Exemple #1
0
def readAndIndex(iterator, with_value=True):
    '''read from gtf stream and index.

    Returns
    -------

    index :
        an object of type :class:`IndexedGenome.IndexedGenome`
    '''

    if with_value:
        index = IndexedGenome.IndexedGenome()
        for gtf in iterator:
            index.add(gtf.contig, gtf.start, gtf.end, gtf)
    else:
        index = IndexedGenome.Simple()
        for gtf in iterator:
            index.add(gtf.contig, gtf.start, gtf.end)

    return index
def buildQuicksectMask(bed_file):
    '''return Quicksect object containing the regions specified
       takes a bed file listing the regions to mask 
    '''
    mask = IndexedGenome.Quicksect()

    n_regions = 0
    for bed in Bed.iterator(iotools.openFile(bed_file)):
        # it is neccessary to extend the region to make an accurate mask
        mask.add(bed.contig, (bed.start - 1), (bed.end + 1), 1)
        n_regions += 1

    E.info("Built Quicksect mask for %i regions" % n_regions)

    return(mask)
def makeIntervalCorrelation(infiles, outfile, field, reference):
    '''compute correlation of interval properties between sets
    '''

    dbhandle = sqlite3.connect(PARAMS["database_name"])

    tracks, idx = [], []
    for infile in infiles:
        track = P.snip(infile, ".bed.gz")
        tablename = "%s_intervals" % P.tablequote(track)
        cc = dbhandle.cursor()
        statement = "SELECT contig, start, end, %(field)s FROM %(tablename)s" % locals(
        )
        cc.execute(statement)
        ix = IndexedGenome.IndexedGenome()
        for contig, start, end, peakval in cc:
            ix.add(contig, start, end, peakval)
        idx.append(ix)
        tracks.append(track)
    outs = iotools.openFile(outfile, "w")
    outs.write("contig\tstart\tend\tid\t" + "\t".join(tracks) + "\n")

    for bed in Bed.iterator(infile=iotools.openFile(reference, "r")):

        row = []
        for ix in idx:
            try:
                intervals = list(ix.get(bed.contig, bed.start, bed.end))
            except KeyError:
                row.append("")
                continue

            if len(intervals) == 0:
                peakval = ""
            else:
                peakval = str((max([x[2] for x in intervals])))
            row.append(peakval)

        outs.write(str(bed) + "\t" + "\t".join(row) + "\n")

    outs.close()
Exemple #4
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option(
        "-g", "--gtf-file", dest="filename_gtf", type="string",
        help="filename with gene models in gtf format [%default]")

    parser.add_option(
        "-m", "--filename-mismapped", dest="filename_mismapped", type="string",
        help="output bam file for mismapped reads [%default]")

    parser.add_option(
        "-j", "--junctions-bed-file", dest="filename_junctions", type="string",
        help="bam file with reads mapped across junctions [%default]")

    parser.add_option(
        "-r", "--filename-regions", dest="filename_regions", type="string",
        help="filename with regions to remove in bed format [%default]")

    parser.add_option(
        "-t", "--transcripts-gtf-file", dest="filename_transcriptome",
        type="string",
        help="bam file with reads mapped against transcripts [%default]")

    parser.add_option(
        "-p", "--map-tsv-file", dest="filename_map", type="string",
        help="filename mapping transcript numbers (used by "
        "--filename-transciptome) to transcript names "
        "(used by --filename-gtf) [%default]")

    parser.add_option(
        "-s", "--filename-stats", dest="filename_stats", type="string",
        help="filename to output stats to [%default]")

    parser.add_option(
        "-o", "--colour",
        dest="colour_mismatches", action="store_true",
        help="mismatches will use colour differences (CM tag) [%default]")

    parser.add_option(
        "-i", "--ignore-mismatches",
        dest="ignore_mismatches", action="store_true",
        help="ignore mismatches [%default]")

    parser.add_option(
        "-c", "--remove-contigs", dest="remove_contigs", type="string",
        help="','-separated list of contigs to remove [%default]")

    parser.add_option(
        "-f", "--force-output", dest="force", action="store_true",
        help="force overwriting of existing files [%default]")

    parser.add_option("-u", "--unique", dest="unique", action="store_true",
                      help="remove reads not matching uniquely [%default]")

    parser.add_option("--output-sam", dest="output_sam", action="store_true",
                      help="output in sam format [%default]")

    parser.set_defaults(
        filename_gtf=None,
        filename_mismapped=None,
        filename_junctions=None,
        filename_transcriptome=None,
        filename_map=None,
        remove_contigs=None,
        force=False,
        unique=False,
        colour_mismatches=False,
        ignore_mismatches=False,
        output_sam=False,
        filename_table=None,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv)

    if len(args) != 1:
        raise ValueError("please supply one bam file")

    bamfile_genome = args[0]
    genome_samfile = pysam.AlignmentFile(bamfile_genome, "rb")

    if options.remove_contigs:
        options.remove_contigs = options.remove_contigs.split(",")

    if options.filename_map:
        E.info("reading map")
        id_map = iotools.read_map(
            iotools.open_file(options.filename_map), has_header=True)
        id_map = dict([(y, x) for x, y in id_map.items()])
    else:
        id_map = None

    transcripts = {}
    if options.filename_gtf:
        E.info("indexing geneset")
        mapped, missed = 0, 0
        for gtf in GTF.transcript_iterator(
                GTF.iterator(iotools.open_file(options.filename_gtf))):
            gtf.sort(key=lambda x: x.start)
            transcript_id = gtf[0].transcript_id
            if id_map:
                try:
                    transcript_id = id_map[transcript_id]
                    mapped += 1
                except KeyError:
                    missed += 1
                    continue
            transcripts[transcript_id] = gtf

        E.info("read %i transcripts from geneset (%i mapped, %i missed)" %
               (len(transcripts), mapped, missed))

    regions_to_remove = None
    if options.filename_regions:
        E.info("indexing regions")
        regions_to_remove = IndexedGenome.Simple()
        for bed in Bed.iterator(iotools.open_file(options.filename_regions)):
            regions_to_remove.add(bed.contig, bed.start, bed.end)
        E.info("read %i regions" % len(regions_to_remove))

    if options.filename_transcriptome:
        transcripts_samfile = pysam.AlignmentFile(options.filename_transcriptome,
                                                  "rb")
    else:
        transcripts_samfile = None

    if options.output_sam:
        output_samfile = pysam.AlignmentFile("-", "wh", template=genome_samfile)
    else:
        output_samfile = pysam.AlignmentFile("-", "wb", template=genome_samfile)

    if options.filename_mismapped:
        if not options.force and os.path.exists(options.filename_mismapped):
            raise IOError("output file %s already exists" %
                          options.filename_mismapped)
        output_mismapped = pysam.AlignmentFile(options.filename_mismapped,
                                               "wb",
                                               template=genome_samfile)
    else:
        output_mismapped = None

    if options.filename_junctions:
        junctions_samfile = pysam.AlignmentFile(options.filename_junctions,
                                                "rb")
    else:
        junctions_samfile = None

    c = bams2bam_filter(genome_samfile,
                        output_samfile,
                        output_mismapped,
                        transcripts_samfile,
                        junctions_samfile,
                        transcripts,
                        regions=regions_to_remove,
                        unique=options.unique,
                        remove_contigs=options.remove_contigs,
                        colour_mismatches=options.colour_mismatches,
                        ignore_mismatches=options.ignore_mismatches,
                        ignore_transcripts=transcripts_samfile is None,
                        ignore_junctions=junctions_samfile is None)

    if options.filename_stats:
        outf = iotools.open_file(options.filename_stats, "w")
        outf.write("category\tcounts\n%s\n" % c.asTable())
        outf.close()

    if options.filename_transcriptome:
        transcripts_samfile.close()

    genome_samfile.close()
    output_samfile.close()
    if output_mismapped:
        output_mismapped.close()

    # write footer and output benchmark information.
    E.stop()
Exemple #5
0
def annotateWindows(contig, windows, gff_data, fasta, options):
    """annotate windows."""

    index = IndexedGenome.IndexedGenome()
    for g in gff_data:
        index.add(g.contig, g.start, g.end, g)

    is_gtf = options.is_gtf

    if options.transform == "none":
        transform = lambda x, y, z: [(x[0], x[1]) for x in z]
    elif options.transform == "overlap":
        transform = transform_overlap
    elif options.transform == "complement":
        transform = transform_complement
    elif options.transform == "third_codon":
        transform = transform_third_codon
    else:
        raise ValueError("unknown transform %s" % options.transform)

    work_on_intervals = True
    if options.decorator == "counts":
        decorator = decorator_counts
    elif options.decorator == "mean-length":
        decorator = decorator_mean_length
    elif options.decorator == "median-length":
        decorator = decorator_median_length
    elif options.decorator == "percent-coverage":
        decorator = decorator_percent_coverage
    elif options.decorator == "gc":
        decorator = decorator_percent_gc
    elif options.decorator == "median-score":
        decorator = decorator_median_score
        work_on_intervals = False
    elif options.decorator == "mean-score":
        decorator = decorator_mean_score
        work_on_intervals = False
    elif options.decorator == "stddev-score":
        decorator = decorator_stddev_score
        work_on_intervals = False
    elif options.decorator == "min-score":
        decorator = decorator_min_score
        work_on_intervals = False
    elif options.decorator == "max-score":
        decorator = decorator_max_score
        work_on_intervals = False
    else:
        raise ValueError("unknown decorator %s" % options.decorator)

    for start, end in windows:

        # counts/length before/after transformation
        n1, l1, n2, l2 = 0, 0, 0, 0

        values, intervals_with_gff, genes, transcripts = [], [], set(), set()

        try:
            for istart, iend, value in index.get(contig, start, end):
                n1 += 1
                l1 += iend - istart
                intervals_with_gff.append((istart, iend, value))
                values.append(value.score)
                if is_gtf:
                    genes.add(value.gene_id)
                    transcripts.add(value.transcript_id)
        except KeyError:
            pass

        if n1 == 0 and options.skip_empty:
            continue

        if work_on_intervals:

            if options.loglevel >= 3:
                options.stdlog.write(
                    "# intervals in window %i:%i before transformation: %s\n" %
                    (start, end, str(intervals)))

            intervals = transform(start, end, intervals_with_gff)

            for xstart, xend in intervals:
                n2 += 1
                l2 += xend - xstart

            if options.loglevel >= 3:
                options.stdlog.write(
                    "# intervals in window %i:%i after transformation: %s\n" %
                    (start, end, str(intervals)))

            score, extra_info = decorator(intervals, start, end, contig, fasta)

        else:
            if len(values) > 0:
                values = list(map(float, values))
                score, extra_info = decorator(values, start, end, contig)
            else:
                score, extra_info = 0, None

            l2 = 0
            n2 = 0

        if is_gtf:
            ngenes, ntranscripts = len(genes), len(transcripts)
        else:
            ngenes, ntranscripts = 0, 0

        if extra_info:
            extra_info = re.sub("\t", ";", extra_info)
        options.stdout.write("\t".join(
            map(str, (contig, start, end, ngenes, ntranscripts, n1, l1, n2, l2,
                      score, extra_info))) + "\n")
Exemple #6
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.ArgumentParser(description=__doc__)

    parser.add_argument("--version", action='version', version="1.0")

    parser.add_argument("--bed-file",
                        dest="infiles",
                        type=str,
                        metavar="bed",
                        help="supply list of bed files",
                        action="append")

    parser.set_defaults(infiles=[])

    # add common options (-h/--help, ...) and parse command line
    (args, unknown) = E.start(parser, argv=argv, unknowns=True)

    args.infiles.extend(unknown)
    if len(args.infiles) == 0:
        raise ValueError('please provide at least 1 bed file')

    E.info("concatenating bed files")
    # concatenate the list of files
    tmp = tempfile.NamedTemporaryFile(delete=False, mode="w")
    tmp_merge = tempfile.NamedTemporaryFile(delete=False, mode="w")
    infs = args.infiles
    for inf in infs:
        for bed in Bed.iterator(iotools.open_file(inf)):
            tmp.write("%s\n" % bed)
    tmp.close()

    E.info("merging bed entries")
    # merge the bed entries in the file
    name = tmp.name
    tmp_bed = pybedtools.BedTool(name)
    tmp_bed.sort().merge().saveas(tmp_merge.name)
    tmp_merge.close()

    E.info("indexing bed entries")
    # index the bed entries
    merged = IndexedGenome.Simple()
    for bed in Bed.iterator(iotools.open_file(tmp_merge.name)):
        merged.add(bed.contig, bed.start, bed.end)

    counts = collections.defaultdict(int)
    # list of samples
    samples = args.infiles

    E.info("counting no. samples overlapping each interval")
    for sample in samples:
        found = set()
        for bed in Bed.iterator(iotools.open_file(sample)):
            if merged.contains(bed.contig, bed.start, bed.end):
                key = [bed.contig] + \
                    [x for x in merged.get(bed.contig, bed.start, bed.end)]
                key = (key[0], key[1][0], key[1][1])
                if key in found:
                    continue
                found.add(key)

                # tuple of interval description as key - (contig, start, end)
                counts[key] += 1

    # open outfile
    args.stdout.write("contig\tstart\tend\tcount\n")

    E.info("outputting result")
    for interval, count in sorted(counts.items()):
        args.stdout.write("\t".join(map(str, interval)) + "\t" + str(count) +
                          "\n")

    # write footer and output benchmark information.
    E.stop()
Exemple #7
0
    def __call__(self, track, slice=None):

        result = odict()

        merged = None
        rocs = []

        for field in self.mFields:
            data = []
            for replicate in EXPERIMENTS.getTracks(track):
                statement = "SELECT contig, start, end,%(field)s FROM %(replicate)s_intervals" % locals(
                )
                data.append(self.get(statement))

            idx = []
            for x in range(len(data)):
                i = IndexedGenome.IndexedGenome()
                for contig, start, end, peakval in data[x]:
                    i.add(contig, start, end, peakval)
                idx.append(i)

            def _iter(all):
                all.sort()
                last_contig, first_start, last_end, last_value = all[0]
                for contig, start, end, value in all[1:]:
                    if contig != last_contig or last_end < start:
                        yield (last_contig, first_start, last_end)
                        last_contig, first_start, last_end = contig, start, end
                    else:
                        last_end = max(last_end, end)
                yield (last_contig, first_start, last_end)

            if not merged:
                all = [x for x in itertools.chain(*data)]
                merged = list(_iter(all))

            roc_data = []
            for contig, start, end in merged:
                intervals = []
                for i in idx:
                    try:
                        intervals.append(list(i.get(contig, start, end)))
                    except KeyError:
                        continue

                if len(intervals) == 0:
                    continue

                is_repro = len([x for x in intervals if x != []]) == len(data)
                value = max([x[2] for x in itertools.chain(*intervals)])

                # fpr, tpr
                roc_data.append((value, is_repro))

            roc_data.sort()
            roc_data.reverse()

            roc = list(zip(*Stats.computeROC(roc_data)))
            result[field] = odict((("FPR", roc[0]), (field, roc[1])))

        return result