Ejemplo n.º 1
0
def toIntronIntervals(chunk):
    """convert a set of gtf elements within a transcript to intron coordinates.

    Will use first transcript_id found.

    Note that coordinates will still be forward strand coordinates
    """
    if len(chunk) == 0:
        return []
    contig, strand, transcript_id = (chunk[0].contig, chunk[0].strand, chunk[0].transcript_id)
    for gff in chunk:
        assert gff.strand == strand, "features on different strands."
        assert gff.contig == contig, "features on different contigs."

    intervals = Intervals.combine([(x.start, x.end) for x in chunk if x.feature == "exon"])
    return Intervals.complement(intervals)
Ejemplo n.º 2
0
def toIntronIntervals(chunk):
    '''convert a set of gtf elements within a transcript to intron coordinates.

    Will use first transcript_id found.

    Note that coordinates will still be forward strand coordinates
    '''
    if len(chunk) == 0:
        return []
    contig, strand, transcript_id = (chunk[0].contig, chunk[0].strand,
                                     chunk[0].transcript_id)
    for gff in chunk:
        assert gff.strand == strand, "features on different strands."
        assert gff.contig == contig, "features on different contigs."

    intervals = Intervals.combine([(x.start, x.end) for x in chunk
                                   if x.feature == "exon"])
    return Intervals.complement(intervals)
Ejemplo n.º 3
0
def toIntronIntervals(chunk):
    '''convert a set of gtf elements within a transcript to intron coordinates.

    Will raise an error if more than one transcript is submitted.

    Note that coordinates will still be forward strand coordinates
    '''
    if len(chunk) == 0:
        return []
    contig, strand, transcript_id = chunk[
        0].contig, chunk[0].strand, chunk[0].transcript_id
    for gff in chunk:
        assert gff.strand == strand, "features on different strands."
        assert gff.contig == contig, "features on different contigs."
        assert gff.transcript_id == transcript_id, "more than one transcript submitted"

    intervals = Intervals.combine([(x.start, x.end)
                                   for x in chunk if x.feature == "exon"])
    return Intervals.complement(intervals)
Ejemplo n.º 4
0
def main(argv=None):
    """script main.
    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-f",
                      "--feature",
                      dest="feature",
                      type="choice",
                      choices=["gene", "transcript", "exon"],
                      default="transcript",
                      help="which feature to use: gene/transcript/exon")
    parser.add_option("--unstranded-bw",
                      dest="unstranded_wig",
                      type="string",
                      help="BigWig with tag counts on both strands")
    parser.add_option("--plus-bw",
                      dest="plus_wig",
                      type="string",
                      help="BigWig with tag counts from plus strand")
    parser.add_option("--minus-bw",
                      dest="minus_wig",
                      type="string",
                      help="BigWig with tag counts from minus strand")
    parser.add_option("--bed",
                      dest="bedfile",
                      type="string",
                      help="tabix indexed bed file with tag counts"),
    parser.add_option("-c",
                      "--use-centre",
                      dest="centre",
                      action="store_true",
                      default=False,
                      help="Use centre of read rather than start")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    iterator = GTF.iterator(options.stdin)

    if options.feature == "gene":
        iterator = GTF.flat_gene_iterator(iterator)
    elif options.feature == "transcript":
        iterator = GTF.transcript_iterator(iterator)
    elif options.feature == "exon":

        def _exon_iterator(gff_iterator):
            for exon in gff_iterator:
                yield [exon]

        iterator = _exon_iterator(iterator)

    if options.unstranded_wig:
        bamfile = iCLIP.make_getter(plus_wig=options.unstranded_wig)
    elif options.plus_wig:
        if not options.minus_wig:
            raise ValueError(
                "Please provide wigs for both strands or use --unstranded_wig")
        bamfile = iCLIP.make_getter(plus_wig=options.plus_wig,
                                    minus_wig=options.minus_wig)
    elif options.bedfile:
        bamfile = iCLIP.make_getter(bedfile=options.bedfile)
    else:
        bamfile = pysam.AlignmentFile(args[0])

    outlines = []
    for feature in iterator:
        exons = GTF.asRanges(feature, "exon")

        exon_counts = iCLIP.count_intervals(bamfile,
                                            exons,
                                            feature[0].contig,
                                            feature[0].strand,
                                            dtype="uint32",
                                            use_centre=options.centre)

        exon_counts = exon_counts.sum()

        introns = Intervals.complement(exons)
        intron_counts = iCLIP.count_intervals(bamfile,
                                              introns,
                                              feature[0].contig,
                                              feature[0].strand,
                                              dtype="uint32",
                                              use_centre=options.centre)

        intron_counts = intron_counts.sum()

        if options.feature == "exon":

            try:
                exon_id = feature[0].exon_id
            except AttributeError:
                try:
                    exon_id = feature[0].exon_number
                except AttributeError:
                    exon_id = "missing"

            gene_id = feature[0].gene_id
            transcript_id = feature[0].transcript_id
            intron_counts = "NA"
        else:
            exon_id = "NA"
            gene_id = feature[0].gene_id
            transcript_id = feature[0].transcript_id
            intron_counts = float(intron_counts)

        outlines.append([
            gene_id, transcript_id, exon_id,
            str(float(exon_counts)),
            str(intron_counts)
        ])

    options.stdout.write("\t".join([
        "gene_id", "transcript_id", "exon_id", "exon_count", "intron_count"
    ]) + "\n")

    outlines = ["\t".join(outline) for outline in outlines]
    outlines = "\n".join(outlines)
    options.stdout.write(outlines + "\n")

    # write footer and output benchmark information.
    E.Stop()
Ejemplo n.º 5
0
    def processChunk( query_id, matches ):
        """process a set of matches from query_id"""

        global ninput, noutput, nskipped
        global nfull_matches, npartial_matches, ngood_matches
        global nremoved_pid, nremoved_query_coverage, nempty, nremoved_gaps, nremoved_nmatches
        global nremoved_regions, nqueries_removed_region
        global outfile_empty
        ninput += 1

        full_matches = []
        good_matches = []
        partial_matches = []

        x_nremoved_pid, x_nquery_coverage, x_nremoved_gaps, x_nremoved_nmatches = 0, 0, 0, 0
        nmatches = len(matches)

        new_matches = []

        # absolute filters applicable to non-fragmentory matches

        for match in matches:

            if match.mPid < options.threshold_min_pid:
                nremoved_pid += 1
                continue
                
            if match.mNMatches < options.threshold_min_matches:
                nremoved_nmatches += 1
                continue

            if options.threshold_max_error_rate:
                r = 100.0 * math.power( options.threshold_max_error_rate, match.mNMatches + match.mNMismatches)
                if match.mPid < r:
                    nremoved_pid += 1
                    x_nremoved_pid += 1
                    continue
            
            new_matches.append(match)

        matches = new_matches

        # filter matches        
        if len(matches) == 0:
            if outfile_empty:
                outfile_empty.write( "%s\tall matches removed after applying thresholds: before=%i, npid=%i, nqcoverage=%i, ngaps=%i, nmatches=%i\n" %\
                                     (query_id, nmatches, x_nremoved_pid, x_nquery_coverage, x_nremoved_gaps, x_nremoved_nmatches ) )
            nskipped += 1
            return
        
        if options.keep_unique_matches and len(matches) == 1:
            pass
        else:
            new_matches = []

            for match in matches:

                if match.mQueryCoverage < options.threshold_min_query_coverage:
                    nremoved_query_coverage += 1
                    x_nquery_coverage += 1
                    continue

                if options.threshold_max_query_gaps and options.threshold_max_query_gaps > match.mQueryNGapsCounts:
                    nremoved_gaps += 1
                    x_nremoved_gaps += 1
                    continue

                if options.threshold_max_query_gapchars and options.threshold_max_query_gapchars > match.mQueryNGapsBases:
                    nremoved_gaps += 1
                    x_nremoved_gaps += 1
                    continue

                if options.threshold_max_sbjct_gaps and options.threshold_max_sbjct_gaps > match.mSbjctNGapsCounts:
                    nremoved_gaps += 1
                    x_nremoved_gaps += 1
                    continue

                if options.threshold_max_sbjct_gapchars and options.threshold_max_sbjct_gapchars > match.mSbjctNGapsBases:
                    nremoved_gaps += 1
                    x_nremoved_gaps += 1
                    continue
                
                new_matches.append( match )
            matches = new_matches

        if len(matches) == 0:
            if outfile_empty:
                outfile_empty.write( "%s\tall matches removed after applying thresholds: before=%i, npid=%i, nqcoverage=%i, ngaps=%i, nmatches=%i\n" %\
                                     (query_id, nmatches, x_nremoved_pid, x_nquery_coverage, x_nremoved_gaps, x_nremoved_nmatches ) )
            nskipped += 1
            return

        ## Remove queries matching to a forbidden region. This section
        ## will remove the full query if any of its matches matches in a
        ## forbidden region.
        keep = True
        for match in matches:
            if intersectors and match.mSbjctId in intersectors:
                found = intersectors[match.mSbjctId].find( match.mSbjctFrom, match.mSbjctTo )
                if found and not options.keep_forbidden or (found and not options.keep_forbidden):
                    nremoved_regions += 1
                    keep = False
                    continue

        if not keep:
            nqueries_removed_region += 1
            if outfile_empty:
                outfile_empty.write( "%s\toverlap with forbidden region\n" % query_id )
            return 

        ## check for full length matches
        for match in matches:
            if match.mQueryCoverage >= 99.9:
                full_matches.append(match)
            if match.mQueryCoverage > options.threshold_good_query_coverage:
                good_matches.append(match)
            else:
                partial_matches.append(match)
            
        if full_matches:
            nfull_matches += 1
        elif good_matches:
            ngood_matches += 1
        elif partial_matches:
            npartial_matches += 1

        ## compute coverage of sequence with matches
        intervals = []
        for match in full_matches + good_matches + partial_matches:
            intervals.append( (match.mQueryFrom, match.mQueryTo) )
        
        rest = Intervals.complement( intervals, 0, match.mQueryLength )
        
        query_coverage = 100.0 * (match.mQueryLength - sum( map( lambda x: x[1] - x[0], rest) ) ) / match.mQueryLength

        if query_coverage >= 99.9:
            fully_matched.append( query_id )
        elif  query_coverage > options.threshold_good_query_coverage:
            well_matched.append( query_id )
        else:
            partially_matched.append( query_id )

        aggregate_coverages.append( query_coverage )

        ## select matches to output
        matches, msg = selectMatches( query_id, matches, options, queries_fasta )

        if len(matches) > 0:
            for match in matches:
                if options.query_forward_coordinates:
                    match.convertCoordinates()

                if options.output_format == "map":
                    options.stdout.write( "%s\n" %\
                                              "\t".join( map(str, (
                                match.mQueryId, match.mSbjctId, 
                                match.strand,
                                "%5.2f" % match.mQueryCoverage,
                                "%5.2f" % match.mSbjctCoverage,
                                "%5.2f" % match.mPid,
                                match.mQueryLength,
                                match.mSbjctLength,
                                match.mQueryFrom, match.mQueryTo,
                                match.mSbjctFrom, match.mSbjctTo,
                                ",".join( map(str,match.mBlockSizes) ),
                                ",".join( map(str,match.mQueryBlockStarts)),
                                ",".join( map(str,match.mSbjctBlockStarts)), 
                                ))))
                elif options.output_format == "psl":
                    options.stdout.write( str(match) + "\n" )

            noutput += 1
        else:
            if outfile_empty:
                outfile_empty.write( "%s\tno matches selected: %s\n" % (query_id, msg) )
            nempty += 1
def readWorkspace(infile,
                  workspace_builder="raw",
                  label="none",
                  map_id2annotation={}):
    """read workspace from infile.

    A workspace is a collection of intervals with two labels associated
    to each interval, one for the 5' and one for the 3' end.

    Available workspace builders are:

    gff
       take a gff file.

    gtf-intergenic
       build workspace from intergenic segments in a gtf file.

    gtf-intronic
       build workspace from intronic segments in a gtf file

    gtf-genic
       the workspace is built from genes (first to last exon).

    Available labels are:

    none
       no labels are given to the ends of workspaces

    direction
       labels are given based on the 5'/3' end of the
       bounding exon

    annotation
       labels are given based on a gene2annotation map.

    returns a list of segments for each contig in a dictionary
    """

    if label == "none":
        label_f = lambda x, y: (("X", ), ("X", ))
        info_f = lambda x: None
    elif label == "direction":
        label_f = lambda x, y: ((("5", "3")[x], ), (("3", "5")[y], ))
        info_f = lambda x: x.strand == "+"
    elif label == "annotation":
        label_f = lambda x, y: (map_id2annotation[x], map_id2annotation[y])
        info_f = lambda x: x.gene_id

    if workspace_builder == "gff":
        workspace = GTF.readAsIntervals(GFF.iterator(infile))

    elif workspace_builder == "gtf-intergenic":

        workspace = collections.defaultdict(list)
        # get all genes
        for e in GTF.merged_gene_iterator(GTF.iterator(infile)):
            workspace[e.contig].append((e.start, e.end, info_f(e)))

        # convert to intergenic regions.
        # overlapping genes are merged and the labels
        # of the right-most entry is retained
        for contig in list(workspace.keys()):
            segs = workspace[contig]
            segs.sort()
            last = segs[0]
            new_segs = []
            for this in segs[1:]:
                if last[1] >= this[0]:
                    if this[1] > last[1]:
                        last = (last[0], this[1], this[2])
                    continue
                assert last[1] < this[0], "this=%s, last=%s" % (this, last)

                new_segs.append((last[1], this[0], label_f(last[2], this[2])))
                last = this
            workspace[contig] = new_segs

    elif workspace_builder == "gtf-intronic":

        workspace = collections.defaultdict(list)

        # the current procedure will count nested genes
        # twice
        for ee in GTF.flat_gene_iterator(GTF.iterator(infile)):

            exons = Intervals.combine([(e.start, e.end) for e in ee])
            introns = Intervals.complement(exons)

            r = ee[0]
            for start, end in introns:
                workspace[r.contig].append(
                    (start, end, label_f(info_f(r), info_f(r))))
    elif workspace_builder == "gtf-genic":

        workspace = collections.defaultdict(list)

        # the current procedure will count nested genes
        # twice
        for ee in GTF.flat_gene_iterator(GTF.iterator(infile)):

            exons = Intervals.combine([(e.start, e.end) for e in ee])
            start, end = exons[0][0], exons[-1][1]
            r = ee[0]
            workspace[r.contig].append(
                (start, end, label_f(info_f(r), info_f(r))))

    else:
        raise ValueError("unknown workspace_builder %s" % workspace_builder)

    return workspace
 def __init__(self, *args, **kwargs):
     Sampler.__init__(self, *args, **kwargs)
     self.mGapLengths = [
         x[1] - x[0] for x in Intervals.complement(
             self.mObserved, self.mWorkStart, self.mWorkEnd)
     ]
Ejemplo n.º 8
0
    def processChunk(query_id, matches):
        """process a set of matches from query_id"""

        global ninput, noutput, nskipped
        global nfull_matches, npartial_matches, ngood_matches
        global nremoved_pid, nremoved_query_coverage, nempty, nremoved_gaps, nremoved_nmatches
        global nremoved_regions, nqueries_removed_region
        global outfile_empty
        ninput += 1

        full_matches = []
        good_matches = []
        partial_matches = []

        x_nremoved_pid, x_nquery_coverage, x_nremoved_gaps, x_nremoved_nmatches = 0, 0, 0, 0
        nmatches = len(matches)

        new_matches = []

        # absolute filters applicable to non-fragmentory matches

        for match in matches:

            if match.mPid < options.threshold_min_pid:
                nremoved_pid += 1
                continue

            if match.mNMatches < options.threshold_min_matches:
                nremoved_nmatches += 1
                continue

            if options.threshold_max_error_rate:
                r = 100.0 * \
                    math.power(
                        options.threshold_max_error_rate, match.mNMatches + match.mNMismatches)
                if match.mPid < r:
                    nremoved_pid += 1
                    x_nremoved_pid += 1
                    continue

            new_matches.append(match)

        matches = new_matches

        # filter matches
        if len(matches) == 0:
            if outfile_empty:
                outfile_empty.write("%s\tall matches removed after applying thresholds: before=%i, npid=%i, nqcoverage=%i, ngaps=%i, nmatches=%i\n" %
                                    (query_id, nmatches, x_nremoved_pid, x_nquery_coverage, x_nremoved_gaps, x_nremoved_nmatches))
            nskipped += 1
            return

        if options.keep_unique_matches and len(matches) == 1:
            pass
        else:
            new_matches = []

            for match in matches:

                if match.mQueryCoverage < options.threshold_min_query_coverage:
                    nremoved_query_coverage += 1
                    x_nquery_coverage += 1
                    continue

                if options.threshold_max_query_gaps and options.threshold_max_query_gaps > match.mQueryNGapsCounts:
                    nremoved_gaps += 1
                    x_nremoved_gaps += 1
                    continue

                if options.threshold_max_query_gapchars and options.threshold_max_query_gapchars > match.mQueryNGapsBases:
                    nremoved_gaps += 1
                    x_nremoved_gaps += 1
                    continue

                if options.threshold_max_sbjct_gaps and options.threshold_max_sbjct_gaps > match.mSbjctNGapsCounts:
                    nremoved_gaps += 1
                    x_nremoved_gaps += 1
                    continue

                if options.threshold_max_sbjct_gapchars and options.threshold_max_sbjct_gapchars > match.mSbjctNGapsBases:
                    nremoved_gaps += 1
                    x_nremoved_gaps += 1
                    continue

                new_matches.append(match)
            matches = new_matches

        if len(matches) == 0:
            if outfile_empty:
                outfile_empty.write("%s\tall matches removed after applying thresholds: before=%i, npid=%i, nqcoverage=%i, ngaps=%i, nmatches=%i\n" %
                                    (query_id, nmatches, x_nremoved_pid, x_nquery_coverage, x_nremoved_gaps, x_nremoved_nmatches))
            nskipped += 1
            return

        # Remove queries matching to a forbidden region. This section
        # will remove the full query if any of its matches matches in a
        # forbidden region.
        keep = True
        for match in matches:
            if intersectors and match.mSbjctId in intersectors:
                found = intersectors[match.mSbjctId].find(
                    match.mSbjctFrom, match.mSbjctTo)
                if found and not options.keep_forbidden or (found and not options.keep_forbidden):
                    nremoved_regions += 1
                    keep = False
                    continue

        if not keep:
            nqueries_removed_region += 1
            if outfile_empty:
                outfile_empty.write(
                    "%s\toverlap with forbidden region\n" % query_id)
            return

        # check for full length matches
        for match in matches:
            if match.mQueryCoverage >= 99.9:
                full_matches.append(match)
            if match.mQueryCoverage > options.threshold_good_query_coverage:
                good_matches.append(match)
            else:
                partial_matches.append(match)

        if full_matches:
            nfull_matches += 1
        elif good_matches:
            ngood_matches += 1
        elif partial_matches:
            npartial_matches += 1

        # compute coverage of sequence with matches
        intervals = []
        for match in full_matches + good_matches + partial_matches:
            intervals.append((match.mQueryFrom, match.mQueryTo))

        rest = Intervals.complement(intervals, 0, match.mQueryLength)

        query_coverage = 100.0 * \
            (match.mQueryLength -
             sum([x[1] - x[0] for x in rest])) / match.mQueryLength

        if query_coverage >= 99.9:
            fully_matched.append(query_id)
        elif query_coverage > options.threshold_good_query_coverage:
            well_matched.append(query_id)
        else:
            partially_matched.append(query_id)

        aggregate_coverages.append(query_coverage)

        # select matches to output
        matches, msg = selectMatches(query_id, matches, options, queries_fasta)

        if len(matches) > 0:
            for match in matches:
                if options.query_forward_coordinates:
                    match.convertCoordinates()

                if options.output_format == "map":
                    options.stdout.write("%s\n" %
                                         "\t".join(map(str, (
                                             match.mQueryId, match.mSbjctId,
                                             match.strand,
                                             "%5.2f" % match.mQueryCoverage,
                                             "%5.2f" % match.mSbjctCoverage,
                                             "%5.2f" % match.mPid,
                                             match.mQueryLength,
                                             match.mSbjctLength,
                                             match.mQueryFrom, match.mQueryTo,
                                             match.mSbjctFrom, match.mSbjctTo,
                                             ",".join(
                                                 map(str, match.mBlockSizes)),
                                             ",".join(
                                                 map(str, match.mQueryBlockStarts)),
                                             ",".join(
                                                 map(str, match.mSbjctBlockStarts)),
                                         ))))
                elif options.output_format == "psl":
                    options.stdout.write(str(match) + "\n")

            noutput += 1
        else:
            if outfile_empty:
                outfile_empty.write(
                    "%s\tno matches selected: %s\n" % (query_id, msg))
            nempty += 1
Ejemplo n.º 9
0
def readWorkspace(infile,
                  workspace_builder="raw",
                  label="none",
                  map_id2annotation={}):
    """read workspace from infile.

    A workspace is a collection of intervals with two labels associated
    to each interval, one for the 5' and one for the 3' end.

    Available workspace builders are:

    gff
       take a gff file. 

    gtf-intergenic
       build workspace from intergenic segments in a gtf file. 

    gtf-intronic
       build workspace from intronic segments in a gtf file

    gtf-genic
       the workspace is built from genes (first to last exon).

    Available labels are:

    none
       no labels are given to the ends of workspaces

    direction
       labels are given based on the 5'/3' end of the
       bounding exon

    annotation
       labels are given based on a gene2annotation map.

    returns a list of segments for each contig in a dictionary
    """

    if label == "none":
        label_f = lambda x, y: (("X",), ("X",))
        info_f = lambda x: None
    elif label == "direction":
        label_f = lambda x, y: ((("5", "3")[x],), (("3", "5")[y],))
        info_f = lambda x: x.strand == "+"
    elif label == "annotation":
        label_f = lambda x, y: (map_id2annotation[x], map_id2annotation[y])
        info_f = lambda x: x.gene_id

    if workspace_builder == "gff":
        workspace = GTF.readAsIntervals(GFF.iterator(infile))

    elif workspace_builder == "gtf-intergenic":

        workspace = collections.defaultdict(list)
        # get all genes
        for e in GTF.merged_gene_iterator(GTF.iterator(infile)):
            workspace[e.contig].append((e.start, e.end, info_f(e)))

        # convert to intergenic regions.
        # overlapping genes are merged and the labels
        # of the right-most entry is retained
        for contig in workspace.keys():
            segs = workspace[contig]
            segs.sort()
            last = segs[0]
            new_segs = []
            for this in segs[1:]:
                if last[1] >= this[0]:
                    if this[1] > last[1]:
                        last = (last[0], this[1], this[2])
                    continue
                assert last[1] < this[0], "this=%s, last=%s" % (this, last)

                new_segs.append((last[1], this[0],
                                 label_f(last[2], this[2])))
                last = this
            workspace[contig] = new_segs

    elif workspace_builder == "gtf-intronic":

        workspace = collections.defaultdict(list)

        # the current procedure will count nested genes
        # twice
        for ee in GTF.flat_gene_iterator(GTF.iterator(infile)):

            exons = Intervals.combine([(e.start, e.end) for e in ee])
            introns = Intervals.complement(exons)

            r = ee[0]
            for start, end in introns:
                workspace[r.contig].append((start,
                                            end,
                                            label_f(info_f(r), info_f(r))
                                            ))
    elif workspace_builder == "gtf-genic":

        workspace = collections.defaultdict(list)

        # the current procedure will count nested genes
        # twice
        for ee in GTF.flat_gene_iterator(GTF.iterator(infile)):

            exons = Intervals.combine([(e.start, e.end) for e in ee])
            start, end = exons[0][0], exons[-1][1]
            r = ee[0]
            workspace[r.contig].append((start,
                                        end,
                                        label_f(info_f(r), info_f(r))
                                        ))

    else:
        raise ValueError("unknown workspace_builder %s" % workspace_builder)

    return workspace
Ejemplo n.º 10
0
 def __init__(self, *args, **kwargs):
     Sampler.__init__(self, *args, **kwargs)
     self.mGapLengths = [x[1] - x[0]
                         for x in Intervals.complement(self.mObserved, self.mWorkStart, self.mWorkEnd)]
Ejemplo n.º 11
0
def main(argv=None):
    """script main.
    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-f", "--feature", dest="feature", type="choice",
                      choices=["gene", "transcript", "exon"],
                      default="transcript",
                      help="which feature to use: gene/transcript/exon")
    parser.add_option("--unstranded-bw", dest="unstranded_wig", type="string",
                      help="BigWig with tag counts on both strands")
    parser.add_option("--plus-bw", dest="plus_wig", type="string",
                      help="BigWig with tag counts from plus strand")
    parser.add_option("--minus-bw", dest="minus_wig", type="string",
                      help="BigWig with tag counts from minus strand")
    parser.add_option("--bed", dest="bedfile", type="string",
                      help="tabix indexed bed file with tag counts"),
    parser.add_option("-c", "--use-centre", dest="centre", action="store_true",
                      default=False,
                      help="Use centre of read rather than start")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    iterator = GTF.iterator(options.stdin)

    if options.feature == "gene":
        iterator = GTF.flat_gene_iterator(iterator)
    elif options.feature == "transcript":
        iterator = GTF.transcript_iterator(iterator)
    elif options.feature == "exon":
        def _exon_iterator(gff_iterator):
            for exon in gff_iterator:
                yield [exon]
        iterator = _exon_iterator(iterator)

    if options.unstranded_wig:
        bamfile = iCLIP.make_getter(plus_wig=options.unstranded_wig)
    elif options.plus_wig:
        if not options.minus_wig:
            raise ValueError(
                "Please provide wigs for both strands or use --unstranded_wig")
        bamfile = iCLIP.make_getter(plus_wig=options.plus_wig,
                                    minus_wig=options.minus_wig)
    elif options.bedfile:
        bamfile = iCLIP.make_getter(bedfile=options.bedfile)   
    else:
        bamfile = pysam.AlignmentFile(args[0])
        
    outlines = []
    for feature in iterator:
        exons = GTF.asRanges(feature, "exon")

        exon_counts = iCLIP.count_intervals(bamfile,
                                            exons,
                                            feature[0].contig,
                                            feature[0].strand,
                                            dtype="uint32",
                                            use_centre=options.centre)

        exon_counts = exon_counts.sum()

        introns = Intervals.complement(exons)
        intron_counts = iCLIP.count_intervals(bamfile,
                                              introns,
                                              feature[0].contig,
                                              feature[0].strand,
                                              dtype="uint32",
                                              use_centre=options.centre)

        intron_counts = intron_counts.sum()

        if options.feature == "exon":

            try:
                exon_id = feature[0].exon_id
            except AttributeError:
                try:
                    exon_id = feature[0].exon_number
                except AttributeError:
                    exon_id = "missing"

            gene_id = feature[0].gene_id
            transcript_id = feature[0].transcript_id
            intron_counts = "NA"
        else:
            exon_id = "NA"
            gene_id = feature[0].gene_id
            transcript_id = feature[0].transcript_id
            intron_counts = float(intron_counts)
            
        outlines.append([gene_id,
                         transcript_id,
                         exon_id,
                         str(float(exon_counts)),
                         str(intron_counts)])

    options.stdout.write("\t".join(["gene_id",
                                    "transcript_id",
                                    "exon_id",
                                    "exon_count",
                                    "intron_count"])+"\n")

    outlines = ["\t".join(outline) for outline in outlines]
    outlines = "\n".join(outlines)
    options.stdout.write(outlines + "\n")

    # write footer and output benchmark information.
    E.Stop()
Ejemplo n.º 12
0
def main(argv=None):
    """script main.
    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-f",
                      "--feature",
                      dest="feature",
                      type="choice",
                      choices=["gene", "transcript", "exon"],
                      default="transcript",
                      help="supply help")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    iterator = GTF.iterator(options.stdin)

    if options.feature == "gene":
        iterator = GTF.flat_gene_iterator(iterator)
    elif options.feature == "transcript":
        iterator = GTF.transcript_iterator(iterator)
    elif options.feature == "exon":

        def _exon_iterator(gff_iterator):
            for exon in gff_iterator:
                yield [exon]

        iterator = _exon_iterator(iterator)

    bamfile = pysam.AlignmentFile(args[0])
    outlines = []
    for feature in iterator:
        exons = GTF.asRanges(feature, "exon")

        exon_counts = iCLIP.count_intervals(bamfile,
                                            exons,
                                            feature[0].contig,
                                            feature[0].strand,
                                            dtype="uint32")

        exon_counts = exon_counts.sum()

        introns = Intervals.complement(exons)
        intron_counts = iCLIP.count_intervals(bamfile,
                                              introns,
                                              feature[0].contig,
                                              feature[0].strand,
                                              dtype="uint32")

        intron_counts = intron_counts.sum()

        if options.feature == "exon":
            exon_id = feature[0].exon_id
            gene_id = feature[0].gene_id
            transcript_id = feature[0].transcript_id
            intron_counts = "NA"
        else:
            exon_id = "NA"
            gene_id = feature[0].gene_id
            transcript_id = feature[0].transcript_id

        outlines.append([
            gene_id, transcript_id, exon_id,
            str(exon_counts),
            str(intron_counts)
        ])

    options.stdout.write("\t".join([
        "gene_id", "transcript_id", "exon_id", "exon_count", "intron_count"
    ]) + "\n")

    outlines = ["\t".join(outline) for outline in outlines]
    outlines = "\n".join(outlines)
    options.stdout.write(outlines + "\n")

    # write footer and output benchmark information.
    E.Stop()
def main(argv=None):
    """script main.
    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-f", "--feature", dest="feature", type="choice",
                      choices=["gene", "transcript", "exon"],
                      default="transcript",
                      help="supply help")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    iterator = GTF.iterator(options.stdin)

    if options.feature == "gene":
        iterator = GTF.flat_gene_iterator(iterator)
    elif options.feature == "transcript":
        iterator = GTF.transcript_iterator(iterator)
    elif options.feature == "exon":
        def _exon_iterator(gff_iterator):
            for exon in gff_iterator:
                yield [exon]
        iterator = _exon_iterator(iterator)

    bamfile = pysam.AlignmentFile(args[0])
    outlines = []
    for feature in iterator:
        exons = GTF.asRanges(feature, "exon")

        exon_counts = iCLIP.count_intervals(bamfile,
                                            exons,
                                            feature[0].contig,
                                            feature[0].strand,
                                            dtype="uint32")

        exon_counts = exon_counts.sum()

        introns = Intervals.complement(exons)
        intron_counts = iCLIP.count_intervals(bamfile,
                                              introns,
                                              feature[0].contig,
                                              feature[0].strand,
                                              dtype="uint32")

        intron_counts = intron_counts.sum()

        if options.feature == "exon":

            try:
                exon_id = feature[0].exon_id
            except AttributeError:
                exon_id = "missing"

            gene_id = feature[0].gene_id
            transcript_id = feature[0].transcript_id
            intron_counts = "NA"
        else:
            exon_id = "NA"
            gene_id = feature[0].gene_id
            transcript_id = feature[0].transcript_id

        outlines.append([gene_id,
                         transcript_id,
                         exon_id,
                         str(exon_counts),
                         str(intron_counts)])

    options.stdout.write("\t".join(["gene_id",
                                    "transcript_id",
                                    "exon_id",
                                    "exon_count",
                                    "intron_count"])+"\n")

    outlines = ["\t".join(outline) for outline in outlines]
    outlines = "\n".join(outlines)
    options.stdout.write(outlines + "\n")

    # write footer and output benchmark information.
    E.Stop()