def toIntronIntervals(chunk): """convert a set of gtf elements within a transcript to intron coordinates. Will use first transcript_id found. Note that coordinates will still be forward strand coordinates """ if len(chunk) == 0: return [] contig, strand, transcript_id = (chunk[0].contig, chunk[0].strand, chunk[0].transcript_id) for gff in chunk: assert gff.strand == strand, "features on different strands." assert gff.contig == contig, "features on different contigs." intervals = Intervals.combine([(x.start, x.end) for x in chunk if x.feature == "exon"]) return Intervals.complement(intervals)
def toIntronIntervals(chunk): '''convert a set of gtf elements within a transcript to intron coordinates. Will use first transcript_id found. Note that coordinates will still be forward strand coordinates ''' if len(chunk) == 0: return [] contig, strand, transcript_id = (chunk[0].contig, chunk[0].strand, chunk[0].transcript_id) for gff in chunk: assert gff.strand == strand, "features on different strands." assert gff.contig == contig, "features on different contigs." intervals = Intervals.combine([(x.start, x.end) for x in chunk if x.feature == "exon"]) return Intervals.complement(intervals)
def toIntronIntervals(chunk): '''convert a set of gtf elements within a transcript to intron coordinates. Will raise an error if more than one transcript is submitted. Note that coordinates will still be forward strand coordinates ''' if len(chunk) == 0: return [] contig, strand, transcript_id = chunk[ 0].contig, chunk[0].strand, chunk[0].transcript_id for gff in chunk: assert gff.strand == strand, "features on different strands." assert gff.contig == contig, "features on different contigs." assert gff.transcript_id == transcript_id, "more than one transcript submitted" intervals = Intervals.combine([(x.start, x.end) for x in chunk if x.feature == "exon"]) return Intervals.complement(intervals)
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-f", "--feature", dest="feature", type="choice", choices=["gene", "transcript", "exon"], default="transcript", help="which feature to use: gene/transcript/exon") parser.add_option("--unstranded-bw", dest="unstranded_wig", type="string", help="BigWig with tag counts on both strands") parser.add_option("--plus-bw", dest="plus_wig", type="string", help="BigWig with tag counts from plus strand") parser.add_option("--minus-bw", dest="minus_wig", type="string", help="BigWig with tag counts from minus strand") parser.add_option("--bed", dest="bedfile", type="string", help="tabix indexed bed file with tag counts"), parser.add_option("-c", "--use-centre", dest="centre", action="store_true", default=False, help="Use centre of read rather than start") # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) iterator = GTF.iterator(options.stdin) if options.feature == "gene": iterator = GTF.flat_gene_iterator(iterator) elif options.feature == "transcript": iterator = GTF.transcript_iterator(iterator) elif options.feature == "exon": def _exon_iterator(gff_iterator): for exon in gff_iterator: yield [exon] iterator = _exon_iterator(iterator) if options.unstranded_wig: bamfile = iCLIP.make_getter(plus_wig=options.unstranded_wig) elif options.plus_wig: if not options.minus_wig: raise ValueError( "Please provide wigs for both strands or use --unstranded_wig") bamfile = iCLIP.make_getter(plus_wig=options.plus_wig, minus_wig=options.minus_wig) elif options.bedfile: bamfile = iCLIP.make_getter(bedfile=options.bedfile) else: bamfile = pysam.AlignmentFile(args[0]) outlines = [] for feature in iterator: exons = GTF.asRanges(feature, "exon") exon_counts = iCLIP.count_intervals(bamfile, exons, feature[0].contig, feature[0].strand, dtype="uint32", use_centre=options.centre) exon_counts = exon_counts.sum() introns = Intervals.complement(exons) intron_counts = iCLIP.count_intervals(bamfile, introns, feature[0].contig, feature[0].strand, dtype="uint32", use_centre=options.centre) intron_counts = intron_counts.sum() if options.feature == "exon": try: exon_id = feature[0].exon_id except AttributeError: try: exon_id = feature[0].exon_number except AttributeError: exon_id = "missing" gene_id = feature[0].gene_id transcript_id = feature[0].transcript_id intron_counts = "NA" else: exon_id = "NA" gene_id = feature[0].gene_id transcript_id = feature[0].transcript_id intron_counts = float(intron_counts) outlines.append([ gene_id, transcript_id, exon_id, str(float(exon_counts)), str(intron_counts) ]) options.stdout.write("\t".join([ "gene_id", "transcript_id", "exon_id", "exon_count", "intron_count" ]) + "\n") outlines = ["\t".join(outline) for outline in outlines] outlines = "\n".join(outlines) options.stdout.write(outlines + "\n") # write footer and output benchmark information. E.Stop()
def processChunk( query_id, matches ): """process a set of matches from query_id""" global ninput, noutput, nskipped global nfull_matches, npartial_matches, ngood_matches global nremoved_pid, nremoved_query_coverage, nempty, nremoved_gaps, nremoved_nmatches global nremoved_regions, nqueries_removed_region global outfile_empty ninput += 1 full_matches = [] good_matches = [] partial_matches = [] x_nremoved_pid, x_nquery_coverage, x_nremoved_gaps, x_nremoved_nmatches = 0, 0, 0, 0 nmatches = len(matches) new_matches = [] # absolute filters applicable to non-fragmentory matches for match in matches: if match.mPid < options.threshold_min_pid: nremoved_pid += 1 continue if match.mNMatches < options.threshold_min_matches: nremoved_nmatches += 1 continue if options.threshold_max_error_rate: r = 100.0 * math.power( options.threshold_max_error_rate, match.mNMatches + match.mNMismatches) if match.mPid < r: nremoved_pid += 1 x_nremoved_pid += 1 continue new_matches.append(match) matches = new_matches # filter matches if len(matches) == 0: if outfile_empty: outfile_empty.write( "%s\tall matches removed after applying thresholds: before=%i, npid=%i, nqcoverage=%i, ngaps=%i, nmatches=%i\n" %\ (query_id, nmatches, x_nremoved_pid, x_nquery_coverage, x_nremoved_gaps, x_nremoved_nmatches ) ) nskipped += 1 return if options.keep_unique_matches and len(matches) == 1: pass else: new_matches = [] for match in matches: if match.mQueryCoverage < options.threshold_min_query_coverage: nremoved_query_coverage += 1 x_nquery_coverage += 1 continue if options.threshold_max_query_gaps and options.threshold_max_query_gaps > match.mQueryNGapsCounts: nremoved_gaps += 1 x_nremoved_gaps += 1 continue if options.threshold_max_query_gapchars and options.threshold_max_query_gapchars > match.mQueryNGapsBases: nremoved_gaps += 1 x_nremoved_gaps += 1 continue if options.threshold_max_sbjct_gaps and options.threshold_max_sbjct_gaps > match.mSbjctNGapsCounts: nremoved_gaps += 1 x_nremoved_gaps += 1 continue if options.threshold_max_sbjct_gapchars and options.threshold_max_sbjct_gapchars > match.mSbjctNGapsBases: nremoved_gaps += 1 x_nremoved_gaps += 1 continue new_matches.append( match ) matches = new_matches if len(matches) == 0: if outfile_empty: outfile_empty.write( "%s\tall matches removed after applying thresholds: before=%i, npid=%i, nqcoverage=%i, ngaps=%i, nmatches=%i\n" %\ (query_id, nmatches, x_nremoved_pid, x_nquery_coverage, x_nremoved_gaps, x_nremoved_nmatches ) ) nskipped += 1 return ## Remove queries matching to a forbidden region. This section ## will remove the full query if any of its matches matches in a ## forbidden region. keep = True for match in matches: if intersectors and match.mSbjctId in intersectors: found = intersectors[match.mSbjctId].find( match.mSbjctFrom, match.mSbjctTo ) if found and not options.keep_forbidden or (found and not options.keep_forbidden): nremoved_regions += 1 keep = False continue if not keep: nqueries_removed_region += 1 if outfile_empty: outfile_empty.write( "%s\toverlap with forbidden region\n" % query_id ) return ## check for full length matches for match in matches: if match.mQueryCoverage >= 99.9: full_matches.append(match) if match.mQueryCoverage > options.threshold_good_query_coverage: good_matches.append(match) else: partial_matches.append(match) if full_matches: nfull_matches += 1 elif good_matches: ngood_matches += 1 elif partial_matches: npartial_matches += 1 ## compute coverage of sequence with matches intervals = [] for match in full_matches + good_matches + partial_matches: intervals.append( (match.mQueryFrom, match.mQueryTo) ) rest = Intervals.complement( intervals, 0, match.mQueryLength ) query_coverage = 100.0 * (match.mQueryLength - sum( map( lambda x: x[1] - x[0], rest) ) ) / match.mQueryLength if query_coverage >= 99.9: fully_matched.append( query_id ) elif query_coverage > options.threshold_good_query_coverage: well_matched.append( query_id ) else: partially_matched.append( query_id ) aggregate_coverages.append( query_coverage ) ## select matches to output matches, msg = selectMatches( query_id, matches, options, queries_fasta ) if len(matches) > 0: for match in matches: if options.query_forward_coordinates: match.convertCoordinates() if options.output_format == "map": options.stdout.write( "%s\n" %\ "\t".join( map(str, ( match.mQueryId, match.mSbjctId, match.strand, "%5.2f" % match.mQueryCoverage, "%5.2f" % match.mSbjctCoverage, "%5.2f" % match.mPid, match.mQueryLength, match.mSbjctLength, match.mQueryFrom, match.mQueryTo, match.mSbjctFrom, match.mSbjctTo, ",".join( map(str,match.mBlockSizes) ), ",".join( map(str,match.mQueryBlockStarts)), ",".join( map(str,match.mSbjctBlockStarts)), )))) elif options.output_format == "psl": options.stdout.write( str(match) + "\n" ) noutput += 1 else: if outfile_empty: outfile_empty.write( "%s\tno matches selected: %s\n" % (query_id, msg) ) nempty += 1
def readWorkspace(infile, workspace_builder="raw", label="none", map_id2annotation={}): """read workspace from infile. A workspace is a collection of intervals with two labels associated to each interval, one for the 5' and one for the 3' end. Available workspace builders are: gff take a gff file. gtf-intergenic build workspace from intergenic segments in a gtf file. gtf-intronic build workspace from intronic segments in a gtf file gtf-genic the workspace is built from genes (first to last exon). Available labels are: none no labels are given to the ends of workspaces direction labels are given based on the 5'/3' end of the bounding exon annotation labels are given based on a gene2annotation map. returns a list of segments for each contig in a dictionary """ if label == "none": label_f = lambda x, y: (("X", ), ("X", )) info_f = lambda x: None elif label == "direction": label_f = lambda x, y: ((("5", "3")[x], ), (("3", "5")[y], )) info_f = lambda x: x.strand == "+" elif label == "annotation": label_f = lambda x, y: (map_id2annotation[x], map_id2annotation[y]) info_f = lambda x: x.gene_id if workspace_builder == "gff": workspace = GTF.readAsIntervals(GFF.iterator(infile)) elif workspace_builder == "gtf-intergenic": workspace = collections.defaultdict(list) # get all genes for e in GTF.merged_gene_iterator(GTF.iterator(infile)): workspace[e.contig].append((e.start, e.end, info_f(e))) # convert to intergenic regions. # overlapping genes are merged and the labels # of the right-most entry is retained for contig in list(workspace.keys()): segs = workspace[contig] segs.sort() last = segs[0] new_segs = [] for this in segs[1:]: if last[1] >= this[0]: if this[1] > last[1]: last = (last[0], this[1], this[2]) continue assert last[1] < this[0], "this=%s, last=%s" % (this, last) new_segs.append((last[1], this[0], label_f(last[2], this[2]))) last = this workspace[contig] = new_segs elif workspace_builder == "gtf-intronic": workspace = collections.defaultdict(list) # the current procedure will count nested genes # twice for ee in GTF.flat_gene_iterator(GTF.iterator(infile)): exons = Intervals.combine([(e.start, e.end) for e in ee]) introns = Intervals.complement(exons) r = ee[0] for start, end in introns: workspace[r.contig].append( (start, end, label_f(info_f(r), info_f(r)))) elif workspace_builder == "gtf-genic": workspace = collections.defaultdict(list) # the current procedure will count nested genes # twice for ee in GTF.flat_gene_iterator(GTF.iterator(infile)): exons = Intervals.combine([(e.start, e.end) for e in ee]) start, end = exons[0][0], exons[-1][1] r = ee[0] workspace[r.contig].append( (start, end, label_f(info_f(r), info_f(r)))) else: raise ValueError("unknown workspace_builder %s" % workspace_builder) return workspace
def __init__(self, *args, **kwargs): Sampler.__init__(self, *args, **kwargs) self.mGapLengths = [ x[1] - x[0] for x in Intervals.complement( self.mObserved, self.mWorkStart, self.mWorkEnd) ]
def processChunk(query_id, matches): """process a set of matches from query_id""" global ninput, noutput, nskipped global nfull_matches, npartial_matches, ngood_matches global nremoved_pid, nremoved_query_coverage, nempty, nremoved_gaps, nremoved_nmatches global nremoved_regions, nqueries_removed_region global outfile_empty ninput += 1 full_matches = [] good_matches = [] partial_matches = [] x_nremoved_pid, x_nquery_coverage, x_nremoved_gaps, x_nremoved_nmatches = 0, 0, 0, 0 nmatches = len(matches) new_matches = [] # absolute filters applicable to non-fragmentory matches for match in matches: if match.mPid < options.threshold_min_pid: nremoved_pid += 1 continue if match.mNMatches < options.threshold_min_matches: nremoved_nmatches += 1 continue if options.threshold_max_error_rate: r = 100.0 * \ math.power( options.threshold_max_error_rate, match.mNMatches + match.mNMismatches) if match.mPid < r: nremoved_pid += 1 x_nremoved_pid += 1 continue new_matches.append(match) matches = new_matches # filter matches if len(matches) == 0: if outfile_empty: outfile_empty.write("%s\tall matches removed after applying thresholds: before=%i, npid=%i, nqcoverage=%i, ngaps=%i, nmatches=%i\n" % (query_id, nmatches, x_nremoved_pid, x_nquery_coverage, x_nremoved_gaps, x_nremoved_nmatches)) nskipped += 1 return if options.keep_unique_matches and len(matches) == 1: pass else: new_matches = [] for match in matches: if match.mQueryCoverage < options.threshold_min_query_coverage: nremoved_query_coverage += 1 x_nquery_coverage += 1 continue if options.threshold_max_query_gaps and options.threshold_max_query_gaps > match.mQueryNGapsCounts: nremoved_gaps += 1 x_nremoved_gaps += 1 continue if options.threshold_max_query_gapchars and options.threshold_max_query_gapchars > match.mQueryNGapsBases: nremoved_gaps += 1 x_nremoved_gaps += 1 continue if options.threshold_max_sbjct_gaps and options.threshold_max_sbjct_gaps > match.mSbjctNGapsCounts: nremoved_gaps += 1 x_nremoved_gaps += 1 continue if options.threshold_max_sbjct_gapchars and options.threshold_max_sbjct_gapchars > match.mSbjctNGapsBases: nremoved_gaps += 1 x_nremoved_gaps += 1 continue new_matches.append(match) matches = new_matches if len(matches) == 0: if outfile_empty: outfile_empty.write("%s\tall matches removed after applying thresholds: before=%i, npid=%i, nqcoverage=%i, ngaps=%i, nmatches=%i\n" % (query_id, nmatches, x_nremoved_pid, x_nquery_coverage, x_nremoved_gaps, x_nremoved_nmatches)) nskipped += 1 return # Remove queries matching to a forbidden region. This section # will remove the full query if any of its matches matches in a # forbidden region. keep = True for match in matches: if intersectors and match.mSbjctId in intersectors: found = intersectors[match.mSbjctId].find( match.mSbjctFrom, match.mSbjctTo) if found and not options.keep_forbidden or (found and not options.keep_forbidden): nremoved_regions += 1 keep = False continue if not keep: nqueries_removed_region += 1 if outfile_empty: outfile_empty.write( "%s\toverlap with forbidden region\n" % query_id) return # check for full length matches for match in matches: if match.mQueryCoverage >= 99.9: full_matches.append(match) if match.mQueryCoverage > options.threshold_good_query_coverage: good_matches.append(match) else: partial_matches.append(match) if full_matches: nfull_matches += 1 elif good_matches: ngood_matches += 1 elif partial_matches: npartial_matches += 1 # compute coverage of sequence with matches intervals = [] for match in full_matches + good_matches + partial_matches: intervals.append((match.mQueryFrom, match.mQueryTo)) rest = Intervals.complement(intervals, 0, match.mQueryLength) query_coverage = 100.0 * \ (match.mQueryLength - sum([x[1] - x[0] for x in rest])) / match.mQueryLength if query_coverage >= 99.9: fully_matched.append(query_id) elif query_coverage > options.threshold_good_query_coverage: well_matched.append(query_id) else: partially_matched.append(query_id) aggregate_coverages.append(query_coverage) # select matches to output matches, msg = selectMatches(query_id, matches, options, queries_fasta) if len(matches) > 0: for match in matches: if options.query_forward_coordinates: match.convertCoordinates() if options.output_format == "map": options.stdout.write("%s\n" % "\t".join(map(str, ( match.mQueryId, match.mSbjctId, match.strand, "%5.2f" % match.mQueryCoverage, "%5.2f" % match.mSbjctCoverage, "%5.2f" % match.mPid, match.mQueryLength, match.mSbjctLength, match.mQueryFrom, match.mQueryTo, match.mSbjctFrom, match.mSbjctTo, ",".join( map(str, match.mBlockSizes)), ",".join( map(str, match.mQueryBlockStarts)), ",".join( map(str, match.mSbjctBlockStarts)), )))) elif options.output_format == "psl": options.stdout.write(str(match) + "\n") noutput += 1 else: if outfile_empty: outfile_empty.write( "%s\tno matches selected: %s\n" % (query_id, msg)) nempty += 1
def readWorkspace(infile, workspace_builder="raw", label="none", map_id2annotation={}): """read workspace from infile. A workspace is a collection of intervals with two labels associated to each interval, one for the 5' and one for the 3' end. Available workspace builders are: gff take a gff file. gtf-intergenic build workspace from intergenic segments in a gtf file. gtf-intronic build workspace from intronic segments in a gtf file gtf-genic the workspace is built from genes (first to last exon). Available labels are: none no labels are given to the ends of workspaces direction labels are given based on the 5'/3' end of the bounding exon annotation labels are given based on a gene2annotation map. returns a list of segments for each contig in a dictionary """ if label == "none": label_f = lambda x, y: (("X",), ("X",)) info_f = lambda x: None elif label == "direction": label_f = lambda x, y: ((("5", "3")[x],), (("3", "5")[y],)) info_f = lambda x: x.strand == "+" elif label == "annotation": label_f = lambda x, y: (map_id2annotation[x], map_id2annotation[y]) info_f = lambda x: x.gene_id if workspace_builder == "gff": workspace = GTF.readAsIntervals(GFF.iterator(infile)) elif workspace_builder == "gtf-intergenic": workspace = collections.defaultdict(list) # get all genes for e in GTF.merged_gene_iterator(GTF.iterator(infile)): workspace[e.contig].append((e.start, e.end, info_f(e))) # convert to intergenic regions. # overlapping genes are merged and the labels # of the right-most entry is retained for contig in workspace.keys(): segs = workspace[contig] segs.sort() last = segs[0] new_segs = [] for this in segs[1:]: if last[1] >= this[0]: if this[1] > last[1]: last = (last[0], this[1], this[2]) continue assert last[1] < this[0], "this=%s, last=%s" % (this, last) new_segs.append((last[1], this[0], label_f(last[2], this[2]))) last = this workspace[contig] = new_segs elif workspace_builder == "gtf-intronic": workspace = collections.defaultdict(list) # the current procedure will count nested genes # twice for ee in GTF.flat_gene_iterator(GTF.iterator(infile)): exons = Intervals.combine([(e.start, e.end) for e in ee]) introns = Intervals.complement(exons) r = ee[0] for start, end in introns: workspace[r.contig].append((start, end, label_f(info_f(r), info_f(r)) )) elif workspace_builder == "gtf-genic": workspace = collections.defaultdict(list) # the current procedure will count nested genes # twice for ee in GTF.flat_gene_iterator(GTF.iterator(infile)): exons = Intervals.combine([(e.start, e.end) for e in ee]) start, end = exons[0][0], exons[-1][1] r = ee[0] workspace[r.contig].append((start, end, label_f(info_f(r), info_f(r)) )) else: raise ValueError("unknown workspace_builder %s" % workspace_builder) return workspace
def __init__(self, *args, **kwargs): Sampler.__init__(self, *args, **kwargs) self.mGapLengths = [x[1] - x[0] for x in Intervals.complement(self.mObserved, self.mWorkStart, self.mWorkEnd)]
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-f", "--feature", dest="feature", type="choice", choices=["gene", "transcript", "exon"], default="transcript", help="which feature to use: gene/transcript/exon") parser.add_option("--unstranded-bw", dest="unstranded_wig", type="string", help="BigWig with tag counts on both strands") parser.add_option("--plus-bw", dest="plus_wig", type="string", help="BigWig with tag counts from plus strand") parser.add_option("--minus-bw", dest="minus_wig", type="string", help="BigWig with tag counts from minus strand") parser.add_option("--bed", dest="bedfile", type="string", help="tabix indexed bed file with tag counts"), parser.add_option("-c", "--use-centre", dest="centre", action="store_true", default=False, help="Use centre of read rather than start") # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) iterator = GTF.iterator(options.stdin) if options.feature == "gene": iterator = GTF.flat_gene_iterator(iterator) elif options.feature == "transcript": iterator = GTF.transcript_iterator(iterator) elif options.feature == "exon": def _exon_iterator(gff_iterator): for exon in gff_iterator: yield [exon] iterator = _exon_iterator(iterator) if options.unstranded_wig: bamfile = iCLIP.make_getter(plus_wig=options.unstranded_wig) elif options.plus_wig: if not options.minus_wig: raise ValueError( "Please provide wigs for both strands or use --unstranded_wig") bamfile = iCLIP.make_getter(plus_wig=options.plus_wig, minus_wig=options.minus_wig) elif options.bedfile: bamfile = iCLIP.make_getter(bedfile=options.bedfile) else: bamfile = pysam.AlignmentFile(args[0]) outlines = [] for feature in iterator: exons = GTF.asRanges(feature, "exon") exon_counts = iCLIP.count_intervals(bamfile, exons, feature[0].contig, feature[0].strand, dtype="uint32", use_centre=options.centre) exon_counts = exon_counts.sum() introns = Intervals.complement(exons) intron_counts = iCLIP.count_intervals(bamfile, introns, feature[0].contig, feature[0].strand, dtype="uint32", use_centre=options.centre) intron_counts = intron_counts.sum() if options.feature == "exon": try: exon_id = feature[0].exon_id except AttributeError: try: exon_id = feature[0].exon_number except AttributeError: exon_id = "missing" gene_id = feature[0].gene_id transcript_id = feature[0].transcript_id intron_counts = "NA" else: exon_id = "NA" gene_id = feature[0].gene_id transcript_id = feature[0].transcript_id intron_counts = float(intron_counts) outlines.append([gene_id, transcript_id, exon_id, str(float(exon_counts)), str(intron_counts)]) options.stdout.write("\t".join(["gene_id", "transcript_id", "exon_id", "exon_count", "intron_count"])+"\n") outlines = ["\t".join(outline) for outline in outlines] outlines = "\n".join(outlines) options.stdout.write(outlines + "\n") # write footer and output benchmark information. E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-f", "--feature", dest="feature", type="choice", choices=["gene", "transcript", "exon"], default="transcript", help="supply help") # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) iterator = GTF.iterator(options.stdin) if options.feature == "gene": iterator = GTF.flat_gene_iterator(iterator) elif options.feature == "transcript": iterator = GTF.transcript_iterator(iterator) elif options.feature == "exon": def _exon_iterator(gff_iterator): for exon in gff_iterator: yield [exon] iterator = _exon_iterator(iterator) bamfile = pysam.AlignmentFile(args[0]) outlines = [] for feature in iterator: exons = GTF.asRanges(feature, "exon") exon_counts = iCLIP.count_intervals(bamfile, exons, feature[0].contig, feature[0].strand, dtype="uint32") exon_counts = exon_counts.sum() introns = Intervals.complement(exons) intron_counts = iCLIP.count_intervals(bamfile, introns, feature[0].contig, feature[0].strand, dtype="uint32") intron_counts = intron_counts.sum() if options.feature == "exon": exon_id = feature[0].exon_id gene_id = feature[0].gene_id transcript_id = feature[0].transcript_id intron_counts = "NA" else: exon_id = "NA" gene_id = feature[0].gene_id transcript_id = feature[0].transcript_id outlines.append([ gene_id, transcript_id, exon_id, str(exon_counts), str(intron_counts) ]) options.stdout.write("\t".join([ "gene_id", "transcript_id", "exon_id", "exon_count", "intron_count" ]) + "\n") outlines = ["\t".join(outline) for outline in outlines] outlines = "\n".join(outlines) options.stdout.write(outlines + "\n") # write footer and output benchmark information. E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-f", "--feature", dest="feature", type="choice", choices=["gene", "transcript", "exon"], default="transcript", help="supply help") # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) iterator = GTF.iterator(options.stdin) if options.feature == "gene": iterator = GTF.flat_gene_iterator(iterator) elif options.feature == "transcript": iterator = GTF.transcript_iterator(iterator) elif options.feature == "exon": def _exon_iterator(gff_iterator): for exon in gff_iterator: yield [exon] iterator = _exon_iterator(iterator) bamfile = pysam.AlignmentFile(args[0]) outlines = [] for feature in iterator: exons = GTF.asRanges(feature, "exon") exon_counts = iCLIP.count_intervals(bamfile, exons, feature[0].contig, feature[0].strand, dtype="uint32") exon_counts = exon_counts.sum() introns = Intervals.complement(exons) intron_counts = iCLIP.count_intervals(bamfile, introns, feature[0].contig, feature[0].strand, dtype="uint32") intron_counts = intron_counts.sum() if options.feature == "exon": try: exon_id = feature[0].exon_id except AttributeError: exon_id = "missing" gene_id = feature[0].gene_id transcript_id = feature[0].transcript_id intron_counts = "NA" else: exon_id = "NA" gene_id = feature[0].gene_id transcript_id = feature[0].transcript_id outlines.append([gene_id, transcript_id, exon_id, str(exon_counts), str(intron_counts)]) options.stdout.write("\t".join(["gene_id", "transcript_id", "exon_id", "exon_count", "intron_count"])+"\n") outlines = ["\t".join(outline) for outline in outlines] outlines = "\n".join(outlines) options.stdout.write(outlines + "\n") # write footer and output benchmark information. E.Stop()