def toSequence(chunk, fasta): """convert a list of gff attributes to a single sequence. This function ensures correct in-order concatenation on positive/negative strand. Overlapping regions are merged. """ if len(chunk) == 0: return "" contig, strand = chunk[0].contig, chunk[0].strand for gff in chunk: assert gff.strand == strand, "features on different strands." assert gff.contig == contig, "features on different contigs." intervals = Intervals.combine([(x.start, x.end) for x in chunk]) lcontig = fasta.getLength(contig) positive = Genomics.IsPositiveStrand(strand) if not positive: intervals = [(lcontig - end, lcontig - start) for start, end in intervals] intervals.reverse() s = [ fasta.getSequence(contig, strand, start, end) for start, end in intervals ] return "".join(s)
def toSequence(chunk, fasta): """convert a list of gff attributes to a single sequence. This function ensures correct in-order concatenation on positive/negative strand. Overlapping regions are merged. """ if len(chunk) == 0: return "" contig, strand = chunk[0].contig, chunk[0].strand for gff in chunk: assert gff.strand == strand, "features on different strands." assert gff.contig == contig, "features on different contigs." intervals = Intervals.combine([(x.start, x.end) for x in chunk]) lcontig = fasta.getLength(contig) positive = Genomics.IsPositiveStrand(strand) if not positive: intervals = [(lcontig - end, lcontig - start) for start, end in intervals] intervals.reverse() s = [fasta.getSequence(contig, strand, start, end) for start, end in intervals] return "".join(s)
def combineMergedIntervals(bedfiles): '''combine intervals in a collection of bed files. Overlapping intervals between tracks are merged. Algorithm: 1. collect all intervals in all tracks into a single track 2. merge overlapping intervals 3. report all intervals that overlap with an interval in each track. ''' # get all intervals data_per_contig = collections.defaultdict(list) for bedfile in bedfiles: for contig in bedfile.contigs: i = [] for bed in bedfile.fetch(contig, parser=pysam.asBed()): i.append((bed.start, bed.end)) data_per_contig[contig].extend(i) # merge intervals for contig in list(data_per_contig.keys()): data_per_contig[contig] = Intervals.combine(data_per_contig[contig]) # filter intervals - take only those present in all bedfiles for contig, data in sorted(data_per_contig.items()): for start, end in data: if isContainedInAll(contig, start, end, bedfiles): yield contig, start, end
def combineMergedIntervals(bedfiles): '''combine intervals in a collection of bed files. Overlapping intervals between tracks are merged. Algorithm: 1. collect all intervals in all tracks into a single track 2. merge overlapping intervals 3. report all intervals that overlap with an interval in each track. ''' # get all intervals data_per_contig = collections.defaultdict(list) for bedfile in bedfiles: for contig in bedfile.contigs: i = [] for bed in bedfile.fetch(contig, parser=pysam.asBed()): i.append((bed.start, bed.end)) data_per_contig[contig].extend(i) # merge intervals for contig in data_per_contig.keys(): data_per_contig[contig] = Intervals.combine(data_per_contig[contig]) # filter intervals - take only those present in all bedfiles for contig, data in data_per_contig.iteritems(): for start, end in data: if isContainedInAll(contig, start, end, bedfiles): yield contig, start, end
def annotateRegulons( iterator, fasta, tss, options ): """annotate regulons within iterator. Entries specied with ``--restrict-source`` are annotated. """ gene_iterator = GTF.gene_iterator( iterator ) ngenes, ntranscripts, nregulons = 0, 0, 0 upstream, downstream = options.upstream, options.downstream for gene in gene_iterator: ngenes += 1 is_negative_strand = Genomics.IsNegativeStrand( gene[0][0].strand ) lcontig = fasta.getLength( gene[0][0].contig ) regulons = [] transcript_ids = [] for transcript in gene: ntranscripts += 1 mi, ma = min( [x.start for x in transcript ] ), max( [x.end for x in transcript ] ) if tss: # add range to both sides of tss if is_negative_strand: interval = ma - options.downstream, ma + options.upstream else: interval = mi - options.upstream, mi + options.downstream else: # add range to both sides of tts if is_negative_strand: interval = mi - options.downstream, mi + options.upstream else: interval = ma - options.upstream, ma + options.downstream interval = ( min( lcontig, max( 0, interval[0] ) ), min( lcontig, max( 0, interval[1] ) ) ) regulons.append( interval ) transcript_ids.append( transcript[0].transcript_id ) if options.merge_promotors: # merge the regulons (and rename - as sort order might have changed) regulons = Intervals.combine( regulons ) transcript_ids = ["%i" % (x+1) for x in range(len(regulons) )] gtf = GTF.Entry() gtf.fromGTF( gene[0][0], gene[0][0].gene_id, gene[0][0].gene_id ) gtf.source = "regulon" x = 0 for start, end in regulons: gtf.start, gtf.end = start, end gtf.transcript_id = transcript_ids[x] options.stdout.write( "%s\n" % str(gtf) ) nregulons += 1 x += 1 E.info( "ngenes=%i, ntranscripts=%i, nregulons=%i" % (ngenes, ntranscripts, nregulons) )
def annotateRegulons(iterator, fasta, tss, options): """annotate regulons within iterator. Entries specied with ``--restrict-source`` are annotated. """ gene_iterator = GTF.gene_iterator(iterator) ngenes, ntranscripts, nregulons = 0, 0, 0 upstream, downstream = options.upstream, options.downstream for gene in gene_iterator: ngenes += 1 is_negative_strand = Genomics.IsNegativeStrand(gene[0][0].strand) lcontig = fasta.getLength(gene[0][0].contig) regulons = [] transcript_ids = [] for transcript in gene: ntranscripts += 1 mi, ma = min([x.start for x in transcript]), max([x.end for x in transcript]) if tss: # add range to both sides of tss if is_negative_strand: interval = ma - options.downstream, ma + options.upstream else: interval = mi - options.upstream, mi + options.downstream else: # add range to both sides of tts if is_negative_strand: interval = mi - options.downstream, mi + options.upstream else: interval = ma - options.upstream, ma + options.downstream interval = (min(lcontig, max(0, interval[0])), min(lcontig, max(0, interval[1]))) regulons.append(interval) transcript_ids.append(transcript[0].transcript_id) if options.merge_promotors: # merge the regulons (and rename - as sort order might have # changed) regulons = Intervals.combine(regulons) transcript_ids = ["%i" % (x + 1) for x in range(len(regulons))] gtf = GTF.Entry() gtf.fromGTF(gene[0][0], gene[0][0].gene_id, gene[0][0].gene_id) gtf.source = "regulon" x = 0 for start, end in regulons: gtf.start, gtf.end = start, end gtf.transcript_id = transcript_ids[x] options.stdout.write("%s\n" % str(gtf)) nregulons += 1 x += 1 E.info("ngenes=%i, ntranscripts=%i, nregulons=%i" % (ngenes, ntranscripts, nregulons))
def iterator_min_feature_length(gff_iterator, min_length, feature="exon"): """select only those genes with a minimum length of a given feature.""" for gffs in gff_iterator: intervals = [(x.start, x.end) for x in gffs if x.feature == feature] intervals = Intervals.combine(intervals) t = sum((x[1] - x[0] for x in intervals)) if t >= min_length: yield gffs
def iterator_min_feature_length(gff_iterator, min_length, feature="exon"): """select only those genes with a minimum length of a given feature.""" for gffs in gff_iterator: intervals = [(x.start, x.end) for x in gffs if x.feature == feature] intervals = Intervals.combine(intervals) t = sum((x[1] - x[0] for x in intervals)) if t >= min_length: yield gffs
def get_windows(pvalues, window_size, threshold): # intervals are close closed windows = [(pos-window_size, pos+window_size+1) for pos in pvalues.index.values] merged_windows = Intervals.combine(windows) windows_min_p = [pvalues.ix[float(start):float(end-1)].min() for start, end in merged_windows] return zip(merged_windows, windows_min_p)
def annotateExons(iterator, fasta, options): """annotate exons within iterator.""" gene_iterator = GTF.gene_iterator(iterator) ninput, noutput, noverlapping = 0, 0, 0 for this in gene_iterator: ninput += 1 intervals = collections.defaultdict(list) ntranscripts = len(this) is_negative_strand = Genomics.IsNegativeStrand(this[0][0].strand) for exons in this: # make sure these are sorted correctly exons.sort(key=lambda x: x.start) if is_negative_strand: exons.reverse() nexons = len(exons) for i, e in enumerate(exons): intervals[(e.start, e.end)].append((i + 1, nexons)) gtf = GTF.Entry() gtf.fromGTF(this[0][0], this[0][0].gene_id, this[0][0].gene_id) gtf.addAttribute("ntranscripts", ntranscripts) gtfs = [] for r, pos in intervals.items(): g = GTF.Entry().copy(gtf) g.start, g.end = r g.addAttribute("nused", len(pos)) g.addAttribute("pos", ",".join(["%i:%i" % x for x in pos])) gtfs.append(g) gtfs.sort(key=lambda x: x.start) for g in gtfs: options.stdout.write("%s\n" % str(g)) # check for exon overlap intervals = [(g.start, g.end) for g in gtfs] nbefore = len(intervals) nafter = len(Intervals.combine(intervals)) if nafter != nbefore: noverlapping += 1 noutput += 1 if options.loglevel >= 1: options.stdlog.write( "# ninput=%i, noutput=%i, noverlapping=%i\n" % (ninput, noutput, noverlapping))
def annotateTTS(iterator, fasta, options): """annotate termination sites within iterator. Entries specified with ``--restrict-source are annotated``. """ gene_iterator = GTF.gene_iterator(iterator) ngenes, ntranscripts, npromotors = 0, 0, 0 for gene in gene_iterator: ngenes += 1 is_negative_strand = Genomics.IsNegativeStrand(gene[0][0].strand) lcontig = fasta.getLength(gene[0][0].contig) tts = [] transcript_ids = [] for transcript in gene: ntranscripts += 1 mi, ma = min([x.start for x in transcript]), max( [x.end for x in transcript]) transcript_ids.append(transcript[0].transcript_id) # if tts is directly at start/end of contig, the tss will # be within an exon. otherwise, it is outside an exon. if is_negative_strand: tts.append( (max(0, mi - options.promotor), max(options.promotor, mi))) else: tts.append( (min(ma, lcontig - options.promotor), min(lcontig, ma + options.promotor))) if options.merge_promotors: # merge the promotors (and rename - as sort order might have # changed) tts = Intervals.combine(tts) transcript_ids = ["%i" % (x + 1) for x in range(len(tts))] gtf = GTF.Entry() gtf.fromGTF(gene[0][0], gene[0][0].gene_id, gene[0][0].gene_id) gtf.source = "tts" x = 0 for start, end in tts: gtf.start, gtf.end = start, end gtf.transcript_id = transcript_ids[x] options.stdout.write("%s\n" % str(gtf)) npromotors += 1 x += 1 if options.loglevel >= 1: options.stdlog.write( "# ngenes=%i, ntranscripts=%i, ntss=%i\n" % (ngenes, ntranscripts, npromotors))
def annotateExons(iterator, fasta, options): """annotate exons within iterator.""" gene_iterator = GTF.gene_iterator(iterator) ninput, noutput, noverlapping = 0, 0, 0 for this in gene_iterator: ninput += 1 intervals = collections.defaultdict(list) ntranscripts = len(this) is_negative_strand = Genomics.IsNegativeStrand(this[0][0].strand) for exons in this: # make sure these are sorted correctly exons.sort(key=lambda x: x.start) if is_negative_strand: exons.reverse() nexons = len(exons) for i, e in enumerate(exons): intervals[(e.start, e.end)].append((i + 1, nexons)) gtf = GTF.Entry() gtf.fromGTF(this[0][0], this[0][0].gene_id, this[0][0].gene_id) gtf.addAttribute("ntranscripts", ntranscripts) gtfs = [] for r, pos in intervals.iteritems(): g = GTF.Entry().copy(gtf) g.start, g.end = r g.addAttribute("nused", len(pos)) g.addAttribute("pos", ",".join(["%i:%i" % x for x in pos])) gtfs.append(g) gtfs.sort(key=lambda x: x.start) for g in gtfs: options.stdout.write("%s\n" % str(g)) # check for exon overlap intervals = [(g.start, g.end) for g in gtfs] nbefore = len(intervals) nafter = len(Intervals.combine(intervals)) if nafter != nbefore: noverlapping += 1 noutput += 1 if options.loglevel >= 1: options.stdlog.write( "# ninput=%i, noutput=%i, noverlapping=%i\n" % (ninput, noutput, noverlapping))
def annotateTTS(iterator, fasta, options): """annotate termination sites within iterator. Entries specified with ``--restrict-source are annotated``. """ gene_iterator = GTF.gene_iterator(iterator) ngenes, ntranscripts, npromotors = 0, 0, 0 for gene in gene_iterator: ngenes += 1 is_negative_strand = Genomics.IsNegativeStrand(gene[0][0].strand) lcontig = fasta.getLength(gene[0][0].contig) tts = [] transcript_ids = [] for transcript in gene: ntranscripts += 1 mi, ma = min([x.start for x in transcript]), max( [x.end for x in transcript]) transcript_ids.append(transcript[0].transcript_id) # if tts is directly at start/end of contig, the tss will # be within an exon. otherwise, it is outside an exon. if is_negative_strand: tts.append( (max(0, mi - options.promotor), max(options.promotor, mi))) else: tts.append( (min(ma, lcontig - options.promotor), min(lcontig, ma + options.promotor))) if options.merge_promotors: # merge the promotors (and rename - as sort order might have # changed) tts = Intervals.combine(tts) transcript_ids = ["%i" % (x + 1) for x in range(len(tts))] gtf = GTF.Entry() gtf.fromGTF(gene[0][0], gene[0][0].gene_id, gene[0][0].gene_id) gtf.source = "tts" x = 0 for start, end in tts: gtf.start, gtf.end = start, end gtf.transcript_id = transcript_ids[x] options.stdout.write("%s\n" % str(gtf)) npromotors += 1 x += 1 if options.loglevel >= 1: options.stdlog.write( "# ngenes=%i, ntranscripts=%i, ntss=%i\n" % (ngenes, ntranscripts, npromotors))
def get_windows(pvalues, window_size, threshold): # intervals are close closed windows = [(pos - window_size, pos + window_size + 1) for pos in pvalues.index.values] merged_windows = Intervals.combine(windows) windows_min_p = [ pvalues.ix[float(start):float(end - 1)].min() for start, end in merged_windows ] return zip(merged_windows, windows_min_p)
def count( self, bed ): '''update internal counts.''' results = [] for track in self.tracks: try: overlaps = [ (x[0],x[1]) for x in self.index[track][bed.contig].find( bed.start, bed.end ) ] except KeyError: overlaps = [] results.append( (len(overlaps), Intervals.calculateOverlap( [(bed.start, bed.end),], Intervals.combine( overlaps ) ) ) ) self.data = results
def count( self, bed ): '''update internal counts.''' results = [] for track in self.tracks: try: overlaps = [ (x[0],x[1]) for x in self.index[track][bed.contig].find( bed.start, bed.end ) ] except KeyError: overlaps = [] results.append( (len(overlaps), Intervals.calculateOverlap( [(bed.start, bed.end),], Intervals.combine( overlaps ) ) ) ) self.data = results
def toIntronIntervals(chunk): """convert a set of gtf elements within a transcript to intron coordinates. Will use first transcript_id found. Note that coordinates will still be forward strand coordinates """ if len(chunk) == 0: return [] contig, strand, transcript_id = (chunk[0].contig, chunk[0].strand, chunk[0].transcript_id) for gff in chunk: assert gff.strand == strand, "features on different strands." assert gff.contig == contig, "features on different contigs." intervals = Intervals.combine([(x.start, x.end) for x in chunk if x.feature == "exon"]) return Intervals.complement(intervals)
def findRetainedIntrons(infile, outfile): outf = IOTools.openFile(outfile, "w") for gene in GTF.gene_iterator(GTF.iterator(IOTools.openFile(infile))): gene_out = [] introns_out = [] # now find if any of the transcripts are retained intron # versions of any of the others for first, second in itertools.product(gene, gene): first = sorted( [entry for entry in first if entry.feature == "exon"], key=lambda x: x.start) second = sorted( [entry for entry in second if entry.feature == "exon"], key=lambda x: x.start) first_introns = set(GTF.toIntronIntervals(first)) second_introns = set(GTF.toIntronIntervals(second)) if len(first_introns-second_introns) > 0 and \ len(second_introns-first_introns) == 0: novel_introns = list(first_introns - second_introns) def _filterIntron(intron): return intron[0] > second[0].start and \ intron[1] < second[-1].end novel_introns = filter(_filterIntron, novel_introns) if len(novel_introns) > 0: gene_out.extend(first) for intron in novel_introns: introns_out.append(intron) introns_out = Intervals.combine(introns_out) template = gene[0][0] template.feature = "exon" for gff in introns_out: entry = GTF.Entry().copy(template) entry.start = gff[0] entry.end = gff[1] outf.write("%s\n" % str(entry))
def asRanges(gffs, feature=None): """return ranges within a set of gffs. Overlapping intervals are merged. The returned intervals are sorted. """ if isinstance(feature, basestring): gg = filter(lambda x: x.feature == feature, gffs) elif feature: gg = filter(lambda x: x.feature in feature, gffs) else: gg = gffs[:] r = [(g.start, g.end) for g in gg] return Intervals.combine(r)
def asRanges(gffs, feature=None): """return ranges within a set of gffs. Overlapping intervals are merged. The returned intervals are sorted. """ if isinstance(feature, str): gg = [x for x in gffs if x.feature == feature] elif feature: gg = [x for x in gffs if x.feature in feature] else: gg = gffs[:] r = [(g.start, g.end) for g in gg] return Intervals.combine(r)
def asRanges(gffs, feature=None): """return ranges within a set of gffs. Overlapping intervals are merged. The returned intervals are sorted. """ if isinstance(feature, str): gg = [x for x in gffs if x.feature == feature] elif feature: gg = [x for x in gffs if x.feature in feature] else: gg = gffs[:] r = [(g.start, g.end) for g in gg] return Intervals.combine(r)
def asRanges(gffs, feature=None): """return ranges within a set of gffs. Overlapping intervals are merged. The returned intervals are sorted. """ if isinstance(feature, basestring): gg = filter(lambda x: x.feature == feature, gffs) elif feature: gg = filter(lambda x: x.feature in feature, gffs) else: gg = gffs[:] r = [(g.start, g.end) for g in gg] return Intervals.combine(r)
def toIntronIntervals(chunk): '''convert a set of gtf elements within a transcript to intron coordinates. Will use first transcript_id found. Note that coordinates will still be forward strand coordinates ''' if len(chunk) == 0: return [] contig, strand, transcript_id = (chunk[0].contig, chunk[0].strand, chunk[0].transcript_id) for gff in chunk: assert gff.strand == strand, "features on different strands." assert gff.contig == contig, "features on different contigs." intervals = Intervals.combine([(x.start, x.end) for x in chunk if x.feature == "exon"]) return Intervals.complement(intervals)
def toIntronIntervals(chunk): '''convert a set of gtf elements within a transcript to intron coordinates. Will raise an error if more than one transcript is submitted. Note that coordinates will still be forward strand coordinates ''' if len(chunk) == 0: return [] contig, strand, transcript_id = chunk[ 0].contig, chunk[0].strand, chunk[0].transcript_id for gff in chunk: assert gff.strand == strand, "features on different strands." assert gff.contig == contig, "features on different contigs." assert gff.transcript_id == transcript_id, "more than one transcript submitted" intervals = Intervals.combine([(x.start, x.end) for x in chunk if x.feature == "exon"]) return Intervals.complement(intervals)
def __str__(self): single_exon_transcripts = 0 exons_per_transcript = [] intron_sizes = [] transcript_lengths = [] exon_sizes = [] for x in self.counts_exons_per_transcript.values(): x.sort() x = Intervals.combine(x) transcript_lengths.append(x[-1][1] - x[0][0]) exons_per_transcript.append(len(x)) for start, end in x: exon_sizes.append(end - start) if len(x) == 1: single_exon_transcripts += 1 continue last_end = x[0][1] for start, end in x[1:]: intron_sizes.append(start - last_end) last_end = end return "\t".join( map( str, ( len(self.counts_gene_ids), len(self.counts_transcript_ids), single_exon_transcripts, Stats.Summary(exons_per_transcript), Stats.Summary(exon_sizes), Stats.Summary(intron_sizes), Stats.Summary(transcript_lengths), ), ) )
def find_retained_introns(gene): '''Given a bundle of transcripts, find intervals matching retained introns. A retained intron is defined as an interval from an exon/intron boundary to the next where both boundaries are in the same exon of another transcript''' intron_intervals = [GTF.toIntronIntervals(transcript) for transcript in gene] intron_intervals = list(set( itertools.chain.from_iterable(intron_intervals))) intron_intervals.sort() for transcript in gene: exons = iter(sorted(GTF.asRanges(transcript))) introns = iter(intron_intervals) retained_introns = [] try: intron = introns.next() exon = exons.next() while True: if exon[1] < intron[0]: exon = exons.next() continue if intron[0] >= exon[0] and intron[1] <= exon[1]: E.debug("exon %s of transcript %s contains intron %s" % (exon, transcript[0].transcript_id, intron)) retained_introns.append(intron) intron = introns.next() except StopIteration: pass retained_introns = Intervals.combine(retained_introns) for intron in retained_introns: entry = GTF.Entry() entry = entry.copy(transcript[0]) entry.start = intron[0] entry.end = intron[1] yield entry
def __str__(self): single_exon_transcripts = 0 exons_per_transcript = [] intron_sizes = [] transcript_lengths = [] exon_sizes = [] for x in self.counts_exons_per_transcript.values(): x.sort() x = Intervals.combine(x) transcript_lengths.append(x[-1][1] - x[0][0]) exons_per_transcript.append(len(x)) for start, end in x: exon_sizes.append(end - start) if len(x) == 1: single_exon_transcripts += 1 continue last_end = x[0][1] for start, end in x[1:]: intron_sizes.append(start - last_end) last_end = end return "\t".join( map(str, ( len(self.counts_gene_ids), len(self.counts_transcript_ids), single_exon_transcripts, Stats.Summary(exons_per_transcript), Stats.Summary(exon_sizes), Stats.Summary(intron_sizes), Stats.Summary(transcript_lengths), )))
def readWorkspace(infile, workspace_builder="raw", label="none", map_id2annotation={}): """read workspace from infile. A workspace is a collection of intervals with two labels associated to each interval, one for the 5' and one for the 3' end. Available workspace builders are: gff take a gff file. gtf-intergenic build workspace from intergenic segments in a gtf file. gtf-intronic build workspace from intronic segments in a gtf file gtf-genic the workspace is built from genes (first to last exon). Available labels are: none no labels are given to the ends of workspaces direction labels are given based on the 5'/3' end of the bounding exon annotation labels are given based on a gene2annotation map. returns a list of segments for each contig in a dictionary """ if label == "none": label_f = lambda x, y: (("X",), ("X",)) info_f = lambda x: None elif label == "direction": label_f = lambda x, y: ((("5", "3")[x],), (("3", "5")[y],)) info_f = lambda x: x.strand == "+" elif label == "annotation": label_f = lambda x, y: (map_id2annotation[x], map_id2annotation[y]) info_f = lambda x: x.gene_id if workspace_builder == "gff": workspace = GTF.readAsIntervals(GFF.iterator(infile)) elif workspace_builder == "gtf-intergenic": workspace = collections.defaultdict(list) # get all genes for e in GTF.merged_gene_iterator(GTF.iterator(infile)): workspace[e.contig].append((e.start, e.end, info_f(e))) # convert to intergenic regions. # overlapping genes are merged and the labels # of the right-most entry is retained for contig in workspace.keys(): segs = workspace[contig] segs.sort() last = segs[0] new_segs = [] for this in segs[1:]: if last[1] >= this[0]: if this[1] > last[1]: last = (last[0], this[1], this[2]) continue assert last[1] < this[0], "this=%s, last=%s" % (this, last) new_segs.append((last[1], this[0], label_f(last[2], this[2]))) last = this workspace[contig] = new_segs elif workspace_builder == "gtf-intronic": workspace = collections.defaultdict(list) # the current procedure will count nested genes # twice for ee in GTF.flat_gene_iterator(GTF.iterator(infile)): exons = Intervals.combine([(e.start, e.end) for e in ee]) introns = Intervals.complement(exons) r = ee[0] for start, end in introns: workspace[r.contig].append((start, end, label_f(info_f(r), info_f(r)) )) elif workspace_builder == "gtf-genic": workspace = collections.defaultdict(list) # the current procedure will count nested genes # twice for ee in GTF.flat_gene_iterator(GTF.iterator(infile)): exons = Intervals.combine([(e.start, e.end) for e in ee]) start, end = exons[0][0], exons[-1][1] r = ee[0] workspace[r.contig].append((start, end, label_f(info_f(r), info_f(r)) )) else: raise ValueError("unknown workspace_builder %s" % workspace_builder) return workspace
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("--is-gtf", dest="is_gtf", action="store_true", help="input is gtf instead of gff.") parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome [default=%default].") parser.add_option("-m", "--merge-adjacent", dest="merge", action="store_true", help="merge adjacent intervals with the same attributes." " [default=%default]") parser.add_option("-e", "--feature", dest="feature", type="string", help="filter by a feature, for example 'exon', 'CDS'." " If set to the empty string, all entries are output " "[%default].") parser.add_option("-f", "--maskregions-bed-file", dest="filename_masks", type="string", metavar="gff", help="mask sequences with regions given in gff file " "[%default].") parser.add_option("--remove-masked-regions", dest="remove_masked_regions", action="store_true", help="remove regions instead of masking [%default].") parser.add_option("--min-interval-length", dest="min_length", type="int", help="set minimum length for sequences output " "[%default]") parser.add_option("--max-length", dest="max_length", type="int", help="set maximum length for sequences output " "[%default]") parser.add_option("--extend-at", dest="extend_at", type="choice", choices=("none", "3", "5", "both", "3only", "5only"), help="extend at no end, 3', 5' or both ends. If " "3only or 5only are set, only the added sequence " "is returned [default=%default]") parser.add_option("--extend-by", dest="extend_by", type="int", help="extend by # bases [default=%default]") parser.add_option("--extend-with", dest="extend_with", type="string", help="extend using base [default=%default]") parser.add_option("--masker", dest="masker", type="choice", choices=("dust", "dustmasker", "softmask", "none"), help="apply masker [%default].") parser.add_option("--fold-at", dest="fold_at", type="int", help="fold sequence every n bases[%default].") parser.add_option( "--fasta-name-attribute", dest="naming_attribute", type="string", help="use attribute to name fasta entry. Currently only compatable" " with gff format [%default].") parser.set_defaults(is_gtf=False, genome_file=None, merge=False, feature=None, filename_masks=None, remove_masked_regions=False, min_length=0, max_length=0, extend_at=None, extend_by=100, extend_with=None, masker=None, fold_at=None, naming_attribute=False) (options, args) = E.Start(parser) if options.genome_file: fasta = IndexedFasta.IndexedFasta(options.genome_file) contigs = fasta.getContigSizes() if options.is_gtf: iterator = GTF.transcript_iterator(GTF.iterator(options.stdin)) else: gffs = GTF.iterator(options.stdin) if options.merge: iterator = GTF.joined_iterator(gffs) else: iterator = GTF.chunk_iterator(gffs) masks = None if options.filename_masks: masks = {} with IOTools.openFile(options.filename_masks, "r") as infile: e = GTF.readAsIntervals(GTF.iterator(infile)) # convert intervals to intersectors for contig in list(e.keys()): intersector = bx.intervals.intersection.Intersecter() for start, end in e[contig]: intersector.add_interval(bx.intervals.Interval(start, end)) masks[contig] = intersector ninput, noutput, nmasked, nskipped_masked = 0, 0, 0, 0 nskipped_length = 0 nskipped_noexons = 0 feature = options.feature # iterator is a list containing groups (lists) of features. # Each group of features have in common the same transcript ID, in case of # GTF files. for ichunk in iterator: ninput += 1 if feature: chunk = [x for x in ichunk if x.feature == feature] else: chunk = ichunk if len(chunk) == 0: nskipped_noexons += 1 E.info("no features in entry from " "%s:%i..%i - %s" % (ichunk[0].contig, ichunk[0].start, ichunk[0].end, str(ichunk[0]))) continue contig, strand = chunk[0].contig, chunk[0].strand if options.is_gtf: name = chunk[0].transcript_id else: if options.naming_attribute: attr_dict = { x.split("=")[0]: x.split("=")[1] for x in chunk[0].attributes.split(";") } name = attr_dict[options.naming_attribute] else: name = str(chunk[0].attributes) lcontig = contigs[contig] positive = Genomics.IsPositiveStrand(strand) intervals = [(x.start, x.end) for x in chunk] intervals.sort() if masks: if contig in masks: masked_regions = [] for start, end in intervals: masked_regions += [(x.start, x.end) for x in masks[contig].find(start, end)] masked_regions = Intervals.combine(masked_regions) if len(masked_regions): nmasked += 1 if options.remove_masked_regions: intervals = Intervals.truncate(intervals, masked_regions) else: raise NotImplementedError("unimplemented") if len(intervals) == 0: nskipped_masked += 1 if options.loglevel >= 1: options.stdlog.write( "# skipped because fully masked: " "%s: regions=%s masks=%s\n" % (name, str([(x.start, x.end) for x in chunk]), masked_regions)) continue out = intervals if options.extend_at and not options.extend_with: if options.extend_at == "5only": intervals = [(max(0, intervals[0][0] - options.extend_by), intervals[0][0])] elif options.extend_at == "3only": intervals = [(intervals[-1][1], min(lcontig, intervals[-1][1] + options.extend_by))] else: if options.extend_at in ("5", "both"): intervals[0] = (max(0, intervals[0][0] - options.extend_by), intervals[0][1]) if options.extend_at in ("3", "both"): intervals[-1] = (intervals[-1][0], min(lcontig, intervals[-1][1] + options.extend_by)) if not positive: intervals = [(lcontig - x[1], lcontig - x[0]) for x in intervals[::-1]] out.reverse() s = [ fasta.getSequence(contig, strand, start, end) for start, end in intervals ] # IMS: allow for masking of sequences s = Masker.maskSequences(s, options.masker) l = sum([len(x) for x in s]) if (l < options.min_length or (options.max_length and l > options.max_length)): nskipped_length += 1 if options.loglevel >= 1: options.stdlog.write("# skipped because length out of bounds " "%s: regions=%s len=%i\n" % (name, str(intervals), l)) continue if options.extend_at and options.extend_with: extension = "".join((options.extend_with, ) * options.extend_by) if options.extend_at in ("5", "both"): s[1] = extension + s[1] if options.extend_at in ("3", "both"): s[-1] = s[-1] + extension if options.fold_at: n = options.fold_at s = "".join(s) seq = "\n".join([s[i:i + n] for i in range(0, len(s), n)]) else: seq = "\n".join(s) options.stdout.write( ">%s %s:%s:%s\n%s\n" % (name, contig, strand, ";".join(["%i-%i" % x for x in out]), seq)) noutput += 1 E.info("ninput=%i, noutput=%i, nmasked=%i, nskipped_noexons=%i, " "nskipped_masked=%i, nskipped_length=%i" % (ninput, noutput, nmasked, nskipped_noexons, nskipped_masked, nskipped_length)) E.Stop()
def merge(iterator, max_distance=0, by_name=False, min_intervals=1, remove_inconsistent=False, resolve_blocks=False, stranded=False): """iterator for merging adjacent bed entries. *max_distance* > 0 permits merging of intervals that are not directly adjacent. If *by_name = True*, only entries with the same name are merged. If *remove_inconsistent*, overlapping intervals where the names are inconsistent will be removed. The score gives the number of intervals that have been merged. """ if remove_inconsistent and by_name: assert ValueError( "using both remove_inconsistent and by_name makes no sense") def iterate_chunks(iterator): max_end = defaultdict(int) to_join = defaultdict(list) last_name = defaultdict(str) last = iterator.next() if not stranded: strand = "." else: strand = last.strand max_end[strand] = last.end to_join[strand] = [last] for bed in iterator: if not stranded: strand = "." else: strand = bed.strand d = bed.start - max_end[strand] if bed.contig == last.contig: assert bed.start >= last.start, \ "input file should be sorted by contig and position: d=%i:\n%s\n%s\n" \ % (d, last, bed) if bed.contig != last.contig: for s in to_join: if to_join[s]: yield to_join[s] to_join[s] = [] max_end[s] = 0 elif (d > max_distance or (by_name and last_name[strand] != bed.name)): if to_join[strand]: yield to_join[strand] to_join[strand] = [] last = bed last_name[strand] = last.name max_end[strand] = max(bed.end, max_end[strand]) to_join[strand].append(bed) for strand in to_join: if to_join[strand]: yield to_join[strand] raise StopIteration c = E.Counter() for to_join in iterate_chunks(iterator): c.input += 1 if remove_inconsistent: names = set([x.name for x in to_join]) if len(names) > 1: c.skipped_inconcistent_intervals += 1 continue if resolve_blocks: # keep track of number of intervals in each entry for bed in to_join: bed["score"] = 1 merged = True while merged: joined = [] not_joined = [] merged = False while len(to_join) > 0: bed1, to_join = to_join[0], to_join[1:] intervals1 = bed1.toIntervals() for bed2 in to_join: intervals2 = bed2.toIntervals() if Intervals.calculateOverlap(intervals1, intervals2) > 0: intervals = Intervals.combine(intervals1 + intervals2) bed1.fromIntervals(intervals) bed1["score"] += bed2["score"] merged = True else: not_joined.append(bed2) joined.append(bed1) to_join = not_joined not_joined = [] to_join = joined joined = [] to_join = sorted(to_join, key=lambda x: int(x.start)) # keep only those with the created from the merge of the minimum # number of intervals for bed in to_join: if bed["score"] < min_intervals: c.skipped_min_intervals += 1 continue yield bed c.output += 1 else: if len(to_join) < min_intervals: c.skipped_min_intervals += 1 continue a = to_join[0] a.end = max([entry.end for entry in to_join]) a.score = len(to_join) yield a c.output += 1 E.info(str(c))
def merge(iterator, max_distance=0, by_name=False, min_intervals=1, remove_inconsistent=False, resolve_blocks=False, stranded=False): """iterator for merging adjacent bed entries. *max_distance* > 0 permits merging of intervals that are not directly adjacent. If *by_name = True*, only entries with the same name are merged. If *remove_inconsistent*, overlapping intervals where the names are inconsistent will be removed. The score gives the number of intervals that have been merged. """ if remove_inconsistent and by_name: assert ValueError( "using both remove_inconsistent and by_name makes no sense") def iterate_chunks(iterator): max_end = defaultdict(int) to_join = defaultdict(list) last_name = defaultdict(str) last = next(iterator) if not stranded: strand = "." else: strand = last.strand max_end[strand] = last.end to_join[strand] = [last] for bed in iterator: if not stranded: strand = "." else: strand = bed.strand d = bed.start - max_end[strand] if bed.contig == last.contig: assert bed.start >= last.start, \ "input file should be sorted by contig and position: d=%i:\n%s\n%s\n" \ % (d, last, bed) if bed.contig != last.contig: for s in to_join: if to_join[s]: yield to_join[s] to_join[s] = [] max_end[s] = 0 elif (d > max_distance or (by_name and last_name[strand] and last_name[strand] != bed.name)): if to_join[strand]: yield to_join[strand] to_join[strand] = list() last = bed last_name[strand] = last.name max_end[strand] = max(bed.end, max_end[strand]) to_join[strand].append(bed) for strand in sorted(to_join): if to_join[strand]: yield to_join[strand] raise StopIteration c = E.Counter() for to_join in iterate_chunks(iterator): c.input += 1 if remove_inconsistent: names = set([x.name for x in to_join]) if len(names) > 1: c.skipped_inconsistent_intervals += 1 continue if resolve_blocks: # keep track of number of intervals in each entry for bed in to_join: bed["score"] = 1 merged = True while merged: joined = [] not_joined = [] merged = False while len(to_join) > 0: bed1, to_join = to_join[0], to_join[1:] intervals1 = bed1.toIntervals() for bed2 in to_join: intervals2 = bed2.toIntervals() if Intervals.calculateOverlap(intervals1, intervals2) > 0: intervals = Intervals.combine(intervals1 + intervals2) bed1.fromIntervals(intervals) bed1["score"] += bed2["score"] merged = True else: not_joined.append(bed2) joined.append(bed1) to_join = not_joined not_joined = [] to_join = joined joined = [] to_join = sorted(to_join, key=lambda x: int(x.start)) # keep only those with the created from the merge of the minimum # number of intervals for bed in to_join: if bed["score"] < min_intervals: c.skipped_min_intervals += 1 continue yield bed c.output += 1 else: if len(to_join) < min_intervals: c.skipped_min_intervals += 1 continue a = to_join[0] a.end = max([entry.end for entry in to_join]) a.score = len(to_join) yield a c.output += 1 E.info(str(c))
def main(argv=None): if not argv: argv = sys.argv parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-m", "--merge-exons", dest="merge_exons", action="store_true", help="merge overlapping exons of all transcripts " "within a gene. " "The merged exons will be output. " "Input needs to sorted by gene [default=%default].") parser.add_option("-t", "--merge-transcripts", dest="merge_transcripts", action="store_true", help="merge all transcripts within a gene. " "The entry will span the whole gene " "(exons and introns). " "The transcript does not include the UTR unless " "--with-utr is set. [default=%default].") parser.add_option("--merge-genes", dest="merge_genes", action="store_true", help="merge overlapping genes if their exons overlap. " "A gene with a single transcript containing all exons " "of the overlapping transcripts will be output. " "This operation ignores strand information " "The input needs te sorted by transcript " "[default=%default].") parser.add_option("--merge-exons-distance", dest="merge_exons_distance", type="int", help="distance in nucleotides between " "exons to be merged [default=%default].") parser.add_option("-j", "--join-exons", dest="join_exons", action="store_true", help="join all exons per transcript. " "A new transcript will be " "output that spans a whole transcript. " "Input needs to be sorted by transcript " "[default=%default].") parser.add_option("--unset-genes", dest="unset_genes", type="string", help="unset gene identifiers, keeping " "transcripts intact. " "New gene identifiers are set to the " "pattern given. For example, " "'--unset-genes=%06i' [default=%default].") parser.add_option("--sort", dest="sort", type="choice", choices=("gene", "gene+transcript", "transcript", "position", "contig+gene", "position+gene", "gene+position"), help="sort input data [default=%default].") parser.add_option("-u", "--with-utr", dest="with_utr", action="store_true", help="include utr in merged transcripts " "[default=%default].") parser.add_option("--intersect-transcripts", dest="intersect_transcripts", action="store_true", help="intersect all transcripts within a gene. " "The entry will only span those bases " "that are covered by all transcrips." "The transcript does not include the UTR unless " "--with-utr is set. This method " "will remove all other features (stop_codon, etc.) " "The input needs to be sorted by gene. " "[default=%default].") parser.add_option("-i", "--merge-introns", dest="merge_introns", action="store_true", help="merge and output all introns within a " "gene. The output will contain " "all intronic regions within a gene. Single exon genes " "are skipped. " "The input needs to be sorted by gene. " "[default=%default].") parser.add_option("-g", "--set-transcript-to-gene", "--set-transcript2gene", dest="set_transcript2gene", action="store_true", help="set the transcript_id to the " "gene_id [default=%default].") parser.add_option("--set-protein-to-transcript", dest="set_protein2transcript", action="store_true", help="set the protein_id to the " "transcript_id [default=%default].") parser.add_option("--add-protein-id", dest="add_protein_id", type="string", help="add a protein_id for each transcript_id. " "The argument is a filename containing a mapping " "between " "transcript_id to protein_id [default=%default].") parser.add_option("-G", "--set-gene-to-transcript", "--set-gene2transcript", dest="set_gene2transcript", action="store_true", help="set the gene_id to the " "transcript_id [default=%default].") parser.add_option("-d", "--set-score2distance", dest="set_score2distance", action="store_true", help="set the score field for each feature to the " "distance to " "transcription start site [default=%default].") parser.add_option("--exons2introns", dest="exons2introns", action="store_true", help="for each gene build an 'intronic' transcript " "containing the union of all intronic regions " "of all transcripts in a gene." "The features are labeled as 'intron'." "The input needs to be sorted by gene. " "[default=%default].") parser.add_option("-f", "--filter", dest="filter", type="choice", choices=("gene", "transcript", "longest-gene", "longest-transcript", "representative-transcript"), help="apply a filter to the input file. Available " "filters are: " "'gene': filter by gene_id, " "'transcript': filter by transcript_id, " "'longest-gene': output the longest gene for " "overlapping genes ," "'longest-transcript': output the longest " "transcript per gene," "'representative-transcript': output the " "representative transcript per gene. " "The representative transcript is the transcript " "that shares most exons with " "the other transcripts in a gene. " "The input needs to be sorted by gene. " "[default=%default].") parser.add_option("-r", "--rename", dest="rename", type="choice", choices=("gene", "transcript"), help="rename genes or transcripts with a map " "given by the option `--apply`. " "Those that can not be renamed are removed " "[default=%default].") parser.add_option("--renumber-genes", dest="renumber_genes", type="string", help="renumber genes according to the given pattern. " "[default=%default].") parser.add_option("--renumber-transcripts", dest="renumber_transcripts", type="string", help="renumber transcripts according to the " "given pattern. " "[default=%default].") parser.add_option("-a", "--apply", dest="filename_filter", type="string", metavar="tsv", help="filename of ids to map/filter [default=%default].") parser.add_option("--invert-filter", dest="invert_filter", action="store_true", help="when using --filter, invert selection " "(like grep -v). " "[default=%default].") parser.add_option("--sample-size", dest="sample_size", type="int", help="extract a random sample of size # if the option " "'--filter' is set[default=%default].") parser.add_option("--intron-min-length", dest="intron_min_length", type="int", help="minimum length for introns (for --exons2introns) " "[default=%default].") parser.add_option("--min-exons-length", dest="min_exons_length", type="int", help="minimum length for gene (sum of exons) " "(--sample-size) [default=%default].") parser.add_option("--intron-border", dest="intron_border", type="int", help="number of residues to exclude at intron at either end " "(--exons2introns) [default=%default].") parser.add_option("--transcripts2genes", dest="transcripts2genes", action="store_true", help="cluster overlapping transcripts into genes.") parser.add_option("--reset-strand", dest="reset_strand", action="store_true", help="remove strandedness of features (set to '.') when " "using --transcripts2genes" "[default=%default].") parser.add_option("--remove-overlapping", dest="remove_overlapping", type="string", metavar="gff", help="remove all transcripts that overlap intervals " "in a gff-formatted file." "The comparison ignores strand " "[default=%default].") parser.add_option("--permit-duplicates", dest="strict", action="store_false", help="permit duplicate genes. " "[default=%default]") parser.add_option("--remove-duplicates", dest="remove_duplicates", type="choice", choices=("gene", "transcript", "ucsc", "coordinates"), help="remove duplicates by gene/transcript. " "If ``ucsc`` is chosen, transcripts ending on _dup# are " "removed. This is necessary to remove duplicate entries " "that are next to each other in the sort order " "[%default]") parser.add_option("--rename-duplicates", dest="rename_duplicates", action="store_true", help="rename duplicate gene_ids and transcript_ids by " "addition of a numerical suffix") parser.set_defaults( sort=None, merge_exons=False, join_exons=False, merge_exons_distance=0, merge_transcripts=False, set_score2distance=False, set_gene2transcript=False, set_transcript2gene=False, set_protein2transcript=False, add_protein_id=None, filename_filter=None, filter=None, exons2introns=None, merge_genes=False, intron_border=None, intron_min_length=None, sample_size=0, min_exons_length=0, transripts2genes=False, reset_strand=False, with_utr=False, invert_filter=False, remove_duplicates=None, remove_overlapping=None, renumber_genes=None, unset_genes=None, renumber_transcripts=None, strict=True, intersect_transcripts=False, rename_duplicates=False, ) (options, args) = E.Start(parser, argv=argv) ninput, noutput, nfeatures, ndiscarded = 0, 0, 0, 0 if options.set_transcript2gene: for gff in GTF.iterator(options.stdin): ninput += 1 gff.setAttribute("transcript_id", gff.gene_id) options.stdout.write("%s\n" % str(gff)) noutput += 1 nfeatures += 1 elif options.remove_duplicates: counts = collections.defaultdict(int) if options.remove_duplicates == "ucsc": store = [] remove = set() f = lambda x: x[0].transcript_id gffs = GTF.transcript_iterator( GTF.iterator(options.stdin), strict=False) outf = lambda x: "\n".join([str(y) for y in x]) for entry in gffs: ninput += 1 store.append(entry) id = f(entry) if "_dup" in id: remove.add(re.sub("_dup\d+", "", id)) remove.add(id) for entry in store: id = f(entry) if id not in remove: options.stdout.write(outf(entry) + "\n") noutput += 1 else: ndiscarded += 1 E.info("discarded duplicates for %s" % (id)) else: if options.remove_duplicates == "gene": gffs = GTF.gene_iterator( GTF.iterator(options.stdin), strict=False) f = lambda x: x[0][0].gene_id outf = lambda x: "\n".join( ["\n".join([str(y) for y in xx]) for xx in x]) elif options.remove_duplicates == "transcript": gffs = GTF.transcript_iterator( GTF.iterator(options.stdin), strict=False) f = lambda x: x[0].transcript_id outf = lambda x: "\n".join([str(y) for y in x]) elif options.remove_duplicates == "coordinates": gffs = GTF.chunk_iterator(GTF.iterator(options.stdin)) f = lambda x: x[0].contig + "_" + \ str(x[0].start) + "-" + str(x[0].end) outf = lambda x: "\n".join([str(y) for y in x]) store = [] for entry in gffs: ninput += 1 store.append(entry) id = f(entry) counts[id] += 1 # Assumes GTF file sorted by contig then start last_id = "" if options.remove_duplicates == "coordinates": for entry in store: id = f(entry) if id == last_id: ndiscarded += 1 E.info("discarded duplicates for %s: %i" % (id, counts[id])) else: options.stdout.write(outf(entry) + "\n") noutput += 1 last_id = id else: for entry in store: id = f(entry) if counts[id] == 1: options.stdout.write(outf(entry) + "\n") noutput += 1 else: ndiscarded += 1 E.info("discarded duplicates for %s: %i" % (id, counts[id])) elif options.sort: for gff in GTF.iterator_sorted(GTF.iterator(options.stdin), sort_order=options.sort): ninput += 1 options.stdout.write("%s\n" % str(gff)) noutput += 1 nfeatures += 1 elif options.set_gene2transcript: for gff in GTF.iterator(options.stdin): ninput += 1 gff.setAttribute("gene_id", gff.transcript_id) options.stdout.write("%s\n" % str(gff)) noutput += 1 nfeatures += 1 elif options.set_protein2transcript: for gff in GTF.iterator(options.stdin): ninput += 1 gff.setAttribute("protein_id", gff.transcript_id) options.stdout.write("%s\n" % str(gff)) noutput += 1 nfeatures += 1 elif options.add_protein_id: transcript2protein = IOTools.readMap(open(options.add_protein_id, "r")) missing = set() for gff in GTF.iterator(options.stdin): ninput += 1 if gff.transcript_id not in transcript2protein: if gff.transcript_id not in missing: E.debug( ("removing transcript '%s' due to " "missing protein id") % gff.transcript_id) missing.add(gff.transcript_id) ndiscarded += 1 continue gff.setAttribute( "protein_id", transcript2protein[gff.transcript_id]) options.stdout.write("%s\n" % str(gff)) noutput += 1 nfeatures += 1 E.info("transcripts removed due to missing protein ids: %i" % len(missing)) elif options.join_exons: for exons in GTF.transcript_iterator(GTF.iterator(options.stdin)): ninput += 1 strand = Genomics.convertStrand(exons[0].strand) contig = exons[0].contig transid = exons[0].transcript_id geneid = exons[0].gene_id biotype = exons[0].source all_start, all_end = min([x.start for x in exons]), max( [x.end for x in exons]) y = GTF.Entry() y.contig = contig y.source = biotype y.feature = "transcript" y.start = all_start y.end = all_end y.strand = strand y.transcript_id = transid y.gene_id = geneid options.stdout.write("%s\n" % str(y)) elif options.merge_genes: # merges overlapping genes # gffs = GTF.iterator_sorted_chunks( GTF.flat_gene_iterator(GTF.iterator(options.stdin)), sort_by="contig-strand-start") def iterate_chunks(gff_chunks): last = gff_chunks.next() to_join = [last] for gffs in gff_chunks: d = gffs[0].start - last[-1].end if gffs[0].contig == last[0].contig and \ gffs[0].strand == last[0].strand: assert gffs[0].start >= last[0].start, \ ("input file should be sorted by contig, strand " "and position: d=%i:\nlast=\n%s\nthis=\n%s\n") % \ (d, "\n".join([str(x) for x in last]), "\n".join([str(x) for x in gffs])) if gffs[0].contig != last[0].contig or \ gffs[0].strand != last[0].strand or \ d > 0: yield to_join to_join = [] last = gffs to_join.append(gffs) yield to_join raise StopIteration for chunks in iterate_chunks(gffs): ninput += 1 if len(chunks) > 1: gene_id = "merged_%s" % chunks[0][0].gene_id transcript_id = "merged_%s" % chunks[0][0].transcript_id info = ",".join([x[0].gene_id for x in chunks]) else: gene_id = chunks[0][0].gene_id transcript_id = chunks[0][0].transcript_id info = None intervals = [] for c in chunks: intervals += [(x.start, x.end) for x in c] intervals = Intervals.combine(intervals) # take single strand strand = chunks[0][0].strand for start, end in intervals: y = GTF.Entry() y.fromGTF(chunks[0][0], gene_id, transcript_id) y.start = start y.end = end y.strand = strand if info: y.addAttribute("merged", info) options.stdout.write("%s\n" % str(y)) nfeatures += 1 noutput += 1 elif options.renumber_genes: map_old2new = {} for gtf in GTF.iterator(options.stdin): ninput += 1 if gtf.gene_id not in map_old2new: map_old2new[gtf.gene_id] = options.renumber_genes % ( len(map_old2new) + 1) gtf.setAttribute("gene_id", map_old2new[gtf.gene_id]) options.stdout.write("%s\n" % str(gtf)) noutput += 1 elif options.unset_genes: map_old2new = {} for gtf in GTF.iterator(options.stdin): ninput += 1 key = gtf.transcript_id if key not in map_old2new: map_old2new[key] = options.unset_genes % (len(map_old2new) + 1) gtf.setAttribute("gene_id", map_old2new[key]) options.stdout.write("%s\n" % str(gtf)) noutput += 1 elif options.renumber_transcripts: map_old2new = {} for gtf in GTF.iterator(options.stdin): ninput += 1 key = (gtf.gene_id, gtf.transcript_id) if key not in map_old2new: map_old2new[key] = options.renumber_transcripts % ( len(map_old2new) + 1) gtf.setAttribute("transcript_id", map_old2new[key]) options.stdout.write("%s\n" % str(gtf)) noutput += 1 elif options.transcripts2genes: transcripts = set() genes = set() reset_strand = options.reset_strand for gtfs in GTF.iterator_transcripts2genes( GTF.iterator(options.stdin)): ninput += 1 for gtf in gtfs: if reset_strand: gtf.strand = "." options.stdout.write("%s\n" % str(gtf)) transcripts.add(gtf.transcript_id) genes.add(gtf.gene_id) nfeatures += 1 noutput += 1 E.info("transcripts2genes: transcripts=%i, genes=%i" % (len(transcripts), len(genes))) elif options.rename: map_old2new = IOTools.readMap(open(options.filename_filter, "r")) if options.rename == "transcript": is_gene_id = False elif options.rename == "gene": is_gene_id = True for gff in GTF.iterator(options.stdin): ninput += 1 if is_gene_id: if gff.gene_id in map_old2new: gff.setAttribute("gene_id", map_old2new[gff.gene_id]) else: E.debug("removing missing gene_id %s" % gff.gene_id) ndiscarded += 1 continue else: if gff.transcript_id in map_old2new: gff.setAttribute( "transcript_id", map_old2new[gff.transcript_id]) else: E.debug("removing missing transcript_id %s" % gff.transcript_id) ndiscarded += 1 continue noutput += 1 options.stdout.write("%s\n" % str(gff)) elif options.filter: keep_genes = set() if options.filter == "longest-gene": iterator = GTF.flat_gene_iterator(GTF.iterator(options.stdin)) coords = [] gffs = [] for gff in iterator: gff.sort(key=lambda x: x.start) coords.append((gff[0].contig, min([x.start for x in gff]), max([x.end for x in gff]), gff[0].gene_id)) gffs.append(gff) coords.sort() last_contig = None max_end = 0 longest_gene_id = None longest_length = None for contig, start, end, gene_id in coords: ninput += 1 if contig != last_contig or start >= max_end: if longest_gene_id: keep_genes.add(longest_gene_id) longest_gene_id = gene_id longest_length = end - start max_end = end else: if end - start > longest_length: longest_length, longest_gene_id = end - start, gene_id last_contig = contig max_end = max(max_end, end) keep_genes.add(longest_gene_id) invert = options.invert_filter for gff in gffs: keep = gff[0].gene_id in keep_genes if (keep and not invert) or (not keep and invert): noutput += 1 for g in gff: nfeatures += 1 options.stdout.write("%s\n" % g) else: ndiscarded += 1 elif options.filter in ("longest-transcript", "representative-transcript"): iterator = GTF.gene_iterator(GTF.iterator(options.stdin)) def selectLongestTranscript(gene): r = [] for transcript in gene: transcript.sort(key=lambda x: x.start) length = transcript[-1].end - transcript[0].start r.append((length, transcript)) r.sort() return r[-1][1] def selectRepresentativeTranscript(gene): '''select a representative transcript. The representative transcript represent the largest number of exons over all transcripts. ''' all_exons = [] for transcript in gene: all_exons.extend([(x.start, x.end) for x in transcript if x.feature == "exon"]) exon_counts = {} for key, exons in itertools.groupby(all_exons): exon_counts[key] = len(list(exons)) transcript_counts = [] for transcript in gene: count = sum([exon_counts[(x.start, x.end)] for x in transcript if x.feature == "exon"]) transcript_counts.append((count, transcript)) transcript_counts.sort() return transcript_counts[-1][1] if options.filter == "longest-transcript": _select = selectLongestTranscript elif options.filter == "representative-transcript": _select = selectRepresentativeTranscript for gene in iterator: ninput += 1 transcript = _select(gene) noutput += 1 for g in transcript: nfeatures += 1 options.stdout.write("%s\n" % g) elif options.filter in ("gene", "transcript"): if options.filename_filter: ids, nerrors = IOTools.ReadList( open(options.filename_filter, "r")) E.info("read %i ids" % len(ids)) ids = set(ids) by_gene = options.filter == "gene" by_transcript = options.filter == "transcript" invert = options.invert_filter reset_strand = options.reset_strand for gff in GTF.iterator(options.stdin): ninput += 1 keep = False if by_gene: keep = gff.gene_id in ids if by_transcript: keep = gff.transcript_id in ids if (invert and keep) or (not invert and not keep): continue if reset_strand: gff.strand = "." options.stdout.write("%s\n" % str(gff)) nfeatures += 1 noutput += 1 elif options.sample_size: if options.filter == "gene": iterator = GTF.flat_gene_iterator( GTF.iterator(options.stdin)) elif options.filter == "transcript": iterator = GTF.transcript_iterator( GTF.iterator(options.stdin)) if options.min_exons_length: iterator = GTF.iterator_min_feature_length( iterator, min_length=options.min_exons_length, feature="exon") data = [x for x in iterator] ninput = len(data) if len(data) > options.sample_size: data = random.sample(data, options.sample_size) for d in data: noutput += 1 for dd in d: nfeatures += 1 options.stdout.write(str(dd) + "\n") else: assert False, "please supply either a filename " "with ids to filter with (--apply) or a sample-size." elif options.exons2introns: for gffs in GTF.flat_gene_iterator(GTF.iterator(options.stdin)): ninput += 1 cds_ranges = GTF.asRanges(gffs, "CDS") exon_ranges = GTF.asRanges(gffs, "exon") input_ranges = Intervals.combine(cds_ranges + exon_ranges) if len(input_ranges) > 1: last = input_ranges[0][1] output_ranges = [] for start, end in input_ranges[1:]: output_ranges.append((last, start)) last = end if options.intron_border: b = options.intron_border output_ranges = [(x[0] + b, x[1] - b) for x in output_ranges] if options.intron_min_length: l = options.intron_min_length output_ranges = [ x for x in output_ranges if x[1] - x[0] > l] for start, end in output_ranges: entry = GTF.Entry() entry.copy(gffs[0]) entry.clearAttributes() entry.transcript_id = "merged" entry.feature = "intron" entry.start = start entry.end = end options.stdout.write("%s\n" % str(entry)) nfeatures += 1 noutput += 1 else: ndiscarded += 1 elif options.set_score2distance: for gffs in GTF.transcript_iterator(GTF.iterator(options.stdin)): ninput += 1 strand = Genomics.convertStrand(gffs[0].strand) all_start, all_end = min([x.start for x in gffs]), max( [x.end for x in gffs]) if strand != ".": t = 0 if strand == "-": gffs.reverse() for gff in gffs: gff.score = t t += gff.end - gff.start if strand == "-": gffs.reverse() for gff in gffs: options.stdout.write("%s\n" % str(gff)) nfeatures += 1 noutput += 1 elif options.remove_overlapping: index = GTF.readAndIndex( GTF.iterator(IOTools.openFile(options.remove_overlapping, "r"))) for gffs in GTF.transcript_iterator(GTF.iterator(options.stdin)): ninput += 1 found = False for e in gffs: if index.contains(e.contig, e.start, e.end): found = True break if found: ndiscarded += 1 else: noutput += 1 for e in gffs: nfeatures += 1 options.stdout.write("%s\n" % str(e)) elif options.intersect_transcripts: for gffs in GTF.gene_iterator(GTF.iterator(options.stdin), strict=options.strict): ninput += 1 r = [] for g in gffs: if options.with_utr: ranges = GTF.asRanges(g, "exon") else: ranges = GTF.asRanges(g, "CDS") r.append(ranges) result = r[0] for x in r[1:]: result = Intervals.intersect(result, x) entry = GTF.Entry() entry.copy(gffs[0][0]) entry.clearAttributes() entry.transcript_id = "merged" entry.feature = "exon" for start, end in result: entry.start = start entry.end = end options.stdout.write("%s\n" % str(entry)) nfeatures += 1 noutput += 1 elif options.rename_duplicates: gene_ids = list() transcript_ids = list() gtfs = list() for gtf in GTF.iterator(options.stdin): gtfs.append(gtf) if gtf.feature == "CDS": gene_ids.append(gtf.gene_id) transcript_ids.append(gtf.transcript_id) dup_gene = [item for item in set(gene_ids) if gene_ids.count(item) > 1] dup_transcript = [item for item in set(transcript_ids) if transcript_ids.count(item) > 1] E.info("Number of duplicated gene_ids: %i" % len(dup_gene)) E.info("Number of duplicated transcript_ids: %i" % len(dup_transcript)) gene_dict = dict(zip(dup_gene, ([0] * len(dup_gene)))) transcript_dict = dict(zip(dup_transcript, ([0] * len(dup_transcript)))) for gtf in gtfs: if gtf.feature == "CDS": if gtf.gene_id in dup_gene: gene_dict[gtf.gene_id] = gene_dict[gtf.gene_id] + 1 gtf.setAttribute('gene_id', gtf.gene_id + "." + str(gene_dict[gtf.gene_id])) if gtf.transcript_id in dup_transcript: transcript_dict[gtf.transcript_id] = \ transcript_dict[gtf.transcript_id] + 1 gtf.setAttribute('transcript_id', gtf.transcript_id + "." + str(transcript_dict[gtf.transcript_id])) options.stdout.write("%s\n" % gtf) else: for gffs in GTF.flat_gene_iterator( GTF.iterator(options.stdin), strict=options.strict): ninput += 1 cds_ranges = GTF.asRanges(gffs, "CDS") exon_ranges = GTF.asRanges(gffs, "exon") # sanity checks strands = set([x.strand for x in gffs]) contigs = set([x.contig for x in gffs]) if len(strands) > 1: raise ValueError("can not merge gene '%s' on multiple strands: %s" % ( gffs[0].gene_id, str(strands))) if len(contigs) > 1: raise ValueError("can not merge gene '%s' on multiple contigs: %s" % ( gffs[0].gene_id, str(contigs))) strand = Genomics.convertStrand(gffs[0].strand) if cds_ranges and options.with_utr: cds_start, cds_end = cds_ranges[0][0], cds_ranges[-1][1] midpoint = (cds_end - cds_start) / 2 + cds_start utr_ranges = [] for start, end in Intervals.truncate(exon_ranges, cds_ranges): if end - start > 3: if strand == ".": feature = "UTR" elif strand == "+": if start < midpoint: feature = "UTR5" else: feature = "UTR3" elif strand == "-": if start < midpoint: feature = "UTR3" else: feature = "UTR5" utr_ranges.append((feature, start, end)) output_feature = "CDS" output_ranges = cds_ranges else: output_feature = "exon" output_ranges = exon_ranges utr_ranges = [] result = [] if options.merge_exons: # need to combine per feature - skip # utr_ranges = Intervals.combineAtDistance( # utr_ranges, # options.merge_exons_distance) output_ranges = Intervals.combineAtDistance( output_ranges, options.merge_exons_distance) for feature, start, end in utr_ranges: entry = GTF.Entry() entry.copy(gffs[0]) entry.clearAttributes() entry.feature = feature entry.transcript_id = "merged" entry.start = start entry.end = end result.append(entry) for start, end in output_ranges: entry = GTF.Entry() entry.copy(gffs[0]) entry.clearAttributes() entry.transcript_id = "merged" entry.feature = output_feature entry.start = start entry.end = end result.append(entry) elif options.merge_transcripts: entry = GTF.Entry() entry.copy(gffs[0]) entry.clearAttributes() entry.transcript_id = entry.gene_id entry.start = output_ranges[0][0] entry.end = output_ranges[-1][1] result.append(entry) elif options.merge_introns: if len(output_ranges) >= 2: entry = GTF.Entry() entry.copy(gffs[0]) entry.clearAttributes() entry.transcript_id = entry.gene_id entry.start = output_ranges[0][1] entry.end = output_ranges[-1][0] result.append(entry) else: ndiscarded += 1 continue result.sort(key=lambda x: x.start) for x in result: options.stdout.write("%s\n" % str(x)) nfeatures += 1 noutput += 1 E.info("ninput=%i, noutput=%i, nfeatures=%i, ndiscarded=%i" % (ninput, noutput, nfeatures, ndiscarded)) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version="%prog version: $Id: gff2psl.py 2781 2009-09-10 11:33:14Z andreas $", usage=globals()["__doc__"]) parser.add_option("--is-gtf", dest="is_gtf", action="store_true", help="input is gtf.") parser.add_option("--no-header", dest="with_header", action="store_false", help="do not output BLAT header [default=%default].") parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome.") parser.add_option("--input-filename-queries", dest="input_filename_queries", type="string", help="fasta filename with queries [default=%default].") parser.add_option("--allow-duplicates", dest="allow_duplicates", action="store_true", help="""permit duplicate entries. Adjacent exons of a transcript will still be merged [default=%default].""" ) parser.set_defaults(is_gtf=False, genome_file=None, with_header=True, allow_duplicates=False, test=None) (options, args) = E.Start(parser, add_pipe_options=True) if options.genome_file: genome_fasta = IndexedFasta.IndexedFasta(options.genome_file) else: genome_fasta = None if options.input_filename_queries: queries_fasta = IndexedFasta.IndexedFasta( options.input_filename_queries) else: queries_fasta = None ninput, noutput, nskipped = 0, 0, 0 if options.is_gtf: iterator = GTF.transcript_iterator(GTF.iterator_filtered(GTF.iterator(sys.stdin), feature="exon"), strict=not options.allow_duplicates) else: iterator = GTF.joined_iterator(GTF.iterator(sys.stdin)) if options.with_header: options.stdout.write(Blat.Match().getHeader() + "\n") for gffs in iterator: if options.test and ninput >= options.test: break ninput += 1 result = alignlib_lite.py_makeAlignmentBlocks() xstart = 0 intervals = Intervals.combine([(gff.start, gff.end) for gff in gffs]) for start, end in intervals: xend = xstart + end - start result.addDiagonal(xstart, xend, start - xstart) xstart = xend entry = Blat.Match() entry.mQueryId = gff.transcript_id entry.mSbjctId = gff.contig entry.strand = gff.strand if genome_fasta: if entry.mSbjctId in genome_fasta: entry.mSbjctLength = genome_fasta.getLength(entry.mSbjctId) else: entry.mSbjctLength = result.getColTo() if queries_fasta: if entry.mQueryId in queries_fasta: entry.mQueryLength = queries_fasta.getLength(entry.mQueryId) else: entry.mQueryLength = result.getRowTo() entry.fromMap(result) options.stdout.write(str(entry) + "\n") noutput += 1 E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped)) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: gff2fasta.py 2861 2010-02-23 17:36:32Z andreas $", usage=globals()["__doc__"]) parser.add_option("--is-gtf", dest="is_gtf", action="store_true", help="input is gtf instead of gff.") parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome [default=%default].") parser.add_option( "-m", "--merge", dest="merge", action="store_true", help="merge adjacent intervals with the same attributes. " "[default=%default]") parser.add_option( "-e", "--feature", dest="feature", type="string", help="filter by a feature, for example 'exon', 'CDS'. If " "set to the empty string, all entries are output [%default].") parser.add_option( "-f", "--filename-masks", dest="filename_masks", type="string", metavar="gff", help="mask sequences with regions given in gff file [%default].") parser.add_option("--remove-masked-regions", dest="remove_masked_regions", action="store_true", help="remove regions instead of masking [%default].") parser.add_option( "--min-length", dest="min_length", type="int", help="set minimum length for sequences output [%default]") parser.add_option( "--max-length", dest="max_length", type="int", help="set maximum length for sequences output [%default]") parser.add_option("--extend-at", dest="extend_at", type="choice", choices=("none", "3", "5", "both", "3only", "5only"), help="extend at no end, 3', 5' or both ends. If " "3only or 5only are set, only the added sequence " "is returned [default=%default]") parser.add_option("--extend-by", dest="extend_by", type="int", help="extend by # bases [default=%default]") parser.add_option("--masker", dest="masker", type="choice", choices=("dust", "dustmasker", "softmask", "none"), help="apply masker [%default].") parser.set_defaults(is_gtf=False, genome_file=None, merge=False, feature=None, filename_masks=None, remove_masked_regions=False, min_length=0, max_length=0, extend_at=None, extend_by=100, masker=None) (options, args) = E.Start(parser) if options.genome_file: fasta = IndexedFasta.IndexedFasta(options.genome_file) contigs = fasta.getContigSizes() if options.is_gtf: iterator = GTF.transcript_iterator(GTF.iterator(sys.stdin)) else: gffs = GTF.iterator(sys.stdin) if options.merge: iterator = GTF.joined_iterator(gffs) else: iterator = GTF.chunk_iterator(gffs) masks = None if options.filename_masks: masks = {} with open(options.filename_masks, "r") as infile: e = GTF.readAsIntervals(GFF.iterator(infile)) # convert intervals to intersectors for contig in e.keys(): intersector = bx.intervals.intersection.Intersecter() for start, end in e[contig]: intersector.add_interval(bx.intervals.Interval(start, end)) masks[contig] = intersector ninput, noutput, nmasked, nskipped_masked = 0, 0, 0, 0 nskipped_length = 0 nskipped_noexons = 0 feature = options.feature # for item in iterator: # print len(item) # 3, 2 # for i in item: # print len(i) # 9, 9, 9, 9, 9 # print i.contig # print i.strand # print i.transcript_id # iterator is a list containing groups (lists) of features. # Each group of features have in common the same transcript ID, in case of GTF files. for ichunk in iterator: ninput += 1 if feature: chunk = filter(lambda x: x.feature == feature, ichunk) else: chunk = ichunk if len(chunk) == 0: nskipped_noexons += 1 E.info("no features in entry from %s:%i..%i - %s" % (ichunk[0].contig, ichunk[0].start, ichunk[0].end, str(ichunk[0]))) continue contig, strand = chunk[0].contig, chunk[0].strand if options.is_gtf: name = chunk[0].transcript_id else: name = str(chunk[0].attributes) lcontig = contigs[contig] positive = Genomics.IsPositiveStrand(strand) intervals = [(x.start, x.end) for x in chunk] intervals.sort() if masks: if contig in masks: masked_regions = [] for start, end in intervals: masked_regions += [(x.start, x.end) for x in masks[contig].find(start, end)] masked_regions = Intervals.combine(masked_regions) if len(masked_regions): nmasked += 1 if options.remove_masked_regions: intervals = Intervals.truncate(intervals, masked_regions) else: raise "unimplemented" if len(intervals) == 0: nskipped_masked += 1 if options.loglevel >= 1: options.stdlog.write( "# skipped because fully masked: %s: regions=%s masks=%s\n" %\ (name, str([ (x.start, x.end) for x in chunk ]), masked_regions) ) continue out = intervals if options.extend_at: if options.extend_at == "5only": intervals = [(max(0, intervals[0][0] - options.extend_by), intervals[0][0])] elif options.extend_at == "3only": intervals = [(intervals[-1][1], min(lcontig, intervals[-1][1] + options.extend_by))] else: if options.extend_at in ("5", "both"): intervals[0] = (max(0, intervals[0][0] - options.extend_by), intervals[0][1]) if options.extend_at in ("3", "both"): intervals[-1] = (intervals[-1][0], min(lcontig, intervals[-1][1] + options.extend_by)) if not positive: intervals = [(lcontig - x[1], lcontig - x[0]) for x in intervals[::-1]] out.reverse() s = [ fasta.getSequence(contig, strand, start, end) for start, end in intervals ] #IMS: allow for masking of sequences s = Masker.maskSequences(s, options.masker) l = sum([len(x) for x in s]) if l < options.min_length or (options.max_length and l > options.max_length): nskipped_length += 1 if options.loglevel >= 1: options.stdlog.write( "# skipped because length out of bounds %s: regions=%s len=%i\n" %\ (name, str(intervals), l) ) continue options.stdout.write( ">%s %s:%s:%s\n%s\n" % (name, contig, strand, ";".join(["%i-%i" % x for x in out]), "\n".join(s))) noutput += 1 E.info( "ninput=%i, noutput=%i, nmasked=%i, nskipped_noexons=%i, nskipped_masked=%i, nskipped_length=%i" %\ (ninput, noutput, nmasked, nskipped_noexons, nskipped_masked, nskipped_length ) ) E.Stop()
def main(argv=None): if not argv: argv = sys.argv parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-m", "--merge-exons", dest="merge_exons", action="store_true", help="merge overlapping exons of all transcripts " "within a gene. " "The merged exons will be output. " "Input needs to sorted by gene [default=%default].") parser.add_option("-t", "--merge-transcripts", dest="merge_transcripts", action="store_true", help="merge all transcripts within a gene. " "The entry will span the whole gene " "(exons and introns). " "The transcript does not include the UTR unless " "--with-utr is set. [default=%default].") parser.add_option("--merge-genes", dest="merge_genes", action="store_true", help="merge overlapping genes if their exons overlap. " "A gene with a single transcript containing all exons " "of the overlapping transcripts will be output. " "This operation ignores strand information " "The input needs te sorted by transcript " "[default=%default].") parser.add_option("--merge-exons-distance", dest="merge_exons_distance", type="int", help="distance in nucleotides between " "exons to be merged [default=%default].") parser.add_option("-j", "--join-exons", dest="join_exons", action="store_true", help="join all exons per transcript. " "A new transcript will be " "output that spans a whole transcript. " "Input needs to be sorted by transcript " "[default=%default].") parser.add_option("--unset-genes", dest="unset_genes", type="string", help="unset gene identifiers, keeping " "transcripts intact. " "New gene identifiers are set to the " "pattern given. For example, " "'--unset-genes=%06i' [default=%default].") parser.add_option("--sort", dest="sort", type="choice", choices=("gene", "gene+transcript", "transcript", "position", "contig+gene", "position+gene", "gene+position"), help="sort input data [default=%default].") parser.add_option("-u", "--with-utr", dest="with_utr", action="store_true", help="include utr in merged transcripts " "[default=%default].") parser.add_option("--intersect-transcripts", dest="intersect_transcripts", action="store_true", help="intersect all transcripts within a gene. " "The entry will only span those bases " "that are covered by all transcrips." "The transcript does not include the UTR unless " "--with-utr is set. This method " "will remove all other features (stop_codon, etc.) " "The input needs to be sorted by gene. " "[default=%default].") parser.add_option("-i", "--merge-introns", dest="merge_introns", action="store_true", help="merge and output all introns within a " "gene. The output will contain " "all intronic regions within a gene. Single exon genes " "are skipped. " "The input needs to be sorted by gene. " "[default=%default].") parser.add_option("-g", "--set-transcript-to-gene", "--set-transcript2gene", dest="set_transcript2gene", action="store_true", help="set the transcript_id to the " "gene_id [default=%default].") parser.add_option("--set-protein-to-transcript", dest="set_protein2transcript", action="store_true", help="set the protein_id to the " "transcript_id [default=%default].") parser.add_option("--add-protein-id", dest="add_protein_id", type="string", help="add a protein_id for each transcript_id. " "The argument is a filename containing a mapping " "between " "transcript_id to protein_id [default=%default].") parser.add_option("-G", "--set-gene-to-transcript", "--set-gene2transcript", dest="set_gene2transcript", action="store_true", help="set the gene_id to the " "transcript_id [default=%default].") parser.add_option("-d", "--set-score2distance", dest="set_score2distance", action="store_true", help="set the score field for each feature to the " "distance to " "transcription start site [default=%default].") parser.add_option("--exons2introns", dest="exons2introns", action="store_true", help="for each gene build an 'intronic' transcript " "containing the union of all intronic regions " "of all transcripts in a gene." "The features are labeled as 'intron'." "The input needs to be sorted by gene. " "[default=%default].") parser.add_option("-f", "--filter", dest="filter", type="choice", choices=("gene", "transcript", "longest-gene", "longest-transcript", "representative-transcript"), help="apply a filter to the input file. Available " "filters are: " "'gene': filter by gene_id, " "'transcript': filter by transcript_id, " "'longest-gene': output the longest gene for " "overlapping genes ," "'longest-transcript': output the longest " "transcript per gene," "'representative-transcript': output the " "representative transcript per gene. " "The representative transcript is the transcript " "that shares most exons with " "the other transcripts in a gene. " "The input needs to be sorted by gene. " "[default=%default].") parser.add_option("-r", "--rename", dest="rename", type="choice", choices=("gene", "transcript"), help="rename genes or transcripts with a map " "given by the option `--apply`. " "Those that can not be renamed are removed " "[default=%default].") parser.add_option("--renumber-genes", dest="renumber_genes", type="string", help="renumber genes according to the given pattern. " "[default=%default].") parser.add_option("--renumber-transcripts", dest="renumber_transcripts", type="string", help="renumber transcripts according to the " "given pattern. " "[default=%default].") parser.add_option("-a", "--apply", dest="filename_filter", type="string", metavar="tsv", help="filename of ids to map/filter [default=%default].") parser.add_option("--invert-filter", dest="invert_filter", action="store_true", help="when using --filter, invert selection " "(like grep -v). " "[default=%default].") parser.add_option("--sample-size", dest="sample_size", type="int", help="extract a random sample of size # if the option " "'--filter' is set[default=%default].") parser.add_option("--intron-min-length", dest="intron_min_length", type="int", help="minimum length for introns (for --exons2introns) " "[default=%default].") parser.add_option("--min-exons-length", dest="min_exons_length", type="int", help="minimum length for gene (sum of exons) " "(--sample-size) [default=%default].") parser.add_option( "--intron-border", dest="intron_border", type="int", help="number of residues to exclude at intron at either end " "(--exons2introns) [default=%default].") parser.add_option("--transcripts2genes", dest="transcripts2genes", action="store_true", help="cluster overlapping transcripts into genes.") parser.add_option("--reset-strand", dest="reset_strand", action="store_true", help="remove strandedness of features (set to '.') when " "using --transcripts2genes" "[default=%default].") parser.add_option("--remove-overlapping", dest="remove_overlapping", type="string", metavar="gff", help="remove all transcripts that overlap intervals " "in a gff-formatted file." "The comparison ignores strand " "[default=%default].") parser.add_option("--permit-duplicates", dest="strict", action="store_false", help="permit duplicate genes. " "[default=%default]") parser.add_option("--remove-duplicates", dest="remove_duplicates", type="choice", choices=("gene", "transcript", "ucsc", "coordinates"), help="remove duplicates by gene/transcript. " "If ``ucsc`` is chosen, transcripts ending on _dup# are " "removed. This is necessary to remove duplicate entries " "that are next to each other in the sort order " "[%default]") parser.add_option("--rename-duplicates", dest="rename_duplicates", action="store_true", help="rename duplicate gene_ids and transcript_ids by " "addition of a numerical suffix") parser.set_defaults( sort=None, merge_exons=False, join_exons=False, merge_exons_distance=0, merge_transcripts=False, set_score2distance=False, set_gene2transcript=False, set_transcript2gene=False, set_protein2transcript=False, add_protein_id=None, filename_filter=None, filter=None, exons2introns=None, merge_genes=False, intron_border=None, intron_min_length=None, sample_size=0, min_exons_length=0, transripts2genes=False, reset_strand=False, with_utr=False, invert_filter=False, remove_duplicates=None, remove_overlapping=None, renumber_genes=None, unset_genes=None, renumber_transcripts=None, strict=True, intersect_transcripts=False, rename_duplicates=False, ) (options, args) = E.Start(parser, argv=argv) ninput, noutput, nfeatures, ndiscarded = 0, 0, 0, 0 if options.set_transcript2gene: for gff in GTF.iterator(options.stdin): ninput += 1 gff.setAttribute("transcript_id", gff.gene_id) options.stdout.write("%s\n" % str(gff)) noutput += 1 nfeatures += 1 elif options.remove_duplicates: counts = collections.defaultdict(int) if options.remove_duplicates == "ucsc": store = [] remove = set() f = lambda x: x[0].transcript_id gffs = GTF.transcript_iterator(GTF.iterator(options.stdin), strict=False) outf = lambda x: "\n".join([str(y) for y in x]) for entry in gffs: ninput += 1 store.append(entry) id = f(entry) if "_dup" in id: remove.add(re.sub("_dup\d+", "", id)) remove.add(id) for entry in store: id = f(entry) if id not in remove: options.stdout.write(outf(entry) + "\n") noutput += 1 else: ndiscarded += 1 E.info("discarded duplicates for %s" % (id)) else: if options.remove_duplicates == "gene": gffs = GTF.gene_iterator(GTF.iterator(options.stdin), strict=False) f = lambda x: x[0][0].gene_id outf = lambda x: "\n".join( ["\n".join([str(y) for y in xx]) for xx in x]) elif options.remove_duplicates == "transcript": gffs = GTF.transcript_iterator(GTF.iterator(options.stdin), strict=False) f = lambda x: x[0].transcript_id outf = lambda x: "\n".join([str(y) for y in x]) elif options.remove_duplicates == "coordinates": gffs = GTF.chunk_iterator(GTF.iterator(options.stdin)) f = lambda x: x[0].contig + "_" + \ str(x[0].start) + "-" + str(x[0].end) outf = lambda x: "\n".join([str(y) for y in x]) store = [] for entry in gffs: ninput += 1 store.append(entry) id = f(entry) counts[id] += 1 # Assumes GTF file sorted by contig then start last_id = "" if options.remove_duplicates == "coordinates": for entry in store: id = f(entry) if id == last_id: ndiscarded += 1 E.info("discarded duplicates for %s: %i" % (id, counts[id])) else: options.stdout.write(outf(entry) + "\n") noutput += 1 last_id = id else: for entry in store: id = f(entry) if counts[id] == 1: options.stdout.write(outf(entry) + "\n") noutput += 1 else: ndiscarded += 1 E.info("discarded duplicates for %s: %i" % (id, counts[id])) elif options.sort: for gff in GTF.iterator_sorted(GTF.iterator(options.stdin), sort_order=options.sort): ninput += 1 options.stdout.write("%s\n" % str(gff)) noutput += 1 nfeatures += 1 elif options.set_gene2transcript: for gff in GTF.iterator(options.stdin): ninput += 1 gff.setAttribute("gene_id", gff.transcript_id) options.stdout.write("%s\n" % str(gff)) noutput += 1 nfeatures += 1 elif options.set_protein2transcript: for gff in GTF.iterator(options.stdin): ninput += 1 gff.setAttribute("protein_id", gff.transcript_id) options.stdout.write("%s\n" % str(gff)) noutput += 1 nfeatures += 1 elif options.add_protein_id: transcript2protein = IOTools.readMap(open(options.add_protein_id, "r")) missing = set() for gff in GTF.iterator(options.stdin): ninput += 1 if gff.transcript_id not in transcript2protein: if gff.transcript_id not in missing: E.debug(("removing transcript '%s' due to " "missing protein id") % gff.transcript_id) missing.add(gff.transcript_id) ndiscarded += 1 continue gff.setAttribute("protein_id", transcript2protein[gff.transcript_id]) options.stdout.write("%s\n" % str(gff)) noutput += 1 nfeatures += 1 E.info("transcripts removed due to missing protein ids: %i" % len(missing)) elif options.join_exons: for exons in GTF.transcript_iterator(GTF.iterator(options.stdin)): ninput += 1 strand = Genomics.convertStrand(exons[0].strand) contig = exons[0].contig transid = exons[0].transcript_id geneid = exons[0].gene_id biotype = exons[0].source all_start, all_end = min([x.start for x in exons ]), max([x.end for x in exons]) y = GTF.Entry() y.contig = contig y.source = biotype y.feature = "transcript" y.start = all_start y.end = all_end y.strand = strand y.transcript_id = transid y.gene_id = geneid options.stdout.write("%s\n" % str(y)) elif options.merge_genes: # merges overlapping genes # gffs = GTF.iterator_sorted_chunks(GTF.flat_gene_iterator( GTF.iterator(options.stdin)), sort_by="contig-strand-start") def iterate_chunks(gff_chunks): last = gff_chunks.next() to_join = [last] for gffs in gff_chunks: d = gffs[0].start - last[-1].end if gffs[0].contig == last[0].contig and \ gffs[0].strand == last[0].strand: assert gffs[0].start >= last[0].start, \ ("input file should be sorted by contig, strand " "and position: d=%i:\nlast=\n%s\nthis=\n%s\n") % \ (d, "\n".join([str(x) for x in last]), "\n".join([str(x) for x in gffs])) if gffs[0].contig != last[0].contig or \ gffs[0].strand != last[0].strand or \ d > 0: yield to_join to_join = [] last = gffs to_join.append(gffs) yield to_join raise StopIteration for chunks in iterate_chunks(gffs): ninput += 1 if len(chunks) > 1: gene_id = "merged_%s" % chunks[0][0].gene_id transcript_id = "merged_%s" % chunks[0][0].transcript_id info = ",".join([x[0].gene_id for x in chunks]) else: gene_id = chunks[0][0].gene_id transcript_id = chunks[0][0].transcript_id info = None intervals = [] for c in chunks: intervals += [(x.start, x.end) for x in c] intervals = Intervals.combine(intervals) # take single strand strand = chunks[0][0].strand for start, end in intervals: y = GTF.Entry() y.fromGTF(chunks[0][0], gene_id, transcript_id) y.start = start y.end = end y.strand = strand if info: y.addAttribute("merged", info) options.stdout.write("%s\n" % str(y)) nfeatures += 1 noutput += 1 elif options.renumber_genes: map_old2new = {} for gtf in GTF.iterator(options.stdin): ninput += 1 if gtf.gene_id not in map_old2new: map_old2new[gtf.gene_id] = options.renumber_genes % ( len(map_old2new) + 1) gtf.setAttribute("gene_id", map_old2new[gtf.gene_id]) options.stdout.write("%s\n" % str(gtf)) noutput += 1 elif options.unset_genes: map_old2new = {} for gtf in GTF.iterator(options.stdin): ninput += 1 key = gtf.transcript_id if key not in map_old2new: map_old2new[key] = options.unset_genes % (len(map_old2new) + 1) gtf.setAttribute("gene_id", map_old2new[key]) options.stdout.write("%s\n" % str(gtf)) noutput += 1 elif options.renumber_transcripts: map_old2new = {} for gtf in GTF.iterator(options.stdin): ninput += 1 key = (gtf.gene_id, gtf.transcript_id) if key not in map_old2new: map_old2new[key] = options.renumber_transcripts % ( len(map_old2new) + 1) gtf.setAttribute("transcript_id", map_old2new[key]) options.stdout.write("%s\n" % str(gtf)) noutput += 1 elif options.transcripts2genes: transcripts = set() genes = set() reset_strand = options.reset_strand for gtfs in GTF.iterator_transcripts2genes(GTF.iterator( options.stdin)): ninput += 1 for gtf in gtfs: if reset_strand: gtf.strand = "." options.stdout.write("%s\n" % str(gtf)) transcripts.add(gtf.transcript_id) genes.add(gtf.gene_id) nfeatures += 1 noutput += 1 E.info("transcripts2genes: transcripts=%i, genes=%i" % (len(transcripts), len(genes))) elif options.rename: map_old2new = IOTools.readMap(open(options.filename_filter, "r")) if options.rename == "transcript": is_gene_id = False elif options.rename == "gene": is_gene_id = True for gff in GTF.iterator(options.stdin): ninput += 1 if is_gene_id: if gff.gene_id in map_old2new: gff.setAttribute("gene_id", map_old2new[gff.gene_id]) else: E.debug("removing missing gene_id %s" % gff.gene_id) ndiscarded += 1 continue else: if gff.transcript_id in map_old2new: gff.setAttribute("transcript_id", map_old2new[gff.transcript_id]) else: E.debug("removing missing transcript_id %s" % gff.transcript_id) ndiscarded += 1 continue noutput += 1 options.stdout.write("%s\n" % str(gff)) elif options.filter: keep_genes = set() if options.filter == "longest-gene": iterator = GTF.flat_gene_iterator(GTF.iterator(options.stdin)) coords = [] gffs = [] for gff in iterator: gff.sort(key=lambda x: x.start) coords.append((gff[0].contig, min([x.start for x in gff]), max([x.end for x in gff]), gff[0].gene_id)) gffs.append(gff) coords.sort() last_contig = None max_end = 0 longest_gene_id = None longest_length = None for contig, start, end, gene_id in coords: ninput += 1 if contig != last_contig or start >= max_end: if longest_gene_id: keep_genes.add(longest_gene_id) longest_gene_id = gene_id longest_length = end - start max_end = end else: if end - start > longest_length: longest_length, longest_gene_id = end - start, gene_id last_contig = contig max_end = max(max_end, end) keep_genes.add(longest_gene_id) invert = options.invert_filter for gff in gffs: keep = gff[0].gene_id in keep_genes if (keep and not invert) or (not keep and invert): noutput += 1 for g in gff: nfeatures += 1 options.stdout.write("%s\n" % g) else: ndiscarded += 1 elif options.filter in ("longest-transcript", "representative-transcript"): iterator = GTF.gene_iterator(GTF.iterator(options.stdin)) def selectLongestTranscript(gene): r = [] for transcript in gene: transcript.sort(key=lambda x: x.start) length = transcript[-1].end - transcript[0].start r.append((length, transcript)) r.sort() return r[-1][1] def selectRepresentativeTranscript(gene): '''select a representative transcript. The representative transcript represent the largest number of exons over all transcripts. ''' all_exons = [] for transcript in gene: all_exons.extend([(x.start, x.end) for x in transcript if x.feature == "exon"]) exon_counts = {} for key, exons in itertools.groupby(all_exons): exon_counts[key] = len(list(exons)) transcript_counts = [] for transcript in gene: count = sum([ exon_counts[(x.start, x.end)] for x in transcript if x.feature == "exon" ]) transcript_counts.append((count, transcript)) transcript_counts.sort() return transcript_counts[-1][1] if options.filter == "longest-transcript": _select = selectLongestTranscript elif options.filter == "representative-transcript": _select = selectRepresentativeTranscript for gene in iterator: ninput += 1 # sort in order to make reproducible which # gene is chosen. transcript = _select(sorted(gene)) noutput += 1 for g in transcript: nfeatures += 1 options.stdout.write("%s\n" % g) elif options.filter in ("gene", "transcript"): if options.filename_filter: ids, nerrors = IOTools.ReadList( open(options.filename_filter, "r")) E.info("read %i ids" % len(ids)) ids = set(ids) by_gene = options.filter == "gene" by_transcript = options.filter == "transcript" invert = options.invert_filter reset_strand = options.reset_strand for gff in GTF.iterator(options.stdin): ninput += 1 keep = False if by_gene: keep = gff.gene_id in ids if by_transcript: keep = gff.transcript_id in ids if (invert and keep) or (not invert and not keep): continue if reset_strand: gff.strand = "." options.stdout.write("%s\n" % str(gff)) nfeatures += 1 noutput += 1 elif options.sample_size: if options.filter == "gene": iterator = GTF.flat_gene_iterator( GTF.iterator(options.stdin)) elif options.filter == "transcript": iterator = GTF.transcript_iterator( GTF.iterator(options.stdin)) if options.min_exons_length: iterator = GTF.iterator_min_feature_length( iterator, min_length=options.min_exons_length, feature="exon") data = [x for x in iterator] ninput = len(data) if len(data) > options.sample_size: data = random.sample(data, options.sample_size) for d in data: noutput += 1 for dd in d: nfeatures += 1 options.stdout.write(str(dd) + "\n") else: assert False, "please supply either a filename " "with ids to filter with (--apply) or a sample-size." elif options.exons2introns: for gffs in GTF.flat_gene_iterator(GTF.iterator(options.stdin)): ninput += 1 cds_ranges = GTF.asRanges(gffs, "CDS") exon_ranges = GTF.asRanges(gffs, "exon") input_ranges = Intervals.combine(cds_ranges + exon_ranges) if len(input_ranges) > 1: last = input_ranges[0][1] output_ranges = [] for start, end in input_ranges[1:]: output_ranges.append((last, start)) last = end if options.intron_border: b = options.intron_border output_ranges = [(x[0] + b, x[1] - b) for x in output_ranges] if options.intron_min_length: l = options.intron_min_length output_ranges = [ x for x in output_ranges if x[1] - x[0] > l ] for start, end in output_ranges: entry = GTF.Entry() entry.copy(gffs[0]) entry.clearAttributes() entry.transcript_id = "merged" entry.feature = "intron" entry.start = start entry.end = end options.stdout.write("%s\n" % str(entry)) nfeatures += 1 noutput += 1 else: ndiscarded += 1 elif options.set_score2distance: for gffs in GTF.transcript_iterator(GTF.iterator(options.stdin)): ninput += 1 strand = Genomics.convertStrand(gffs[0].strand) all_start, all_end = min([x.start for x in gffs ]), max([x.end for x in gffs]) if strand != ".": t = 0 if strand == "-": gffs.reverse() for gff in gffs: gff.score = t t += gff.end - gff.start if strand == "-": gffs.reverse() for gff in gffs: options.stdout.write("%s\n" % str(gff)) nfeatures += 1 noutput += 1 elif options.remove_overlapping: index = GTF.readAndIndex( GTF.iterator(IOTools.openFile(options.remove_overlapping, "r"))) for gffs in GTF.transcript_iterator(GTF.iterator(options.stdin)): ninput += 1 found = False for e in gffs: if index.contains(e.contig, e.start, e.end): found = True break if found: ndiscarded += 1 else: noutput += 1 for e in gffs: nfeatures += 1 options.stdout.write("%s\n" % str(e)) elif options.intersect_transcripts: for gffs in GTF.gene_iterator(GTF.iterator(options.stdin), strict=options.strict): ninput += 1 r = [] for g in gffs: if options.with_utr: ranges = GTF.asRanges(g, "exon") else: ranges = GTF.asRanges(g, "CDS") r.append(ranges) result = r[0] for x in r[1:]: result = Intervals.intersect(result, x) entry = GTF.Entry() entry.copy(gffs[0][0]) entry.clearAttributes() entry.transcript_id = "merged" entry.feature = "exon" for start, end in result: entry.start = start entry.end = end options.stdout.write("%s\n" % str(entry)) nfeatures += 1 noutput += 1 elif options.rename_duplicates: gene_ids = list() transcript_ids = list() gtfs = list() for gtf in GTF.iterator(options.stdin): gtfs.append(gtf) if gtf.feature == "CDS": gene_ids.append(gtf.gene_id) transcript_ids.append(gtf.transcript_id) dup_gene = [item for item in set(gene_ids) if gene_ids.count(item) > 1] dup_transcript = [ item for item in set(transcript_ids) if transcript_ids.count(item) > 1 ] E.info("Number of duplicated gene_ids: %i" % len(dup_gene)) E.info("Number of duplicated transcript_ids: %i" % len(dup_transcript)) gene_dict = dict(zip(dup_gene, ([0] * len(dup_gene)))) transcript_dict = dict(zip(dup_transcript, ([0] * len(dup_transcript)))) for gtf in gtfs: if gtf.feature == "CDS": if gtf.gene_id in dup_gene: gene_dict[gtf.gene_id] = gene_dict[gtf.gene_id] + 1 gtf.setAttribute( 'gene_id', gtf.gene_id + "." + str(gene_dict[gtf.gene_id])) if gtf.transcript_id in dup_transcript: transcript_dict[gtf.transcript_id] = \ transcript_dict[gtf.transcript_id] + 1 gtf.setAttribute( 'transcript_id', gtf.transcript_id + "." + str(transcript_dict[gtf.transcript_id])) options.stdout.write("%s\n" % gtf) else: for gffs in GTF.flat_gene_iterator(GTF.iterator(options.stdin), strict=options.strict): ninput += 1 cds_ranges = GTF.asRanges(gffs, "CDS") exon_ranges = GTF.asRanges(gffs, "exon") # sanity checks strands = set([x.strand for x in gffs]) contigs = set([x.contig for x in gffs]) if len(strands) > 1: raise ValueError( "can not merge gene '%s' on multiple strands: %s" % (gffs[0].gene_id, str(strands))) if len(contigs) > 1: raise ValueError( "can not merge gene '%s' on multiple contigs: %s" % (gffs[0].gene_id, str(contigs))) strand = Genomics.convertStrand(gffs[0].strand) if cds_ranges and options.with_utr: cds_start, cds_end = cds_ranges[0][0], cds_ranges[-1][1] midpoint = (cds_end - cds_start) / 2 + cds_start utr_ranges = [] for start, end in Intervals.truncate(exon_ranges, cds_ranges): if end - start > 3: if strand == ".": feature = "UTR" elif strand == "+": if start < midpoint: feature = "UTR5" else: feature = "UTR3" elif strand == "-": if start < midpoint: feature = "UTR3" else: feature = "UTR5" utr_ranges.append((feature, start, end)) output_feature = "CDS" output_ranges = cds_ranges else: output_feature = "exon" output_ranges = exon_ranges utr_ranges = [] result = [] if options.merge_exons: # need to combine per feature - skip # utr_ranges = Intervals.combineAtDistance( # utr_ranges, # options.merge_exons_distance) output_ranges = Intervals.combineAtDistance( output_ranges, options.merge_exons_distance) for feature, start, end in utr_ranges: entry = GTF.Entry() entry.copy(gffs[0]) entry.clearAttributes() entry.feature = feature entry.transcript_id = "merged" entry.start = start entry.end = end result.append(entry) for start, end in output_ranges: entry = GTF.Entry() entry.copy(gffs[0]) entry.clearAttributes() entry.transcript_id = "merged" entry.feature = output_feature entry.start = start entry.end = end result.append(entry) elif options.merge_transcripts: entry = GTF.Entry() entry.copy(gffs[0]) entry.clearAttributes() entry.transcript_id = entry.gene_id entry.start = output_ranges[0][0] entry.end = output_ranges[-1][1] result.append(entry) elif options.merge_introns: if len(output_ranges) >= 2: entry = GTF.Entry() entry.copy(gffs[0]) entry.clearAttributes() entry.transcript_id = entry.gene_id entry.start = output_ranges[0][1] entry.end = output_ranges[-1][0] result.append(entry) else: ndiscarded += 1 continue result.sort(key=lambda x: x.start) for x in result: options.stdout.write("%s\n" % str(x)) nfeatures += 1 noutput += 1 E.info("ninput=%i, noutput=%i, nfeatures=%i, ndiscarded=%i" % (ninput, noutput, nfeatures, ndiscarded)) E.Stop()
def annotateGenes(iterator, fasta, options): """annotate gene structures This method outputs intervals for first/middle/last exon/intron, UTRs and flanking regions. This method annotates per transcript. In order to achieve a unique tiling, use only a single transcript per gene and remove any overlap between genes. """ gene_iterator = GTF.gene_iterator(iterator) ngenes, ntranscripts, nskipped = 0, 0, 0 results = [] increment = options.increment introns_detail = "introns" in options.detail exons_detail = "exons" in options.detail for gene in gene_iterator: ngenes += 1 is_negative_strand = Genomics.IsNegativeStrand(gene[0][0].strand) try: lcontig = fasta.getLength(gene[0][0].contig) except KeyError: nskipped += 1 continue results = [] for transcript in gene: def _add(interval, anno): gtf = GTF.Entry() gtf.contig = transcript[0].contig gtf.gene_id = transcript[0].gene_id gtf.transcript_id = transcript[0].transcript_id gtf.strand = transcript[0].strand gtf.feature = anno gtf.start, gtf.end = interval results.append(gtf) ntranscripts += 1 exons = [(x.start, x.end) for x in transcript if x.feature == "exon"] if len(exons) == 0: nskipped += 1 exons.sort() introns = [] end = exons[0][1] for exon in exons[1:]: introns.append((end, exon[0])) end = exon[1] # add flank start, end = exons[0][0], exons[-1][1] upstream, downstream = [], [] for x in range(0, options.flank, increment): upstream.append((start - increment, start)) start -= increment downstream.append((end, end + increment)) end += increment # remove out-of-bounds coordinates upstream = [x for x in upstream if x[0] >= 0] downstream = [x for x in downstream if x[1] <= lcontig] if is_negative_strand: exons.reverse() introns.reverse() upstream, downstream = downstream, upstream # add exons if exons_detail: _add(exons[0], "first_exon") if len(exons) > 1: _add(exons[-1], "last_exon") for e in exons[1:-1]: _add(e, "middle_exon") else: for e in exons: _add(e, "exon") # add introns if introns_detail: if len(introns) > 0: _add(introns[0], "first_intron") if len(introns) > 1: _add(introns[-1], "last_intron") for i in introns[1:-1]: _add(i, "middle_intron") else: for i in introns: _add(i, "intron") for x, u in enumerate(upstream): _add(u, "upstream_%i" % (increment * (x + 1))) for x, u in enumerate(downstream): _add(u, "downstream_%i" % (increment * (x + 1))) results.sort(key=lambda x: x.feature) cache = [] for key, vals in itertools.groupby(results, key=lambda x: x.feature): v = list(vals) intervals = [(x.start, x.end) for x in v] intervals = Intervals.combine(intervals) for start, end in intervals: r = GTF.Entry() r.copy(v[0]) r.start, r.end = start, end cache.append(r) cache.sort(key=lambda x: x.start) for r in cache: options.stdout.write("%s\n" % str(r)) E.info("ngenes=%i, ntranscripts=%i, nskipped=%i\n" % (ngenes, ntranscripts, nskipped))
def annotateGenes(iterator, fasta, options): """annotate gene structures This method outputs intervals for first/middle/last exon/intron, UTRs and flanking regions. This method annotates per transcript. In order to achieve a unique tiling, use only a single transcript per gene and remove any overlap between genes. """ gene_iterator = GTF.gene_iterator(iterator) ngenes, ntranscripts, nskipped = 0, 0, 0 results = [] increment = options.increment introns_detail = "introns" in options.detail exons_detail = "exons" in options.detail for gene in gene_iterator: ngenes += 1 is_negative_strand = Genomics.IsNegativeStrand(gene[0][0].strand) try: lcontig = fasta.getLength(gene[0][0].contig) except KeyError: nskipped += 1 continue results = [] for transcript in gene: def _add(interval, anno): gtf = GTF.Entry() gtf.contig = transcript[0].contig gtf.gene_id = transcript[0].gene_id gtf.transcript_id = transcript[0].transcript_id gtf.strand = transcript[0].strand gtf.feature = anno gtf.start, gtf.end = interval results.append(gtf) ntranscripts += 1 exons = [(x.start, x.end) for x in transcript if x.feature == "exon"] if len(exons) == 0: nskipped += 1 exons.sort() introns = [] end = exons[0][1] for exon in exons[1:]: introns.append((end, exon[0])) end = exon[1] # add flank start, end = exons[0][0], exons[-1][1] upstream, downstream = [], [] for x in xrange(0, options.flank, increment): upstream.append((start - increment, start)) start -= increment downstream.append((end, end + increment)) end += increment # remove out-of-bounds coordinates upstream = [x for x in upstream if x[0] >= 0] downstream = [x for x in downstream if x[1] <= lcontig] if is_negative_strand: exons.reverse() introns.reverse() upstream, downstream = downstream, upstream # add exons if exons_detail: _add(exons[0], "first_exon") if len(exons) > 1: _add(exons[-1], "last_exon") for e in exons[1:-1]: _add(e, "middle_exon") else: for e in exons: _add(e, "exon") # add introns if introns_detail: if len(introns) > 0: _add(introns[0], "first_intron") if len(introns) > 1: _add(introns[-1], "last_intron") for i in introns[1:-1]: _add(i, "middle_intron") else: for i in introns: _add(i, "intron") for x, u in enumerate(upstream): _add(u, "upstream_%i" % (increment * (x + 1))) for x, u in enumerate(downstream): _add(u, "downstream_%i" % (increment * (x + 1))) results.sort(key=lambda x: x.feature) cache = [] for key, vals in itertools.groupby(results, key=lambda x: x.feature): v = list(vals) intervals = [(x.start, x.end) for x in v] intervals = Intervals.combine(intervals) for start, end in intervals: r = GTF.Entry() r.copy(v[0]) r.start, r.end = start, end cache.append(r) cache.sort(key=lambda x: x.start) for r in cache: options.stdout.write("%s\n" % str(r)) E.info("ngenes=%i, ntranscripts=%i, nskipped=%i\n" % (ngenes, ntranscripts, nskipped))
def readWorkspace(infile, workspace_builder="raw", label="none", map_id2annotation={}): """read workspace from infile. A workspace is a collection of intervals with two labels associated to each interval, one for the 5' and one for the 3' end. Available workspace builders are: gff take a gff file. gtf-intergenic build workspace from intergenic segments in a gtf file. gtf-intronic build workspace from intronic segments in a gtf file gtf-genic the workspace is built from genes (first to last exon). Available labels are: none no labels are given to the ends of workspaces direction labels are given based on the 5'/3' end of the bounding exon annotation labels are given based on a gene2annotation map. returns a list of segments for each contig in a dictionary """ if label == "none": label_f = lambda x, y: (("X", ), ("X", )) info_f = lambda x: None elif label == "direction": label_f = lambda x, y: ((("5", "3")[x], ), (("3", "5")[y], )) info_f = lambda x: x.strand == "+" elif label == "annotation": label_f = lambda x, y: (map_id2annotation[x], map_id2annotation[y]) info_f = lambda x: x.gene_id if workspace_builder == "gff": workspace = GTF.readAsIntervals(GFF.iterator(infile)) elif workspace_builder == "gtf-intergenic": workspace = collections.defaultdict(list) # get all genes for e in GTF.merged_gene_iterator(GTF.iterator(infile)): workspace[e.contig].append((e.start, e.end, info_f(e))) # convert to intergenic regions. # overlapping genes are merged and the labels # of the right-most entry is retained for contig in list(workspace.keys()): segs = workspace[contig] segs.sort() last = segs[0] new_segs = [] for this in segs[1:]: if last[1] >= this[0]: if this[1] > last[1]: last = (last[0], this[1], this[2]) continue assert last[1] < this[0], "this=%s, last=%s" % (this, last) new_segs.append((last[1], this[0], label_f(last[2], this[2]))) last = this workspace[contig] = new_segs elif workspace_builder == "gtf-intronic": workspace = collections.defaultdict(list) # the current procedure will count nested genes # twice for ee in GTF.flat_gene_iterator(GTF.iterator(infile)): exons = Intervals.combine([(e.start, e.end) for e in ee]) introns = Intervals.complement(exons) r = ee[0] for start, end in introns: workspace[r.contig].append( (start, end, label_f(info_f(r), info_f(r)))) elif workspace_builder == "gtf-genic": workspace = collections.defaultdict(list) # the current procedure will count nested genes # twice for ee in GTF.flat_gene_iterator(GTF.iterator(infile)): exons = Intervals.combine([(e.start, e.end) for e in ee]) start, end = exons[0][0], exons[-1][1] r = ee[0] workspace[r.contig].append( (start, end, label_f(info_f(r), info_f(r)))) else: raise ValueError("unknown workspace_builder %s" % workspace_builder) return workspace
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("--is-gtf", dest="is_gtf", action="store_true", help="input is gtf instead of gff.") parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome [default=%default].") parser.add_option( "-m", "--merge-adjacent", dest="merge", action="store_true", help="merge adjacent intervals with the same attributes." " [default=%default]") parser.add_option( "-e", "--feature", dest="feature", type="string", help="filter by a feature, for example 'exon', 'CDS'." " If set to the empty string, all entries are output " "[%default].") parser.add_option( "-f", "--maskregions-bed-file", dest="filename_masks", type="string", metavar="gff", help="mask sequences with regions given in gff file " "[%default].") parser.add_option( "--remove-masked-regions", dest="remove_masked_regions", action="store_true", help="remove regions instead of masking [%default].") parser.add_option( "--min-interval-length", dest="min_length", type="int", help="set minimum length for sequences output " "[%default]") parser.add_option( "--max-length", dest="max_length", type="int", help="set maximum length for sequences output " "[%default]") parser.add_option( "--extend-at", dest="extend_at", type="choice", choices=("none", "3", "5", "both", "3only", "5only"), help="extend at no end, 3', 5' or both ends. If " "3only or 5only are set, only the added sequence " "is returned [default=%default]") parser.add_option( "--extend-by", dest="extend_by", type="int", help="extend by # bases [default=%default]") parser.add_option( "--extend-with", dest="extend_with", type="string", help="extend using base [default=%default]") parser.add_option( "--masker", dest="masker", type="choice", choices=("dust", "dustmasker", "softmask", "none"), help="apply masker [%default].") parser.add_option( "--fold-at", dest="fold_at", type="int", help="fold sequence every n bases[%default].") parser.add_option( "--fasta-name-attribute", dest="naming_attribute", type="string", help="use attribute to name fasta entry. Currently only compatable" " with gff format [%default].") parser.set_defaults( is_gtf=False, genome_file=None, merge=False, feature=None, filename_masks=None, remove_masked_regions=False, min_length=0, max_length=0, extend_at=None, extend_by=100, extend_with=None, masker=None, fold_at=None, naming_attribute=False ) (options, args) = E.Start(parser) if options.genome_file: fasta = IndexedFasta.IndexedFasta(options.genome_file) contigs = fasta.getContigSizes() if options.is_gtf: iterator = GTF.transcript_iterator(GTF.iterator(options.stdin)) else: gffs = GTF.iterator(options.stdin) if options.merge: iterator = GTF.joined_iterator(gffs) else: iterator = GTF.chunk_iterator(gffs) masks = None if options.filename_masks: masks = {} with open(options.filename_masks, "r") as infile: e = GTF.readAsIntervals(GTF.iterator(infile)) # convert intervals to intersectors for contig in e.keys(): intersector = bx.intervals.intersection.Intersecter() for start, end in e[contig]: intersector.add_interval(bx.intervals.Interval(start, end)) masks[contig] = intersector ninput, noutput, nmasked, nskipped_masked = 0, 0, 0, 0 nskipped_length = 0 nskipped_noexons = 0 feature = options.feature # for item in iterator: # print len(item) # 3, 2 # for i in item: # print len(i) # 9, 9, 9, 9, 9 # print i.contig # print i.strand # print i.transcript_id # iterator is a list containing groups (lists) of features. # Each group of features have in common the same transcript ID, in case of # GTF files. for ichunk in iterator: ninput += 1 if feature: chunk = filter(lambda x: x.feature == feature, ichunk) else: chunk = ichunk if len(chunk) == 0: nskipped_noexons += 1 E.info("no features in entry from " "%s:%i..%i - %s" % (ichunk[0].contig, ichunk[0].start, ichunk[0].end, str(ichunk[0]))) continue contig, strand = chunk[0].contig, chunk[0].strand if options.is_gtf: name = chunk[0].transcript_id else: if options.naming_attribute: attr_dict = {x.split("=")[0]: x.split("=")[1] for x in chunk[0].attributes.split(";")} name = attr_dict[options.naming_attribute] else: name = str(chunk[0].attributes) lcontig = contigs[contig] positive = Genomics.IsPositiveStrand(strand) intervals = [(x.start, x.end) for x in chunk] intervals.sort() if masks: if contig in masks: masked_regions = [] for start, end in intervals: masked_regions += [(x.start, x.end) for x in masks[contig].find(start, end)] masked_regions = Intervals.combine(masked_regions) if len(masked_regions): nmasked += 1 if options.remove_masked_regions: intervals = Intervals.truncate(intervals, masked_regions) else: raise "unimplemented" if len(intervals) == 0: nskipped_masked += 1 if options.loglevel >= 1: options.stdlog.write("# skipped because fully masked: " "%s: regions=%s masks=%s\n" % (name, str([(x.start, x.end) for x in chunk]), masked_regions)) continue out = intervals if options.extend_at and not options.extend_with: if options.extend_at == "5only": intervals = [(max(0, intervals[0][0] - options.extend_by), intervals[0][0])] elif options.extend_at == "3only": intervals = [(intervals[-1][1], min(lcontig, intervals[-1][1] + options.extend_by))] else: if options.extend_at in ("5", "both"): intervals[0] = (max(0, intervals[0][0] - options.extend_by), intervals[0][1]) if options.extend_at in ("3", "both"): intervals[-1] = (intervals[-1][0], min(lcontig, intervals[-1][1] + options.extend_by)) if not positive: intervals = [(lcontig - x[1], lcontig - x[0]) for x in intervals[::-1]] out.reverse() s = [fasta.getSequence(contig, strand, start, end) for start, end in intervals] # IMS: allow for masking of sequences s = Masker.maskSequences(s, options.masker) l = sum([len(x) for x in s]) if (l < options.min_length or (options.max_length and l > options.max_length)): nskipped_length += 1 if options.loglevel >= 1: options.stdlog.write("# skipped because length out of bounds " "%s: regions=%s len=%i\n" % (name, str(intervals), l)) continue if options.extend_at and options.extend_with: extension = "".join((options.extend_with,) * options.extend_by) if options.extend_at in ("5", "both"): s[1] = extension + s[1] if options.extend_at in ("3", "both"): s[-1] = s[-1] + extension if options.fold_at: n = options.fold_at s = "".join(s) seq = "\n".join([s[i:i+n] for i in range(0, len(s), n)]) else: seq = "\n".join(s) options.stdout.write(">%s %s:%s:%s\n%s\n" % (name, contig, strand, ";".join( ["%i-%i" % x for x in out]), seq)) noutput += 1 E.info("ninput=%i, noutput=%i, nmasked=%i, nskipped_noexons=%i, " "nskipped_masked=%i, nskipped_length=%i" % (ninput, noutput, nmasked, nskipped_noexons, nskipped_masked, nskipped_length)) E.Stop()
def main(argv=None): if not argv: argv = sys.argv parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option( "--merge-exons-distance", dest="merge_exons_distance", type="int", help="distance in nucleotides between " "exons to be merged [%default].", ) parser.add_option( "--pattern-identifier", dest="pattern", type="string", help="pattern to use for renaming genes/transcripts. " "The pattern should contain a %i, for example " "--pattern-identifier=ENSG%010i [%default].", ) parser.add_option( "--sort-order", dest="sort_order", type="choice", choices=("gene", "gene+transcript", "transcript", "position", "contig+gene", "position+gene", "gene+position"), help="sort input data [%default].", ) parser.add_option( "-u", "--with-utr", dest="with_utr", action="store_true", help="include utr in merged transcripts " "[%default].", ) parser.add_option( "--filter-method", dest="filter_method", type="choice", choices=( "gene", "transcript", "longest-gene", "longest-transcript", "representative-transcript", "proteincoding", "lincrna", ), help="Filter method to apply. Available filters are: " "'gene': filter by gene_id given in ``--map-tsv-file``, " "'transcript': filter by transcript_id given in ``--map-tsv-file``, " "'longest-gene': output the longest gene for overlapping genes ," "'longest-transcript': output the longest transcript per gene," "'representative-transcript': output the representative transcript " "per gene. The representative transcript is the transcript " "that shares most exons with other transcripts in a gene. " "The input needs to be sorted by gene. " "'proteincoding': only output protein coding features. " "'lincrna': only output lincRNA features. " "[%default].", ) parser.add_option( "-a", "--map-tsv-file", dest="filename_filter", type="string", metavar="tsv", help="filename of ids to map/filter [%default].", ) parser.add_option( "--gff-file", dest="filename_gff", type="string", metavar="GFF", help="second filename of features (see --remove-overlapping) " "[%default]", ) parser.add_option( "--invert-filter", dest="invert_filter", action="store_true", help="when using --filter, invert selection " "(like grep -v). " "[%default].", ) parser.add_option( "--sample-size", dest="sample_size", type="int", help="extract a random sample of size # if the option " "'--method=filter --filter-method' is set " "[%default].", ) parser.add_option( "--intron-min-length", dest="intron_min_length", type="int", help="minimum length for introns (for --exons-file2introns) " "[%default].", ) parser.add_option( "--min-exons-length", dest="min_exons_length", type="int", help="minimum length for gene (sum of exons) " "(--sam-fileple-size) [%default].", ) parser.add_option( "--intron-border", dest="intron_border", type="int", help="number of residues to exclude at intron at either end " "(--exons-file2introns) [%default].", ) parser.add_option( "--ignore-strand", dest="ignore_strand", action="store_true", help="remove strandedness of features (set to '.') when " "using ``transcripts2genes`` or ``filter``" "[%default].", ) parser.add_option( "--permit-duplicates", dest="strict", action="store_false", help="permit duplicate genes. " "[%default]" ) parser.add_option( "--duplicate-feature", dest="duplicate_feature", type="choice", choices=("gene", "transcript", "both", "ucsc", "coordinates"), help="remove duplicates by gene/transcript. " "If ``ucsc`` is chosen, transcripts ending on _dup# are " "removed. This is necessary to remove duplicate entries " "that are next to each other in the sort order " "[%default]", ) parser.add_option( "-m", "--method", dest="method", type="choice", action="append", choices=( "add-protein-id", "exons2introns", "filter", "find-retained-introns", "genes-to-unique-chunks", "intersect-transcripts", "join-exons", "merge-exons", "merge-transcripts", "merge-genes", "merge-introns", "remove-overlapping", "remove-duplicates", "rename-genes", "rename-transcripts", "rename-duplicates", "renumber-genes", "renumber-transcripts", "set-transcript-to-gene", "set-gene-to-transcript", "set-protein-to-transcript", "set-score-to-distance", "set-gene_biotype-to-source", "sort", "transcript2genes", "unset-genes", ), help="Method to apply [%default]." "Please only select one.", ) parser.set_defaults( sort_order="gene", filter_method="gene", pattern="%i", merge_exons_distance=0, filename_filter=None, intron_border=None, intron_min_length=None, sample_size=0, min_exons_length=0, ignore_strand=False, with_utr=False, invert_filter=False, duplicate_feature=None, strict=True, method=None, ) (options, args) = E.Start(parser, argv=argv) ninput, noutput, nfeatures, ndiscarded = 0, 0, 0, 0 if options.method is None: raise ValueError("please specify a --method") if len(options.method) > 1: raise ValueError("multiple --method arguements specified") else: options.method = options.method[0] if options.method == "set-transcript-to-gene": for gff in GTF.iterator(options.stdin): ninput += 1 gff.setAttribute("transcript_id", gff.gene_id) options.stdout.write("%s\n" % str(gff)) noutput += 1 nfeatures += 1 elif options.method == "set-gene_biotype-to-source": for gff in GTF.iterator(options.stdin): ninput += 1 if "gene_biotype" not in gff: gff.setAttribute("gene_biotype", gff.source) options.stdout.write("%s\n" % str(gff)) noutput += 1 nfeatures += 1 elif options.method == "remove-duplicates": counts = collections.defaultdict(int) if options.duplicate_feature == "ucsc": store = [] remove = set() f = lambda x: x[0].transcript_id gffs = GTF.transcript_iterator(GTF.iterator(options.stdin), strict=False) outf = lambda x: "\n".join([str(y) for y in x]) for entry in gffs: ninput += 1 store.append(entry) id = f(entry) if "_dup" in id: remove.add(re.sub("_dup\d+", "", id)) remove.add(id) for entry in store: id = f(entry) if id not in remove: options.stdout.write(outf(entry) + "\n") noutput += 1 else: ndiscarded += 1 E.info("discarded duplicates for %s" % (id)) else: if options.duplicate_feature == "gene": gffs = GTF.gene_iterator(GTF.iterator(options.stdin), strict=False) f = lambda x: x[0][0].gene_id outf = lambda x: "\n".join(["\n".join([str(y) for y in xx]) for xx in x]) elif options.duplicate_feature == "transcript": gffs = GTF.transcript_iterator(GTF.iterator(options.stdin), strict=False) f = lambda x: x[0].transcript_id outf = lambda x: "\n".join([str(y) for y in x]) elif options.duplicate_feature == "coordinates": gffs = GTF.chunk_iterator(GTF.iterator(options.stdin)) f = lambda x: x[0].contig + "_" + str(x[0].start) + "-" + str(x[0].end) outf = lambda x: "\n".join([str(y) for y in x]) store = [] for entry in gffs: ninput += 1 store.append(entry) id = f(entry) counts[id] += 1 # Assumes GTF file sorted by contig then start last_id = "" if options.duplicate_feature == "coordinates": for entry in store: id = f(entry) if id == last_id: ndiscarded += 1 E.info("discarded duplicates for %s: %i" % (id, counts[id])) else: options.stdout.write(outf(entry) + "\n") noutput += 1 last_id = id else: for entry in store: id = f(entry) if counts[id] == 1: options.stdout.write(outf(entry) + "\n") noutput += 1 else: ndiscarded += 1 E.info("discarded duplicates for %s: %i" % (id, counts[id])) elif "sort" == options.method: for gff in GTF.iterator_sorted(GTF.iterator(options.stdin), sort_order=options.sort_order): ninput += 1 options.stdout.write("%s\n" % str(gff)) noutput += 1 nfeatures += 1 elif "set-gene-to-transcript" == options.method: for gff in GTF.iterator(options.stdin): ninput += 1 gff.setAttribute("gene_id", gff.transcript_id) options.stdout.write("%s\n" % str(gff)) noutput += 1 nfeatures += 1 elif "set-protein-to-transcript" == options.method: for gff in GTF.iterator(options.stdin): ninput += 1 gff.setAttribute("protein_id", gff.transcript_id) options.stdout.write("%s\n" % str(gff)) noutput += 1 nfeatures += 1 elif "add-protein-id" == options.method: transcript2protein = IOTools.readMap(IOTools.openFile(options.filename_filter, "r")) missing = set() for gff in GTF.iterator(options.stdin): ninput += 1 if gff.transcript_id not in transcript2protein: if gff.transcript_id not in missing: E.debug(("removing transcript '%s' due to " "missing protein id") % gff.transcript_id) missing.add(gff.transcript_id) ndiscarded += 1 continue gff.setAttribute("protein_id", transcript2protein[gff.transcript_id]) options.stdout.write("%s\n" % str(gff)) noutput += 1 nfeatures += 1 E.info("transcripts removed due to missing protein ids: %i" % len(missing)) elif "join-exons" == options.method: for exons in GTF.transcript_iterator(GTF.iterator(options.stdin)): ninput += 1 strand = Genomics.convertStrand(exons[0].strand) contig = exons[0].contig transid = exons[0].transcript_id geneid = exons[0].gene_id biotype = exons[0].source all_start, all_end = min([x.start for x in exons]), max([x.end for x in exons]) y = GTF.Entry() y.contig = contig y.source = biotype y.feature = "transcript" y.start = all_start y.end = all_end y.strand = strand y.transcript_id = transid y.gene_id = geneid options.stdout.write("%s\n" % str(y)) elif "merge-genes" == options.method: # merges overlapping genes # gffs = GTF.iterator_sorted_chunks( GTF.flat_gene_iterator(GTF.iterator(options.stdin)), sort_by="contig-strand-start" ) def iterate_chunks(gff_chunks): last = gff_chunks.next() to_join = [last] for gffs in gff_chunks: d = gffs[0].start - last[-1].end if gffs[0].contig == last[0].contig and gffs[0].strand == last[0].strand: assert gffs[0].start >= last[0].start, ( "input file should be sorted by contig, strand " "and position: d=%i:\nlast=\n%s\nthis=\n%s\n" ) % (d, "\n".join([str(x) for x in last]), "\n".join([str(x) for x in gffs])) if gffs[0].contig != last[0].contig or gffs[0].strand != last[0].strand or d > 0: yield to_join to_join = [] last = gffs to_join.append(gffs) yield to_join raise StopIteration for chunks in iterate_chunks(gffs): ninput += 1 if len(chunks) > 1: gene_id = "merged_%s" % chunks[0][0].gene_id transcript_id = "merged_%s" % chunks[0][0].transcript_id info = ",".join([x[0].gene_id for x in chunks]) else: gene_id = chunks[0][0].gene_id transcript_id = chunks[0][0].transcript_id info = None intervals = [] for c in chunks: intervals += [(x.start, x.end) for x in c] intervals = Intervals.combine(intervals) # take single strand strand = chunks[0][0].strand for start, end in intervals: y = GTF.Entry() y.fromGTF(chunks[0][0], gene_id, transcript_id) y.start = start y.end = end y.strand = strand if info: y.addAttribute("merged", info) options.stdout.write("%s\n" % str(y)) nfeatures += 1 noutput += 1 elif options.method == "renumber-genes": map_old2new = {} for gtf in GTF.iterator(options.stdin): ninput += 1 if gtf.gene_id not in map_old2new: map_old2new[gtf.gene_id] = options.pattern % (len(map_old2new) + 1) gtf.setAttribute("gene_id", map_old2new[gtf.gene_id]) options.stdout.write("%s\n" % str(gtf)) noutput += 1 elif options.method == "unset-genes": map_old2new = {} for gtf in GTF.iterator(options.stdin): ninput += 1 key = gtf.transcript_id if key not in map_old2new: map_old2new[key] = options.pattern % (len(map_old2new) + 1) gtf.setAttribute("gene_id", map_old2new[key]) options.stdout.write("%s\n" % str(gtf)) noutput += 1 elif options.method == "renumber-transcripts": map_old2new = {} for gtf in GTF.iterator(options.stdin): ninput += 1 key = (gtf.gene_id, gtf.transcript_id) if key not in map_old2new: map_old2new[key] = options.pattern % (len(map_old2new) + 1) gtf.setAttribute("transcript_id", map_old2new[key]) options.stdout.write("%s\n" % str(gtf)) noutput += 1 elif options.method == "transcripts2genes": transcripts = set() genes = set() ignore_strand = options.ignore_strand for gtfs in GTF.iterator_transcripts2genes(GTF.iterator(options.stdin)): ninput += 1 for gtf in gtfs: if ignore_strand: gtf.strand = "." options.stdout.write("%s\n" % str(gtf)) transcripts.add(gtf.transcript_id) genes.add(gtf.gene_id) nfeatures += 1 noutput += 1 E.info("transcripts2genes: transcripts=%i, genes=%i" % (len(transcripts), len(genes))) elif options.method in ("rename-genes", "rename-transcripts"): map_old2new = IOTools.readMap(IOTools.openFile(options.filename_filter, "r")) if options.method == "rename-transcripts": is_gene_id = False elif options.method == "rename-genes": is_gene_id = True for gff in GTF.iterator(options.stdin): ninput += 1 if is_gene_id: if gff.gene_id in map_old2new: gff.setAttribute("gene_id", map_old2new[gff.gene_id]) else: E.debug("removing missing gene_id %s" % gff.gene_id) ndiscarded += 1 continue else: if gff.transcript_id in map_old2new: gff.setAttribute("transcript_id", map_old2new[gff.transcript_id]) else: E.debug("removing missing transcript_id %s" % gff.transcript_id) ndiscarded += 1 continue noutput += 1 options.stdout.write("%s\n" % str(gff)) elif options.method == "filter": keep_genes = set() if options.filter_method == "longest-gene": iterator = GTF.flat_gene_iterator(GTF.iterator(options.stdin)) coords = [] gffs = [] for gff in iterator: gff.sort(key=lambda x: x.start) coords.append((gff[0].contig, min([x.start for x in gff]), max([x.end for x in gff]), gff[0].gene_id)) gffs.append(gff) coords.sort() last_contig = None max_end = 0 longest_gene_id = None longest_length = None for contig, start, end, gene_id in coords: ninput += 1 if contig != last_contig or start >= max_end: if longest_gene_id: keep_genes.add(longest_gene_id) longest_gene_id = gene_id longest_length = end - start max_end = end else: if end - start > longest_length: longest_length, longest_gene_id = end - start, gene_id last_contig = contig max_end = max(max_end, end) keep_genes.add(longest_gene_id) invert = options.invert_filter for gff in gffs: keep = gff[0].gene_id in keep_genes if (keep and not invert) or (not keep and invert): noutput += 1 for g in gff: nfeatures += 1 options.stdout.write("%s\n" % g) else: ndiscarded += 1 elif options.filter_method in ("longest-transcript", "representative-transcript"): iterator = GTF.gene_iterator(GTF.iterator(options.stdin)) def selectLongestTranscript(gene): r = [] for transcript in gene: transcript.sort(key=lambda x: x.start) length = transcript[-1].end - transcript[0].start r.append((length, transcript)) r.sort() return r[-1][1] def selectRepresentativeTranscript(gene): """select a representative transcript. The representative transcript represent the largest number of exons over all transcripts. """ all_exons = [] for transcript in gene: all_exons.extend([(x.start, x.end) for x in transcript if x.feature == "exon"]) exon_counts = {} for key, exons in itertools.groupby(all_exons): exon_counts[key] = len(list(exons)) transcript_counts = [] for transcript in gene: count = sum([exon_counts[(x.start, x.end)] for x in transcript if x.feature == "exon"]) # add transcript id to sort to provide a stable # segmentation. transcript_counts.append((count, transcript[0].transcript_id, transcript)) transcript_counts.sort() return transcript_counts[-1][-1] if options.filter_method == "longest-transcript": _select = selectLongestTranscript elif options.filter_method == "representative-transcript": _select = selectRepresentativeTranscript for gene in iterator: ninput += 1 # sort in order to make reproducible which # gene is chosen. transcript = _select(sorted(gene)) noutput += 1 for g in transcript: nfeatures += 1 options.stdout.write("%s\n" % g) elif options.filter_method in ("gene", "transcript"): if options.filename_filter: ids, nerrors = IOTools.ReadList(IOTools.openFile(options.filename_filter, "r")) E.info("read %i ids" % len(ids)) ids = set(ids) by_gene = options.filter_method == "gene" by_transcript = options.filter_method == "transcript" invert = options.invert_filter ignore_strand = options.ignore_strand for gff in GTF.iterator(options.stdin): ninput += 1 keep = False if by_gene: keep = gff.gene_id in ids if by_transcript: keep = gff.transcript_id in ids if (invert and keep) or (not invert and not keep): continue if ignore_strand: gff.strand = "." options.stdout.write("%s\n" % str(gff)) nfeatures += 1 noutput += 1 elif options.sample_size: if options.filter_method == "gene": iterator = GTF.flat_gene_iterator(GTF.iterator(options.stdin)) elif options.filter_method == "transcript": iterator = GTF.transcript_iterator(GTF.iterator(options.stdin)) if options.min_exons_length: iterator = GTF.iterator_min_feature_length( iterator, min_length=options.min_exons_length, feature="exon" ) data = [x for x in iterator] ninput = len(data) if len(data) > options.sample_size: data = random.sample(data, options.sample_size) for d in data: noutput += 1 for dd in d: nfeatures += 1 options.stdout.write(str(dd) + "\n") else: assert False, "please supply either a filename " "with ids to filter with (--map-tsv-file) or a sample-size." elif options.filter_method in ("proteincoding", "lincrna", "processed-pseudogene"): # extract entries by transcript/gene biotype. # This filter uses a test on the source field (ENSEMBL pre v78) # a regular expression on the attributes (ENSEMBL >= v78). tag = { "proteincoding": "protein_coding", "processed-pseudogene": "processed_pseudogene", "lincrna": "lincRNA", }[options.filter_method] rx = re.compile('"%s"' % tag) if not options.invert_filter: f = lambda x: x.source == tag or rx.search(x.attributes) else: f = lambda x: x.source != tag and not rx.search(x.attributes) for gff in GTF.iterator(options.stdin): ninput += 1 if f(gff): options.stdout.write(str(gff) + "\n") noutput += 1 else: ndiscarded += 1 elif options.method == "exons2introns": for gffs in GTF.flat_gene_iterator(GTF.iterator(options.stdin)): ninput += 1 cds_ranges = GTF.asRanges(gffs, "CDS") exon_ranges = GTF.asRanges(gffs, "exon") input_ranges = Intervals.combine(cds_ranges + exon_ranges) if len(input_ranges) > 1: last = input_ranges[0][1] output_ranges = [] for start, end in input_ranges[1:]: output_ranges.append((last, start)) last = end if options.intron_border: b = options.intron_border output_ranges = [(x[0] + b, x[1] - b) for x in output_ranges] if options.intron_min_length: l = options.intron_min_length output_ranges = [x for x in output_ranges if x[1] - x[0] > l] for start, end in output_ranges: entry = GTF.Entry() entry.copy(gffs[0]) entry.clearAttributes() entry.transcript_id = "merged" entry.feature = "intron" entry.start = start entry.end = end options.stdout.write("%s\n" % str(entry)) nfeatures += 1 noutput += 1 else: ndiscarded += 1 elif options.method == "set-score-to-distance": for gffs in GTF.transcript_iterator(GTF.iterator(options.stdin)): ninput += 1 strand = Genomics.convertStrand(gffs[0].strand) all_start, all_end = min([x.start for x in gffs]), max([x.end for x in gffs]) if strand != ".": t = 0 if strand == "-": gffs.reverse() for gff in gffs: gff.score = t t += gff.end - gff.start if strand == "-": gffs.reverse() for gff in gffs: options.stdout.write("%s\n" % str(gff)) nfeatures += 1 noutput += 1 elif options.method == "remove-overlapping": index = GTF.readAndIndex(GTF.iterator(IOTools.openFile(options.filename_gff, "r"))) for gffs in GTF.transcript_iterator(GTF.iterator(options.stdin)): ninput += 1 found = False for e in gffs: if index.contains(e.contig, e.start, e.end): found = True break if found: ndiscarded += 1 else: noutput += 1 for e in gffs: nfeatures += 1 options.stdout.write("%s\n" % str(e)) elif options.method == "intersect-transcripts": for gffs in GTF.gene_iterator(GTF.iterator(options.stdin), strict=options.strict): ninput += 1 r = [] for g in gffs: if options.with_utr: ranges = GTF.asRanges(g, "exon") else: ranges = GTF.asRanges(g, "CDS") r.append(ranges) result = r[0] for x in r[1:]: result = Intervals.intersect(result, x) entry = GTF.Entry() entry.copy(gffs[0][0]) entry.clearAttributes() entry.transcript_id = "merged" entry.feature = "exon" for start, end in result: entry.start = start entry.end = end options.stdout.write("%s\n" % str(entry)) nfeatures += 1 noutput += 1 elif "rename-duplicates" == options.method: # note: this will only rename entries with "CDS" in feature column assert options.duplicate_feature in ["gene", "transcript", "both"], ( "for renaming duplicates, --duplicate-feature must be set to one " "of 'gene', transcript' or 'both'" ) gene_ids = list() transcript_ids = list() gtfs = list() for gtf in GTF.iterator(options.stdin): gtfs.append(gtf) if gtf.feature == "CDS": gene_ids.append(gtf.gene_id) transcript_ids.append(gtf.transcript_id) dup_gene = [item for item in set(gene_ids) if gene_ids.count(item) > 1] dup_transcript = [item for item in set(transcript_ids) if transcript_ids.count(item) > 1] E.info("Number of duplicated gene_ids: %i" % len(dup_gene)) E.info("Number of duplicated transcript_ids: %i" % len(dup_transcript)) gene_dict = dict(zip(dup_gene, ([0] * len(dup_gene)))) transcript_dict = dict(zip(dup_transcript, ([0] * len(dup_transcript)))) for gtf in gtfs: if gtf.feature == "CDS": if options.duplicate_feature in ["both", "gene"]: if gtf.gene_id in dup_gene: gene_dict[gtf.gene_id] = gene_dict[gtf.gene_id] + 1 gtf.setAttribute("gene_id", gtf.gene_id + "." + str(gene_dict[gtf.gene_id])) if options.duplicate_feature in ["both", "transcript"]: if gtf.transcript_id in dup_transcript: transcript_dict[gtf.transcript_id] = transcript_dict[gtf.transcript_id] + 1 gtf.setAttribute( "transcript_id", gtf.transcript_id + "." + str(transcript_dict[gtf.transcript_id]) ) options.stdout.write("%s\n" % gtf) elif options.method in ("merge-exons", "merge-introns", "merge-transcripts"): for gffs in GTF.flat_gene_iterator(GTF.iterator(options.stdin), strict=options.strict): ninput += 1 cds_ranges = GTF.asRanges(gffs, "CDS") exon_ranges = GTF.asRanges(gffs, "exon") # sanity checks strands = set([x.strand for x in gffs]) contigs = set([x.contig for x in gffs]) if len(strands) > 1: raise ValueError("can not merge gene '%s' on multiple strands: %s" % (gffs[0].gene_id, str(strands))) if len(contigs) > 1: raise ValueError("can not merge gene '%s' on multiple contigs: %s" % (gffs[0].gene_id, str(contigs))) strand = Genomics.convertStrand(gffs[0].strand) if cds_ranges and options.with_utr: cds_start, cds_end = cds_ranges[0][0], cds_ranges[-1][1] midpoint = (cds_end - cds_start) / 2 + cds_start utr_ranges = [] for start, end in Intervals.truncate(exon_ranges, cds_ranges): if end - start > 3: if strand == ".": feature = "UTR" elif strand == "+": if start < midpoint: feature = "UTR5" else: feature = "UTR3" elif strand == "-": if start < midpoint: feature = "UTR3" else: feature = "UTR5" utr_ranges.append((feature, start, end)) output_feature = "CDS" output_ranges = cds_ranges else: output_feature = "exon" output_ranges = exon_ranges utr_ranges = [] result = [] try: biotypes = [x["gene_biotype"] for x in gffs] biotype = ":".join(set(biotypes)) except (KeyError, AttributeError): biotype = None if options.method == "merge-exons": # need to combine per feature - skip # utr_ranges = Intervals.combineAtDistance( # utr_ranges, # options.merge_exons_distance) output_ranges = Intervals.combineAtDistance(output_ranges, options.merge_exons_distance) for feature, start, end in utr_ranges: entry = GTF.Entry() entry.copy(gffs[0]) entry.clearAttributes() entry.feature = feature entry.transcript_id = "merged" if biotype: entry.addAttribute("gene_biotype", biotype) entry.start = start entry.end = end result.append(entry) for start, end in output_ranges: entry = GTF.Entry() entry.copy(gffs[0]) entry.clearAttributes() entry.transcript_id = "merged" if biotype: entry.addAttribute("gene_biotype", biotype) entry.feature = output_feature entry.start = start entry.end = end result.append(entry) elif options.method == "merge-transcripts": entry = GTF.Entry() entry.copy(gffs[0]) entry.clearAttributes() entry.transcript_id = entry.gene_id if biotype: entry.addAttribute("gene_biotype", biotype) entry.start = output_ranges[0][0] entry.end = output_ranges[-1][1] result.append(entry) elif options.method == "merge-introns": if len(output_ranges) >= 2: entry = GTF.Entry() entry.copy(gffs[0]) entry.clearAttributes() entry.transcript_id = entry.gene_id if biotype: entry.addAttribute("gene_biotype", biotype) entry.start = output_ranges[0][1] entry.end = output_ranges[-1][0] result.append(entry) else: ndiscarded += 1 continue result.sort(key=lambda x: x.start) for x in result: options.stdout.write("%s\n" % str(x)) nfeatures += 1 noutput += 1 elif options.method == "find-retained-introns": for gene in GTF.gene_iterator(GTF.iterator(options.stdin)): ninput += 1 found_any = False for intron in find_retained_introns(gene): found_any = True options.stdout.write("%s\n" % str(intron)) nfeatures += 1 if found_any: noutput += 1 elif options.method == "genes-to-unique-chunks": for gene in GTF.flat_gene_iterator(GTF.iterator(options.stdin)): ninput += 1 for exon in gene_to_blocks(gene): options.stdout.write("%s\n" % str(exon)) nfeatures += 1 noutput += 1 else: raise ValueError("unknown method '%s'" % options.method) E.info("ninput=%i, noutput=%i, nfeatures=%i, ndiscarded=%i" % (ninput, noutput, nfeatures, ndiscarded)) E.Stop()