def testNoOverlap(self): """test empty input.""" self.assertEqual(Intervals.truncate([(0, 5), (10, 15)], [(5, 10)]), [(0, 5), (10, 15)]) self.assertEqual(Intervals.truncate([(5, 10)], [(0, 5), (10, 15)]), [(5, 10)]) self.assertEqual(Intervals.truncate([(0, 5), (5, 10)], [(10, 15)]), [(0, 5), (5, 10)])
def testMultiple(self): """test empty input.""" self.assertEqual(Intervals.intersect([(0, 5), (10, 15)], [(0, 5)]), [(0, 5)]) self.assertEqual(Intervals.intersect([(0, 5), (10, 15)], [(0, 10)]), [(0, 5)]) self.assertEqual(Intervals.intersect([(0, 5), (10, 15)], [(0, 15)]), [(0, 5), (10, 15)]) self.assertEqual(Intervals.intersect([(0, 5), (5, 10)], [(0, 10)]), [(0, 5), (5, 10)])
def testSingle(self): """test empty input.""" self.assertEqual(Intervals.truncate([(0, 5)], [(0, 5)]), []) self.assertEqual(Intervals.truncate([(0, 5)], [(0, 3)]), [(3, 5)]) self.assertEqual(Intervals.truncate([(0, 3)], [(0, 5)]), []) self.assertEqual(Intervals.truncate([(0, 5)], [(3, 5)]), [(0, 3)]) self.assertEqual(Intervals.truncate([(3, 5)], [(0, 5)]), []) self.assertEqual(Intervals.truncate([(5, 10)], [(5, 10)]), []) self.assertEqual(Intervals.truncate([(5, 10)], [(5, 20)]), []) self.assertEqual(Intervals.truncate([(5, 10)], [(0, 10)]), []) self.assertEqual(Intervals.truncate([(5, 10)], [(0, 10)]), []) self.assertEqual(Intervals.truncate([(5, 10)], [(0, 20)]), [])
def toSequence(chunk, fasta): """convert a list of gff attributes to a single sequence. This function ensures correct in-order concatenation on positive/negative strand. Overlapping regions are merged. """ if len(chunk) == 0: return "" contig, strand = chunk[0].contig, chunk[0].strand for gff in chunk: assert gff.strand == strand, "features on different strands." assert gff.contig == contig, "features on different contigs." intervals = Intervals.combine([(x.start, x.end) for x in chunk]) lcontig = fasta.getLength(contig) positive = Genomics.IsPositiveStrand(strand) if not positive: intervals = [(lcontig - end, lcontig - start) for start, end in intervals] intervals.reverse() s = [ fasta.getSequence(contig, strand, start, end) for start, end in intervals ] return "".join(s)
def combineMergedIntervals(bedfiles): '''combine intervals in a collection of bed files. Overlapping intervals between tracks are merged. Algorithm: 1. collect all intervals in all tracks into a single track 2. merge overlapping intervals 3. report all intervals that overlap with an interval in each track. ''' # get all intervals data_per_contig = collections.defaultdict(list) for bedfile in bedfiles: for contig in bedfile.contigs: i = [] for bed in bedfile.fetch(contig, parser=pysam.asBed()): i.append((bed.start, bed.end)) data_per_contig[contig].extend(i) # merge intervals for contig in list(data_per_contig.keys()): data_per_contig[contig] = Intervals.combine(data_per_contig[contig]) # filter intervals - take only those present in all bedfiles for contig, data in sorted(data_per_contig.items()): for start, end in data: if isContainedInAll(contig, start, end, bedfiles): yield contig, start, end
def iterator_min_feature_length(gff_iterator, min_length, feature="exon"): """select only those genes with a minimum length of a given feature.""" for gffs in gff_iterator: intervals = [(x.start, x.end) for x in gffs if x.feature == feature] intervals = Intervals.combine(intervals) t = sum((x[1] - x[0] for x in intervals)) if t >= min_length: yield gffs
def toIntronIntervals(chunk): '''convert a set of gtf elements within a transcript to intron coordinates. Will use first transcript_id found. Note that coordinates will still be forward strand coordinates ''' if len(chunk) == 0: return [] contig, strand, transcript_id = (chunk[0].contig, chunk[0].strand, chunk[0].transcript_id) for gff in chunk: assert gff.strand == strand, "features on different strands." assert gff.contig == contig, "features on different contigs." intervals = Intervals.combine([(x.start, x.end) for x in chunk if x.feature == "exon"]) return Intervals.complement(intervals)
def count(self, bed): '''update internal counts.''' results = [] for track in self.tracks: try: overlaps = [(x[0], x[1]) for x in self.index[track][bed.contig].find( bed.start, bed.end)] except KeyError: overlaps = [] results.append((len(overlaps), Intervals.calculateOverlap([ (bed.start, bed.end), ], Intervals.combine(overlaps)))) self.data = results
def annotateExons(iterator, fasta, options): """annotate exons within iterator.""" gene_iterator = GTF.gene_iterator(iterator) ninput, noutput, noverlapping = 0, 0, 0 for this in gene_iterator: ninput += 1 intervals = collections.defaultdict(list) ntranscripts = len(this) is_negative_strand = Genomics.IsNegativeStrand(this[0][0].strand) for exons in this: # make sure these are sorted correctly exons.sort(key=lambda x: x.start) if is_negative_strand: exons.reverse() nexons = len(exons) for i, e in enumerate(exons): intervals[(e.start, e.end)].append((i + 1, nexons)) gtf = GTF.Entry() gtf.fromGTF(this[0][0], this[0][0].gene_id, this[0][0].gene_id) gtf.addAttribute("ntranscripts", ntranscripts) gtfs = [] for r, pos in intervals.items(): g = GTF.Entry().copy(gtf) g.start, g.end = r g.addAttribute("nused", len(pos)) g.addAttribute("pos", ",".join(["%i:%i" % x for x in pos])) gtfs.append(g) gtfs.sort(key=lambda x: x.start) for g in gtfs: options.stdout.write("%s\n" % str(g)) # check for exon overlap intervals = [(g.start, g.end) for g in gtfs] nbefore = len(intervals) nafter = len(Intervals.combine(intervals)) if nafter != nbefore: noverlapping += 1 noutput += 1 if options.loglevel >= 1: options.stdlog.write("# ninput=%i, noutput=%i, noverlapping=%i\n" % (ninput, noutput, noverlapping))
def annotateTTS(iterator, fasta, options): """annotate termination sites within iterator. Entries specified with ``--restrict-source are annotated``. """ gene_iterator = GTF.gene_iterator(iterator) ngenes, ntranscripts, npromotors = 0, 0, 0 for gene in gene_iterator: ngenes += 1 is_negative_strand = Genomics.IsNegativeStrand(gene[0][0].strand) lcontig = fasta.getLength(gene[0][0].contig) tts = [] transcript_ids = [] for transcript in gene: ntranscripts += 1 mi, ma = min([x.start for x in transcript ]), max([x.end for x in transcript]) transcript_ids.append(transcript[0].transcript_id) # if tts is directly at start/end of contig, the tss will # be within an exon. otherwise, it is outside an exon. if is_negative_strand: tts.append( (max(0, mi - options.promotor), max(options.promotor, mi))) else: tts.append((min(ma, lcontig - options.promotor), min(lcontig, ma + options.promotor))) if options.merge_promotors: # merge the promotors (and rename - as sort order might have # changed) tts = Intervals.combine(tts) transcript_ids = ["%i" % (x + 1) for x in range(len(tts))] gtf = GTF.Entry() gtf.fromGTF(gene[0][0], gene[0][0].gene_id, gene[0][0].gene_id) gtf.source = "tts" x = 0 for start, end in tts: gtf.start, gtf.end = start, end gtf.transcript_id = transcript_ids[x] options.stdout.write("%s\n" % str(gtf)) npromotors += 1 x += 1 if options.loglevel >= 1: options.stdlog.write("# ngenes=%i, ntranscripts=%i, ntss=%i\n" % (ngenes, ntranscripts, npromotors))
def asRanges(gffs, feature=None): """return ranges within a set of gffs. Overlapping intervals are merged. The returned intervals are sorted. """ if isinstance(feature, str): gg = [x for x in gffs if x.feature == feature] elif feature: gg = [x for x in gffs if x.feature in feature] else: gg = gffs[:] r = [(g.start, g.end) for g in gg] return Intervals.combine(r)
def transform_third_codon(start, end, intervals_with_gff): """transform: only return nucleotide positions in window (start, end) that are in third codon position. """ intervals = [] for istart, iend, gff in intervals_with_gff: if gff.frame == ".": raise ValueError("need a frame for third codon positions.") # frame = nucleotides from start to next codon frame = int(gff.frame) # to make life easier, convert to 0-based coordinates, # with zero starting at first position in window # re-arrange positions on negative strand if Genomics.IsNegativeStrand(gff.strand): # convert to negative strand coordinates counting from 0 coordinate_offset = end reverse = True istart, iend = end - iend, end - istart else: istart, iend = istart - start, iend - start reverse = False coordinate_offset = start # make sure that you start on a second codon position and within window if istart < 0: frame = (frame + istart) % 3 istart = 0 if frame != 0: istart -= (3 - frame) istart += 2 iend = min(iend, end - start) for x in range(istart, iend, 3): if reverse: c = coordinate_offset - x - 1 else: c = coordinate_offset + x intervals.append((c, c + 1)) return Intervals.combineIntervals(intervals)
def __str__(self): single_exon_transcripts = 0 exons_per_transcript = [] intron_sizes = [] transcript_lengths = [] exon_sizes = [] for x in list(self.counts_exons_per_transcript.values()): x.sort() x = Intervals.combine(x) transcript_lengths.append(x[-1][1] - x[0][0]) exons_per_transcript.append(len(x)) for start, end in x: exon_sizes.append(end - start) if len(x) == 1: single_exon_transcripts += 1 continue last_end = x[0][1] for start, end in x[1:]: intron_sizes.append(start - last_end) last_end = end return "\t".join(map(str, (len(self.counts_gene_ids), len(self.counts_transcript_ids), single_exon_transcripts, Stats.Summary(exons_per_transcript), Stats.Summary(exon_sizes), Stats.Summary(intron_sizes), Stats.Summary(transcript_lengths), )))
def testEmpty(self): """test empty input.""" self.assertEqual(Intervals.fromArray([]), [])
def annotateRegulons(iterator, fasta, tss, options): """annotate regulons within iterator. Entries specied with ``--restrict-source`` are annotated. """ gene_iterator = GTF.gene_iterator(iterator) ngenes, ntranscripts, nregulons = 0, 0, 0 upstream, downstream = options.upstream, options.downstream for gene in gene_iterator: ngenes += 1 is_negative_strand = Genomics.IsNegativeStrand(gene[0][0].strand) lcontig = fasta.getLength(gene[0][0].contig) regulons = [] transcript_ids = [] for transcript in gene: ntranscripts += 1 mi, ma = min([x.start for x in transcript ]), max([x.end for x in transcript]) if tss: # add range to both sides of tss if is_negative_strand: interval = ma - options.downstream, ma + options.upstream else: interval = mi - options.upstream, mi + options.downstream else: # add range to both sides of tts if is_negative_strand: interval = mi - options.downstream, mi + options.upstream else: interval = ma - options.upstream, ma + options.downstream interval = (min(lcontig, max(0, interval[0])), min(lcontig, max(0, interval[1]))) regulons.append(interval) transcript_ids.append(transcript[0].transcript_id) if options.merge_promotors: # merge the regulons (and rename - as sort order might have # changed) regulons = Intervals.combine(regulons) transcript_ids = ["%i" % (x + 1) for x in range(len(regulons))] gtf = GTF.Entry() gtf.fromGTF(gene[0][0], gene[0][0].gene_id, gene[0][0].gene_id) gtf.source = "regulon" x = 0 for start, end in regulons: gtf.start, gtf.end = start, end gtf.transcript_id = transcript_ids[x] options.stdout.write("%s\n" % str(gtf)) nregulons += 1 x += 1 E.info("ngenes=%i, ntranscripts=%i, nregulons=%i" % (ngenes, ntranscripts, nregulons))
def testArray2(self): """test longer array.""" a = [1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1] self.assertEqual(Intervals.fromArray(a), [(0, 3), (6, 9), (12, 15)]) self.assertEqual(Intervals.fromArray([not x for x in a]), [(3, 6), (9, 12)])
def annotateGenome(iterator, fasta, options, default_code=DEFAULT_CODE): """annotate a genome given by the indexed *fasta* file and an iterator over gtf annotations. """ annotations = {} contig_sizes = fasta.getContigSizes(with_synonyms=False) E.info("allocating memory for %i contigs and %i bytes" % (len(contig_sizes), sum(contig_sizes.values()) * array.array("B").itemsize)) # AString.AString( "a").itemsize )) for contig, size in list(contig_sizes.items()): E.debug("allocating %s: %i bases" % (contig, size)) # annotations[contig] = AString.AString( default_code * size ) # annotations[contig] = array.array("", default_code * size) # Go to list for py3 compatibility, patch annotations[contig] = [default_code] * size E.info("allocated memory for %i contigs" % len(fasta)) counter = E.Counter() # output splice junctions outfile_junctions = E.open_output_file("junctions") outfile_junctions.write( "contig\tstrand\tpos1\tpos2\tframe\tgene_id\ttranscript_id\n") for gtfs in iterator: counter.input += 1 if counter.input % options.report_step == 0: E.info("iteration %i" % counter.input) try: contig = fasta.getToken(gtfs[0].contig) except KeyError as msg: E.warn("contig %s not found - annotation ignored" % gtfs[0].contig) counter.skipped_contig += 1 continue lcontig = fasta.getLength(contig) # make sure that exons are sorted by coordinate gtfs.sort(key=lambda x: x.start) is_positive = Genomics.IsPositiveStrand(gtfs[0].strand) source = gtfs[0].source # process non-coding data if source in MAP_ENSEMBL: code = MAP_ENSEMBL[source] intervals = [(x.start, x.end) for x in gtfs] addSegments(annotations[contig], intervals, is_positive, code) elif source == "protein_coding": # collect exons for utr exons = [(x.start, x.end) for x in gtfs if x.feature == "exon"] cds = [(x.start, x.end) for x in gtfs if x.feature == "CDS"] if len(cds) == 0: counter.skipped_transcripts += 1 E.warn("protein-coding transcript %s without CDS - skipped" % gtfs[0].transcript_id) continue exons = Intervals.truncate(exons, cds) start, end = cds[0][0], cds[-1][1] UTR5 = [x for x in exons if x[1] < start] UTR3 = [x for x in exons if x[0] >= end] if not is_positive: UTR5, UTR3 = UTR3, UTR5 splice_code = "S" else: splice_code = "s" addSegments(annotations[contig], UTR5, is_positive, "u") addIntrons(annotations[contig], UTR5, is_positive, options.max_frameshift_length) addSegments(annotations[contig], UTR3, is_positive, "v") addIntrons(annotations[contig], UTR3, is_positive, options.max_frameshift_length) # output CDS according to frame addCDS(annotations[contig], [x for x in gtfs if x.feature == "CDS"], is_positive) # add introns between CDS addIntrons(annotations[contig], cds, is_positive, options.max_frameshift_length) # output splice junctions cds = [x for x in gtfs if x.feature == "CDS"] # apply corrections for 1-past end coordinates # to point between residues within CDS if is_positive: ender = lambda x: x.end - 1 starter = lambda x: x.start out_positive = "+" else: ender = lambda x: lcontig - x.start - 1 starter = lambda x: lcontig - x.end out_positive = "-" cds.reverse() end = ender(cds[0]) for c in cds[1:]: start = starter(c) outfile_junctions.write("%s\t%s\t%i\t%i\t%s\t%s\t%s\n" % ( contig, out_positive, end, start, c.frame, c.gene_id, c.transcript_id, )) end = ender(c) E.info("finished reading genes: %s" % str(counter)) outfile_junctions.close() E.info("started counting") outfile = E.open_output_file("counts") outputCounts(outfile, annotations) outfile.close() E.info("started output") for k in sorted(annotations.keys()): # options.stdout.write(">%s\n%s\n" % (k, annotations[k].tostring())) options.stdout.write(">%s\n%s\n" % (k, "".join(annotations[k])))
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("--is-gtf", dest="is_gtf", action="store_true", help="input is gtf instead of gff.") parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome [default=%default].") parser.add_option("-m", "--merge-adjacent", dest="merge", action="store_true", help="merge adjacent intervals with the same attributes." " [default=%default]") parser.add_option("-e", "--feature", dest="feature", type="string", help="filter by a feature, for example 'exon', 'CDS'." " If set to the empty string, all entries are output " "[%default].") parser.add_option("-f", "--maskregions-bed-file", dest="filename_masks", type="string", metavar="gff", help="mask sequences with regions given in gff file " "[%default].") parser.add_option("--remove-masked-regions", dest="remove_masked_regions", action="store_true", help="remove regions instead of masking [%default].") parser.add_option("--min-interval-length", dest="min_length", type="int", help="set minimum length for sequences output " "[%default]") parser.add_option("--max-length", dest="max_length", type="int", help="set maximum length for sequences output " "[%default]") parser.add_option("--extend-at", dest="extend_at", type="choice", choices=("none", "3", "5", "both", "3only", "5only"), help="extend at no end, 3', 5' or both ends. If " "3only or 5only are set, only the added sequence " "is returned [default=%default]") parser.add_option("--header-attributes", dest="header_attr", action="store_true", help="add GFF entry attributes to the FASTA record" " header section") parser.add_option("--extend-by", dest="extend_by", type="int", help="extend by # bases [default=%default]") parser.add_option("--extend-with", dest="extend_with", type="string", help="extend using base [default=%default]") parser.add_option("--masker", dest="masker", type="choice", choices=("dust", "dustmasker", "softmask", "none"), help="apply masker [%default].") parser.add_option("--fold-at", dest="fold_at", type="int", help="fold sequence every n bases[%default].") parser.add_option( "--fasta-name-attribute", dest="naming_attribute", type="string", help="use attribute to name fasta entry. Currently only compatable" " with gff format [%default].") parser.set_defaults( is_gtf=False, genome_file=None, merge=False, feature=None, filename_masks=None, remove_masked_regions=False, min_length=0, max_length=0, extend_at=None, extend_by=100, extend_with=None, masker=None, fold_at=None, naming_attribute=False, header_attr=False, ) (options, args) = E.start(parser) if options.genome_file: fasta = IndexedFasta.IndexedFasta(options.genome_file) contigs = fasta.getContigSizes() if options.is_gtf: iterator = GTF.transcript_iterator(GTF.iterator(options.stdin)) else: gffs = GTF.iterator(options.stdin) if options.merge: iterator = GTF.joined_iterator(gffs) else: iterator = GTF.chunk_iterator(gffs) masks = None if options.filename_masks: masks = {} with iotools.open_file(options.filename_masks, "r") as infile: e = GTF.readAsIntervals(GTF.iterator(infile)) # convert intervals to intersectors for contig in list(e.keys()): intersector = quicksect.IntervalTree() for start, end in e[contig]: intersector.add(start, end) masks[contig] = intersector ninput, noutput, nmasked, nskipped_masked = 0, 0, 0, 0 nskipped_length = 0 nskipped_noexons = 0 feature = options.feature # iterator is a list containing groups (lists) of features. # Each group of features have in common the same transcript ID, in case of # GTF files. for ichunk in iterator: ninput += 1 if feature: chunk = [x for x in ichunk if x.feature == feature] else: chunk = ichunk if len(chunk) == 0: nskipped_noexons += 1 E.info("no features in entry from " "%s:%i..%i - %s" % (ichunk[0].contig, ichunk[0].start, ichunk[0].end, str(ichunk[0]))) continue contig, strand = chunk[0].contig, chunk[0].strand if options.is_gtf: name = chunk[0].transcript_id else: if options.naming_attribute: attr_dict = { x.split("=")[0]: x.split("=")[1] for x in chunk[0].attributes.split(";") } name = attr_dict[options.naming_attribute] else: name = str(chunk[0].attributes) lcontig = contigs[contig] positive = Genomics.IsPositiveStrand(strand) intervals = [(x.start, x.end) for x in chunk] intervals.sort() if masks: if contig in masks: masked_regions = [] for start, end in intervals: masked_regions += [(x.start, x.end) for x in masks[contig].find( quicksect.Interval(start, end))] masked_regions = Intervals.combine(masked_regions) if len(masked_regions): nmasked += 1 if options.remove_masked_regions: intervals = Intervals.truncate(intervals, masked_regions) else: raise NotImplementedError("unimplemented") if len(intervals) == 0: nskipped_masked += 1 if options.loglevel >= 1: options.stdlog.write( "# skipped because fully masked: " "%s: regions=%s masks=%s\n" % (name, str([(x.start, x.end) for x in chunk]), masked_regions)) continue out = intervals if options.extend_at and not options.extend_with: if options.extend_at == "5only": intervals = [(max(0, intervals[0][0] - options.extend_by), intervals[0][0])] elif options.extend_at == "3only": intervals = [(intervals[-1][1], min(lcontig, intervals[-1][1] + options.extend_by))] else: if options.extend_at in ("5", "both"): intervals[0] = (max(0, intervals[0][0] - options.extend_by), intervals[0][1]) if options.extend_at in ("3", "both"): intervals[-1] = (intervals[-1][0], min(lcontig, intervals[-1][1] + options.extend_by)) if not positive: intervals = [(lcontig - x[1], lcontig - x[0]) for x in intervals[::-1]] out.reverse() s = [ fasta.getSequence(contig, strand, start, end) for start, end in intervals ] # IMS: allow for masking of sequences s = Masker.maskSequences(s, options.masker) l = sum([len(x) for x in s]) if (l < options.min_length or (options.max_length and l > options.max_length)): nskipped_length += 1 if options.loglevel >= 1: options.stdlog.write("# skipped because length out of bounds " "%s: regions=%s len=%i\n" % (name, str(intervals), l)) continue if options.extend_at and options.extend_with: extension = "".join((options.extend_with, ) * options.extend_by) if options.extend_at in ("5", "both"): s[1] = extension + s[1] if options.extend_at in ("3", "both"): s[-1] = s[-1] + extension if options.fold_at: n = options.fold_at s = "".join(s) seq = "\n".join([s[i:i + n] for i in range(0, len(s), n)]) else: seq = "\n".join(s) if options.header_attr: attributes = " ".join( [":".join([ax, ay]) for ax, ay in chunk[0].asDict().items()]) options.stdout.write( ">%s %s:%s:%s feature:%s %s\n%s\n" % (name, contig, strand, ";".join( ["%i-%i" % x for x in out]), chunk[0].feature, attributes, seq)) else: options.stdout.write( ">%s %s:%s:%s\n%s\n" % (name, contig, strand, ";".join(["%i-%i" % x for x in out]), seq)) noutput += 1 E.info("ninput=%i, noutput=%i, nmasked=%i, nskipped_noexons=%i, " "nskipped_masked=%i, nskipped_length=%i" % (ninput, noutput, nmasked, nskipped_noexons, nskipped_masked, nskipped_length)) E.stop()
def annotateGenes(iterator, fasta, options): """annotate gene structures This method outputs intervals for first/middle/last exon/intron, UTRs and flanking regions. This method annotates per transcript. In order to achieve a unique tiling, use only a single transcript per gene and remove any overlap between genes. """ gene_iterator = GTF.gene_iterator(iterator) ngenes, ntranscripts, nskipped = 0, 0, 0 results = [] increment = options.increment introns_detail = "introns" in options.detail exons_detail = "exons" in options.detail for gene in gene_iterator: ngenes += 1 is_negative_strand = Genomics.IsNegativeStrand(gene[0][0].strand) try: lcontig = fasta.getLength(gene[0][0].contig) except KeyError: nskipped += 1 continue results = [] for transcript in gene: def _add(interval, anno): gtf = GTF.Entry() gtf.contig = transcript[0].contig gtf.gene_id = transcript[0].gene_id gtf.transcript_id = transcript[0].transcript_id gtf.strand = transcript[0].strand gtf.feature = anno gtf.start, gtf.end = interval results.append(gtf) ntranscripts += 1 exons = [(x.start, x.end) for x in transcript if x.feature == "exon"] if len(exons) == 0: nskipped += 1 exons.sort() introns = [] end = exons[0][1] for exon in exons[1:]: introns.append((end, exon[0])) end = exon[1] # add flank start, end = exons[0][0], exons[-1][1] upstream, downstream = [], [] for x in range(0, options.flank, increment): upstream.append((start - increment, start)) start -= increment downstream.append((end, end + increment)) end += increment # remove out-of-bounds coordinates upstream = [x for x in upstream if x[0] >= 0] downstream = [x for x in downstream if x[1] <= lcontig] if is_negative_strand: exons.reverse() introns.reverse() upstream, downstream = downstream, upstream # add exons if exons_detail: _add(exons[0], "first_exon") if len(exons) > 1: _add(exons[-1], "last_exon") for e in exons[1:-1]: _add(e, "middle_exon") else: for e in exons: _add(e, "exon") # add introns if introns_detail: if len(introns) > 0: _add(introns[0], "first_intron") if len(introns) > 1: _add(introns[-1], "last_intron") for i in introns[1:-1]: _add(i, "middle_intron") else: for i in introns: _add(i, "intron") for x, u in enumerate(upstream): _add(u, "upstream_%i" % (increment * (x + 1))) for x, u in enumerate(downstream): _add(u, "downstream_%i" % (increment * (x + 1))) results.sort(key=lambda x: x.feature) cache = [] for key, vals in itertools.groupby(results, key=lambda x: x.feature): v = list(vals) intervals = [(x.start, x.end) for x in v] intervals = Intervals.combine(intervals) for start, end in intervals: r = GTF.Entry() r.copy(v[0]) r.start, r.end = start, end cache.append(r) cache.sort(key=lambda x: x.start) for r in cache: options.stdout.write("%s\n" % str(r)) E.info("ngenes=%i, ntranscripts=%i, nskipped=%i\n" % (ngenes, ntranscripts, nskipped))
def testEmpty(self): """test empty input.""" self.assertEqual(Intervals.truncate([], []), [])
def testEmpty(self): """test empty input.""" self.assertEqual(Intervals.intersect([], []), [])
def cropGFF(gffs, filename_gff): """crop intervals in gff file.""" # read regions to crop with and convert intervals to intersectors E.info("reading gff for cropping: started.") other_gffs = GTF.iterator(iotools.open_file(filename_gff, "r")) cropper = GTF.readAsIntervals(other_gffs) ntotal = 0 for contig in list(cropper.keys()): intersector = quicksect.IntervalTree() for start, end in cropper[contig]: intersector.add(start, end) ntotal += 1 cropper[contig] = intersector E.info("reading gff for cropping: finished.") E.info("reading gff for cropping: %i contigs with %i intervals." % (len(cropper), ntotal)) ninput, noutput, ncropped, ndeleted = 0, 0, 0, 0 # do the actual cropping for gff in gffs: ninput += 1 if gff.contig in cropper: start, end = gff.start, gff.end overlaps = cropper[gff.contig].find(quicksect.Interval(start, end)) if overlaps: l = end - start a = numpy.ones(l) for i in overlaps: s = max(0, i.start - start) e = min(l, i.end - start) a[s:e] = 0 segments = Intervals.fromArray(a) if len(segments) == 0: ndeleted += 1 else: ncropped += 1 for s, e in segments: gff.start, gff.end = s + start, e + start noutput += 1 yield (gff) continue noutput += 1 yield (gff) E.info("ninput=%i, noutput=%i, ncropped=%i, ndeleted=%i" % (ninput, noutput, ncropped, ndeleted))
def testHalfEmpty(self): """test empty input.""" self.assertEqual(Intervals.intersect([(0, 5)], []), []) self.assertEqual(Intervals.intersect([], [(0, 5)]), [])
def transform_complement(start, end, intervals_with_gff): y = Intervals.combineIntervals([(x[0], x[1]) for x in intervals_with_gff]) return Intervals.complementIntervals(y, start, end)
def merge(iterator, max_distance=0, by_name=False, min_intervals=1, remove_inconsistent=False, resolve_blocks=False, stranded=False): """iterator for merging adjacent bed entries. *max_distance* > 0 permits merging of intervals that are not directly adjacent. If *by_name = True*, only entries with the same name are merged. If *remove_inconsistent*, overlapping intervals where the names are inconsistent will be removed. The score gives the number of intervals that have been merged. """ if remove_inconsistent and by_name: assert ValueError( "using both remove_inconsistent and by_name makes no sense") def iterate_chunks(iterator): max_end = defaultdict(int) to_join = defaultdict(list) last_name = defaultdict(str) last = next(iterator) if not stranded: strand = "." else: strand = last.strand max_end[strand] = last.end to_join[strand] = [last] for bed in iterator: if not stranded: strand = "." else: strand = bed.strand d = bed.start - max_end[strand] if bed.contig == last.contig: assert bed.start >= last.start, \ "input file should be sorted by contig and position: d=%i:\n%s\n%s\n" \ % (d, last, bed) if bed.contig != last.contig: for s in to_join: if to_join[s]: yield to_join[s] to_join[s] = [] max_end[s] = 0 elif (d > max_distance or (by_name and last_name[strand] and last_name[strand] != bed.name)): if to_join[strand]: yield to_join[strand] to_join[strand] = list() last = bed last_name[strand] = last.name max_end[strand] = max(bed.end, max_end[strand]) to_join[strand].append(bed) for strand in sorted(to_join): if to_join[strand]: try: yield to_join[strand] except: return c = E.Counter() for to_join in iterate_chunks(iterator): c.input += 1 if remove_inconsistent: names = set([x.name for x in to_join]) if len(names) > 1: c.skipped_inconsistent_intervals += 1 continue if resolve_blocks: # keep track of number of intervals in each entry for bed in to_join: bed["score"] = 1 merged = True while merged: joined = [] not_joined = [] merged = False while len(to_join) > 0: bed1, to_join = to_join[0], to_join[1:] intervals1 = bed1.toIntervals() for bed2 in to_join: intervals2 = bed2.toIntervals() if Intervals.calculateOverlap(intervals1, intervals2) > 0: intervals = Intervals.combine(intervals1 + intervals2) bed1.fromIntervals(intervals) bed1["score"] += bed2["score"] merged = True else: not_joined.append(bed2) joined.append(bed1) to_join = not_joined not_joined = [] to_join = joined joined = [] to_join = sorted(to_join, key=lambda x: int(x.start)) # keep only those with the created from the merge of the minimum # number of intervals for bed in to_join: if bed["score"] < min_intervals: c.skipped_min_intervals += 1 continue yield bed c.output += 1 else: if len(to_join) < min_intervals: c.skipped_min_intervals += 1 continue a = to_join[0] a.end = max([entry.end for entry in to_join]) a.score = len(to_join) yield a c.output += 1 E.info(str(c))
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version="%prog version: $Id: gff2psl.py 2781 2009-09-10 11:33:14Z andreas $", usage=globals()["__doc__"]) parser.add_option("--is-gtf", dest="is_gtf", action="store_true", help="input is gtf.") parser.add_option("--no-header", dest="with_header", action="store_false", help="do not output BLAT header [default=%default].") parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome.") parser.add_option("--queries-tsv-file", dest="input_filename_queries", type="string", help="fasta filename with queries [default=%default].") parser.add_option("--allow-duplicates", dest="allow_duplicates", action="store_true", help="""permit duplicate entries. Adjacent exons of a transcript will still be merged [default=%default].""" ) parser.set_defaults(is_gtf=False, genome_file=None, with_header=True, allow_duplicates=False, test=None) (options, args) = E.start(parser, add_pipe_options=True) if options.genome_file: genome_fasta = IndexedFasta.IndexedFasta(options.genome_file) else: genome_fasta = None if options.input_filename_queries: queries_fasta = IndexedFasta.IndexedFasta( options.input_filename_queries) else: queries_fasta = None ninput, noutput, nskipped = 0, 0, 0 if options.is_gtf: iterator = GTF.transcript_iterator(GTF.iterator_filtered(GTF.iterator(sys.stdin), feature="exon"), strict=not options.allow_duplicates) else: iterator = GTF.joined_iterator(GTF.iterator(sys.stdin)) if options.with_header: options.stdout.write(Blat.Match().getHeader() + "\n") for gffs in iterator: if options.test and ninput >= options.test: break ninput += 1 result = alignlib_lite.py_makeAlignmentBlocks() xstart = 0 intervals = Intervals.combine([(gff.start, gff.end) for gff in gffs]) for start, end in intervals: xend = xstart + end - start result.addDiagonal(xstart, xend, start - xstart) xstart = xend entry = Blat.Match() entry.mQueryId = gffs[0].transcript_id entry.mSbjctId = gffs[0].contig entry.strand = gffs[0].strand if genome_fasta: if entry.mSbjctId in genome_fasta: entry.mSbjctLength = genome_fasta.getLength(entry.mSbjctId) else: entry.mSbjctLength = result.getColTo() if queries_fasta: if entry.mQueryId in queries_fasta: entry.mQueryLength = queries_fasta.getLength(entry.mQueryId) else: entry.mQueryLength = result.getRowTo() entry.fromMap(result) options.stdout.write(str(entry) + "\n") noutput += 1 E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped)) E.stop()
def testArray1(self): """test simple array.""" a = [1, 1, 1, 0, 0, 0, 1, 1, 1] self.assertEqual(Intervals.fromArray(a), [(0, 3), (6, 9)]) self.assertEqual(Intervals.fromArray([not x for x in a]), [(3, 6)])
def transform_overlap(start, end, intervals_with_gff): """transform: overlap of intervals in x with y.""" y = Intervals.combineIntervals([(x[0], x[1]) for x in intervals_with_gff]) return Intervals.pruneIntervals(y, start, end)