def setUp(self): AnnotatorDistanceCheck.setUp(self) outfile = open( self.workspace, "w" ) e = GTF.Entry() e.contig, e.strand, e.gene_id, e.transcript_id = "chr1", "+", "gene1", "trans1" # 10kb genes every 100000kb for 10Mb for x in range( 100000, 10000000, 100000 ): e.gene_id, e.transcript_id = "gene%i" % x, "trans1" e.start, e.end = x, x + 10000 outfile.write( str(e) + "\n" ) outfile.close() # segments: concentrated at 5' end outfile = open( self.segments, "w" ) e = GTF.Entry() e.contig, e.strand = "chr1", "+" for x in range( 110000, 10000000, 100000 ): y = x inc = 200 while y < x + 100000: e.gene_id, e.transcript_id = "gene%i" % (y), "trans1" e.start, e.end = y, y+random.randint( 50, 150 ) outfile.write( str(e) + "\n" ) y += inc inc += random.randint( 0, 100 ) outfile.close()
def setUp(self): self.tmpdir = tempfile.mkdtemp() self.workspace = os.path.join( self.tmpdir, "workspace.gtf" ) self.segments = os.path.join( self.tmpdir, "segments.gtf" ) outfile = open( self.workspace, "w" ) e = GTF.Entry() e.contig, e.strand, e.gene_id, e.transcript_id = "chr1", "+", "gene1", "trans1" # 10kb genes every 100000kb for 10Mb for x in range( 100000, 10000000, 100000 ): e.gene_id, e.transcript_id = "gene%i" % x, "trans1" e.start, e.end = x, x + 10000 outfile.write( str(e) + "\n" ) outfile.close() # segments: uniformly distributed every 1kb with random length outfile = open( self.segments, "w" ) e = GTF.Entry() e.contig, e.strand = "chr1", "+" for x in range( 0, 10000000, 1000 ): e.gene_id, e.transcript_id = "gene%i" % x, "trans1" e.start, e.end = x, x+random.randint( 50, 150 ) outfile.write( str(e) + "\n" ) outfile.close()
def targetScanParse(infile, lnc_gtf): ''' Parse results from targetScan into GTF ''' gtf_dict = {} lnc_file = IOTools.openFile(lnc_gtf) for each in GTF.transcript_iterator(GTF.iterator(lnc_file)): for trans in each: entry = GTF.Entry() entry = entry.copy(trans) gtf_dict[entry.transcript_id] = entry lnc_file.close() counter = 0 for line in infile: counter += 1 line = line.split("\t") if counter > 1: MRE = GTF.Entry() gene_id = line[0].lstrip('"').rstrip('"') target = gtf_dict[gene_id] align_start = int(line[3]) align_end = int(line[4]) size = align_end - align_start miRNA = "mmu-%s" % line[1] seed_class = line[8] MRE.contig = target.contig MRE.feature = "MRE" MRE.start = target.start + align_start MRE.end = MRE.start + size MRE.source = target.source MRE.strand = target.strand MRE.addAttribute('miRNA', miRNA) MRE.addAttribute('target', gene_id) try: MRE.addAttribute('exon_number', target.asDict()['exon_number']) except KeyError: E.info("No exon number data in GTF for %s" % gene_id) MRE.addAttribute('exon_number', '.') if target.source == "protein_coding": MRE.addAttribute('exon_status', "protein_coding") else: try: MRE.addAttribute('exon_status', target.asDict()['exon_status']) except KeyError: E.info("No exon status data in GTF for %s" % gene_id) MRE.addAttribute('exon_status', '.') MRE.transcript_id = "%s_%s:%i-%i" % (gene_id, MRE.contig, MRE.start, MRE.end) MRE.gene_id = "%s_%s:%i-%i" % (miRNA, MRE.contig, MRE.start, MRE.end) MRE.addAttribute('seed_class', seed_class) yield MRE
def main(argv=None): parser = E.OptionParser( version="%prog version: $Id: fasta2gff.py 2861 2010-02-23 17:36:32Z andreas $") parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome.") parser.add_option("-a", "--as-gtf", dest="as_gtf", action="store_true", help="output as gtf.") parser.add_option("-f", "--fragment-size", dest="fragment_size", type="int", help="fixed size of fragments [default=%default].") parser.add_option("-s", "--sample-size", dest="sample_size", type="int", help="fixed size of fragments.") parser.set_defaults( as_gtf=False, genome_file=None, fragment_size=1000, sample_size=10000, pattern_id="%08i", ) (options, args) = E.Start(parser) fasta = IndexedFasta.IndexedFasta(options.genome_file) contigs = fasta.getContigSizes() if options.as_gtf: entry = GTF.Entry() else: entry = GTF.Entry() n = 0 entry.feature = "exon" entry.source = "random" for x in range(options.sample_size): entry.contig, entry.strand, entry.start, entry.end = fasta.getRandomCoordinates( options.fragment_size) if entry.strand == "-": l = contigs[entry.contig] entry.start, entry.end = l - entry.end, l - entry.start if options.as_gtf: entry.gene_id = options.pattern_id % n entry.transcript_id = entry.gene_id options.stdout.write(str(entry) + "\n") n += 1 E.Stop()
def annotateExons(iterator, fasta, options): """annotate exons within iterator.""" gene_iterator = GTF.gene_iterator(iterator) ninput, noutput, noverlapping = 0, 0, 0 for this in gene_iterator: ninput += 1 intervals = collections.defaultdict(list) ntranscripts = len(this) is_negative_strand = Genomics.IsNegativeStrand(this[0][0].strand) for exons in this: # make sure these are sorted correctly exons.sort(key=lambda x: x.start) if is_negative_strand: exons.reverse() nexons = len(exons) for i, e in enumerate(exons): intervals[(e.start, e.end)].append((i + 1, nexons)) gtf = GTF.Entry() gtf.fromGTF(this[0][0], this[0][0].gene_id, this[0][0].gene_id) gtf.addAttribute("ntranscripts", ntranscripts) gtfs = [] for r, pos in intervals.items(): g = GTF.Entry().copy(gtf) g.start, g.end = r g.addAttribute("nused", len(pos)) g.addAttribute("pos", ",".join(["%i:%i" % x for x in pos])) gtfs.append(g) gtfs.sort(key=lambda x: x.start) for g in gtfs: options.stdout.write("%s\n" % str(g)) # check for exon overlap intervals = [(g.start, g.end) for g in gtfs] nbefore = len(intervals) nafter = len(Intervals.combine(intervals)) if nafter != nbefore: noverlapping += 1 noutput += 1 if options.loglevel >= 1: options.stdlog.write( "# ninput=%i, noutput=%i, noverlapping=%i\n" % (ninput, noutput, noverlapping))
def UTR5(transcript): exons = GTF.asRanges(transcript, "exon") cds = GTF.asRanges(transcript, "CDS") utrs = Intervals.truncate(exons, cds) if len(cds) == 0: return list() if transcript[0].strand == "-": utr3 = [exon for exon in utrs if exon[0] >= cds[-1][1]] else: utr3 = [exon for exon in utrs if exon[-1] <= cds[0][0]] for e in transcript: if e.feature == "exon": template_exon = e break returned_exons = [] for e in utr3: gtf = GTF.Entry().fromGTF(template_exon) gtf.start = e[0] gtf.end = e[1] returned_exons.append(gtf) return returned_exons
def writeGFF(blocks, first, filename): outfile.write("writing gff entries to %s\n" % filename) outfile_gff = open(filename, "w") entry = GTF.Entry() entry.source = "gpipe" entry.feature = "synteny" for b in range(len(blocks)): block = blocks[b] if first: entry.name = block.contig1 entry.start = block.mFrom1 entry.end = block.mTo1 else: entry.name = block.contig2 entry.start = block.mFrom2 entry.end = block.mTo2 entry.info = "Block=%i" % block.mBlockId outfile_gff.write(str(entry) + "\n") outfile_gff.close()
def update( self, bed ): # convert to a gtf entry gtf = GTF.Entry() gtf.fromBed( bed ) gtf.feature = 'exon' gtf2table.Classifier.update( self, [gtf] )
def setUp( self ): AnnotatorDistanceCheck.setUp(self) outfile = open( self.workspace, "w" ) e = GTF.Entry() e.contig, e.strand, e.gene_id, e.transcript_id = "chr1", "+", "gene1", "trans1" e.start, e.end = 0, 1000 outfile.write( str(e) + "\n" ) e.start, e.end = 3000, 4000 outfile.write( str(e) + "\n" ) e.contig, e.strand, e.gene_id, e.transcript_id = "chr1", "+", "gene2", "trans1" e.start, e.end = 10000, 11000 outfile.write( str(e) + "\n" ) e.start, e.end = 13000, 14000 outfile.write( str(e) + "\n" ) e.contig, e.strand, e.gene_id, e.transcript_id = "chr1", "-", "gene3", "trans1" e.start, e.end = 20000, 21000 outfile.write( str(e) + "\n" ) e.start, e.end = 23000, 24000 outfile.write( str(e) + "\n" ) outfile.close()
def annotateRegulons( iterator, fasta, tss, options ): """annotate regulons within iterator. Entries specied with ``--restrict-source`` are annotated. """ gene_iterator = GTF.gene_iterator( iterator ) ngenes, ntranscripts, nregulons = 0, 0, 0 upstream, downstream = options.upstream, options.downstream for gene in gene_iterator: ngenes += 1 is_negative_strand = Genomics.IsNegativeStrand( gene[0][0].strand ) lcontig = fasta.getLength( gene[0][0].contig ) regulons = [] transcript_ids = [] for transcript in gene: ntranscripts += 1 mi, ma = min( [x.start for x in transcript ] ), max( [x.end for x in transcript ] ) if tss: # add range to both sides of tss if is_negative_strand: interval = ma - options.downstream, ma + options.upstream else: interval = mi - options.upstream, mi + options.downstream else: # add range to both sides of tts if is_negative_strand: interval = mi - options.downstream, mi + options.upstream else: interval = ma - options.upstream, ma + options.downstream interval = ( min( lcontig, max( 0, interval[0] ) ), min( lcontig, max( 0, interval[1] ) ) ) regulons.append( interval ) transcript_ids.append( transcript[0].transcript_id ) if options.merge_promotors: # merge the regulons (and rename - as sort order might have changed) regulons = Intervals.combine( regulons ) transcript_ids = ["%i" % (x+1) for x in range(len(regulons) )] gtf = GTF.Entry() gtf.fromGTF( gene[0][0], gene[0][0].gene_id, gene[0][0].gene_id ) gtf.source = "regulon" x = 0 for start, end in regulons: gtf.start, gtf.end = start, end gtf.transcript_id = transcript_ids[x] options.stdout.write( "%s\n" % str(gtf) ) nregulons += 1 x += 1 E.info( "ngenes=%i, ntranscripts=%i, nregulons=%i" % (ngenes, ntranscripts, nregulons) )
def update(self, bed): # convert to a gtf entry gtf = GTF.Entry() gtf.fromBed(bed) gtf.feature = 'exon' GeneModelAnalysis.Classifier.update(self, [gtf])
def setUp(self): self.mExons = [] self.mSplitCodonsNext = {} self.mSplitCodonsPrev = {} self.mSpliceSize = 4 self.mExonSize = 100 self.mIntronSize = 900 self.strand = "+" self.mNExons = 9 self.mOffset = 1000 length = 0 self.frame = 0 self.mIncrement = self.mIntronSize + self.mExonSize seq = list("123" * int((self.mNExons * self.mExonSize) / 3)) exon_id = 0 start = self.mOffset for x in range(self.mNExons): e = GTF.Entry() e.contig, e.strand, e.gene_id, e.transcript_id = "chr1", "+", "gene1", "trans1" e.start, e.end = start, start + self.mExonSize e.frame = (3 - (length % 3)) % 3 length += e.end - e.start self.mExons.append(e) if e.frame != 0: for y in range(0, e.frame): self.mSplitCodonsPrev[start + y] = start - self.mIntronSize for y in range(0, 3 - e.frame): self.mSplitCodonsNext[ start - self.mIntronSize - y - 1] = start exon_id += 1 if exon_id < self.mNExons: p = exon_id * self.mExonSize + self.mIntronSize * (exon_id - 1) seq[p:p] = list("AG") seq[p:p] = list("T" * (self.mIntronSize - 4)) seq[p:p] = list("GT") start += self.mIncrement # print str(e) # print self.mSplitCodonsNext # print self.mSplitCodonsPrev seq[0:0] = "C" * self.mOffset seq.append("G" * self.mOffset) tmpfile = tempfile.NamedTemporaryFile() tmpfile.close() seq = "".join(seq) self.mSequence = seq self.contigSize = len(seq) IndexedFasta.createDatabase(tmpfile.name, iter([("chr1", seq), ])) self.mFasta = IndexedFasta.IndexedFasta(tmpfile.name)
def _add(interval, anno): gtf = GTF.Entry() gtf.contig = transcript[0].contig gtf.gene_id = transcript[0].gene_id gtf.transcript_id = transcript[0].transcript_id gtf.strand = transcript[0].strand gtf.feature = anno gtf.start, gtf.end = interval results.append(gtf)
def test_entry(frame, strand, xfrom, xto, start, end, ref): entry = GTF.Entry() entry.frame = frame entry.strand = strand entry.start = xfrom entry.end = xto intervals = transform_third_codon(start, end, [(xfrom, xto, entry)]) if ref != intervals: print("failed:", ref != intervals)
def annotateTTS(iterator, fasta, options): """annotate termination sites within iterator. Entries specified with ``--restrict-source are annotated``. """ gene_iterator = GTF.gene_iterator(iterator) ngenes, ntranscripts, npromotors = 0, 0, 0 for gene in gene_iterator: ngenes += 1 is_negative_strand = Genomics.IsNegativeStrand(gene[0][0].strand) lcontig = fasta.getLength(gene[0][0].contig) tts = [] transcript_ids = [] for transcript in gene: ntranscripts += 1 mi, ma = min([x.start for x in transcript]), max( [x.end for x in transcript]) transcript_ids.append(transcript[0].transcript_id) # if tts is directly at start/end of contig, the tss will # be within an exon. otherwise, it is outside an exon. if is_negative_strand: tts.append( (max(0, mi - options.promotor), max(options.promotor, mi))) else: tts.append( (min(ma, lcontig - options.promotor), min(lcontig, ma + options.promotor))) if options.merge_promotors: # merge the promotors (and rename - as sort order might have # changed) tts = Intervals.combine(tts) transcript_ids = ["%i" % (x + 1) for x in range(len(tts))] gtf = GTF.Entry() gtf.fromGTF(gene[0][0], gene[0][0].gene_id, gene[0][0].gene_id) gtf.source = "tts" x = 0 for start, end in tts: gtf.start, gtf.end = start, end gtf.transcript_id = transcript_ids[x] options.stdout.write("%s\n" % str(gtf)) npromotors += 1 x += 1 if options.loglevel >= 1: options.stdlog.write( "# ngenes=%i, ntranscripts=%i, ntss=%i\n" % (ngenes, ntranscripts, npromotors))
def CDS(transcript): CDS = [e for e in transcript if e.feature == "CDS"] if len(CDS) == 0: return list() returned_exons = [GTF.Entry().fromGTF(e) for e in CDS] for e in returned_exons: e.feature = "exon" return returned_exons
def setUp(self): AnnotatorDistanceCheck.setUp(self) work_outfile = open( self.workspace, "w" ) segs_outfile = open( self.segments, "w" ) w = GTF.Entry() w.contig, w.strand, w.gene_id, w.transcript_id = "chr1", "+", "gene1", "trans1" e = GTF.Entry() e.contig, e.strand = "chr1", "+" # 10kb genes, size of intergenic space grows by random increment x, y = 0, 0 w_inc = 0 while x < 10000000: w.gene_id, w.transcript_id = "gene%i" % x, "trans1" e.start, e.end = x, x + 10000 work_outfile.write( str(e) + "\n" ) x += 10000 w_inc += random.randint( 0, 10000) end = x + w_inc y = x s_inc = 0 while y < end: e.gene_id, e.transcript_id = "gene%i" % (y), "trans1" e.start, e.end = y, y+random.randint( 50, 150 ) segs_outfile.write( str(e) + "\n" ) y += s_inc s_inc += random.randint( 0, 100 ) x = end work_outfile.close() segs_outfile.close()
def process(self, contig, start, end, reads, qualities): entry = GTF.Entry() entry.start, entry.end = start, end entry.gene_id = self.mIdFormat % id entry.transcript_id = entry.gene_id entry.contig = contig entry.feature = "exon" entry.source = "maq" read_stats = Stats.Summary(reads) entry.score = "%5.2f" % read_stats['mean'] self.mOutFile.write(str(entry) + "\n")
def buildRepeatTrack( infile, outfile ): '''build a repeat track as negative control.''' nrepeats = 0 for gff in GFF.iterator( gzip.open(infile, "r" ) ): nrepeats+=1 sample = set( random.sample( xrange( nrepeats), PARAMS["ancestral_repeats_samplesize"]) ) outf = gzip.open( outfile, "w" ) gtf = GTF.Entry() for x,gff in enumerate( GFF.iterator( gzip.open(infile, "r" ) ) ): if not x in sample: continue gtf.fromGFF( gff, "%08i" % x, "%08i" % x ) outf.write( "%s\n" % str(gtf) ) outf.close() E.debug( "created sample of %i repeats out of %i in %s" % (len(sample), nrepeats, outfile))
def tts(transcript, upstream=500, downstream=500): exons = [e for e in transcript if e.feature == "exon"] if exons[0].strand == "+": start = max(x.end for x in exons) - upstream end = start + upstream + downstream else: end = min(x.start for x in exons) + upstream start = end - upstream - downstream returned_exon = GTF.Entry().fromGTF(exons[0]) returned_exon.start = start returned_exon.end = end return [returned_exon]
def convert_set(gffs, gene_pattern, transcript_pattern, options): ''' creates the gene_id and transcript_id fields from a string format pattern using fields of the gff. ''' for gff in gffs: gff.gene_id = str(gene_pattern) % gff.asDict() gff.transcript_id = str(gene_pattern) % gff.asDict() gtf_entry = GTF.Entry() gtf_entry.copy(gff) if "Parent" in gtf_entry: gtf_entry['Parent'] = ",".join(gtf_entry['Parent']) options.stdout.write(str(gtf_entry) + "\n")
def flank3(transcript, length=500): exons = [e for e in transcript if e.feature == "exon"] if exons[0].strand == "+": start = max(x.end for x in exons) end = start + length else: end = min(x.start for x in exons) start = end - length returned_exon = GTF.Entry().fromGTF(exons[0]) returned_exon.start = start returned_exon.end = end return [returned_exon]
def filterMREsTSV(input_file, filter_set): ''' Filter MREs in a GFF file based on a list of miRNA IDs. Return a generator object. ''' mre_file = IOTools.openFile(input_file, "rb") for x in GTF.transcript_iterator(GTF.iterator(mre_file)): for mre in x: if mre.asDict()['miRNA'] in filter_set: entry = GTF.Entry() entry.copy(mre) yield entry else: pass mre_file.close()
def findRetainedIntrons(infile, outfile): outf = IOTools.openFile(outfile, "w") for gene in GTF.gene_iterator(GTF.iterator(IOTools.openFile(infile))): gene_out = [] introns_out = [] # now find if any of the transcripts are retained intron # versions of any of the others for first, second in itertools.product(gene, gene): first = sorted( [entry for entry in first if entry.feature == "exon"], key=lambda x: x.start) second = sorted( [entry for entry in second if entry.feature == "exon"], key=lambda x: x.start) first_introns = set(GTF.toIntronIntervals(first)) second_introns = set(GTF.toIntronIntervals(second)) if len(first_introns-second_introns) > 0 and \ len(second_introns-first_introns) == 0: novel_introns = list(first_introns - second_introns) def _filterIntron(intron): return intron[0] > second[0].start and \ intron[1] < second[-1].end novel_introns = filter(_filterIntron, novel_introns) if len(novel_introns) > 0: gene_out.extend(first) for intron in novel_introns: introns_out.append(intron) introns_out = Intervals.combine(introns_out) template = gene[0][0] template.feature = "exon" for gff in introns_out: entry = GTF.Entry().copy(template) entry.start = gff[0] entry.end = gff[1] outf.write("%s\n" % str(entry))
def introns(transcript): introns = GTF.toIntronIntervals(transcript) for e in transcript: if e.feature == "exon": template_exon = e break returned_exons = [] for e in introns: gtf = GTF.Entry().fromGTF(template_exon) gtf.start = e[0] gtf.end = e[1] returned_exons.append(gtf) return returned_exons
def filter_overlapping_genes(infile, outfile): '''Filter out exons that overlapp with exons from another gene''' tmp1 = P.getTempFilename() tmp2 = P.getTempFilename(shared=True) # the first command in the statment trancates exons that overlap # on oppsite strands. The second exons that overlap on the same # strand. the first part of the second command identifies exons # that overlap on the same strand, the second part removes them # from the geneset. statement = ''' bedtools subtract -a %(infile)s -b %(infile)s -S > %(tmp1)s; checkpoint; bedtools merge -i <( sort -k1,1 -k4,4n %(tmp1)s) -c 6 -o count -d -2 | awk '$4>1' | bedtools subtract -a %(tmp1)s -b stdin | python %(scriptsdir)s/gtf2gtf.py --method=set-transcript-to-gene -L %(outfile)s.log | python %(scriptsdir)s/gtf2gtf.py --method=sort -L %(outfile)s.log | gzip > %(tmp2)s; checkpoint; rm %(tmp1)s''' P.run() # renumber exons as new exons have probably been created. with IOTools.openFile(outfile, "w") as outf: for transcript in GTF.transcript_iterator( GTF.iterator(IOTools.openFile(tmp2))): nexon = 0 for exon in transcript: nexon += 1 exon = GTF.Entry().fromGTF(exon) exon["exon_id"] = int(nexon) outf.write(str(exon) + "\n") os.unlink(tmp2)
def setUp( self ): AnnotatorDistanceCheck.setUp(self) e = GTF.Entry() e.contig, e.strand = "chr1", "+" outfile = open( self.workspace, "w" ) start, inc, size = 0, 1000, 100 for x in range( 0, 2): start = x * 10000 for y in range( 0, 3 ): e.gene_id, e.transcript_id = "gene_%i" % x , "transcript_%i" % y e.start, e.end = start, start + size outfile.write( str(e) + "\n" ) start += inc if e.strand == "+": e.strand = "-" else: e.strand = "+" outfile.close()
def addSegment( feature, start, end, template, options ): """add a generic segment of type *feature*. """ if start >= end: return 0 entry = GTF.Entry() if type(template) == types.TupleType: entry.copy( template[0] ) entry.clearAttributes() entry.addAttribute( "downstream_gene_id", template[1].gene_id ) else: entry.copy( template ) entry.clearAttributes() entry.start, entry.end = start, end entry.feature = feature if feature not in ("exon", "CDS", "UTR", "UTR3", "UTR5"): entry.score = "." options.stdout.write( str(entry) + "\n" ) return 1
def exportSegments(infiles, outfile): track = outfile[:-len(".gtf")] outf = open(outfile, "w") dbhandle = sqlite3.connect(PARAMS["database"]) cc = dbhandle.cursor() # ignores the attributes statement = """SELECT DISTINCT contig, end, feature, frame, s.gene_id, score, source, start, strand, transcript_id \ FROM segments AS s, assignments AS a WHERE a.gene_id = s.gene_id and a.%(track)s""" % locals( ) cc.execute(statement) for row in cc: gtf = GTF.Entry() gtf.contig, gtf.end, gtf.feature, gtf.frame, gtf.gene_id, gtf.score, gtf.source, gtf.start, gtf.strand, gtf.transcript_id =\ row outf.write(str(gtf) + "\n") outf.close()
def main(): ''' main function ''' parser = E.OptionParser( version= "%prog version: $Id: gtf2tsv.py 2887 2010-04-07 08:48:04Z andreas $", usage=globals()["__doc__"]) parser.add_option( "-o", "--only-attributes", dest="only_attributes", action="store_true", help="output attributes as separate columns [default=%default].") parser.add_option( "-f", "--full", dest="full", action="store_true", help="output attributes as separate columns [default=%default].") parser.add_option( "-i", "--invert", dest="invert", action="store_true", help="convert tab-separated table back to gtf [default=%default].") parser.add_option( "-m", "--map", dest="map", type="choice", choices=("transcript2gene", "peptide2gene", "peptide2transcript"), help="output a map mapping transcripts to genes [default=%default].") parser.set_defaults( only_attributes=False, full=False, invert=False, map=None, ) (options, args) = E.Start(parser) if options.full: # output full table with column for each attribute attributes = set() data = [] for gtf in GTF.iterator(options.stdin): data.append(gtf) attributes = attributes.union(set(gtf.keys())) # remove gene_id and transcript_id, as they are used # explicitely later attributes.difference_update(["gene_id", "transcript_id"]) attributes = sorted(list(attributes)) if options.only_attributes: header = ["gene_id", "transcript_id"] + attributes else: header = [ "contig", "source", "feature", "start", "end", "score", "strand", "frame", "gene_id", "transcript_id", ] + attributes options.stdout.write("\t".join(header) + "\n") if options.only_attributes: for gtf in data: options.stdout.write("\t".join( map(str, ( gtf.gene_id, gtf.transcript_id, )))) for a in attributes: if a in ("gene_id", "transcript_id"): continue try: val = getattr(gtf, a) except AttributeError: val = "" options.stdout.write("\t%s" % val) options.stdout.write("\n") else: for gtf in data: options.stdout.write("\t".join( map(str, ( gtf.contig, gtf.source, gtf.feature, gtf.start, gtf.end, gtf.score, gtf.strand, gtf.frame, gtf.gene_id, gtf.transcript_id, )))) for a in attributes: try: val = getattr(gtf, a) except AttributeError: val = "" options.stdout.write("\t%s" % val) options.stdout.write("\n") elif options.invert: gtf = GTF.Entry() header = None for line in options.stdin: if line.startswith("#"): continue data = line[:-1].split("\t") if not header: header = data map_header2column = dict([(y, x) for x, y in enumerate(header)]) continue # fill gtf entry with data try: gtf.contig = data[map_header2column["contig"]] gtf.source = data[map_header2column["source"]] gtf.feature = data[map_header2column["feature"]] # subtract -1 to start for 0-based coordinates gtf.start = int(data[map_header2column["start"]]) gtf.end = int(data[map_header2column["end"]]) gtf.score = data[map_header2column["score"]] gtf.strand = data[map_header2column["strand"]] gtf.frame = data[map_header2column["frame"]] gtf.gene_id = data[map_header2column["gene_id"]] gtf.transcript_id = data[map_header2column["transcript_id"]] gtf.parseInfo(data[map_header2column["attributes"]], line) except KeyError, msg: raise KeyError("incomplete entry %s: %s: %s" % (str(data), str(map_header2column), msg)) # output gtf entry in gtf format options.stdout.write("%s\n" % str(gtf))