def __init__(self, instream, taxon_id=None): gt.extended.CustomStream.__init__(self) self.iterator = FlatFileIterator(instream) self.outqueue = collections.deque() self.go_coll = GOCollection(taxon_id) self.uniprots = {} self.typemaps = { 'gene': 'pseudogene', 'mRNA': 'pseudogenic_transcript', 'rRNA': 'pseudogenic_transcript', 'tRNA': 'pseudogenic_transcript', 'exon': 'pseudogenic_exon' }
def __init__(self, instream, taxon_id=None): gt.extended.CustomStream.__init__(self) self.iterator = FlatFileIterator(instream) self.outqueue = collections.deque() self.go_coll = GOCollection(taxon_id) self.uniprots = {} self.typemaps = {'gene': 'pseudogene', 'mRNA': 'pseudogenic_transcript', 'rRNA': 'pseudogenic_transcript', 'tRNA': 'pseudogenic_transcript', 'exon': 'pseudogenic_exon'}
class TableInStream(gt.extended.CustomStream): def __init__(self, instream, taxon_id=None): gt.extended.CustomStream.__init__(self) self.iterator = FlatFileIterator(instream) self.outqueue = collections.deque() self.go_coll = GOCollection(taxon_id) self.uniprots = {} self.typemaps = { 'gene': 'pseudogene', 'mRNA': 'pseudogenic_transcript', 'rRNA': 'pseudogenic_transcript', 'tRNA': 'pseudogenic_transcript', 'exon': 'pseudogenic_exon' } def get_transcript_length(self, v): transcript_length = 0 for f in v['Gene Model']: if int(f['Start']) <= int(f['End']) and f['Type'] == 'exon': transcript_length += (int(f['End']) - int(f['Start']) + 1) return transcript_length def finaltype(self, v, mtype): if 'pseudo' in v and v['pseudo']: if mtype in self.typemaps: return self.typemaps[mtype] return mtype def make_noncoding(self, v, gtype, gene): rna = gt.extended.FeatureNode.create_new(v['seqid'], self.finaltype(v, gtype), int(v['start']), int(v['stop']), v['strand']) rna.add_attribute("ID", v['ID'] + ":" + gtype) gene.add_child(rna) if 'Gene Model' in v: for f in v['Gene Model']: if int(f['Start']) <= int(f['End']): newfeat = gt.extended.FeatureNode.create_new( v['seqid'], self.finaltype(v, f['Type']), int(f['Start']), int(f['End']), v['strand']) rna.add_child(newfeat) else: sys.stderr.write("invalid feature range, skipping " + str(f['Type']) + " " + v['ID'] + "\n") def make_coding(self, v, gene): transcript = gt.extended.FeatureNode.create_new( v['seqid'], self.finaltype(v, "mRNA"), int(v['start']), int(v['stop']), v['strand']) transcript.add_attribute("ID", v['ID'] + ".1") gene.add_child(transcript) if 'Gene Model' in v: newend = 1 left_i = 0 left_type = 'five_prime_UTR' left_key = 'utr_5' right_type = 'three_prime_UTR' right_key = 'utr_3' left_c = None right_c = None if 'utr_5' in v or 'utr_3' in v: if v['strand'] == '-': if 'utr_5' in v: right_c = int(v['utr_5']) if 'utr_3' in v: left_c = int(v['utr_3']) left_type = 'three_prime_UTR' right_type = 'five_prime_UTR' left_key = 'utr_3' right_key = 'utr_5' else: if 'utr_5' in v: left_c = int(v['utr_5']) if 'utr_3' in v: right_c = int(v['utr_3']) if v['strand'] == '-': v['Gene Model'].reverse() transcript_length = self.get_transcript_length(v) for f in v['Gene Model']: if int(f['Start']) <= int(f['End']): newfeat = gt.extended.FeatureNode.create_new( v['seqid'], self.finaltype(v, f['Type']), int(f['Start']), int(f['End']), v['strand']) transcript.add_child(newfeat) # decide whether to make CDS if f['Type'] == 'exon': scoord = int(f['Start']) ecoord = int(f['End']) newend = left_i + (int(f['End']) - int(f['Start']) + 1) if left_key in v: if newend > left_c: if left_i < left_c: newfeat = gt.extended.FeatureNode.create_new( v['seqid'], self.finaltype(v, left_type), scoord, int(f['Start']) + (left_c - left_i) - 1, v['strand']) transcript.add_child(newfeat) scoord = int( f['Start']) + (left_c - left_i) else: scoord = int(f['Start']) else: newfeat = gt.extended.FeatureNode.create_new( v['seqid'], self.finaltype(v, left_type), scoord, ecoord, v['strand']) transcript.add_child(newfeat) scoord = None if right_key in v: if newend > (transcript_length - right_c): if left_i < (transcript_length - right_c): newfeat = gt.extended.FeatureNode.create_new( v['seqid'], self.finaltype(v, right_type), int(f['Start']) + ((transcript_length - right_c) - left_i), ecoord, v['strand']) transcript.add_child(newfeat) ecoord = int(f['Start']) + ( (transcript_length - right_c) - left_i - 1) else: newfeat = gt.extended.FeatureNode.create_new( v['seqid'], self.finaltype(v, right_type), scoord, ecoord, v['strand']) transcript.add_child(newfeat) ecoord = None if ecoord and scoord: newfeat = gt.extended.FeatureNode.create_new( v['seqid'], self.finaltype(v, 'CDS'), scoord, ecoord, v['strand']) transcript.add_child(newfeat) left_i = newend else: sys.stderr.write("invalid feature range, skipping " + str(f['Type']) + " " + v['ID'] + "\n") # make polypeptide polypeptide = gt.extended.FeatureNode.create_new( v['seqid'], "polypeptide", int(v['start']), int(v['stop']), v['strand']) polypeptide.add_attribute("ID", transcript.get_attribute("ID") + ":pep") polypeptide.add_attribute("Derives_from", transcript.get_attribute("ID")) if 'product' in v and v['product'] != "null": polypeptide.add_attribute( "product", 'term%3D' + urllib.quote(v['product'], safe=' ')) self.outqueue.append(polypeptide) # register GO terms if self.go_coll: self.go_coll.add_item(v) def next(self): while True: #try: if len(self.outqueue) > 0: return self.outqueue.popleft() try: v = self.iterator.next() except exceptions.StopIteration: return None if not v: return None gene = gt.extended.FeatureNode.create_new( v['seqid'], self.finaltype(v, "gene"), int(v['start']), int(v['stop']), v['strand']) gene.add_attribute("ID", v['ID']) if 'name' in v: gene.add_attribute("Name", v['name']) # track UniProtID -> gene mappings # XXX handle multiple transcripts per product once supported by # EuPathDB! if 'uniprot_id' in v: self.uniprots[v['uniprot_id']] = v['ID'] # non-coding RNA m = re.match(r"(.RNA) encoding", v['type']) if m: self.make_noncoding(v, m.group(1), gene) # protein coding gene if v['type'] == 'protein coding': self.make_coding(v, gene) break #except: # if v: # sys.stderr.write("error creating feature for %s\n" % v['ID']) # else: # sys.stderr.write("error creating feature , no ID yet\n") # continue return gene
class TableInStream(gt.extended.CustomStream): def __init__(self, instream, taxon_id=None): gt.extended.CustomStream.__init__(self) self.iterator = FlatFileIterator(instream) self.outqueue = collections.deque() self.go_coll = GOCollection(taxon_id) self.uniprots = {} self.typemaps = {'gene': 'pseudogene', 'mRNA': 'pseudogenic_transcript', 'rRNA': 'pseudogenic_transcript', 'tRNA': 'pseudogenic_transcript', 'exon': 'pseudogenic_exon'} def get_transcript_length(self, v): transcript_length = 0 for f in v['Gene Model']: if int(f['Start']) <= int(f['End']) and f['Type'] == 'exon': transcript_length += (int(f['End']) - int(f['Start']) + 1) return transcript_length def finaltype(self, v, mtype): if 'pseudo' in v and v['pseudo']: if mtype in self.typemaps: return self.typemaps[mtype] return mtype def make_noncoding(self, v, gtype, gene): rna = gt.extended.FeatureNode.create_new(v['seqid'], self.finaltype(v, gtype), int(v['start']), int(v['stop']), v['strand']) rna.add_attribute("ID", v['ID'] + ":" + gtype) gene.add_child(rna) if 'Gene Model' in v: for f in v['Gene Model']: if int(f['Start']) <= int(f['End']): newfeat = gt.extended.FeatureNode.create_new(v['seqid'], self.finaltype(v, f['Type']), int(f['Start']), int(f['End']), v['strand']) rna.add_child(newfeat) else: sys.stderr.write("invalid feature range, skipping " + str(f['Type']) + " " + v['ID'] + "\n") def make_coding(self, v, gene): transcript = gt.extended.FeatureNode.create_new(v['seqid'], self.finaltype(v, "mRNA"), int(v['start']), int(v['stop']), v['strand']) transcript.add_attribute("ID", v['ID'] + ".1") gene.add_child(transcript) if 'Gene Model' in v: newend = 1 left_i = 0 left_type = 'five_prime_UTR' left_key = 'utr_5' right_type = 'three_prime_UTR' right_key = 'utr_3' left_c = None right_c = None if 'utr_5' in v or 'utr_3' in v: if v['strand'] == '-': if 'utr_5' in v: right_c = int(v['utr_5']) if 'utr_3' in v: left_c = int(v['utr_3']) left_type = 'three_prime_UTR' right_type = 'five_prime_UTR' left_key = 'utr_3' right_key = 'utr_5' else: if 'utr_5' in v: left_c = int(v['utr_5']) if 'utr_3' in v: right_c = int(v['utr_3']) if v['strand'] == '-': v['Gene Model'].reverse() transcript_length = self.get_transcript_length(v) for f in v['Gene Model']: if int(f['Start']) <= int(f['End']): newfeat = gt.extended.FeatureNode.create_new(v['seqid'], self.finaltype(v, f['Type']), int(f['Start']), int(f['End']), v['strand']) transcript.add_child(newfeat) # decide whether to make CDS if f['Type'] == 'exon': scoord = int(f['Start']) ecoord = int(f['End']) newend = left_i + (int(f['End']) - int(f['Start']) + 1) if left_key in v: if newend > left_c: if left_i < left_c: newfeat = gt.extended.FeatureNode.create_new(v['seqid'], self.finaltype(v, left_type), scoord, int(f['Start']) + (left_c - left_i) - 1, v['strand']) transcript.add_child(newfeat) scoord = int(f['Start']) + (left_c - left_i) else: scoord = int(f['Start']) else: newfeat = gt.extended.FeatureNode.create_new(v['seqid'], self.finaltype(v, left_type), scoord, ecoord, v['strand']) transcript.add_child(newfeat) scoord = None if right_key in v: if newend > (transcript_length - right_c): if left_i < (transcript_length - right_c): newfeat = gt.extended.FeatureNode.create_new(v['seqid'], self.finaltype(v, right_type), int(f['Start']) + ((transcript_length - right_c) - left_i), ecoord, v['strand']) transcript.add_child(newfeat) ecoord = int(f['Start']) + ((transcript_length - right_c) - left_i - 1) else: newfeat = gt.extended.FeatureNode.create_new(v['seqid'], self.finaltype(v, right_type), scoord, ecoord, v['strand']) transcript.add_child(newfeat) ecoord = None if ecoord and scoord: newfeat = gt.extended.FeatureNode.create_new(v['seqid'], self.finaltype(v, 'CDS'), scoord, ecoord, v['strand']) transcript.add_child(newfeat) left_i = newend else: sys.stderr.write("invalid feature range, skipping " + str(f['Type']) + " " + v['ID'] + "\n") # make polypeptide polypeptide = gt.extended.FeatureNode.create_new(v['seqid'], "polypeptide", int(v['start']), int(v['stop']), v['strand']) polypeptide.add_attribute("ID", transcript.get_attribute("ID") + ":pep") polypeptide.add_attribute("Derives_from", transcript.get_attribute("ID")) if 'product' in v and v['product'] != "null": polypeptide.add_attribute("product", 'term%3D' + urllib.quote(v['product'], safe=' ')) self.outqueue.append(polypeptide) # register GO terms if self.go_coll: self.go_coll.add_item(v) def next(self): while True: #try: if len(self.outqueue) > 0: return self.outqueue.popleft() try: v = self.iterator.next() except exceptions.StopIteration: return None if not v: return None gene = gt.extended.FeatureNode.create_new(v['seqid'], self.finaltype(v, "gene"), int(v['start']), int(v['stop']), v['strand']) gene.add_attribute("ID", v['ID']) if 'name' in v: gene.add_attribute("Name", v['name']) # track UniProtID -> gene mappings # XXX handle multiple transcripts per product once supported by # EuPathDB! if 'uniprot_id' in v: self.uniprots[v['uniprot_id']] = v['ID'] # non-coding RNA m = re.match(r"(.RNA) encoding", v['type']) if m: self.make_noncoding(v, m.group(1), gene) # protein coding gene if v['type'] == 'protein coding': self.make_coding(v, gene) break #except: # if v: # sys.stderr.write("error creating feature for %s\n" % v['ID']) # else: # sys.stderr.write("error creating feature , no ID yet\n") # continue return gene