def makeGene(self,root): gene=Gene() root["object"]=gene gene.ID=root["extra"]["ID"] children=root.get("children",None) if(children is None): return gene for child in children: obj=self.labelStructure(child) if(obj is None): continue if(type(obj)==Transcript): gene.addTranscript(obj) obj.gene=gene obj.geneId=gene.getId() extra=root["extra"] gene.extraFields="" for key in extra: gene.extraFields+=key+"="+extra[key]+";" return gene
def makeGene(self, root): gene = Gene() root["object"] = gene gene.ID = root["extra"]["ID"] children = root.get("children", None) if (children is None): return gene for child in children: obj = self.labelStructure(child) if (obj is None): continue if (type(obj) == Transcript): gene.addTranscript(obj) obj.gene = gene obj.geneId = gene.getId() extra = root["extra"] gene.extraFields = "" for key in extra: gene.extraFields += key + "=" + extra[key] + ";" return gene
def assembleTranscriptome(self): """ Loop over uniqueGeneSet, in which the ENSG-IDs are saved, and assemble all the transcripts and exons for this gene and save it as a Gene object. This gene obeject ist then added to the geneList of this Genome object """ # transcriptsByType = defaultdict(list) # construct Genes for uniqGene in self.uniqGeneSet: geneId, chromosome, strand = uniqGene geneNames = list(self.uniqGene_to_names[uniqGene]) geneType = self.uniqGene_to_source[uniqGene] geneExons = set() geneCds = set() # get exons from all transcripts for transcriptId in self.uniqGene_to_transcriptIds[uniqGene]: geneExons |= self.transcriptId_to_exons[ transcriptId] # add new exon tuples to geneExons set geneCds |= self.transcriptId_to_cds[transcriptId] # usually gtf Files are sorted, but this can't be assumed geneExons = sorted(geneExons, reverse=not strand) geneCds = sorted(geneCds, reverse=not strand) gene = Gene(geneId, chromosome, strand, geneType, geneNames, geneExons, geneCds) geneExons = dict(zip(geneExons, xrange(1000000))) geneCds = dict(zip(geneCds, xrange(1000000))) self.geneList.append(gene) # construct transcripts for transcriptId in self.uniqGene_to_transcriptIds[uniqGene]: transcriptNames = self.transcriptId_to_names[transcriptId] protId = self.transcriptId_to_protId[ transcriptId] if transcriptId in self.transcriptId_to_protId else None exons = sorted(self.transcriptId_to_exons[transcriptId]) codingExons = sorted(self.transcriptId_to_cds[transcriptId]) exonIndices = array('H', [geneExons[e] for e in exons]) codingExonIndices = array('H', [geneCds[e] for e in codingExons]) codingFrames = array('H', [ int(frame) for exonNumber, frame in sorted( self.transcriptId_to_codingFrames[transcriptId]) ]) startCodon = tuple([ interval for exonNumber, interval in sorted( self.transcriptId_to_startCodons[transcriptId]) ]) stopCodon = tuple([ interval for exonNumber, interval in sorted( self.transcriptId_to_stopCodons[transcriptId]) ]) if len(codingExons) != len(codingFrames): raise Exception( "Number of coding Exons and Frames differ for %s %s" % geneId, transcriptId) transcript = Transcript(gene, transcriptId, list(transcriptNames), protId, exonIndices, codingExonIndices, codingFrames, startCodon, stopCodon) gene.addTranscript(transcript)