Exemple #1
0
 def makeGene(self,root):
     gene=Gene()
     root["object"]=gene
     gene.ID=root["extra"]["ID"]
     children=root.get("children",None)
     if(children is None): return gene
     for child in children:
         obj=self.labelStructure(child)
         if(obj is None): continue
         if(type(obj)==Transcript):
             gene.addTranscript(obj)
             obj.gene=gene
             obj.geneId=gene.getId()
     extra=root["extra"]
     gene.extraFields=""
     for key in extra:
         gene.extraFields+=key+"="+extra[key]+";"
     return gene
Exemple #2
0
 def makeGene(self, root):
     gene = Gene()
     root["object"] = gene
     gene.ID = root["extra"]["ID"]
     children = root.get("children", None)
     if (children is None): return gene
     for child in children:
         obj = self.labelStructure(child)
         if (obj is None): continue
         if (type(obj) == Transcript):
             gene.addTranscript(obj)
             obj.gene = gene
             obj.geneId = gene.getId()
     extra = root["extra"]
     gene.extraFields = ""
     for key in extra:
         gene.extraFields += key + "=" + extra[key] + ";"
     return gene
Exemple #3
0
    def assembleTranscriptome(self):
        """
        Loop over uniqueGeneSet, in which the ENSG-IDs are saved,
        and assemble all the transcripts and exons for this gene and save it as a Gene object.
        This gene obeject ist then added to the geneList of this Genome object
        """
        # transcriptsByType = defaultdict(list)

        # construct Genes
        for uniqGene in self.uniqGeneSet:
            geneId, chromosome, strand = uniqGene

            geneNames = list(self.uniqGene_to_names[uniqGene])
            geneType = self.uniqGene_to_source[uniqGene]
            geneExons = set()
            geneCds = set()

            # get exons from all transcripts
            for transcriptId in self.uniqGene_to_transcriptIds[uniqGene]:
                geneExons |= self.transcriptId_to_exons[
                    transcriptId]  # add new exon tuples to geneExons set
                geneCds |= self.transcriptId_to_cds[transcriptId]

            # usually gtf Files are sorted, but this can't be assumed
            geneExons = sorted(geneExons, reverse=not strand)
            geneCds = sorted(geneCds, reverse=not strand)

            gene = Gene(geneId, chromosome, strand, geneType, geneNames,
                        geneExons, geneCds)

            geneExons = dict(zip(geneExons, xrange(1000000)))
            geneCds = dict(zip(geneCds, xrange(1000000)))

            self.geneList.append(gene)

            # construct transcripts
            for transcriptId in self.uniqGene_to_transcriptIds[uniqGene]:
                transcriptNames = self.transcriptId_to_names[transcriptId]

                protId = self.transcriptId_to_protId[
                    transcriptId] if transcriptId in self.transcriptId_to_protId else None
                exons = sorted(self.transcriptId_to_exons[transcriptId])
                codingExons = sorted(self.transcriptId_to_cds[transcriptId])
                exonIndices = array('H', [geneExons[e] for e in exons])
                codingExonIndices = array('H',
                                          [geneCds[e] for e in codingExons])
                codingFrames = array('H', [
                    int(frame) for exonNumber, frame in sorted(
                        self.transcriptId_to_codingFrames[transcriptId])
                ])
                startCodon = tuple([
                    interval for exonNumber, interval in sorted(
                        self.transcriptId_to_startCodons[transcriptId])
                ])
                stopCodon = tuple([
                    interval for exonNumber, interval in sorted(
                        self.transcriptId_to_stopCodons[transcriptId])
                ])

                if len(codingExons) != len(codingFrames):
                    raise Exception(
                        "Number of coding Exons and Frames differ for %s %s" %
                        geneId, transcriptId)

                transcript = Transcript(gene, transcriptId,
                                        list(transcriptNames), protId,
                                        exonIndices, codingExonIndices,
                                        codingFrames, startCodon, stopCodon)

                gene.addTranscript(transcript)