def _get_mutalyzer_record(reference, db_transcripts): """ Creates a Mutalyzer specific record from the transcript entries retrieved from the gbparser database. :param reference: A gbparser database reference entry. :param db_transcripts:A gbparser database list of transcript. :return: The Mutalyzer record. """ record = _bare_record(reference) # Extracting the transcripts from the DB entries. transcripts = [] for db_transcript in db_transcripts: transcript = { 'gene': db_transcript.gene, 'strand': db_transcript.strand, 'transcript_start': db_transcript.transcript_start, 'transcript_stop': db_transcript.transcript_stop, 'transcript_product': db_transcript.transcript_product, 'exons': [], 'exons_start': db_transcript.exons_start, 'exons_stop': db_transcript.exons_stop, 'transcriptID': db_transcript.transcript_accession + '.' + db_transcript.transcript_version, } if db_transcript.protein_accession is not None \ and db_transcript.protein_version is not None: transcript['cds_start'] = db_transcript.cds_start transcript['cds_stop'] = db_transcript.cds_stop transcript['protein_product'] = db_transcript.protein_product transcript['proteinID'] = '%s.%s' %\ (db_transcript.protein_accession, db_transcript.protein_version) transcript['linkMethod'] = 'ncbi' starts = map(int, db_transcript.exons_start.split(',')) \ if db_transcript.exons_start else None stops = map(int, db_transcript.exons_stop.split(',')) \ if db_transcript.exons_stop else None if (starts and stops) and (len(starts) == len(stops)): for start, stop in zip(starts, stops): exon = {'start': start, 'stop': stop} transcript['exons'].append(exon) transcripts.append(transcript) # Generating the actual record entries in the Mutalyzer format. gene_dict = {} for db_transcript in transcripts: if db_transcript['gene'] in gene_dict: gene = gene_dict[db_transcript['gene']] else: gene = Gene(db_transcript['gene']) if db_transcript['strand'] == '+': gene.orientation = 1 if db_transcript['strand'] == '-': gene.orientation = -1 transcript = Locus(gene.newLocusTag()) transcript.mRNA = PList() transcript.mRNA.location = [ db_transcript['transcript_start'], db_transcript['transcript_stop'] ] transcript.transcriptID = db_transcript['transcriptID'] transcript.exon = PList() if db_transcript.get('exons') \ and isinstance(db_transcript.get('exons'), list): exon_list = [] for exon in db_transcript['exons']: exon_list.extend([exon['start'], exon['stop']]) transcript.exon.positionList = exon_list else: transcript.exon.positionList = transcript.mRNA.location transcript.mRNA.positionList = transcript.exon.positionList transcript.mRNA.positionList.sort() if db_transcript.get('proteinID'): transcript.CDS = PList() transcript.CDS.location = [ db_transcript['cds_start'], db_transcript['cds_stop'] ] transcript.CDS.positionList = cds_position_list( transcript.mRNA.positionList, transcript.CDS.location) transcript.proteinID = db_transcript['proteinID'] transcript.transcriptProduct = db_transcript['transcript_product'] transcript.proteinProduct = db_transcript['protein_product'] transcript.linkMethod = 'ncbi' transcript.transcribe = True transcript.translate = True else: transcript.linkMethod = None transcript.transcribe = True transcript.translate = False transcript.locusTag = '' # transcript.molType = db_transcript['molType'] gene.transcriptList.append(transcript) gene_dict[gene.name] = gene record.geneList = list(gene_dict.values()) # Get the sequence. seq_path = settings.SEQ_PATH + reference.checksum_sequence + '.sequence' try: seq = Seq(_get_sequence_mmap(seq_path, 1, reference.length + 1), generic_dna) except IOError: return None else: record.seq = seq return record
def _get_mutalyzer_record(reference, db_transcripts): """ Creates a Mutalyzer specific record from the transcript entries retrieved from the gbparser database. :param reference: A gbparser database reference entry. :param db_transcripts:A gbparser database list of transcript. :return: The Mutalyzer record. """ record = _bare_record(reference) # Extracting the transcripts from the DB entries. transcripts = [] for db_transcript in db_transcripts: transcript = { 'gene': db_transcript.gene, 'strand': db_transcript.strand, 'transcript_start': db_transcript.transcript_start, 'transcript_stop': db_transcript.transcript_stop, 'transcript_product': db_transcript.transcript_product, 'exons': [], 'exons_start': db_transcript.exons_start, 'exons_stop': db_transcript.exons_stop, 'transcriptID': db_transcript.transcript_accession + '.' + db_transcript.transcript_version, } if db_transcript.protein_accession is not None \ and db_transcript.protein_version is not None: transcript['cds_start'] = db_transcript.cds_start transcript['cds_stop'] = db_transcript.cds_stop transcript['protein_product'] = db_transcript.protein_product transcript['proteinID'] = '%s.%s' %\ (db_transcript.protein_accession, db_transcript.protein_version) transcript['linkMethod'] = 'ncbi' starts = map(int, db_transcript.exons_start.split(',')) \ if db_transcript.exons_start else None stops = map(int, db_transcript.exons_stop.split(',')) \ if db_transcript.exons_stop else None if (starts and stops) and (len(starts) == len(stops)): for start, stop in zip(starts, stops): exon = {'start': start, 'stop': stop} transcript['exons'].append(exon) transcripts.append(transcript) # Generating the actual record entries in the Mutalyzer format. gene_dict = {} for db_transcript in transcripts: if db_transcript['gene'] in gene_dict: gene = gene_dict[db_transcript['gene']] else: gene = Gene(db_transcript['gene']) if db_transcript['strand'] == '+': gene.orientation = 1 if db_transcript['strand'] == '-': gene.orientation = -1 transcript = Locus(gene.newLocusTag()) transcript.mRNA = PList() transcript.mRNA.location = [db_transcript['transcript_start'], db_transcript['transcript_stop']] transcript.transcriptID = db_transcript['transcriptID'] transcript.exon = PList() if db_transcript.get('exons') \ and isinstance(db_transcript.get('exons'), list): exon_list = [] for exon in db_transcript['exons']: exon_list.extend([exon['start'], exon['stop']]) transcript.exon.positionList = exon_list else: transcript.exon.positionList = transcript.mRNA.location transcript.mRNA.positionList = transcript.exon.positionList transcript.mRNA.positionList.sort() if db_transcript.get('proteinID'): transcript.CDS = PList() transcript.CDS.location = [db_transcript['cds_start'], db_transcript['cds_stop']] transcript.CDS.positionList = cds_position_list( transcript.mRNA.positionList, transcript.CDS.location) transcript.proteinID = db_transcript['proteinID'] transcript.transcriptProduct = db_transcript['transcript_product'] transcript.proteinProduct = db_transcript['protein_product'] transcript.linkMethod = 'ncbi' transcript.transcribe = True transcript.translate = True else: transcript.linkMethod = None transcript.transcribe = True transcript.translate = False transcript.locusTag = '' # transcript.molType = db_transcript['molType'] gene.transcriptList.append(transcript) gene_dict[gene.name] = gene record.geneList = list(gene_dict.values()) # Get the sequence. seq_path = settings.SEQ_PATH + reference.checksum_sequence + '.sequence' try: seq = Seq(_get_sequence_mmap(seq_path, 1, reference.length + 1), generic_dna) except IOError: return None else: record.seq = seq return record
def _get_mutalyzer_record(reference, db_transcripts): """ Creates a Mutalyzer specific record from the transcript entries retrieved from the gbparser database. :param reference: A gbparser database reference entry. :param db_transcripts:A gbparser database list of transcript. :return: The Mutalyzer record. """ record = _bare_record(reference) # Extracting the transcripts from the DB entries. transcripts = [] for transcript in db_transcripts: my_transcript = { 'gene': transcript.gene, 'strand': transcript.strand, 'transcript_start': transcript.transcript_start, 'transcript_stop': transcript.transcript_stop, 'cds_start': transcript.cds_start, 'cds_stop': transcript.cds_stop, 'exons': [], 'exons_start': transcript.exons_start, 'exons_stop': transcript.exons_stop, 'transcriptID': transcript.transcript_accession + '.' + transcript.transcript_version, 'proteinID': transcript.protein_accession + '.' + transcript.protein_version, 'linkMethod': 'ncbi' } # if transcript.exons_start: # starts = transcript.exons_start.split(',') # if transcripts.exons_stop: # stops = transcript.exons_stopts.split(',') starts = map(int, transcript.exons_start.split( ',')) if transcript.exons_start else None stops = map(int, transcript.exons_stop.split( ',')) if transcript.exons_stop else None if (starts and stops) and (len(starts) == len(stops)): for start, stop in zip(starts, stops): exon = {'start': start, 'stop': stop} my_transcript['exons'].append(exon) # if transcript.exons and isinstance(transcript.exons, list): # for exon in transcript.exons: # exon = {'start': exon.start, # 'stop': exon.stop} # my_transcript['exons'].append(exon) transcripts.append(my_transcript) # Generating the actual record entries in the Mutalyzer format. gene_dict = {} for transcript in transcripts: if transcript['gene'] in gene_dict: gene = gene_dict[transcript['gene']] else: gene = Gene(transcript['gene']) if transcript['strand'] == '+': gene.orientation = 1 if transcript['strand'] == '-': gene.orientation = -1 my_transcript = Locus(gene.newLocusTag()) my_transcript.mRNA = PList() my_transcript.mRNA.location = [ transcript['transcript_start'], transcript['transcript_stop'] ] my_transcript.CDS = PList() my_transcript.CDS.location = [ transcript['cds_start'], transcript['cds_stop'] ] my_transcript.exon = PList() if transcript.get('exons') and isinstance(transcript.get('exons'), list): exon_list = [] for exon in transcript['exons']: exon_list.extend([exon['start'], exon['stop']]) my_transcript.exon.positionList = exon_list else: my_transcript.exon.positionList = my_transcript.mRNA.location my_transcript.mRNA.positionList = my_transcript.exon.positionList my_transcript.mRNA.positionList.sort() my_transcript.CDS.positionList = cds_position_list( my_transcript.mRNA.positionList, my_transcript.CDS.location) my_transcript.transcriptID = transcript['transcriptID'] my_transcript.proteinID = transcript['proteinID'] my_transcript.linkMethod = 'ncbi' my_transcript.transcribe = True my_transcript.translate = True gene.transcriptList.append(my_transcript) gene_dict[gene.name] = gene record.geneList = list(gene_dict.values()) # Get the sequence. seq_path = settings.SEQ_PATH + reference.checksum_sequence + '.sequence' try: seq = Seq(_get_sequence_mmap(seq_path, 1, reference.length + 1), generic_dna) except IOError: return None else: record.seq = seq return record
def create_record(self, filename): """ Create a GenRecord.Record from a GenBank file @arg filename: The full path to the compressed GenBank file @type filename: unicode @return: A GenRecord.Record instance @rtype: object (record) """ # first create an intermediate genbank record with BioPython file_handle = bz2.BZ2File(filename, "r") file_handle = codecs.getreader('utf-8')(file_handle) biorecord = SeqIO.read(file_handle, "genbank") file_handle.close() record = Record() record.seq = biorecord.seq # Note: The .source_* values may be different from the values we are # working with, e.g. for UD slices where these values (taken from # the genbank file) are from the original NC reference. We try to # set the .id field to the working value in the caller. record.source_id = biorecord.id record.source_accession, record.source_version = biorecord.id.split( '.')[:2] record.source_gi = biorecord.annotations['gi'] record.organism = biorecord.annotations['organism'] # Todo: This will change once we support protein references if isinstance(biorecord.seq.alphabet, ProteinAlphabet): return record exonList = [] geneDict = {} accInfo = biorecord.annotations['accessions'] if len(accInfo) >= 3 and accInfo[1] == "REGION:": # Todo: This information is present in the genbank file if it is a # UD sliced from a chromosome. We can get the same information # for NM references from our mapping database and that way # also provide chromosomal variant descriptions for those. region = accInfo[2] if "complement" in region: record.orientation = -1 record.chromOffset = int(region.split('.')[2][:-1]) #if else: record.chromOffset = int(accInfo[2].split('.')[0]) #if for i in biorecord.features: if i.qualifiers: if i.type == "source": if i.qualifiers.has_key("mol_type"): if i.qualifiers["mol_type"][0] in ["mRNA", \ "transcribed RNA"] : record.molType = 'n' else: record.molType = 'g' #if if i.qualifiers.has_key("organelle"): record.organelle = i.qualifiers["organelle"][0] if record.organelle == "mitochondrion": record.molType = 'm' #if fakeGene = Locus("001") record.source.transcriptList.append(fakeGene) fakeGene.CDS = PList() fakeGene.CDS.location = self.__location2pos(i.location) #if if i.qualifiers.has_key("gene"): if not unicode(i.location.start).isdigit() or \ not unicode(i.location.end).isdigit(): # Feature is not completely in reference. Either start # or end is not a Bio.SeqFeature.ExactPosition. continue geneName = i.qualifiers["gene"][0] if i.type == "gene": if not geneDict.has_key(geneName): myGene = Gene(geneName) record.geneList.append(myGene) if i.strand: myGene.orientation = i.strand myGene.location = self.__location2pos(i.location) geneDict[geneName] = tempGene(geneName) #if else: if geneName not in geneDict: # We should have seen a gene entry for this gene # by now. Could be that it was skipped because it # was not completely in reference (see check # above). In that case we just ignore any of its # features. continue #if if i.type in [ "mRNA", "misc_RNA", "ncRNA", "rRNA", "tRNA", "tmRNA" ]: geneDict[geneName].rnaList.append(i) if i.type == "CDS": geneDict[geneName].cdsList.append(i) if i.type == "exon": exonLocation = self.__location2pos(i.location) if exonLocation: exonList.extend(exonLocation) #if #if #if #for if record.molType in ['g', 'm']: for j in geneDict.keys(): myGene = geneDict[j] self.link(myGene.rnaList, myGene.cdsList) for i in myGene.rnaList: if i.usable: myRealGene = record.findGene(i.gene) if i.locus_tag: # Note: We use the last three characters of the # locus_tag as a unique transcript version id. # This is also used to for the protein-transcript # link table. # Normally, locus_tag ends with three digits, but # for some (e.g. mobA on NC_011228, a plasmid) it # ends with two digits prepended with an # underscore. Or prepended with a letter. We # really want a number, so 'fix' this by only # looking for a numeric part. try: version = LOCUS_TAG_VERSION.findall( i.locus_tag)[0].zfill(3) except IndexError: version = '000' myTranscript = Locus(version) else: myTranscript = Locus(myRealGene.newLocusTag()) myTranscript.mRNA = PList() myTranscript.mRNA.positionList = i.positionList myTranscript.mRNA.location = i.location myTranscript.transcribe = True myTranscript.transcriptID = i.transcript_id myTranscript.transcriptProduct = i.product myTranscript.locusTag = i.locus_tag if i.link: myTranscript.CDS = PList() myTranscript.CDS.positionList = i.link.positionList myTranscript.CDS.location = i.link.location myTranscript.translate = True myTranscript.proteinID = i.link.protein_id myTranscript.linkMethod = i.linkMethod myTranscript.proteinProduct = i.link.product if i.link.qualifiers.has_key("transl_table"): myTranscript.txTable = \ int(i.qualifiers["transl_table"][0]) #if myRealGene.transcriptList.append(myTranscript) #if #for for i in myGene.cdsList: if not i.linked and \ (i.usable or not geneDict[myGene.name].rnaList) : myRealGene = record.findGene(i.gene) if i.locus_tag: # Note: We use the last three characters of the # locus_tag as a unique transcript version id. # This is also used to for the protein-transcript # link table. # Normally, locus_tag ends with three digits, but # for some (e.g. mobA on NC_011228, a plasmid) it # ends with two digits prepended with an # underscore. Or prepended with a letter. We # really want a number, so 'fix' this by only # looking for a numeric part. try: version = LOCUS_TAG_VERSION.findall( i.locus_tag)[0].zfill(3) except IndexError: version = '000' myTranscript = Locus(version) else: myTranscript = Locus(myRealGene.newLocusTag()) myTranscript.CDS = PList() myTranscript.CDS.positionList = i.positionList myTranscript.CDS.location = i.location myTranscript.proteinID = i.protein_id myTranscript.proteinProduct = i.product if i.qualifiers.has_key("transl_table"): myTranscript.txTable = \ int(i.qualifiers["transl_table"][0]) myRealGene.transcriptList.append(myTranscript) #if #if #for #for #if else: if geneDict: myGene = geneDict[geneDict.keys()[0]] myRealGene = record.geneList[0] if myGene.cdsList: myCDS = myGene.cdsList[0] self.__tagByDict(myCDS, "protein_id") self.__tagByDict(myCDS, "product") #if else: myCDS = None myTranscript = Locus("001") myTranscript.exon = PList() if exonList: myTranscript.exon.positionList = exonList else: myTranscript.exon.location = myRealGene.location if myCDS: myTranscript.CDS = PList() myTranscript.CDS.location = \ self.__location2pos(myCDS.location) #if if exonList or myRealGene.location or \ myTranscript.CDS.location : myTranscript.transcriptID = biorecord.id if myCDS: myTranscript.proteinID = myCDS.protein_id myTranscript.proteinProduct = myCDS.product myTranscript.linkMethod = "exhaustion" myTranscript.transcribe = True if myCDS.qualifiers.has_key("transl_table"): myTranscript.txTable = \ int(i.qualifiers["transl_table"][0]) #if myRealGene.transcriptList.append(myTranscript) #if #if #else for i in record.geneList: if not i.transcriptList: record.geneList.remove(i) return record
def create_record(self, filename): """ Create a GenRecord.Record from a GenBank file @arg filename: The full path to the compressed GenBank file @type filename: unicode @return: A GenRecord.Record instance @rtype: object (record) """ # first create an intermediate genbank record with BioPython file_handle = bz2.BZ2File(filename, "r") file_handle = codecs.getreader('utf-8')(file_handle) biorecord = SeqIO.read(file_handle, "genbank") file_handle.close() record = Record() record.seq = biorecord.seq # Note: The .source_* values may be different from the values we are # working with, e.g. for UD slices where these values (taken from # the genbank file) are from the original NC reference. We try to # set the .id field to the working value in the caller. record.source_id = biorecord.id record.source_accession, record.source_version = biorecord.id.split('.')[:2] record.source_gi = biorecord.annotations['gi'] record.organism = biorecord.annotations['organism'] # Todo: This will change once we support protein references if isinstance(biorecord.seq.alphabet, ProteinAlphabet): return record exonList = [] geneDict = {} accInfo = biorecord.annotations['accessions'] if len(accInfo) >= 3 and accInfo[1] == "REGION:": # Todo: This information is present in the genbank file if it is a # UD sliced from a chromosome. We can get the same information # for NM references from our mapping database and that way # also provide chromosomal variant descriptions for those. region = accInfo[2] if "complement" in region : record.orientation = -1 record.chromOffset = int(region.split('.')[2][:-1]) #if else : record.chromOffset = int(accInfo[2].split('.')[0]) #if for i in biorecord.features : if i.qualifiers : if i.type == "source" : if i.qualifiers.has_key("mol_type") : if i.qualifiers["mol_type"][0] in ["mRNA", \ "transcribed RNA"] : record.molType = 'n' else : record.molType = 'g' #if if i.qualifiers.has_key("organelle") : record.organelle = i.qualifiers["organelle"][0] if record.organelle == "mitochondrion" : record.molType = 'm' #if fakeGene = Locus("001") record.source.transcriptList.append(fakeGene) fakeGene.CDS = PList() fakeGene.CDS.location = self.__location2pos(i.location) #if if i.qualifiers.has_key("gene") : if not unicode(i.location.start).isdigit() or \ not unicode(i.location.end).isdigit(): # Feature is not completely in reference. Either start # or end is not a Bio.SeqFeature.ExactPosition. continue geneName = i.qualifiers["gene"][0] if i.type == "gene" : if not geneDict.has_key(geneName) : myGene = Gene(geneName) record.geneList.append(myGene) if i.strand : myGene.orientation = i.strand myGene.location = self.__location2pos(i.location) geneDict[geneName] = tempGene(geneName) #if else: if geneName not in geneDict: # We should have seen a gene entry for this gene # by now. Could be that it was skipped because it # was not completely in reference (see check # above). In that case we just ignore any of its # features. continue #if if i.type in ["mRNA", "misc_RNA", "ncRNA", "rRNA", "tRNA", "tmRNA"] : geneDict[geneName].rnaList.append(i) if i.type == "CDS" : geneDict[geneName].cdsList.append(i) if i.type == "exon" : exonLocation = self.__location2pos(i.location) if exonLocation : exonList.extend(exonLocation) #if #if #if #for if record.molType in ['g', 'm'] : for j in geneDict.keys() : myGene = geneDict[j] self.link(myGene.rnaList, myGene.cdsList) for i in myGene.rnaList : if i.usable : myRealGene = record.findGene(i.gene) if i.locus_tag : # Note: We use the last three characters of the # locus_tag as a unique transcript version id. # This is also used to for the protein-transcript # link table. # Normally, locus_tag ends with three digits, but # for some (e.g. mobA on NC_011228, a plasmid) it # ends with two digits prepended with an # underscore. Or prepended with a letter. We # really want a number, so 'fix' this by only # looking for a numeric part. try: version = LOCUS_TAG_VERSION.findall( i.locus_tag)[0].zfill(3) except IndexError: version = '000' myTranscript = Locus(version) else : myTranscript = Locus(myRealGene.newLocusTag()) myTranscript.mRNA = PList() myTranscript.mRNA.positionList = i.positionList myTranscript.mRNA.location = i.location myTranscript.transcribe = True myTranscript.transcriptID = i.transcript_id myTranscript.transcriptProduct = i.product myTranscript.locusTag = i.locus_tag if i.link : myTranscript.CDS = PList() myTranscript.CDS.positionList = i.link.positionList myTranscript.CDS.location = i.link.location myTranscript.translate = True myTranscript.proteinID = i.link.protein_id myTranscript.linkMethod = i.linkMethod myTranscript.proteinProduct = i.link.product if i.link.qualifiers.has_key("transl_table") : myTranscript.txTable = \ int(i.qualifiers["transl_table"][0]) #if myRealGene.transcriptList.append(myTranscript) #if #for for i in myGene.cdsList : if not i.linked and \ (i.usable or not geneDict[myGene.name].rnaList) : myRealGene = record.findGene(i.gene) if i.locus_tag : # Note: We use the last three characters of the # locus_tag as a unique transcript version id. # This is also used to for the protein-transcript # link table. # Normally, locus_tag ends with three digits, but # for some (e.g. mobA on NC_011228, a plasmid) it # ends with two digits prepended with an # underscore. Or prepended with a letter. We # really want a number, so 'fix' this by only # looking for a numeric part. try: version = LOCUS_TAG_VERSION.findall( i.locus_tag)[0].zfill(3) except IndexError: version = '000' myTranscript = Locus(version) else : myTranscript = Locus(myRealGene.newLocusTag()) myTranscript.CDS = PList() myTranscript.CDS.positionList = i.positionList myTranscript.CDS.location = i.location myTranscript.proteinID = i.protein_id myTranscript.proteinProduct = i.product if i.qualifiers.has_key("transl_table") : myTranscript.txTable = \ int(i.qualifiers["transl_table"][0]) myRealGene.transcriptList.append(myTranscript) #if #if #for #for #if else : if geneDict : myGene = geneDict[geneDict.keys()[0]] myRealGene = record.geneList[0] if myGene.cdsList : myCDS = myGene.cdsList[0] self.__tagByDict(myCDS, "protein_id") self.__tagByDict(myCDS, "product") #if else : myCDS = None myTranscript = Locus("001") myTranscript.exon = PList() if exonList : myTranscript.exon.positionList = exonList else : myTranscript.exon.location = myRealGene.location if myCDS : myTranscript.CDS = PList() myTranscript.CDS.location = \ self.__location2pos(myCDS.location) #if if exonList or myRealGene.location or \ myTranscript.CDS.location : myTranscript.transcriptID = biorecord.id if myCDS : myTranscript.proteinID = myCDS.protein_id myTranscript.proteinProduct = myCDS.product myTranscript.linkMethod = "exhaustion" myTranscript.transcribe = True if myCDS.qualifiers.has_key("transl_table") : myTranscript.txTable = \ int(i.qualifiers["transl_table"][0]) #if myRealGene.transcriptList.append(myTranscript) #if #if #else for i in record.geneList : if not i.transcriptList : record.geneList.remove(i) return record