def gapCdsToProteins(proteinAlignment, extraDnaSeqs=None): """ to replace proteinToCodonAlignment() """ protSeqDict = {} for seqRecord in proteinAlignment: protSeqDict[seqRecord.id] = seqRecord dnaFasta = patric_api.getSequenceOfFeatures(protSeqDict.keys(), 'dna') #if Debug: # LOG.write("dnaFasta sample: %s\n"%dnaFasta[:100]) dnaSeqDict = SeqIO.to_dict( SeqIO.parse(StringIO(dnaFasta), "fasta", alphabet=IUPAC.IUPACAmbiguousDNA())) for seqId in protSeqDict: if extraDnaSeqs and seqId in extraDnaSeqs: dnaSeqDict[seqId] = extraDnaSeqs[seqId] if Debug: LOG.write("appending extra DNA seq %s\n" % seqId) if set(dnaSeqDict.keys()) != set(protSeqDict.keys()): raise Exception( "Protein and DNA sets differ:\nProteins: %s\nDNA: %s\n" % (", ".join(sorted(protSeqDict)), ", ".join(sorted(dnaSeqDict)))) dnaAlignFasta = StringIO() prot_align_len = proteinAlignment.get_alignment_length() for seqId in dnaSeqDict: dnaSeq = dnaSeqDict[seqId].seq if len(dnaSeq) < 3 * prot_align_len: # this is to handle cases where protein exists but DNA does not dnaSeq += '---' * (prot_align_len - len(dnaSeq)) protSeq = protSeqDict[seqId].seq dnaAlignFasta.write(">" + seqId + "\n") dnaSeqPos = 0 for protPos in range(0, len(protSeq)): if protSeq[protPos] == '-': codon = '---' else: # TODO: in future use a codon table to check correct matching codon = str(dnaSeq[dnaSeqPos:dnaSeqPos + 3]) dnaSeqPos += 3 dnaAlignFasta.write(codon) protPos += 1 # should now be equal to prot_align_len if Debug: LOG.write( seqId + " protPos={0}, dnaSeqPos={1}, orig_DNA_len={2}, orig_prot_len={3}\n" .format(protPos, dnaSeqPos, len(dnaSeq), len(protSeq))) if protPos < prot_align_len: dnaAlignFasta.write(''.join("---" * (prot_align_len - protPos))) LOG.write( "padding short seq {0}, of {1} pos out to {2}, orig_DNA_len={3}, orig_prot_len={4}\n" .format(seqId, protPos, prot_align_len, len(dnaSeq), len(protSeq))) dnaAlignFasta.write("\n") dnaAlignFasta_text = dnaAlignFasta.getvalue() retval = AlignIO.read(StringIO(dnaAlignFasta_text), 'fasta') return retval
proteinAlignments = {} proteinAlignmentStats = {} alignmentScore = {} alignedTaxa = set() protein_alignment_time = time() for homologId in singleCopyHomologs: try: LOG.write("aligning {}\n".format(homologId)) geneIdSet = set() for genome in homologMatrix[homologId]: for proteinId in homologMatrix[homologId][genome]: if not "undefined" in proteinId: geneIdSet.add(proteinId) #geneIdSet.update(set(homologMatrix[homologId][genome])) proteinFasta = patric_api.getSequenceOfFeatures(geneIdSet, 'protein') # replace bad characters for good while it is still text (e.g., 'J' to 'X') lines = proteinFasta.split("\n") for i in range(len(lines)): if not lines[i].startswith(">"): lines[i] = lines[i].replace("J", "X") proteinFasta = "\n".join(lines) seqRecords = SeqIO.parse(StringIO(proteinFasta), "fasta", alphabet=IUPAC.extended_protein) proteinSeqDict = SeqIO.to_dict(seqRecords) for genomeId in homologMatrix[homologId]: for geneId in homologMatrix[homologId][genomeId]: if genomeId == genomeObject_genomeId:
def proteinToCodonAlignment(proteinAlignment, extraDnaSeqs=None): protSeqDict = {} for seqRecord in proteinAlignment: protSeqDict[seqRecord.id] = seqRecord dnaFasta = patric_api.getSequenceOfFeatures(protSeqDict.keys(), 'dna') #if Debug: # LOG.write("dnaFasta sample: %s\n"%dnaFasta[:100]) dnaSeqDict = SeqIO.to_dict( SeqIO.parse(StringIO(dnaFasta), "fasta", alphabet=IUPAC.IUPACAmbiguousDNA())) for seqId in protSeqDict: if extraDnaSeqs and seqId in extraDnaSeqs: dnaSeqDict[seqId] = extraDnaSeqs[seqId] if Debug: LOG.write("appending extra DNA seq %s\n" % seqId) if set(dnaSeqDict.keys()) != set(protSeqDict.keys()): raise Exception( "Protein and DNA sets differ:\nProteins: %s\nDNA: %s\n" % (", ".join(sorted(protSeqDict)), ", ".join(sorted(dnaSeqDict)))) for seqId in dnaSeqDict: if not len(dnaSeqDict[seqId].seq): #del(dnaSeqDict[seqId]) LOG.write("warning: seqId %s length of dna was zero\n" % seqId) dnaSeqRecords = [] for proteinSeq in proteinAlignment: dnaSeqRecords.append(dnaSeqDict[proteinSeq.id]) if Debug: LOG.write("dna seqs has %d seqs\n" % (len(dnaSeqRecords))) #LOG.write("DNA seq ids: %s\n"%(", ".join(sorted(dnaSeqDict)))) #LOG.write("pro seq ids: %s\n"%(", ".join(sorted(protSeqDict)))) #LOG.write("first two aligned DNA seqs:\n") #SeqIO.write(dnaSeqRecords[:2], LOG, "fasta") #LOG.flush() """ # now check length of protein vs dna sequences, extend dna if needed to make match in numbers of codons for i, protRec in enumerate(proteinAlignment): protSeq = str(protRec.seq) protSeq.replace('-','') protLen = len(protSeq) if len(dnaSeqs[i].seq) < protLen*3: shortfall = (protLen*3) - len(dnaSeqs[i].seq) if Debug: LOG.write("DNA seq for %s is too short for protein, shortfall = %d\n"%(protRec.id, shortfall)) # extend on both ends to be safe dnaSeqs[i].seq = "N"*shortfall + dnaSeqs[i].seq + "N"*shortfall """ returnValue = None #with warnings.catch_warnings(): #warnings.simplefilter('ignore', BiopythonWarning) #try: #ambiguous_nucleotide_values = {'K': 'GT', 'M': 'AC', 'N': 'ACGT', 'S': 'CG', 'R': 'AG', 'W': 'AT', 'Y': 'CT'} #ambiguous_protein_values = {'X': 'ACDEFGHIKLMNOPQRSTVWY', 'J': 'IL', 'B': 'DN', 'Z': 'EQ'} #ambiguous_codon_table = CodonTable.AmbiguousCodonTable(CodonTable.ambiguous_dna_by_name["Standard"], IUPAC.IUPACAmbiguousDNA(), ambiguous_nucleotide_values, IUPAC.protein, ambiguous_protein_values) #returnValue = codonalign.build(pro_align=proteinAlignment, nucl_seqs=dnaSeqRecords, codon_table=ambiguous_codon_table, max_score=1000) returnValue = codonalign.build(pro_align=proteinAlignment, nucl_seqs=dnaSeqRecords, max_score=1000) for dnaSeq in returnValue: proteinRecord = protSeqDict[dnaSeq.id] if proteinRecord.annotations: dnaSeq.annotations = proteinRecord.annotations.copy() #except Exception as e: # LOG.write("problem in codonalign, skipping\n%s\n"%str(e)) # raise(e) return returnValue