Ejemplo n.º 1
0
def gapCdsToProteins(proteinAlignment, extraDnaSeqs=None):
    """ to replace proteinToCodonAlignment() """
    protSeqDict = {}
    for seqRecord in proteinAlignment:
        protSeqDict[seqRecord.id] = seqRecord
    dnaFasta = patric_api.getSequenceOfFeatures(protSeqDict.keys(), 'dna')
    #if Debug:
    #     LOG.write("dnaFasta sample: %s\n"%dnaFasta[:100])

    dnaSeqDict = SeqIO.to_dict(
        SeqIO.parse(StringIO(dnaFasta),
                    "fasta",
                    alphabet=IUPAC.IUPACAmbiguousDNA()))
    for seqId in protSeqDict:
        if extraDnaSeqs and seqId in extraDnaSeqs:
            dnaSeqDict[seqId] = extraDnaSeqs[seqId]
            if Debug:
                LOG.write("appending extra DNA seq %s\n" % seqId)
    if set(dnaSeqDict.keys()) != set(protSeqDict.keys()):
        raise Exception(
            "Protein and DNA sets differ:\nProteins: %s\nDNA: %s\n" %
            (", ".join(sorted(protSeqDict)), ", ".join(sorted(dnaSeqDict))))
    dnaAlignFasta = StringIO()
    prot_align_len = proteinAlignment.get_alignment_length()
    for seqId in dnaSeqDict:
        dnaSeq = dnaSeqDict[seqId].seq
        if len(dnaSeq) < 3 * prot_align_len:
            # this is to handle cases where protein exists but DNA does not
            dnaSeq += '---' * (prot_align_len - len(dnaSeq))
        protSeq = protSeqDict[seqId].seq
        dnaAlignFasta.write(">" + seqId + "\n")
        dnaSeqPos = 0
        for protPos in range(0, len(protSeq)):
            if protSeq[protPos] == '-':
                codon = '---'
            else:
                #  TODO: in future use a codon table to check correct matching
                codon = str(dnaSeq[dnaSeqPos:dnaSeqPos + 3])
                dnaSeqPos += 3
            dnaAlignFasta.write(codon)
        protPos += 1  # should now be equal to prot_align_len
        if Debug:
            LOG.write(
                seqId +
                " protPos={0}, dnaSeqPos={1}, orig_DNA_len={2}, orig_prot_len={3}\n"
                .format(protPos, dnaSeqPos, len(dnaSeq), len(protSeq)))
        if protPos < prot_align_len:
            dnaAlignFasta.write(''.join("---" * (prot_align_len - protPos)))
            LOG.write(
                "padding short seq {0}, of {1} pos out to {2}, orig_DNA_len={3}, orig_prot_len={4}\n"
                .format(seqId, protPos, prot_align_len, len(dnaSeq),
                        len(protSeq)))
        dnaAlignFasta.write("\n")
    dnaAlignFasta_text = dnaAlignFasta.getvalue()
    retval = AlignIO.read(StringIO(dnaAlignFasta_text), 'fasta')
    return retval
proteinAlignments = {}
proteinAlignmentStats = {}
alignmentScore = {}
alignedTaxa = set()
protein_alignment_time = time()
for homologId in singleCopyHomologs:
    try:
        LOG.write("aligning {}\n".format(homologId))
        geneIdSet = set()
        for genome in homologMatrix[homologId]:
            for proteinId in homologMatrix[homologId][genome]:
                if not "undefined" in proteinId:
                    geneIdSet.add(proteinId)
            #geneIdSet.update(set(homologMatrix[homologId][genome]))

        proteinFasta = patric_api.getSequenceOfFeatures(geneIdSet, 'protein')
        # replace bad characters for good while it is still text (e.g., 'J' to 'X')
        lines = proteinFasta.split("\n")
        for i in range(len(lines)):
            if not lines[i].startswith(">"):
                lines[i] = lines[i].replace("J", "X")
        proteinFasta = "\n".join(lines)

        seqRecords = SeqIO.parse(StringIO(proteinFasta),
                                 "fasta",
                                 alphabet=IUPAC.extended_protein)
        proteinSeqDict = SeqIO.to_dict(seqRecords)

        for genomeId in homologMatrix[homologId]:
            for geneId in homologMatrix[homologId][genomeId]:
                if genomeId == genomeObject_genomeId:
Ejemplo n.º 3
0
def proteinToCodonAlignment(proteinAlignment, extraDnaSeqs=None):
    protSeqDict = {}
    for seqRecord in proteinAlignment:
        protSeqDict[seqRecord.id] = seqRecord
    dnaFasta = patric_api.getSequenceOfFeatures(protSeqDict.keys(), 'dna')
    #if Debug:
    #     LOG.write("dnaFasta sample: %s\n"%dnaFasta[:100])

    dnaSeqDict = SeqIO.to_dict(
        SeqIO.parse(StringIO(dnaFasta),
                    "fasta",
                    alphabet=IUPAC.IUPACAmbiguousDNA()))
    for seqId in protSeqDict:
        if extraDnaSeqs and seqId in extraDnaSeqs:
            dnaSeqDict[seqId] = extraDnaSeqs[seqId]
            if Debug:
                LOG.write("appending extra DNA seq %s\n" % seqId)
    if set(dnaSeqDict.keys()) != set(protSeqDict.keys()):
        raise Exception(
            "Protein and DNA sets differ:\nProteins: %s\nDNA: %s\n" %
            (", ".join(sorted(protSeqDict)), ", ".join(sorted(dnaSeqDict))))
    for seqId in dnaSeqDict:
        if not len(dnaSeqDict[seqId].seq):
            #del(dnaSeqDict[seqId])
            LOG.write("warning: seqId %s length of dna was zero\n" % seqId)
    dnaSeqRecords = []
    for proteinSeq in proteinAlignment:
        dnaSeqRecords.append(dnaSeqDict[proteinSeq.id])

    if Debug:
        LOG.write("dna seqs has %d seqs\n" % (len(dnaSeqRecords)))
        #LOG.write("DNA seq ids: %s\n"%(", ".join(sorted(dnaSeqDict))))
        #LOG.write("pro seq ids: %s\n"%(", ".join(sorted(protSeqDict))))
        #LOG.write("first two aligned DNA seqs:\n")
        #SeqIO.write(dnaSeqRecords[:2], LOG, "fasta")
        #LOG.flush()
    """
    # now check length of protein vs dna sequences, extend dna if needed to make match in numbers of codons
    for i, protRec in enumerate(proteinAlignment):
        protSeq = str(protRec.seq)
        protSeq.replace('-','')
        protLen = len(protSeq)
        if len(dnaSeqs[i].seq) < protLen*3:
            shortfall = (protLen*3) - len(dnaSeqs[i].seq)
            if Debug:
                LOG.write("DNA seq for %s is too short for protein, shortfall = %d\n"%(protRec.id, shortfall))
            # extend on both ends to be safe
            dnaSeqs[i].seq = "N"*shortfall + dnaSeqs[i].seq + "N"*shortfall
    """
    returnValue = None
    #with warnings.catch_warnings():
    #warnings.simplefilter('ignore', BiopythonWarning)
    #try:
    #ambiguous_nucleotide_values = {'K': 'GT', 'M': 'AC', 'N': 'ACGT', 'S': 'CG', 'R': 'AG', 'W': 'AT', 'Y': 'CT'}
    #ambiguous_protein_values = {'X': 'ACDEFGHIKLMNOPQRSTVWY', 'J': 'IL', 'B': 'DN', 'Z': 'EQ'}
    #ambiguous_codon_table = CodonTable.AmbiguousCodonTable(CodonTable.ambiguous_dna_by_name["Standard"], IUPAC.IUPACAmbiguousDNA(), ambiguous_nucleotide_values, IUPAC.protein, ambiguous_protein_values)
    #returnValue = codonalign.build(pro_align=proteinAlignment, nucl_seqs=dnaSeqRecords, codon_table=ambiguous_codon_table, max_score=1000)
    returnValue = codonalign.build(pro_align=proteinAlignment,
                                   nucl_seqs=dnaSeqRecords,
                                   max_score=1000)
    for dnaSeq in returnValue:
        proteinRecord = protSeqDict[dnaSeq.id]
        if proteinRecord.annotations:
            dnaSeq.annotations = proteinRecord.annotations.copy()

        #except Exception as e:
        #    LOG.write("problem in codonalign, skipping\n%s\n"%str(e))
        #    raise(e)
    return returnValue