Exemple #1
0
def opt_drate_fix_drate_times_b(multiAlign, dRate, tree, qMat, piProb, cList, qRates=[1.]):
    """
    optimization function for rate = (iRate, dRate) based on PIP
    """
    dRateTimesTreeTotalBranchLength = dRate * tree.length()
    res = minimize_scalar(nllk_drate_fix_drate_times_b, args=(multiAlign, dRateTimesTreeTotalBranchLength, tree, qMat, piProb, cList, qRates))
    dRate = res.x
    treeNew = copy.deepcopy(tree)
    treeNew.scale_edges(dRateTimesTreeTotalBranchLength / (dRate * treeNew.length()))
    seqNames = multiAlign.keys()
    nLeaf = len(seqNames)
    cPhi = '-' * nLeaf
    tau = tree.length()
    cListExt = cList + ['-']
    piProbExt = np.append(piProb, 0)
    pc0 = 0
    for qRate in qRates:
        qMatExt = q_to_qext(qMat*qRate, dRate)
        pc0 += prob_msa_one_site(cPhi, seqNames, tree, qMatExt, piProbExt, dRate, cListExt)
    pc0 = pc0 / len(qRates)
    mlen = len(multiAlign.values()[0])
    nu = mlen / (1. - pc0)
    # logPsi = -np.sum(np.log(np.arange(1, mlen+1))) + mlen * np.log(nu) + (pc0 - 1) * nu
    # nu = iRate * (tau + 1. / dRate)
    iRate = nu / (tau + 1. / dRate)
    rate = (iRate, dRate)
    return rate, treeNew
Exemple #2
0
def opt_drate(multiAlign, tree, qMat, piProb, cList, qRates=[1.]):
    """
    optimization function for rate = (iRate, dRate) based on PIP
    """
    res = minimize_scalar(nllk_drate,
                          args=(multiAlign, tree, qMat, piProb, cList, qRates))
    dRate = res.x
    seqNames = multiAlign.keys()
    nLeaf = len(seqNames)
    cPhi = '-' * nLeaf
    tau = tree.length()
    cListExt = cList + ['-']
    piProbExt = np.append(piProb, 0)
    pc0 = 0
    for qRate in qRates:
        qMatExt = q_to_qext(qMat * qRate, dRate)
        pc0 += prob_msa_one_site(cPhi, seqNames, tree, qMatExt, piProbExt,
                                 dRate, cListExt)
    pc0 = pc0 / len(qRates)
    mlen = len(multiAlign.values()[0])
    nu = mlen / (1. - pc0)
    # logPsi = -np.sum(np.log(np.arange(1, mlen+1))) + mlen * np.log(nu) + (pc0 - 1) * nu
    # nu = iRate * (tau + 1. / dRate)
    iRate = nu / (tau + 1. / dRate)
    rate = (iRate, dRate)
    return rate
Exemple #3
0
def logprob_msa(multiAlign,
                tree,
                qMat,
                piProb,
                iRate,
                dRate,
                cList,
                qRates=[1.]):
    """
    calculate logP(m) for a multiple alignment under PIP
    1). calculate logP(c) for all unique c in the aligments
    2). construct a dict, c->logP(c), and use this dict for all aligments
    3). calculate logP(m) use the dictionary in 2)
    input:
        multiAlign: dict, seqName->aligned string by loci
        tree: tree, fixed
        qMat, piProb: transition matrix and stationary distribution
        iRate, dRate: insertion and deletion rate
        cList: characher list
    output:
        logProbM: float, log probability of the MSA
    """
    # # Make sure that the tree is rooted.
    # tree.reroot_at_midpoint()
    seqNames = multiAlign.keys()
    nLeaf = len(seqNames)
    msaList = zip(*multiAlign.values())
    mlen = len(msaList)
    msaUnique = list(set(msaList))
    piProbExt = np.append(piProb, 0)
    cListExt = cList + ['-']
    logProbMsaUnique = np.zeros(len(msaUnique))
    for qRate in qRates:
        qMatExt = q_to_qext(qMat * qRate, dRate)
        logProbMsaTem = logprob_msa_multi_site(msaUnique, seqNames, tree,
                                               qMatExt, piProbExt, dRate,
                                               cListExt)
        logProbMsaUnique += logProbMsaTem
    logProbMsaUnique = logProbMsaUnique / len(qRates)
    logProbMsaDict = dict(zip(msaUnique, logProbMsaUnique))
    logProbMsa = np.array([logProbMsaDict[msa] for msa in msaList])
    # Calculate psi(.,.).
    tau = tree.length()
    nu = iRate * (tau + 1. / dRate)
    cPhi = '-' * nLeaf
    pc0 = prob_msa_one_site(cPhi, seqNames, tree, qMatExt, piProbExt, dRate,
                            cListExt)
    logPsi = -np.sum(np.log(np.arange(
        1, mlen + 1))) + mlen * np.log(nu) + (pc0 - 1) * nu
    logProbM = logPsi + logProbMsa.sum()
    return logProbM
Exemple #4
0
def pc0_from_dRate_and_tree(dRate, seqNames, tree, qMat, piProb, cList, qRates=[1.]):
    """
    calculate P(cPhi) under PIP
    """
    nLeaf = len(seqNames)
    cPhi = '-' * nLeaf
    cListExt = cList + ['-']
    pc0 = 0
    for qRate in qRates:
        qMatExt = q_to_qext(qMat*qRate, dRate)
        piProbExt = np.append(piProb, 0)
        pc0 += prob_msa_one_site(cPhi, seqNames, tree, qMatExt, piProbExt, dRate, cListExt)
    pc0 = pc0 / len(qRates)
    return pc0
Exemple #5
0
def pc0_from_dRate_and_tree(dRate,
                            seqNames,
                            tree,
                            qMat,
                            piProb,
                            cList,
                            qRates=[1.]):
    """
    calculate P(cPhi) under PIP
    """
    nLeaf = len(seqNames)
    cPhi = '-' * nLeaf
    cListExt = cList + ['-']
    pc0 = 0
    for qRate in qRates:
        qMatExt = q_to_qext(qMat * qRate, dRate)
        piProbExt = np.append(piProb, 0)
        pc0 += prob_msa_one_site(cPhi, seqNames, tree, qMatExt, piProbExt,
                                 dRate, cListExt)
    pc0 = pc0 / len(qRates)
    return pc0
Exemple #6
0
def logprob_msa(multiAlign, tree, qMat, piProb, iRate, dRate, cList, qRates=[1.]):
    """
    calculate logP(m) for a multiple alignment under PIP
    1). calculate logP(c) for all unique c in the aligments
    2). construct a dict, c->logP(c), and use this dict for all aligments
    3). calculate logP(m) use the dictionary in 2)
    input:
        multiAlign: dict, seqName->aligned string by loci
        tree: tree, fixed
        qMat, piProb: transition matrix and stationary distribution
        iRate, dRate: insertion and deletion rate
        cList: characher list
    output:
        logProbM: float, log probability of the MSA
    """
    # # Make sure that the tree is rooted.
    # tree.reroot_at_midpoint()
    seqNames = multiAlign.keys()
    nLeaf = len(seqNames)
    msaList = zip(*multiAlign.values())
    mlen = len(msaList)
    msaUnique = list(set(msaList))
    piProbExt = np.append(piProb, 0)
    cListExt = cList + ['-']
    logProbMsaUnique = np.zeros(len(msaUnique))
    for qRate in qRates:
        qMatExt = q_to_qext(qMat * qRate, dRate)
        logProbMsaTem = logprob_msa_multi_site(msaUnique, seqNames, tree, qMatExt, piProbExt, dRate, cListExt)
        logProbMsaUnique += logProbMsaTem
    logProbMsaUnique = logProbMsaUnique / len(qRates)
    logProbMsaDict = dict(zip(msaUnique, logProbMsaUnique))
    logProbMsa = np.array([logProbMsaDict[msa] for msa in msaList])
    # Calculate psi(.,.).
    tau = tree.length()
    nu = iRate * (tau + 1. / dRate)
    cPhi = '-' * nLeaf
    pc0 = prob_msa_one_site(cPhi, seqNames, tree, qMatExt, piProbExt, dRate, cListExt)
    logPsi = -np.sum(np.log(np.arange(1, mlen+1))) + mlen * np.log(nu) + (pc0 - 1) * nu
    logProbM = logPsi + logProbMsa.sum()
    return logProbM
Exemple #7
0
def logprob_align(b, alignRes, piProb, qMat, iRate, dRate, cList, qRates=[1.]):
    """
    calculate the probability of alignments of segments, or probability of alignments of sequences within one segment
    input:
        alignRes: alignemnt of sequences wthin one segment
        piProb: stationary distribution of qMat
        qMat: rate matrix
        b: branch length
        iRate, dRate: insertion and deletion rate
        cList: list of basic characters
    output:
        logProbAlign: likelihood
    """
    pcAllList = []
    for qRate in qRates:
        qMatExt = q_to_qext(qMat*qRate, dRate)
        pMat = sp.linalg.expm(b*qMatExt)
        piExt = list(piProb) + [0]   # add one element for epsilon
        cListExt = cList + ['-']
        pcAllList.append(prob_c_all(piExt, pMat, b, dRate, cListExt))
    pcAll = mean_dict_value(pcAllList)
    pc0 = pcAll['-']['-']
    logProbAlign = logprob_m(alignRes, pc0, pcAll, b, iRate, dRate)
    return logProbAlign
Exemple #8
0
def logprob_align(b, alignRes, piProb, qMat, iRate, dRate, cList, qRates=[1.]):
    """
    calculate the probability of alignments of segments, or probability of alignments of sequences within one segment
    input:
        alignRes: alignemnt of sequences wthin one segment
        piProb: stationary distribution of qMat
        qMat: rate matrix
        b: branch length
        iRate, dRate: insertion and deletion rate
        cList: list of basic characters
    output:
        logProbAlign: likelihood
    """
    pcAllList = []
    for qRate in qRates:
        qMatExt = q_to_qext(qMat * qRate, dRate)
        pMat = sp.linalg.expm(b * qMatExt)
        piExt = list(piProb) + [0]  # add one element for epsilon
        cListExt = cList + ['-']
        pcAllList.append(prob_c_all(piExt, pMat, b, dRate, cListExt))
    pcAll = mean_dict_value(pcAllList)
    pc0 = pcAll['-']['-']
    logProbAlign = logprob_m(alignRes, pc0, pcAll, b, iRate, dRate)
    return logProbAlign