def opt_drate_fix_drate_times_b(multiAlign, dRate, tree, qMat, piProb, cList, qRates=[1.]): """ optimization function for rate = (iRate, dRate) based on PIP """ dRateTimesTreeTotalBranchLength = dRate * tree.length() res = minimize_scalar(nllk_drate_fix_drate_times_b, args=(multiAlign, dRateTimesTreeTotalBranchLength, tree, qMat, piProb, cList, qRates)) dRate = res.x treeNew = copy.deepcopy(tree) treeNew.scale_edges(dRateTimesTreeTotalBranchLength / (dRate * treeNew.length())) seqNames = multiAlign.keys() nLeaf = len(seqNames) cPhi = '-' * nLeaf tau = tree.length() cListExt = cList + ['-'] piProbExt = np.append(piProb, 0) pc0 = 0 for qRate in qRates: qMatExt = q_to_qext(qMat*qRate, dRate) pc0 += prob_msa_one_site(cPhi, seqNames, tree, qMatExt, piProbExt, dRate, cListExt) pc0 = pc0 / len(qRates) mlen = len(multiAlign.values()[0]) nu = mlen / (1. - pc0) # logPsi = -np.sum(np.log(np.arange(1, mlen+1))) + mlen * np.log(nu) + (pc0 - 1) * nu # nu = iRate * (tau + 1. / dRate) iRate = nu / (tau + 1. / dRate) rate = (iRate, dRate) return rate, treeNew
def opt_drate(multiAlign, tree, qMat, piProb, cList, qRates=[1.]): """ optimization function for rate = (iRate, dRate) based on PIP """ res = minimize_scalar(nllk_drate, args=(multiAlign, tree, qMat, piProb, cList, qRates)) dRate = res.x seqNames = multiAlign.keys() nLeaf = len(seqNames) cPhi = '-' * nLeaf tau = tree.length() cListExt = cList + ['-'] piProbExt = np.append(piProb, 0) pc0 = 0 for qRate in qRates: qMatExt = q_to_qext(qMat * qRate, dRate) pc0 += prob_msa_one_site(cPhi, seqNames, tree, qMatExt, piProbExt, dRate, cListExt) pc0 = pc0 / len(qRates) mlen = len(multiAlign.values()[0]) nu = mlen / (1. - pc0) # logPsi = -np.sum(np.log(np.arange(1, mlen+1))) + mlen * np.log(nu) + (pc0 - 1) * nu # nu = iRate * (tau + 1. / dRate) iRate = nu / (tau + 1. / dRate) rate = (iRate, dRate) return rate
def logprob_msa(multiAlign, tree, qMat, piProb, iRate, dRate, cList, qRates=[1.]): """ calculate logP(m) for a multiple alignment under PIP 1). calculate logP(c) for all unique c in the aligments 2). construct a dict, c->logP(c), and use this dict for all aligments 3). calculate logP(m) use the dictionary in 2) input: multiAlign: dict, seqName->aligned string by loci tree: tree, fixed qMat, piProb: transition matrix and stationary distribution iRate, dRate: insertion and deletion rate cList: characher list output: logProbM: float, log probability of the MSA """ # # Make sure that the tree is rooted. # tree.reroot_at_midpoint() seqNames = multiAlign.keys() nLeaf = len(seqNames) msaList = zip(*multiAlign.values()) mlen = len(msaList) msaUnique = list(set(msaList)) piProbExt = np.append(piProb, 0) cListExt = cList + ['-'] logProbMsaUnique = np.zeros(len(msaUnique)) for qRate in qRates: qMatExt = q_to_qext(qMat * qRate, dRate) logProbMsaTem = logprob_msa_multi_site(msaUnique, seqNames, tree, qMatExt, piProbExt, dRate, cListExt) logProbMsaUnique += logProbMsaTem logProbMsaUnique = logProbMsaUnique / len(qRates) logProbMsaDict = dict(zip(msaUnique, logProbMsaUnique)) logProbMsa = np.array([logProbMsaDict[msa] for msa in msaList]) # Calculate psi(.,.). tau = tree.length() nu = iRate * (tau + 1. / dRate) cPhi = '-' * nLeaf pc0 = prob_msa_one_site(cPhi, seqNames, tree, qMatExt, piProbExt, dRate, cListExt) logPsi = -np.sum(np.log(np.arange( 1, mlen + 1))) + mlen * np.log(nu) + (pc0 - 1) * nu logProbM = logPsi + logProbMsa.sum() return logProbM
def pc0_from_dRate_and_tree(dRate, seqNames, tree, qMat, piProb, cList, qRates=[1.]): """ calculate P(cPhi) under PIP """ nLeaf = len(seqNames) cPhi = '-' * nLeaf cListExt = cList + ['-'] pc0 = 0 for qRate in qRates: qMatExt = q_to_qext(qMat*qRate, dRate) piProbExt = np.append(piProb, 0) pc0 += prob_msa_one_site(cPhi, seqNames, tree, qMatExt, piProbExt, dRate, cListExt) pc0 = pc0 / len(qRates) return pc0
def pc0_from_dRate_and_tree(dRate, seqNames, tree, qMat, piProb, cList, qRates=[1.]): """ calculate P(cPhi) under PIP """ nLeaf = len(seqNames) cPhi = '-' * nLeaf cListExt = cList + ['-'] pc0 = 0 for qRate in qRates: qMatExt = q_to_qext(qMat * qRate, dRate) piProbExt = np.append(piProb, 0) pc0 += prob_msa_one_site(cPhi, seqNames, tree, qMatExt, piProbExt, dRate, cListExt) pc0 = pc0 / len(qRates) return pc0
def logprob_msa(multiAlign, tree, qMat, piProb, iRate, dRate, cList, qRates=[1.]): """ calculate logP(m) for a multiple alignment under PIP 1). calculate logP(c) for all unique c in the aligments 2). construct a dict, c->logP(c), and use this dict for all aligments 3). calculate logP(m) use the dictionary in 2) input: multiAlign: dict, seqName->aligned string by loci tree: tree, fixed qMat, piProb: transition matrix and stationary distribution iRate, dRate: insertion and deletion rate cList: characher list output: logProbM: float, log probability of the MSA """ # # Make sure that the tree is rooted. # tree.reroot_at_midpoint() seqNames = multiAlign.keys() nLeaf = len(seqNames) msaList = zip(*multiAlign.values()) mlen = len(msaList) msaUnique = list(set(msaList)) piProbExt = np.append(piProb, 0) cListExt = cList + ['-'] logProbMsaUnique = np.zeros(len(msaUnique)) for qRate in qRates: qMatExt = q_to_qext(qMat * qRate, dRate) logProbMsaTem = logprob_msa_multi_site(msaUnique, seqNames, tree, qMatExt, piProbExt, dRate, cListExt) logProbMsaUnique += logProbMsaTem logProbMsaUnique = logProbMsaUnique / len(qRates) logProbMsaDict = dict(zip(msaUnique, logProbMsaUnique)) logProbMsa = np.array([logProbMsaDict[msa] for msa in msaList]) # Calculate psi(.,.). tau = tree.length() nu = iRate * (tau + 1. / dRate) cPhi = '-' * nLeaf pc0 = prob_msa_one_site(cPhi, seqNames, tree, qMatExt, piProbExt, dRate, cListExt) logPsi = -np.sum(np.log(np.arange(1, mlen+1))) + mlen * np.log(nu) + (pc0 - 1) * nu logProbM = logPsi + logProbMsa.sum() return logProbM
def logprob_align(b, alignRes, piProb, qMat, iRate, dRate, cList, qRates=[1.]): """ calculate the probability of alignments of segments, or probability of alignments of sequences within one segment input: alignRes: alignemnt of sequences wthin one segment piProb: stationary distribution of qMat qMat: rate matrix b: branch length iRate, dRate: insertion and deletion rate cList: list of basic characters output: logProbAlign: likelihood """ pcAllList = [] for qRate in qRates: qMatExt = q_to_qext(qMat*qRate, dRate) pMat = sp.linalg.expm(b*qMatExt) piExt = list(piProb) + [0] # add one element for epsilon cListExt = cList + ['-'] pcAllList.append(prob_c_all(piExt, pMat, b, dRate, cListExt)) pcAll = mean_dict_value(pcAllList) pc0 = pcAll['-']['-'] logProbAlign = logprob_m(alignRes, pc0, pcAll, b, iRate, dRate) return logProbAlign
def logprob_align(b, alignRes, piProb, qMat, iRate, dRate, cList, qRates=[1.]): """ calculate the probability of alignments of segments, or probability of alignments of sequences within one segment input: alignRes: alignemnt of sequences wthin one segment piProb: stationary distribution of qMat qMat: rate matrix b: branch length iRate, dRate: insertion and deletion rate cList: list of basic characters output: logProbAlign: likelihood """ pcAllList = [] for qRate in qRates: qMatExt = q_to_qext(qMat * qRate, dRate) pMat = sp.linalg.expm(b * qMatExt) piExt = list(piProb) + [0] # add one element for epsilon cListExt = cList + ['-'] pcAllList.append(prob_c_all(piExt, pMat, b, dRate, cListExt)) pcAll = mean_dict_value(pcAllList) pc0 = pcAll['-']['-'] logProbAlign = logprob_m(alignRes, pc0, pcAll, b, iRate, dRate) return logProbAlign