def sim_tree_hpip(tree, iRateSeg, dRateSeg, piProbSeg, qMatSeg, ratesList, piProb, qMat, cList, fixSegNumber=False): """ generate sequences based on a tree using hPIP model. """ piProb = pi_from_qmat(qMat) for node in tree.preorder_node_iter(): if node.parent_node is None: value = sim_segs_initial(iRateSeg, dRateSeg, piProbSeg, ratesList, piProb, cList, fixSegNumber) node.value = value else: value = segs_change(node.parent_node.value, piProbSeg, iRateSeg, dRateSeg, qMatSeg, node.edge.length, ratesList, qMat, piProb, cList) node.value = value return tree
def sim_tree(tree, p, ratesList, piProbSeg, piProb, qMat, cList, fixSegNumber=False): """ Generate sequences based on a tree using the GeoPIP model. input: tree: a tree, of dendropy.Tree class p: parameter of geometric distribution ratesList: a list of all rates piProb, qMat, cList: ... output: tree: updated tree, with value at each node """ piProb = pi_from_qmat(qMat) for node in tree.preorder_node_iter(): if node.parent_node is None: value = sim_segs_initial(p, ratesList, piProbSeg, piProb, cList, fixSegNumber) node.value = value else: value = segs_change(node.parent_node.value, node.edge.length, qMat, piProb, cList) node.value = value return tree
def param_for_estep(qMat, rate=1.0, categoryPriors=1.0): """ generate the input.json file required for the E step contains rate matrix and categories of rates NOTE: THIS FUNCTION CURRENTLY ONLY WORK FOR ONE RATE DIFFERENT FROM R CODE INPUT """ res = {} # nCharType = qMat.shape[0] # nRate = length(rate) # nCate = length(categoryPriors) piProb = pi_from_qmat(qMat) res['categoryPriors'] = [1] res['observationErrorProbability'] = 0 res['stationaryDistributions'] = [piProb] res['rateMatrices'] = [qMat] return res
def qmat_paml_format(qMat): """ get PaML format from a rate matrix, i.e., frequencies and relative rate parameters input: qMat: a rate matrix output: piProb: stationary distribution of the rate matrix rate: relative rate parameters, in the order of A <-> C, A <-> G, A <-> T, C <-> G, C <-> T, G <-> T """ piProb = pi_from_qmat(qMat) tMat = qMat / piProb # dimRow, dimCol = qMat.shape # tem = np.array([tMat[i, (i+1):4] for i in xrange(4-1)]) rate = list([list(tMat[0, 1:4]), list(tMat[1, 2:4]), list(tMat[2, 3:4])]) rate = np.hstack(rate) rate = rate / rate[-1] return piProb, rate
def opt_pip_full(rate, qMat, multiAlign, javaDirectory, modelDirectory, eStepFile, parametersPath, inputLoc, outputLoc, dataLoc, execsLoc, rFileLoc, cList, qRates=[1.], suffix='', updateQ=True, updateRate=True, updateRateFixdRateTimesb=True, tol=1.e-2, bTol=1.e-3, iterMax=100): """ optimization for all parameters: iRate, dRate, qMat, tree (bDict) in PIP updating in iRate and dRate, qMat, tree (bDict) iteratively estimate bDict first given other, so starting bDict (tree) is not needed """ outNameLoc, outDistLoc, outTreeLoc = get_out_name_dist_tree_files(dataLoc, suffix) rCodeNj = get_rscript(outNameLoc, outDistLoc, outTreeLoc, rFileLoc) print 'simulation run: %s' % (inputLoc) # directory for runing EM alignInSeg = pair_align_from_multi_align(multiAlign) pairsList = alignInSeg.keys() piProb = pi_from_qmat(qMat) print '### initialize tree ###' segRateDict = {0: rate} # only one segment bDict = opt_nlists_bonly(pairsList, alignInSeg, segRateDict, piProb, qMat, cList, qRates) # write_dist_from_bdict(bDict, dataLoc, suffix) tree = tree_use_r_for_unknown_number_of_leaves(bDict, pairsList, rCodeNj, dataLoc, outTreeLoc, suffix, rooted=True) outAlignFile = inputLoc + '/' + 'all.align.txt' outTreeFile = inputLoc + '/' + 'all.tree.txt' dict_write_align_fasta(multiAlign, outAlignFile) write_tree(tree, outTreeFile) tree.reroot_at_midpoint() dif = 1.e10 bDictRelativeDif = 1 iterNum = 1 nllk = 1.e10 nllkDif = 1 while ((dif > tol) and (bDictRelativeDif > bTol) and (iterNum < iterMax)): if updateRateFixdRateTimesb: print '### updating insertion and deletion rate when dRate*b is fixed ###' dRate = rate[1] print tree.length() rateNew, tree = opt_drate_fix_drate_times_b(multiAlign, dRate, tree, qMat, piProb, cList, qRates=[1.]) print tree.length() print rate print rateNew print (np.array(rateNew) - np.array(rate)) / np.array(rate) dRateRelativeDifFixdRateTimesb = abs(rate[1] - rateNew[1]) / rate[1] rate = rateNew else: dRateRelativeDifFixdRateTimesb = 0 if updateRate: print '### updating insertion and deletion rate ###\n' # rateNew = opt_rate(rate, multiAlign, tree, qMat, piProb, cList) rateNew = opt_drate(multiAlign, tree, qMat, piProb, cList, qRates) print rate print rateNew print (np.array(rateNew) - np.array(rate)) / np.array(rate) dRateRelativeDif = abs(rate[1] - rateNew[1]) / rate[1] rate = rateNew else: print 'rate is fixed:', rate dRateRelativeDif = 0 if updateQ: print '### updating rate matrix Q ###\n' qMatNew, piProbNew = opt_qmat_em_full(qMat, cList, inputLoc, outputLoc, javaDirectory, modelDirectory, eStepFile, parametersPath, execsLoc) qMatRelativeDifMat = abs(qMatNew - qMat) / abs(qMat) qMatRelativeDif = qMatRelativeDifMat.max() qMat = qMatNew piProb = piProbNew else: print '### fixing rate matrix Q ###\n' qMatRelativeDif = 0 print '### updating tree ###' segRateDict = {0: rate} # only one segment bDictNew = opt_nlists_bonly(pairsList, alignInSeg, segRateDict, piProb, qMat, cList, qRates) bDictRelativeDifVec = [abs(bDictNew[key] - bDict[key]) / bDict[key] for key in bDict.keys()] bDictRelativeDif = np.array(bDictRelativeDifVec).max() print (np.array(bDictNew.values()) - np.array(bDict.values())) / np.array(bDict.values()) bDict = bDictNew # write_dist_from_bdict(bDict, dataLoc, suffix) tree = tree_use_r_for_unknown_number_of_leaves(bDict, pairsList, rCodeNj, dataLoc, outTreeLoc, suffix, rooted=True) # print 'iter=%s: iRate diff = %s, Q diff = %s, bDict diff = %s' % (iterNum, iRateRelativeDif, qMatRelativeDif, bDictRelativeDif) # dif = max(iRateRelativeDif, qMatRelativeDif, bDictRelativeDif) print 'iter=%s: dRate diff = %s, dRate diff (fix dRate*b) = %s, Q diff = %s, bDict diff = %s' % (iterNum, dRateRelativeDif, dRateRelativeDifFixdRateTimesb, qMatRelativeDif, bDictRelativeDif) dif = max(dRateRelativeDif, qMatRelativeDif, bDictRelativeDif, dRateRelativeDifFixdRateTimesb) tree.reroot_at_midpoint() nllkNew = nllk_rate(rate, multiAlign, tree, qMat, piProb, cList) nllkDif = nllk - nllkNew print 'llk increase =', nllkDif nllk = nllkNew write_tree(tree, outTreeFile) iterNum += 1 # if dRateRelativeDifFixdRateTimesb is small, then skip that update if dRateRelativeDifFixdRateTimesb < 0.5: updateRateFixdRateTimesb = False if nllkDif <= 0: print 'Log-Lilkelihood is decreasing! BREAK' break if iterNum == iterMax: print 'maximum iteration %d reached' % (iterMax) else: print 'optimization sucess!' rate = [rate] return rate, qMat, bDict, tree
def opt_pip_full(rate, qMat, multiAlign, javaDirectory, modelDirectory, eStepFile, parametersPath, inputLoc, outputLoc, dataLoc, execsLoc, rFileLoc, cList, qRates=[1.], suffix='', updateQ=True, updateRate=True, updateRateFixdRateTimesb=True, tol=1.e-2, bTol=1.e-3, iterMax=100): """ optimization for all parameters: iRate, dRate, qMat, tree (bDict) in PIP updating in iRate and dRate, qMat, tree (bDict) iteratively estimate bDict first given other, so starting bDict (tree) is not needed """ outNameLoc, outDistLoc, outTreeLoc = get_out_name_dist_tree_files( dataLoc, suffix) rCodeNj = get_rscript(outNameLoc, outDistLoc, outTreeLoc, rFileLoc) print 'simulation run: %s' % (inputLoc) # directory for runing EM alignInSeg = pair_align_from_multi_align(multiAlign) pairsList = alignInSeg.keys() piProb = pi_from_qmat(qMat) print '### initialize tree ###' segRateDict = {0: rate} # only one segment bDict = opt_nlists_bonly(pairsList, alignInSeg, segRateDict, piProb, qMat, cList, qRates) # write_dist_from_bdict(bDict, dataLoc, suffix) tree = tree_use_r_for_unknown_number_of_leaves(bDict, pairsList, rCodeNj, dataLoc, outTreeLoc, suffix, rooted=True) outAlignFile = inputLoc + '/' + 'all.align.txt' outTreeFile = inputLoc + '/' + 'all.tree.txt' dict_write_align_fasta(multiAlign, outAlignFile) write_tree(tree, outTreeFile) tree.reroot_at_midpoint() dif = 1.e10 bDictRelativeDif = 1 iterNum = 1 nllk = 1.e10 nllkDif = 1 while ((dif > tol) and (bDictRelativeDif > bTol) and (iterNum < iterMax)): if updateRateFixdRateTimesb: print '### updating insertion and deletion rate when dRate*b is fixed ###' dRate = rate[1] print tree.length() rateNew, tree = opt_drate_fix_drate_times_b(multiAlign, dRate, tree, qMat, piProb, cList, qRates=[1.]) print tree.length() print rate print rateNew print(np.array(rateNew) - np.array(rate)) / np.array(rate) dRateRelativeDifFixdRateTimesb = abs(rate[1] - rateNew[1]) / rate[1] rate = rateNew else: dRateRelativeDifFixdRateTimesb = 0 if updateRate: print '### updating insertion and deletion rate ###\n' # rateNew = opt_rate(rate, multiAlign, tree, qMat, piProb, cList) rateNew = opt_drate(multiAlign, tree, qMat, piProb, cList, qRates) print rate print rateNew print(np.array(rateNew) - np.array(rate)) / np.array(rate) dRateRelativeDif = abs(rate[1] - rateNew[1]) / rate[1] rate = rateNew else: print 'rate is fixed:', rate dRateRelativeDif = 0 if updateQ: print '### updating rate matrix Q ###\n' qMatNew, piProbNew = opt_qmat_em_full(qMat, cList, inputLoc, outputLoc, javaDirectory, modelDirectory, eStepFile, parametersPath, execsLoc) qMatRelativeDifMat = abs(qMatNew - qMat) / abs(qMat) qMatRelativeDif = qMatRelativeDifMat.max() qMat = qMatNew piProb = piProbNew else: print '### fixing rate matrix Q ###\n' qMatRelativeDif = 0 print '### updating tree ###' segRateDict = {0: rate} # only one segment bDictNew = opt_nlists_bonly(pairsList, alignInSeg, segRateDict, piProb, qMat, cList, qRates) bDictRelativeDifVec = [ abs(bDictNew[key] - bDict[key]) / bDict[key] for key in bDict.keys() ] bDictRelativeDif = np.array(bDictRelativeDifVec).max() print(np.array(bDictNew.values()) - np.array(bDict.values())) / np.array(bDict.values()) bDict = bDictNew # write_dist_from_bdict(bDict, dataLoc, suffix) tree = tree_use_r_for_unknown_number_of_leaves(bDict, pairsList, rCodeNj, dataLoc, outTreeLoc, suffix, rooted=True) # print 'iter=%s: iRate diff = %s, Q diff = %s, bDict diff = %s' % (iterNum, iRateRelativeDif, qMatRelativeDif, bDictRelativeDif) # dif = max(iRateRelativeDif, qMatRelativeDif, bDictRelativeDif) print 'iter=%s: dRate diff = %s, dRate diff (fix dRate*b) = %s, Q diff = %s, bDict diff = %s' % ( iterNum, dRateRelativeDif, dRateRelativeDifFixdRateTimesb, qMatRelativeDif, bDictRelativeDif) dif = max(dRateRelativeDif, qMatRelativeDif, bDictRelativeDif, dRateRelativeDifFixdRateTimesb) tree.reroot_at_midpoint() nllkNew = nllk_rate(rate, multiAlign, tree, qMat, piProb, cList) nllkDif = nllk - nllkNew print 'llk increase =', nllkDif nllk = nllkNew write_tree(tree, outTreeFile) iterNum += 1 # if dRateRelativeDifFixdRateTimesb is small, then skip that update if dRateRelativeDifFixdRateTimesb < 0.5: updateRateFixdRateTimesb = False if nllkDif <= 0: print 'Log-Lilkelihood is decreasing! BREAK' break if iterNum == iterMax: print 'maximum iteration %d reached' % (iterMax) else: print 'optimization sucess!' rate = [rate] return rate, qMat, bDict, tree