def update_segmentation_in_align_in_seg(lenSegs, rateSegs, ratesList, multiAlign): """ update pairwise alignments in segments under new segmentation note that because the number of segments changes, the segRateDict also changes, which also need to be updated input: lenSegs: length of segments rateSegs: estimated rate index in each segment ratesList: list, all rates multiAlign: multiple alignment, seqName -> string output: alignInSeg: dict, pair->segId->pairwise alignment (non empty pairs only, i.e., no ['-', '-']) segRateDict: dict, segId->segRate """ alignInSeg = pair_align_from_multi_align(multiAlign, lenSegs) segRateDict = segratedict_from_ratesegs(rateSegs, ratesList) return alignInSeg, segRateDict
def update_segmentation_in_align_in_seg(lenSegs, rateSegs, ratesList, multiAlign): """ update pairwise alignments in segments under new segmentation note that because the number of segments changes, the segRateDict also changes, which also need to be updated input: lenSegs: length of segments rateSegs: estimated rate index in each segment ratesList: list, all rates multiAlign: multiple alignment, seqName -> string output: alignInSeg: dict, pair->segId->pairwise alignment (non empty pairs only, i.e., no ['-', '-']) segRateDict: dict, segId->segRate """ alignInSeg = pair_align_from_multi_align(multiAlign, lenSegs) segRateDict = segratedict_from_ratesegs(rateSegs, ratesList) return alignInSeg, segRateDict
def opt_geopip_full(m, p, qMat, segRateDict, piProbRates, ratesList, multiAlign, lenSegs, javaDirectory, modelDirectory, eStepFile, parametersPath, inputLoc, outputLoc, dataLoc, execsLoc, rFileLoc, cList, qRates=[1.], suffix='', updateQ=True, updateSeg=True, updateRate=True, updateRateFixdRateTimesTau=True, rooted=True, tol=1.e-2, bTol=1.e-3, iterMax=100): """ main function for optimization updating in segment rates, out segment rates, qMat, bDict iteratively we estimate bDict first, so bDict is not needed as input """ outNameLoc, outDistLoc, outTreeLoc = get_out_name_dist_tree_files(dataLoc, suffix) rCodeNj = get_rscript(outNameLoc, outDistLoc, outTreeLoc, rFileLoc) seqNames = multiAlign.keys() msaList = zip(*multiAlign.values()) print 'simulation run: %s' % (inputLoc) multiAlignAllSeg = update_segmentation_in_multi_aling_all_seg(multiAlign, lenSegs) alignsInSeg = pair_align_from_multi_align(multiAlign, lenSegs) pairsList = alignsInSeg.keys() piProb = pi_from_qmat(qMat) print '### initialize branch lengths ###' bDict = opt_nlists_bonly(pairsList, alignsInSeg, segRateDict, piProb, qMat, cList, qRates) # write_dist_from_bdict(bDict, dataLoc, suffix) # this function can be improved, by using outNameLoc, outDistLoc, outTreeLoc instead tree = tree_use_r_for_unknown_number_of_leaves(bDict, pairsList, rCodeNj, dataLoc, outTreeLoc, suffix, rooted) outAlignFile = inputLoc + '/' + 'all.align.txt' outTreeFile = inputLoc + '/' + 'all.tree.txt' dict_write_align_fasta(multiAlign, outAlignFile) write_tree(tree, outTreeFile) tree.reroot_at_midpoint() dif = 1.e10 bDictRelativeDif = 1 iterNum = 1 nllk = 1.e10 # nllkDif = 1 while ((dif > tol) and (bDictRelativeDif > bTol) and (iterNum < iterMax)): if updateSeg: print '### update segmentation ###' lenSegs, rateSegs = mle_seg_len_rate(p, msaList, ratesList, seqNames, tree, qMat, piProb, piProbRates, cList) nSeg = len(rateSegs) alignsInSeg, segRateDict = update_segmentation_in_align_in_seg(lenSegs, rateSegs, ratesList, multiAlign) multiAlignAllSeg = update_segmentation_in_multi_aling_all_seg(multiAlign, lenSegs) piProbRates = update_piprobrates(rateSegs, m, True) p = update_p(nSeg) print lenSegs if updateRateFixdRateTimesTau: print '### updating insertion and deletion rate when dRate*tau is fixed ###' ratesListNew, segRateDictNew, treeNew = opt_nlists_rate_all_cluster_inseg_only_msa_drate_fix_drate_times_b(ratesList, segRateDict, multiAlignAllSeg, tree, piProb, qMat, cList) print 'rate:', ratesList print 'new rate:', ratesListNew dRateRelativeDifFixdRateTimesTauList = [abs(ratesListNew[index][1] - ratesList[index][1]) / ratesList[index][1] for index in xrange(m) if ratesListNew[index][1] > 1.e-3] dRateRelativeDifFixdRateTimesTau = np.array(dRateRelativeDifFixdRateTimesTauList).max() ratesList = ratesListNew segRateDict = segRateDictNew tree = treeNew else: dRateRelativeDifFixdRateTimesTau = 0 if updateRate: print '### updating in segment rates ###' # segRateDictNew, ratesListNew = opt_nlists_rate_all_cluster_inseg_only_msa(ratesList, segRateDict, multiAlignAllSeg, tree, piProb, qMat, cList) segRateDictNew, ratesListNew = opt_nlists_rate_all_cluster_inseg_only_msa_drate(ratesList, segRateDict, multiAlignAllSeg, tree, piProb, qMat, cList) print 'rate:', ratesList print 'new rate:', ratesListNew dRateRelativeDifList = [abs(ratesListNew[index][1] - ratesList[index][1]) / ratesList[index][1] for index in xrange(m) if ratesListNew[index][1] > 1.e-3] dRateRelativeDif = np.array(dRateRelativeDifList).max() segRateDict = segRateDictNew ratesList = ratesListNew else: dRateRelativeDif = 0 print 'rate is Fixed:', ratesList ##################### # another approach based on clustering # segRateDictNew = opt_nlists_rate_all_inseg_only(segRateDict, alignsInSeg, bDict, pairsList, piProb, qMat, cList) # # IMPROVE THIS LATER, scale each rate separately first maybe? # inSegRateDif = max([abs(segRateDictNew[segId] - np.array(segRateDict[segId])).max() for segId in segRateDictNew.keys()]) # # update ratesList # ratesList, segRateDictNew = ratesList_kmean(segRateDictNew, m) # print segRateDict # print segRateDictNew # segRateDict = segRateDictNew # display estiamteed in segment rates # for segId, segRate in segRateDict.iteritems(): # print 'seg id = %s, \t segRate = %s' %(segId, segRate) ###################### if updateQ: print '### updating rate matrix Q ###' # if alignUpdate: # aligns = align_mle(lans, pairsList, bDict, iRate, dRate, qMat, piProb, cList) # if cogUpdate: # alignsCogOnlyEst = est_align_cog(lans, pairsList, iRate, dRate, cRate, cRateVec, bDict, piProb, qMat, cList, aligns, criticalValue) # else: # alignsCogOnlyEst = alignsCogOnly # write_align(pairsList, alignsInSeg, bDict, inputLoc) qMatNew, piProbNew = opt_qmat_em_full(qMat, cList, inputLoc, outputLoc, javaDirectory, modelDirectory, eStepFile, parametersPath, execsLoc, lEstepTol=1.e-2, iterMax=100) qMatRelativeDifMat = abs(qMatNew - qMat) / abs(qMat) qMatRelativeDif = qMatRelativeDifMat.max() # qMatDif = abs(qMatNew - qMat).max() qMat = qMatNew piProb = piProbNew # qMatDif = 0 else: print '### fixing rate matrix Q ###' qMatRelativeDif = 0 print '### updating branch lengths ###' bDictNew = opt_nlists_bonly(pairsList, alignsInSeg, segRateDict, piProb, qMat, cList, qRates) # bDictDifVec = [abs(bDictNew[key] - bDict[key]) for key in bDict.keys()] bDictRelativeDifVec = [abs(bDictNew[key] - bDict[key]) / bDict[key] for key in bDict.keys()] bDictRelativeDif = np.array(bDictRelativeDifVec).max() # bDictDif = np.array(bDictDifVec).max() bDict = bDictNew # write_dist_from_bdict(bDict, dataLoc, suffix) tree = tree_use_r_for_unknown_number_of_leaves(bDict, pairsList, rCodeNj, dataLoc, outTreeLoc, suffix, rooted) print 'iter=%s: rates in seg diff = %s, rates in seg diff (fix dRate*tau) = %s, Q diff = %s, b diff = %s' % (iterNum, dRateRelativeDif, dRateRelativeDifFixdRateTimesTau, qMatRelativeDif, bDictRelativeDif) # print 'iter=%s: rates in seg diff = %s, Q diff = %s, b diff = %s' % (iterNum, segRateDif, qMatDif, bDictDif) dif = max(dRateRelativeDif, dRateRelativeDifFixdRateTimesTau, qMatRelativeDif, bDictRelativeDif) tree.reroot_at_midpoint() nllkNew = nllk_msa_geopip_final(ratesList, segRateDict, multiAlignAllSeg, tree, piProb, qMat, cList, piProbRates, p, rateSegs) nllkDif = -nllkNew + nllk nllkOld = nllk nllk = nllkNew print 'llk increase =', nllkDif if nllkDif <= 0: print 'Log-Likelihood is decreasing! BREAK!' bDict = bDictOld qMat = qMatOld segRateDict = segRateDictOld alignsInSeg = alignsInSegOld p = pOld piProbRates = piProbRatesOld ratesList = ratesListOld lenSegs = lenSegsOld tree = treeOld nllk = nllkOld break # do not update fixing dRate*tau if changes is small if dRateRelativeDifFixdRateTimesTau < 0.05: updateRateFixdRateTimesTau = False write_tree(tree, outTreeFile) iterNum += 1 bDictOld = bDict qMatOld = qMat segRateDictOld = segRateDict alignsInSegOld = alignsInSeg pOld = p piProbRatesOld = piProbRates ratesListOld = ratesList lenSegsOld = lenSegs treeOld = tree if iterNum == iterMax: print 'maximum iteration %d reached' % (iterMax) else: print 'optimization sucess!' return bDict, qMat, segRateDict, alignsInSeg, p, piProbRates, ratesList, lenSegs, tree, nllk
def opt_geopip_full(m, p, qMat, segRateDict, piProbRates, ratesList, multiAlign, lenSegs, javaDirectory, modelDirectory, eStepFile, parametersPath, inputLoc, outputLoc, dataLoc, execsLoc, rFileLoc, cList, qRates=[1.], suffix='', updateQ=True, updateSeg=True, updateRate=True, updateRateFixdRateTimesTau=True, rooted=True, tol=1.e-2, bTol=1.e-3, iterMax=100): """ main function for optimization updating in segment rates, out segment rates, qMat, bDict iteratively we estimate bDict first, so bDict is not needed as input """ outNameLoc, outDistLoc, outTreeLoc = get_out_name_dist_tree_files( dataLoc, suffix) rCodeNj = get_rscript(outNameLoc, outDistLoc, outTreeLoc, rFileLoc) seqNames = multiAlign.keys() msaList = zip(*multiAlign.values()) print 'simulation run: %s' % (inputLoc) multiAlignAllSeg = update_segmentation_in_multi_aling_all_seg( multiAlign, lenSegs) alignsInSeg = pair_align_from_multi_align(multiAlign, lenSegs) pairsList = alignsInSeg.keys() piProb = pi_from_qmat(qMat) print '### initialize branch lengths ###' bDict = opt_nlists_bonly(pairsList, alignsInSeg, segRateDict, piProb, qMat, cList, qRates) # write_dist_from_bdict(bDict, dataLoc, suffix) # this function can be improved, by using outNameLoc, outDistLoc, outTreeLoc instead tree = tree_use_r_for_unknown_number_of_leaves(bDict, pairsList, rCodeNj, dataLoc, outTreeLoc, suffix, rooted) outAlignFile = inputLoc + '/' + 'all.align.txt' outTreeFile = inputLoc + '/' + 'all.tree.txt' dict_write_align_fasta(multiAlign, outAlignFile) write_tree(tree, outTreeFile) tree.reroot_at_midpoint() dif = 1.e10 bDictRelativeDif = 1 iterNum = 1 nllk = 1.e10 # nllkDif = 1 while ((dif > tol) and (bDictRelativeDif > bTol) and (iterNum < iterMax)): if updateSeg: print '### update segmentation ###' lenSegs, rateSegs = mle_seg_len_rate(p, msaList, ratesList, seqNames, tree, qMat, piProb, piProbRates, cList) nSeg = len(rateSegs) alignsInSeg, segRateDict = update_segmentation_in_align_in_seg( lenSegs, rateSegs, ratesList, multiAlign) multiAlignAllSeg = update_segmentation_in_multi_aling_all_seg( multiAlign, lenSegs) piProbRates = update_piprobrates(rateSegs, m, True) p = update_p(nSeg) print lenSegs if updateRateFixdRateTimesTau: print '### updating insertion and deletion rate when dRate*tau is fixed ###' ratesListNew, segRateDictNew, treeNew = opt_nlists_rate_all_cluster_inseg_only_msa_drate_fix_drate_times_b( ratesList, segRateDict, multiAlignAllSeg, tree, piProb, qMat, cList) print 'rate:', ratesList print 'new rate:', ratesListNew dRateRelativeDifFixdRateTimesTauList = [ abs(ratesListNew[index][1] - ratesList[index][1]) / ratesList[index][1] for index in xrange(m) if ratesListNew[index][1] > 1.e-3 ] dRateRelativeDifFixdRateTimesTau = np.array( dRateRelativeDifFixdRateTimesTauList).max() ratesList = ratesListNew segRateDict = segRateDictNew tree = treeNew else: dRateRelativeDifFixdRateTimesTau = 0 if updateRate: print '### updating in segment rates ###' # segRateDictNew, ratesListNew = opt_nlists_rate_all_cluster_inseg_only_msa(ratesList, segRateDict, multiAlignAllSeg, tree, piProb, qMat, cList) segRateDictNew, ratesListNew = opt_nlists_rate_all_cluster_inseg_only_msa_drate( ratesList, segRateDict, multiAlignAllSeg, tree, piProb, qMat, cList) print 'rate:', ratesList print 'new rate:', ratesListNew dRateRelativeDifList = [ abs(ratesListNew[index][1] - ratesList[index][1]) / ratesList[index][1] for index in xrange(m) if ratesListNew[index][1] > 1.e-3 ] dRateRelativeDif = np.array(dRateRelativeDifList).max() segRateDict = segRateDictNew ratesList = ratesListNew else: dRateRelativeDif = 0 print 'rate is Fixed:', ratesList ##################### # another approach based on clustering # segRateDictNew = opt_nlists_rate_all_inseg_only(segRateDict, alignsInSeg, bDict, pairsList, piProb, qMat, cList) # # IMPROVE THIS LATER, scale each rate separately first maybe? # inSegRateDif = max([abs(segRateDictNew[segId] - np.array(segRateDict[segId])).max() for segId in segRateDictNew.keys()]) # # update ratesList # ratesList, segRateDictNew = ratesList_kmean(segRateDictNew, m) # print segRateDict # print segRateDictNew # segRateDict = segRateDictNew # display estiamteed in segment rates # for segId, segRate in segRateDict.iteritems(): # print 'seg id = %s, \t segRate = %s' %(segId, segRate) ###################### if updateQ: print '### updating rate matrix Q ###' # if alignUpdate: # aligns = align_mle(lans, pairsList, bDict, iRate, dRate, qMat, piProb, cList) # if cogUpdate: # alignsCogOnlyEst = est_align_cog(lans, pairsList, iRate, dRate, cRate, cRateVec, bDict, piProb, qMat, cList, aligns, criticalValue) # else: # alignsCogOnlyEst = alignsCogOnly # write_align(pairsList, alignsInSeg, bDict, inputLoc) qMatNew, piProbNew = opt_qmat_em_full(qMat, cList, inputLoc, outputLoc, javaDirectory, modelDirectory, eStepFile, parametersPath, execsLoc, lEstepTol=1.e-2, iterMax=100) qMatRelativeDifMat = abs(qMatNew - qMat) / abs(qMat) qMatRelativeDif = qMatRelativeDifMat.max() # qMatDif = abs(qMatNew - qMat).max() qMat = qMatNew piProb = piProbNew # qMatDif = 0 else: print '### fixing rate matrix Q ###' qMatRelativeDif = 0 print '### updating branch lengths ###' bDictNew = opt_nlists_bonly(pairsList, alignsInSeg, segRateDict, piProb, qMat, cList, qRates) # bDictDifVec = [abs(bDictNew[key] - bDict[key]) for key in bDict.keys()] bDictRelativeDifVec = [ abs(bDictNew[key] - bDict[key]) / bDict[key] for key in bDict.keys() ] bDictRelativeDif = np.array(bDictRelativeDifVec).max() # bDictDif = np.array(bDictDifVec).max() bDict = bDictNew # write_dist_from_bdict(bDict, dataLoc, suffix) tree = tree_use_r_for_unknown_number_of_leaves(bDict, pairsList, rCodeNj, dataLoc, outTreeLoc, suffix, rooted) print 'iter=%s: rates in seg diff = %s, rates in seg diff (fix dRate*tau) = %s, Q diff = %s, b diff = %s' % ( iterNum, dRateRelativeDif, dRateRelativeDifFixdRateTimesTau, qMatRelativeDif, bDictRelativeDif) # print 'iter=%s: rates in seg diff = %s, Q diff = %s, b diff = %s' % (iterNum, segRateDif, qMatDif, bDictDif) dif = max(dRateRelativeDif, dRateRelativeDifFixdRateTimesTau, qMatRelativeDif, bDictRelativeDif) tree.reroot_at_midpoint() nllkNew = nllk_msa_geopip_final(ratesList, segRateDict, multiAlignAllSeg, tree, piProb, qMat, cList, piProbRates, p, rateSegs) nllkDif = -nllkNew + nllk nllkOld = nllk nllk = nllkNew print 'llk increase =', nllkDif if nllkDif <= 0: print 'Log-Likelihood is decreasing! BREAK!' bDict = bDictOld qMat = qMatOld segRateDict = segRateDictOld alignsInSeg = alignsInSegOld p = pOld piProbRates = piProbRatesOld ratesList = ratesListOld lenSegs = lenSegsOld tree = treeOld nllk = nllkOld break # do not update fixing dRate*tau if changes is small if dRateRelativeDifFixdRateTimesTau < 0.05: updateRateFixdRateTimesTau = False write_tree(tree, outTreeFile) iterNum += 1 bDictOld = bDict qMatOld = qMat segRateDictOld = segRateDict alignsInSegOld = alignsInSeg pOld = p piProbRatesOld = piProbRates ratesListOld = ratesList lenSegsOld = lenSegs treeOld = tree if iterNum == iterMax: print 'maximum iteration %d reached' % (iterMax) else: print 'optimization sucess!' return bDict, qMat, segRateDict, alignsInSeg, p, piProbRates, ratesList, lenSegs, tree, nllk