Example #1
0
def update_segmentation_in_align_in_seg(lenSegs, rateSegs, ratesList, multiAlign):
    """
    update pairwise alignments in segments under new segmentation
    note that because the number of segments changes, the segRateDict also changes, which also need to be updated
    input:
        lenSegs: length of segments
        rateSegs: estimated rate index in each segment
        ratesList: list, all rates
        multiAlign: multiple alignment, seqName -> string
    output:
        alignInSeg: dict, pair->segId->pairwise alignment (non empty pairs only, i.e., no ['-', '-'])
        segRateDict: dict, segId->segRate
    """
    alignInSeg = pair_align_from_multi_align(multiAlign, lenSegs)
    segRateDict = segratedict_from_ratesegs(rateSegs, ratesList)
    return alignInSeg, segRateDict
Example #2
0
def update_segmentation_in_align_in_seg(lenSegs, rateSegs, ratesList,
                                        multiAlign):
    """
    update pairwise alignments in segments under new segmentation
    note that because the number of segments changes, the segRateDict also changes, which also need to be updated
    input:
        lenSegs: length of segments
        rateSegs: estimated rate index in each segment
        ratesList: list, all rates
        multiAlign: multiple alignment, seqName -> string
    output:
        alignInSeg: dict, pair->segId->pairwise alignment (non empty pairs only, i.e., no ['-', '-'])
        segRateDict: dict, segId->segRate
    """
    alignInSeg = pair_align_from_multi_align(multiAlign, lenSegs)
    segRateDict = segratedict_from_ratesegs(rateSegs, ratesList)
    return alignInSeg, segRateDict
Example #3
0
def opt_geopip_full(m, p, qMat, segRateDict, piProbRates, ratesList, multiAlign, lenSegs, javaDirectory, modelDirectory, eStepFile, parametersPath, inputLoc, outputLoc, dataLoc, execsLoc, rFileLoc, cList, qRates=[1.], suffix='', updateQ=True, updateSeg=True, updateRate=True, updateRateFixdRateTimesTau=True, rooted=True, tol=1.e-2, bTol=1.e-3, iterMax=100):
    """
    main function for optimization
    updating in segment rates, out segment rates, qMat, bDict iteratively
    we estimate bDict first, so bDict is not needed as input
    """
    outNameLoc, outDistLoc, outTreeLoc = get_out_name_dist_tree_files(dataLoc, suffix)
    rCodeNj = get_rscript(outNameLoc, outDistLoc, outTreeLoc, rFileLoc)
    seqNames = multiAlign.keys()
    msaList = zip(*multiAlign.values())
    print 'simulation run: %s' % (inputLoc)
    multiAlignAllSeg = update_segmentation_in_multi_aling_all_seg(multiAlign, lenSegs)
    alignsInSeg = pair_align_from_multi_align(multiAlign, lenSegs)
    pairsList = alignsInSeg.keys()
    piProb = pi_from_qmat(qMat)
    print '###  initialize branch lengths  ###'
    bDict = opt_nlists_bonly(pairsList, alignsInSeg, segRateDict, piProb, qMat, cList, qRates)
    # write_dist_from_bdict(bDict, dataLoc, suffix)    # this function can be improved, by using outNameLoc, outDistLoc, outTreeLoc instead
    tree = tree_use_r_for_unknown_number_of_leaves(bDict, pairsList, rCodeNj, dataLoc, outTreeLoc, suffix, rooted)
    outAlignFile = inputLoc + '/' + 'all.align.txt'
    outTreeFile = inputLoc + '/' + 'all.tree.txt'
    dict_write_align_fasta(multiAlign, outAlignFile)
    write_tree(tree, outTreeFile)
    tree.reroot_at_midpoint()
    dif = 1.e10
    bDictRelativeDif = 1
    iterNum = 1
    nllk = 1.e10
    # nllkDif = 1
    while ((dif > tol) and (bDictRelativeDif > bTol) and (iterNum < iterMax)):
        if updateSeg:
            print '### update segmentation ###'
            lenSegs, rateSegs = mle_seg_len_rate(p, msaList, ratesList, seqNames, tree, qMat, piProb, piProbRates, cList)
            nSeg = len(rateSegs)
            alignsInSeg, segRateDict = update_segmentation_in_align_in_seg(lenSegs, rateSegs, ratesList, multiAlign)
            multiAlignAllSeg = update_segmentation_in_multi_aling_all_seg(multiAlign, lenSegs)
            piProbRates = update_piprobrates(rateSegs, m, True)
            p = update_p(nSeg)
        print lenSegs
        if updateRateFixdRateTimesTau:
            print '### updating insertion and deletion rate when dRate*tau is fixed ###'
            ratesListNew, segRateDictNew, treeNew = opt_nlists_rate_all_cluster_inseg_only_msa_drate_fix_drate_times_b(ratesList, segRateDict, multiAlignAllSeg, tree, piProb, qMat, cList)
            print 'rate:', ratesList
            print 'new rate:', ratesListNew
            dRateRelativeDifFixdRateTimesTauList = [abs(ratesListNew[index][1] - ratesList[index][1]) / ratesList[index][1] for index in xrange(m) if ratesListNew[index][1] > 1.e-3]
            dRateRelativeDifFixdRateTimesTau = np.array(dRateRelativeDifFixdRateTimesTauList).max()
            ratesList = ratesListNew
            segRateDict = segRateDictNew
            tree = treeNew
        else:
            dRateRelativeDifFixdRateTimesTau = 0
        if updateRate:
            print '### updating in segment rates ###'
            # segRateDictNew, ratesListNew = opt_nlists_rate_all_cluster_inseg_only_msa(ratesList, segRateDict, multiAlignAllSeg, tree, piProb, qMat, cList)
            segRateDictNew, ratesListNew = opt_nlists_rate_all_cluster_inseg_only_msa_drate(ratesList, segRateDict, multiAlignAllSeg, tree, piProb, qMat, cList)
            print 'rate:', ratesList
            print 'new rate:', ratesListNew
            dRateRelativeDifList = [abs(ratesListNew[index][1] - ratesList[index][1]) / ratesList[index][1] for index in xrange(m) if ratesListNew[index][1] > 1.e-3]
            dRateRelativeDif = np.array(dRateRelativeDifList).max()
            segRateDict = segRateDictNew
            ratesList = ratesListNew
        else:
            dRateRelativeDif = 0
            print 'rate is Fixed:', ratesList
        #####################
        # another approach based on clustering
        # segRateDictNew = opt_nlists_rate_all_inseg_only(segRateDict, alignsInSeg, bDict, pairsList, piProb, qMat, cList)
        # # IMPROVE THIS LATER, scale each rate separately first maybe?
        # inSegRateDif = max([abs(segRateDictNew[segId] - np.array(segRateDict[segId])).max() for segId in segRateDictNew.keys()])
        # # update ratesList
        # ratesList, segRateDictNew = ratesList_kmean(segRateDictNew, m)
        # print segRateDict
        # print segRateDictNew
        # segRateDict = segRateDictNew
        # display estiamteed in segment rates
        # for segId, segRate in segRateDict.iteritems():
        #     print 'seg id = %s, \t segRate = %s' %(segId, segRate)
        ######################
        if updateQ:
            print '### updating rate matrix Q ###'
            # if alignUpdate:
            #     aligns = align_mle(lans, pairsList, bDict, iRate, dRate, qMat, piProb, cList)
            # if cogUpdate:
            #     alignsCogOnlyEst = est_align_cog(lans, pairsList, iRate, dRate, cRate, cRateVec, bDict, piProb, qMat, cList, aligns, criticalValue)
            # else:
            #     alignsCogOnlyEst = alignsCogOnly
            # write_align(pairsList, alignsInSeg, bDict, inputLoc)
            qMatNew, piProbNew = opt_qmat_em_full(qMat, cList, inputLoc, outputLoc, javaDirectory, modelDirectory, eStepFile, parametersPath, execsLoc, lEstepTol=1.e-2, iterMax=100)
            qMatRelativeDifMat = abs(qMatNew - qMat) / abs(qMat)
            qMatRelativeDif = qMatRelativeDifMat.max()
            # qMatDif = abs(qMatNew - qMat).max()
            qMat = qMatNew
            piProb = piProbNew
            # qMatDif = 0
        else:
            print '### fixing rate matrix Q ###'
            qMatRelativeDif = 0
        print '### updating branch lengths ###'
        bDictNew = opt_nlists_bonly(pairsList, alignsInSeg, segRateDict, piProb, qMat, cList, qRates)
        # bDictDifVec = [abs(bDictNew[key] - bDict[key]) for key in bDict.keys()]
        bDictRelativeDifVec = [abs(bDictNew[key] - bDict[key]) / bDict[key] for key in bDict.keys()]
        bDictRelativeDif = np.array(bDictRelativeDifVec).max()
        # bDictDif = np.array(bDictDifVec).max()
        bDict = bDictNew
        # write_dist_from_bdict(bDict, dataLoc, suffix)
        tree = tree_use_r_for_unknown_number_of_leaves(bDict, pairsList, rCodeNj, dataLoc, outTreeLoc, suffix, rooted)
        print 'iter=%s: rates in seg diff = %s, rates in seg diff (fix dRate*tau) = %s, Q diff = %s, b diff = %s' % (iterNum, dRateRelativeDif, dRateRelativeDifFixdRateTimesTau, qMatRelativeDif, bDictRelativeDif)
        # print 'iter=%s: rates in seg diff = %s, Q diff = %s, b diff = %s' % (iterNum, segRateDif, qMatDif, bDictDif)
        dif = max(dRateRelativeDif, dRateRelativeDifFixdRateTimesTau, qMatRelativeDif, bDictRelativeDif)
        tree.reroot_at_midpoint()
        nllkNew = nllk_msa_geopip_final(ratesList, segRateDict, multiAlignAllSeg, tree, piProb, qMat, cList, piProbRates, p, rateSegs)
        nllkDif = -nllkNew + nllk
        nllkOld = nllk
        nllk = nllkNew
        print 'llk increase =', nllkDif
        if nllkDif <= 0:
            print 'Log-Likelihood is decreasing! BREAK!'
            bDict = bDictOld
            qMat = qMatOld
            segRateDict = segRateDictOld
            alignsInSeg = alignsInSegOld
            p = pOld
            piProbRates = piProbRatesOld
            ratesList = ratesListOld
            lenSegs = lenSegsOld
            tree = treeOld
            nllk = nllkOld
            break
        # do not update fixing dRate*tau if changes is small
        if dRateRelativeDifFixdRateTimesTau < 0.05:
            updateRateFixdRateTimesTau = False
        write_tree(tree, outTreeFile)
        iterNum += 1
        bDictOld = bDict
        qMatOld = qMat
        segRateDictOld = segRateDict
        alignsInSegOld = alignsInSeg
        pOld = p
        piProbRatesOld = piProbRates
        ratesListOld = ratesList
        lenSegsOld = lenSegs
        treeOld = tree
    if iterNum == iterMax:
        print 'maximum iteration %d reached' % (iterMax)
    else:
        print 'optimization sucess!'
    return bDict, qMat, segRateDict, alignsInSeg, p, piProbRates, ratesList, lenSegs, tree, nllk
Example #4
0
def opt_geopip_full(m,
                    p,
                    qMat,
                    segRateDict,
                    piProbRates,
                    ratesList,
                    multiAlign,
                    lenSegs,
                    javaDirectory,
                    modelDirectory,
                    eStepFile,
                    parametersPath,
                    inputLoc,
                    outputLoc,
                    dataLoc,
                    execsLoc,
                    rFileLoc,
                    cList,
                    qRates=[1.],
                    suffix='',
                    updateQ=True,
                    updateSeg=True,
                    updateRate=True,
                    updateRateFixdRateTimesTau=True,
                    rooted=True,
                    tol=1.e-2,
                    bTol=1.e-3,
                    iterMax=100):
    """
    main function for optimization
    updating in segment rates, out segment rates, qMat, bDict iteratively
    we estimate bDict first, so bDict is not needed as input
    """
    outNameLoc, outDistLoc, outTreeLoc = get_out_name_dist_tree_files(
        dataLoc, suffix)
    rCodeNj = get_rscript(outNameLoc, outDistLoc, outTreeLoc, rFileLoc)
    seqNames = multiAlign.keys()
    msaList = zip(*multiAlign.values())
    print 'simulation run: %s' % (inputLoc)
    multiAlignAllSeg = update_segmentation_in_multi_aling_all_seg(
        multiAlign, lenSegs)
    alignsInSeg = pair_align_from_multi_align(multiAlign, lenSegs)
    pairsList = alignsInSeg.keys()
    piProb = pi_from_qmat(qMat)
    print '###  initialize branch lengths  ###'
    bDict = opt_nlists_bonly(pairsList, alignsInSeg, segRateDict, piProb, qMat,
                             cList, qRates)
    # write_dist_from_bdict(bDict, dataLoc, suffix)    # this function can be improved, by using outNameLoc, outDistLoc, outTreeLoc instead
    tree = tree_use_r_for_unknown_number_of_leaves(bDict, pairsList, rCodeNj,
                                                   dataLoc, outTreeLoc, suffix,
                                                   rooted)
    outAlignFile = inputLoc + '/' + 'all.align.txt'
    outTreeFile = inputLoc + '/' + 'all.tree.txt'
    dict_write_align_fasta(multiAlign, outAlignFile)
    write_tree(tree, outTreeFile)
    tree.reroot_at_midpoint()
    dif = 1.e10
    bDictRelativeDif = 1
    iterNum = 1
    nllk = 1.e10
    # nllkDif = 1
    while ((dif > tol) and (bDictRelativeDif > bTol) and (iterNum < iterMax)):
        if updateSeg:
            print '### update segmentation ###'
            lenSegs, rateSegs = mle_seg_len_rate(p, msaList, ratesList,
                                                 seqNames, tree, qMat, piProb,
                                                 piProbRates, cList)
            nSeg = len(rateSegs)
            alignsInSeg, segRateDict = update_segmentation_in_align_in_seg(
                lenSegs, rateSegs, ratesList, multiAlign)
            multiAlignAllSeg = update_segmentation_in_multi_aling_all_seg(
                multiAlign, lenSegs)
            piProbRates = update_piprobrates(rateSegs, m, True)
            p = update_p(nSeg)
        print lenSegs
        if updateRateFixdRateTimesTau:
            print '### updating insertion and deletion rate when dRate*tau is fixed ###'
            ratesListNew, segRateDictNew, treeNew = opt_nlists_rate_all_cluster_inseg_only_msa_drate_fix_drate_times_b(
                ratesList, segRateDict, multiAlignAllSeg, tree, piProb, qMat,
                cList)
            print 'rate:', ratesList
            print 'new rate:', ratesListNew
            dRateRelativeDifFixdRateTimesTauList = [
                abs(ratesListNew[index][1] - ratesList[index][1]) /
                ratesList[index][1] for index in xrange(m)
                if ratesListNew[index][1] > 1.e-3
            ]
            dRateRelativeDifFixdRateTimesTau = np.array(
                dRateRelativeDifFixdRateTimesTauList).max()
            ratesList = ratesListNew
            segRateDict = segRateDictNew
            tree = treeNew
        else:
            dRateRelativeDifFixdRateTimesTau = 0
        if updateRate:
            print '### updating in segment rates ###'
            # segRateDictNew, ratesListNew = opt_nlists_rate_all_cluster_inseg_only_msa(ratesList, segRateDict, multiAlignAllSeg, tree, piProb, qMat, cList)
            segRateDictNew, ratesListNew = opt_nlists_rate_all_cluster_inseg_only_msa_drate(
                ratesList, segRateDict, multiAlignAllSeg, tree, piProb, qMat,
                cList)
            print 'rate:', ratesList
            print 'new rate:', ratesListNew
            dRateRelativeDifList = [
                abs(ratesListNew[index][1] - ratesList[index][1]) /
                ratesList[index][1] for index in xrange(m)
                if ratesListNew[index][1] > 1.e-3
            ]
            dRateRelativeDif = np.array(dRateRelativeDifList).max()
            segRateDict = segRateDictNew
            ratesList = ratesListNew
        else:
            dRateRelativeDif = 0
            print 'rate is Fixed:', ratesList
        #####################
        # another approach based on clustering
        # segRateDictNew = opt_nlists_rate_all_inseg_only(segRateDict, alignsInSeg, bDict, pairsList, piProb, qMat, cList)
        # # IMPROVE THIS LATER, scale each rate separately first maybe?
        # inSegRateDif = max([abs(segRateDictNew[segId] - np.array(segRateDict[segId])).max() for segId in segRateDictNew.keys()])
        # # update ratesList
        # ratesList, segRateDictNew = ratesList_kmean(segRateDictNew, m)
        # print segRateDict
        # print segRateDictNew
        # segRateDict = segRateDictNew
        # display estiamteed in segment rates
        # for segId, segRate in segRateDict.iteritems():
        #     print 'seg id = %s, \t segRate = %s' %(segId, segRate)
        ######################
        if updateQ:
            print '### updating rate matrix Q ###'
            # if alignUpdate:
            #     aligns = align_mle(lans, pairsList, bDict, iRate, dRate, qMat, piProb, cList)
            # if cogUpdate:
            #     alignsCogOnlyEst = est_align_cog(lans, pairsList, iRate, dRate, cRate, cRateVec, bDict, piProb, qMat, cList, aligns, criticalValue)
            # else:
            #     alignsCogOnlyEst = alignsCogOnly
            # write_align(pairsList, alignsInSeg, bDict, inputLoc)
            qMatNew, piProbNew = opt_qmat_em_full(qMat,
                                                  cList,
                                                  inputLoc,
                                                  outputLoc,
                                                  javaDirectory,
                                                  modelDirectory,
                                                  eStepFile,
                                                  parametersPath,
                                                  execsLoc,
                                                  lEstepTol=1.e-2,
                                                  iterMax=100)
            qMatRelativeDifMat = abs(qMatNew - qMat) / abs(qMat)
            qMatRelativeDif = qMatRelativeDifMat.max()
            # qMatDif = abs(qMatNew - qMat).max()
            qMat = qMatNew
            piProb = piProbNew
            # qMatDif = 0
        else:
            print '### fixing rate matrix Q ###'
            qMatRelativeDif = 0
        print '### updating branch lengths ###'
        bDictNew = opt_nlists_bonly(pairsList, alignsInSeg, segRateDict,
                                    piProb, qMat, cList, qRates)
        # bDictDifVec = [abs(bDictNew[key] - bDict[key]) for key in bDict.keys()]
        bDictRelativeDifVec = [
            abs(bDictNew[key] - bDict[key]) / bDict[key]
            for key in bDict.keys()
        ]
        bDictRelativeDif = np.array(bDictRelativeDifVec).max()
        # bDictDif = np.array(bDictDifVec).max()
        bDict = bDictNew
        # write_dist_from_bdict(bDict, dataLoc, suffix)
        tree = tree_use_r_for_unknown_number_of_leaves(bDict, pairsList,
                                                       rCodeNj, dataLoc,
                                                       outTreeLoc, suffix,
                                                       rooted)
        print 'iter=%s: rates in seg diff = %s, rates in seg diff (fix dRate*tau) = %s, Q diff = %s, b diff = %s' % (
            iterNum, dRateRelativeDif, dRateRelativeDifFixdRateTimesTau,
            qMatRelativeDif, bDictRelativeDif)
        # print 'iter=%s: rates in seg diff = %s, Q diff = %s, b diff = %s' % (iterNum, segRateDif, qMatDif, bDictDif)
        dif = max(dRateRelativeDif, dRateRelativeDifFixdRateTimesTau,
                  qMatRelativeDif, bDictRelativeDif)
        tree.reroot_at_midpoint()
        nllkNew = nllk_msa_geopip_final(ratesList, segRateDict,
                                        multiAlignAllSeg, tree, piProb, qMat,
                                        cList, piProbRates, p, rateSegs)
        nllkDif = -nllkNew + nllk
        nllkOld = nllk
        nllk = nllkNew
        print 'llk increase =', nllkDif
        if nllkDif <= 0:
            print 'Log-Likelihood is decreasing! BREAK!'
            bDict = bDictOld
            qMat = qMatOld
            segRateDict = segRateDictOld
            alignsInSeg = alignsInSegOld
            p = pOld
            piProbRates = piProbRatesOld
            ratesList = ratesListOld
            lenSegs = lenSegsOld
            tree = treeOld
            nllk = nllkOld
            break
        # do not update fixing dRate*tau if changes is small
        if dRateRelativeDifFixdRateTimesTau < 0.05:
            updateRateFixdRateTimesTau = False
        write_tree(tree, outTreeFile)
        iterNum += 1
        bDictOld = bDict
        qMatOld = qMat
        segRateDictOld = segRateDict
        alignsInSegOld = alignsInSeg
        pOld = p
        piProbRatesOld = piProbRates
        ratesListOld = ratesList
        lenSegsOld = lenSegs
        treeOld = tree
    if iterNum == iterMax:
        print 'maximum iteration %d reached' % (iterMax)
    else:
        print 'optimization sucess!'
    return bDict, qMat, segRateDict, alignsInSeg, p, piProbRates, ratesList, lenSegs, tree, nllk