Ejemplo n.º 1
0
def outputRates( result, options ):
    """output rates in a grammar."""

    trained_model = result.getModel()

    pis = trained_model.evaluateTerminalFrequencies()
    matrices = trained_model.evaluateRateMatrix()
    terminals = pis.keys()

    for terminal in terminals:
        Q, distance = RateEstimation.getDistanceGTR( pis[terminal], matrices[terminal] )
        options.stdout.write("\t%s" % (options.value_format % distance ) )
Ejemplo n.º 2
0
    def getQMatrix(pi, k, s, n):
        """build a q matrix.

        Diagonal elements are set to the negative of the row sums.
        The matrix is normalized such that trace of the matrix is -1.
        """

        codons = Bio.Data.CodonTable.standard_dna_table.forward_table.keys()

        Q = initializeQMatrix(codons)

        trace = 0.0
        for codon_i in codons:
            row_sum = 0.0
            for codon_j in codons:
                if codon_i == codon_j:
                    continue

                is_single, is_synonymous, is_transition = RateEstimation.evaluateCodonPair(
                    codon_i, codon_j)

                if not is_single:
                    continue

                if is_synonymous:
                    if is_transition:
                        v = s
                    else:
                        v = s * k
                else:
                    if is_transition:
                        v = n
                    else:
                        v = n * k

                v *= pi[codon_j]
                Q[codon_i][codon_j] = v
                row_sum += v

            Q[codon_i][codon_i] = -row_sum
            trace += pi[codon_i] * row_sum

        for codon_i in codons:
            for codon_j in codons:
                Q[codon_i][codon_j] /= trace

        return Q, trace
Ejemplo n.º 3
0
def processMali(mali, options):

    ncols = mali.getNumColumns()

    if ncols == 0:
        raise "refusing to process empty alignment."

    ## add annotation of states
    if options.block_size != None:
        if options.block_size < 1:
            size = int(float(ncols) / 3.0 * options.block_size) * 3
        else:
            size = int(options.block_size) * 3

        size = min(size, ncols)
        mali.addAnnotation("STATE", "N" * size + "C" * (ncols - size))

    ## remove gene ids
    for id in mali.getIdentifiers():
        if options.separator in id:
            species = id.split(options.separator)[0]
            mali.rename(id, species)

    map_new2old = mali.mapIdentifiers()
    map_old2new = IOTools.getInvertedDictionary(map_new2old, make_unique=True)

    ids = mali.getIdentifiers()
    xgram = XGram.XGram()

    if options.xrate_min_increment:
        xgram.setMinIncrement(options.xrate_min_increment)

    ninput, noutput, nskipped = 0, 0, 0

    # remove empty columns and masked columns
    if options.clean_mali:
        mali.mGapChars = mali.mGapChars + ("n", "N")
        mali.removeGaps(minimum_gaps=1, frame=3)

    if options.input_filename_tree:
        nexus = TreeTools.Newick2Nexus(open(options.input_filename_tree, "r"))
        tree = nexus.trees[0]
        tree.relabel(map_old2new)
    else:
        tree = None

    annotation = mali.getAnnotation("STATE")
    chars = set(list(annotation))
    for c in chars:
        assert c in (
            "N", "C"), "unknown annotation %s: only 'N' and 'C' are recognized"
    if len(chars) == 1:
        if options.loglevel >= 1:
            options.stdlog.write("# WARNING: only a single block")
        blocks = (("B0_", chars[0]), )
    else:
        blocks = (("B0_", "N"), ("B1_", "C"))

    result, mali, ids = prepareGrammar(xgram, mali, tree, map_old2new, blocks,
                                       options)

    trained_model = result.getModel()

    pis, matrices = RateEstimation.getRateMatrix(trained_model)

    annotation = mali.getAnnotation("STATE")

    for block, code in blocks:

        terminals = ("%sCOD0" % block, "%sCOD1" % block, "%sCOD2" % block)

        pi = pis[terminals]

        if options.shared_rates == "all":
            rate_prefix_rs = ""
            rate_prefix_rn = ""
            rate_prefix_ri = ""
            rate_prefix_rv = ""
        elif options.shared_rates == "kappa":
            rate_prefix_rs = block
            rate_prefix_rn = block
            rate_prefix_ri = ""
            rate_prefix_rv = ""
        elif options.shared_rates == "kappa-ds":
            rate_prefix_rs = ""
            rate_prefix_rn = block
            rate_prefix_ri = ""
            rate_prefix_rv = ""
        elif options.shared_rates == "omega":
            rate_prefix_rs = ""
            rate_prefix_rn = ""
            rate_prefix_ri = block
            rate_prefix_rv = block
        elif options.shared_rates == "omega-ds":
            rate_prefix_rs = ""
            rate_prefix_rn = ""
            rate_prefix_ri = block
            rate_prefix_rv = ""
        elif options.shared_rates == "ds":
            rate_prefix_rs = ""
            rate_prefix_rn = block
            rate_prefix_ri = block
            rate_prefix_rv = block
        else:
            rate_prefix_rs = block
            rate_prefix_rn = block
            rate_prefix_ri = block
            rate_prefix_rv = block

        if options.shared_frequencies:
            frequency_prefix = ""
        else:
            frequency_prefix = block

        rs = trained_model.mGrammar.getParameter('%sRs' % rate_prefix_rs)
        rn = trained_model.mGrammar.getParameter('%sRn' % rate_prefix_rn)
        ri = trained_model.mGrammar.getParameter('%sRi' % rate_prefix_ri)
        rv = trained_model.mGrammar.getParameter('%sRv' % rate_prefix_rv)

        nchars = annotation.count(code)

        msg = "iter=%i Rs=%6.4f Rn=%6.4f Ri=%6.4f Rv=%6.4f" % (
            result.getNumIterations(), rs, rn, ri, rv)

        try:
            Q, t = RateEstimation.getQMatrix(pi,
                                             Rsi=rs * ri,
                                             Rsv=rs * rv,
                                             Rni=rn * ri,
                                             Rnv=rn * rv)
            avg_omega = (rs + rn) / 2.0
            Q0, t0 = RateEstimation.getQMatrix(pi,
                                               Rsi=ri * avg_omega,
                                               Rsv=rv * avg_omega,
                                               Rni=ri * avg_omega,
                                               Rnv=rv * avg_omega)

            avg_kappa = (ri + rv) / 2.0
            Q1, t1 = RateEstimation.getQMatrix(pi,
                                               Rsi=rs * avg_kappa,
                                               Rsv=rs * avg_kappa,
                                               Rni=rn * avg_kappa,
                                               Rnv=rn * avg_kappa)

            rI, rV, rS, rN = RateEstimation.countSubstitutions(pi, Q)
            rI0, rV0, rS0, rN0 = RateEstimation.countSubstitutions(pi, Q0)
            rI1, rV1, rS1, rN1 = RateEstimation.countSubstitutions(pi, Q1)

            dS = rS / (3 * rS0) * t
            dN = rN / (3 * rN0) * t

            o_kappa = options.value_format % (rI / rI0 * rV0 / rV)
            o_omega = options.value_format % (dN / dS)

            o_dn = options.value_format % dN
            o_ds = options.value_format % dS
            o_rn = options.value_format % rN
            o_rs = options.value_format % rS
            o_rn0 = options.value_format % rN0
            o_rs0 = options.value_format % rS0
            o_t = options.value_format % t
            o_t0 = options.value_format % t0

        except ZeroDivisionError:

            o_kappa = "na"
            o_omega = "na"
            o_dn = "na"
            o_ds = "na"
            o_rn = "na"
            o_rs = "na"
            o_rn0 = "na"
            o_rs0 = "na"
            o_t = "na"
            o_t0 = "na"
            Q = None
            msg = "insufficient data to estimate rate matrix."

        options.stdout.write("\t".join(
            map(str, (code, block, o_dn, o_ds, o_omega, "na", "na", "na", "na",
                      o_kappa, result.getLogLikelihood(), "na", nchars))))

        if options.with_rho:
            options.stdout.write(
                "\t" +
                "\t".join(map(str, (o_rn, o_rs, o_t, o_rn0, o_rs0, o_t0))))

        options.stdout.write("\t%s\n" % msg)
Ejemplo n.º 4
0
def prepareGrammar(xgram, mali, tree, map_old2new, blocks, options):
    """prepare grammar for custom grammars."""

    labels = map(lambda x: x[1], blocks)
    nblocks = len(blocks)

    annotate_terminals = {}
    for x in range(len(labels)):
        annotations = []
        key = []

        for c in range(0, 3):
            t = "B%i_COD%i" % (x, c)
            key.append(t)
            annotations.append(
                Annotation(row="STATE", column=t, label=labels[x]))

        annotate_terminals[tuple(key)] = annotations

    input_model = Codons.buildCodonML(
        codon_model="f3x4-fourproducts",
        num_blocks=nblocks,
        grammar_type="linear-blocks",
        annotate_terminals=annotate_terminals,
        shared_frequencies=options.shared_frequencies,
        shared_rates=False,
    )

    ## manually share rates between blocks
    if options.shared_rates == "kappa":
        for c in range(0, nblocks):
            input_model.renameParameter("B%i_Ri" % c, "Ri")
            input_model.renameParameter("B%i_Rv" % c, "Rv")
    elif options.shared_rates == "kappa-ds":
        for c in range(0, nblocks):
            input_model.renameParameter("B%i_Ri" % c, "Ri")
            input_model.renameParameter("B%i_Rv" % c, "Rv")
            input_model.renameParameter("B%i_Rs" % c, "Rs")
    elif options.shared_rates == "omega":
        for c in range(0, nblocks):
            input_model.renameParameter("B%i_Rs" % c, "Rs")
            input_model.renameParameter("B%i_Rn" % c, "Rn")
    elif options.shared_rates == "omega-ds":
        for c in range(0, nblocks):
            input_model.renameParameter("B%i_Rv" % c, "Rv")
            input_model.renameParameter("B%i_Rs" % c, "Rs")
            input_model.renameParameter("B%i_Rn" % c, "Rn")
    elif options.shared_rates == "ds":
        for c in range(0, nblocks):
            input_model.renameParameter("B%i_Rs" % c, "Rs")
    elif options.shared_rates == "all":
        for c in range(0, nblocks):
            input_model.renameParameter("B%i_Rv" % c, "Rv")
            input_model.renameParameter("B%i_Rs" % c, "Rs")
            input_model.renameParameter("B%i_Rn" % c, "Rn")
            input_model.renameParameter("B%i_Ri" % c, "Ri")

    writeModel(input_model, "input", options)

    ids = mali.getIdentifiers()

    fh, filename = tempfile.mkstemp()

    os.close(fh)
    outfile = open(filename, "w")

    ## clip mali by supplied blocks
    mali.clipByAnnotation("STATE", "".join(labels))

    if tree:
        tree.rescaleBranchLengths(1.0)
        tree_options = "#=GF NH %s" % tree.to_string(branchlengths_only=True,
                                                     format="nh")
    elif mali.getNumSequences() == 2:
        tree_options = "#=GF NH (%s:1.0)%s;" % tuple(map_old2new.values())
    else:
        raise "Please supply a tree."

    mali.writeToFile(outfile,
                     format="stockholm",
                     write_ranges=False,
                     options=(tree_options, ))
    outfile.close()

    ## prefix, code
    if options.shared_frequencies:
        frequency_codes = (("", ""), )
    else:
        frequency_codes = blocks

    if options.insert_frequencies:
        for prefix, code in frequency_codes:
            temp_mali = mali.getClone()
            temp_mali.clipByAnnotation("STATE", code)
            RateEstimation.setFrequencies(input_model, temp_mali, prefix)

    if options.fix_frequencies:
        for prefix, code in frequency_codes:
            for char in ('a', 'c', 'g', 't'):
                for x in (0, 1, 2):
                    param = "%sp%s%i" % (prefix, char, x)
                    input_model.mGrammar.moveVariableToConst(param)

    writeModel(input_model, "input", options)

    t1 = time.time()

    result = xgram.train(input_model, filename)

    if options.dump:
        options.stdlog.write("".join(result.mData))
        options.stdlog.write("".join(result.mLog))
        mali.writeToFile(options.stdlog,
                         format="stockholm",
                         write_ranges=False,
                         options=(tree_options, ))

    t2 = time.time()

    trained_model = result.getModel()

    writeModel(trained_model, "trained", options)

    return result, mali, ids
Ejemplo n.º 5
0
def outputXRateResult(mali, result, rsi, rsv, rni, rnv, msg):
    """output the results of running the Xrate four parameter grammar.
    """
    ids = mali.getIdentifiers()

    pi, matrix = RateEstimation.getRateMatrix(result.getModel(),
                                              terminals=('COD0', 'COD1',
                                                         'COD2'))

    if rsi == None:
        o_dn, o_ds, o_omega = "na", "na", "na"
        o_rn, o_rn0, o_rs, o_rs0 = "na", "na", "na", "na"
        o_t, o_t0 = "na", "na"
        o_N, o_S = "na", "na"
        o_kappa = "na",
        msg = "estimated rate parameters are zero"
    else:
        Q, t = RateEstimation.getQMatrix(pi,
                                         Rsi=rsi,
                                         Rsv=rsv,
                                         Rni=rni,
                                         Rnv=rnv)

        ## get rate matrix as if omega was set to 1
        Q0, t0 = RateEstimation.getQMatrix(pi,
                                           Rsi=(rsi + rni) / 2.0,
                                           Rsv=(rsv + rnv) / 2.0,
                                           Rni=(rsi + rni) / 2.0,
                                           Rnv=(rsv + rnv) / 2.0)

        ## get rate matrix as if kappa was set to 1
        Q1, t1 = RateEstimation.getQMatrix(pi,
                                           Rsi=(rsi + rsv) / 2.0,
                                           Rsv=(rsi + rsv) / 2.0,
                                           Rni=(rni + rnv) / 2.0,
                                           Rnv=(rni + rnv) / 2.0)

        rI, rV, rS, rN = RateEstimation.countSubstitutions(pi, Q)
        rI0, rV0, rS0, rN0 = RateEstimation.countSubstitutions(pi, Q0)
        rI1, rV1, rS1, rN1 = RateEstimation.countSubstitutions(pi, Q1)

        # 64.0/61.0 results from the fact that xrate does not normalize
        # the terminals
        dS = rS / (3 * rS0) * t
        dN = rN / (3 * rN0) * t

        o_omega = options.value_format % (dN / dS)
        o_dn = options.value_format % dN
        o_ds = options.value_format % dS
        o_rn = options.value_format % rN
        o_rs = options.value_format % rS
        o_rn0 = options.value_format % rN0
        o_rs0 = options.value_format % rS0
        o_t = options.value_format % t
        o_t0 = options.value_format % t0
        o_S = options.value_format % (mali.getNumColumns() * rS0)
        o_N = options.value_format % (mali.getNumColumns() * rN0)

        ## kappa is given normalized by sites like omega
        o_kappa = options.value_format % (rI / rI1 * rV1 / rV)

        ## kappa1 is given by the ratio of the rates NOT normalized by the sites.
        msg += " rI/rV=%f rI0/rV0=%f kappa1=%s" % (rI / rV, rI0 / rV0,
                                                   options.value_format %
                                                   ((rsi + rni) / (rsv + rnv)))

    options.stdout.write("\t".join(
        map(str, (mali.getEntry(ids[0]).mId, mali.getEntry(
            ids[1]).mId, o_dn, o_ds, o_omega, o_N, o_S, "na", "na", o_kappa,
                  result.getLogLikelihood(), "na"))))

    if options.with_rho:
        options.stdout.write(
            "\t" + "\t".join(map(str, (o_rn, o_rs, o_t, o_rn0, o_rs0, o_t0))))

    if options.with_counts:
        info = Genomics.CalculatePairIndices(mali[ids[0]], mali[ids[1]])
        options.stdout.write("\t%s" % (str(info)))

    options.stdout.write("\t%s\n" % msg)
    options.stdout.flush()
Ejemplo n.º 6
0
def outputXRateResult(mali, result, rsi, rsv, rni, rnv, msg):
    """output the results of running the Xrate four parameter grammar.
    """
    ids = mali.getIdentifiers()

    pi, matrix = RateEstimation.getRateMatrix(result.getModel(),
                                              terminals=('COD0', 'COD1', 'COD2'))

    if rsi is None:
        o_dn, o_ds, o_omega = "na", "na", "na"
        o_rn, o_rn0, o_rs, o_rs0 = "na", "na", "na", "na"
        o_t, o_t0 = "na", "na"
        o_N, o_S = "na", "na"
        o_kappa = "na",
        msg = "estimated rate parameters are zero"
    else:
        Q, t = RateEstimation.getQMatrix(pi,
                                         Rsi=rsi,
                                         Rsv=rsv,
                                         Rni=rni,
                                         Rnv=rnv)

        # get rate matrix as if omega was set to 1
        Q0, t0 = RateEstimation.getQMatrix(pi,
                                           Rsi=(rsi + rni) / 2.0,
                                           Rsv = (rsv + rnv) / 2.0,
                                           Rni = (rsi + rni) / 2.0,
                                           Rnv = (rsv + rnv) / 2.0)

        # get rate matrix as if kappa was set to 1
        Q1, t1 = RateEstimation.getQMatrix(pi,
                                           Rsi=(rsi + rsv) / 2.0,
                                           Rsv = (rsi + rsv) / 2.0,
                                           Rni = (rni + rnv) / 2.0,
                                           Rnv = (rni + rnv) / 2.0)

        rI, rV, rS, rN = RateEstimation.countSubstitutions(pi, Q)
        rI0, rV0, rS0, rN0 = RateEstimation.countSubstitutions(pi, Q0)
        rI1, rV1, rS1, rN1 = RateEstimation.countSubstitutions(pi, Q1)

        # 64.0/61.0 results from the fact that xrate does not normalize
        # the terminals
        dS = rS / (3 * rS0) * t
        dN = rN / (3 * rN0) * t

        o_omega = options.value_format % (dN / dS)
        o_dn = options.value_format % dN
        o_ds = options.value_format % dS
        o_rn = options.value_format % rN
        o_rs = options.value_format % rS
        o_rn0 = options.value_format % rN0
        o_rs0 = options.value_format % rS0
        o_t = options.value_format % t
        o_t0 = options.value_format % t0
        o_S = options.value_format % (mali.getNumColumns() * rS0)
        o_N = options.value_format % (mali.getNumColumns() * rN0)

        # kappa is given normalized by sites like omega
        o_kappa = options.value_format % (rI / rI1 * rV1 / rV)

        # kappa1 is given by the ratio of the rates NOT normalized by the
        # sites.
        msg += " rI/rV=%f rI0/rV0=%f kappa1=%s" % (rI / rV,
                                                   rI0 / rV0,
                                                   options.value_format % ((rsi + rni) / (rsv + rnv)))

    options.stdout.write("\t".join(map(str, (mali.getEntry(ids[0]).mId,
                                             mali.getEntry(ids[1]).mId,
                                             o_dn, o_ds, o_omega,
                                             o_N, o_S, "na", "na",
                                             o_kappa, result.getLogLikelihood(
    ),
        "na"))))

    if options.with_rho:
        options.stdout.write("\t" + "\t".join(map(str, (o_rn, o_rs, o_t,
                                                        o_rn0, o_rs0, o_t0))))

    if options.with_counts:
        info = Genomics.CalculatePairIndices(mali[ids[0]], mali[ids[1]])
        options.stdout.write("\t%s" % (str(info)))

    options.stdout.write("\t%s\n" % msg)
    options.stdout.flush()
Ejemplo n.º 7
0
def runXrate(mali, pairs, options):

    from XGram.Generator.Prebuilt import DNA
    from XGram.Model import Annotation
    import XGram.Run

    xgram = XGram.XGram()
    if options.xrate_min_increment:
        xgram.setMinIncrement(options.xrate_min_increment)

    ninput, noutput, nskipped = 0, 0, 0

    tempdir = tempfile.mkdtemp()
    data = tempdir + "/data"

    if options.distance == "K80":
        model = DNA.buildModel(substitution_model="k80")
    elif options.distance == "JC69":
        model = DNA.buildModel(substitution_model="jc69")
    elif options.distance == "REV":
        model = DNA.buildModel(substitution_model="gtr")
    else:
        raise "distance %s not implemented for xrate" % (options.distance)

    writeModel(model, "input", options)

    if options.output_format == "list":
        options.stdout.write(
            "\t".join(("seq1", "seq2", "distance", "lnL", "alpha", "kappa", "msg")))

        if options.with_counts:
            options.stdout.write(
                "\t%s" % Genomics.SequencePairInfo().getHeader())
        options.stdout.write("\n")

    for x, y in pairs:

        m1 = mali.getSequence(ids[x])
        ninput += 1
        temp_mali = Mali.Mali()
        m2 = mali.getSequence(ids[y])

        temp_mali.addSequence(m1.mId, m1.mFrom, m1.mTo, m1.mString)
        temp_mali.addSequence(m2.mId, m2.mFrom, m2.mTo, m2.mString)

# if temp_mali.getWidth() < options.min_overlap:
# if options.loglevel >= 1:
# options.stdlog.write("# pair %s-%s: not computed because only %i residues overlap\n" % (mali.getEntry(ids[x]).mId,
# mali.getEntry(ids[y]).mId,
# temp_mali.getWidth()) )

#             nskipped += 1
# continue

        outfile = open(data, "w")
        temp_mali.writeToFile(outfile, format="stockholm",
                              write_ranges=False,
                              options=("#=GF NH (%s:1.0)%s;" % tuple(temp_mali.getIdentifiers()),))
        outfile.close()

        o_alpha, o_kappa = "na", "na"
        o_distance = "na"
        msg = ""

        if options.test_xrate:
            for alpha in (0.1, 0.5, 1.0, 1.5):
                for beta in (0.1, 0.5, 1.0, 1.5):
                    model.mGrammar.setParameter("alpha", alpha)
                    model.mGrammar.setParameter("beta", beta)
                    result = xgram.train(model, data)
                    trained_model = result.getModel()
                    xalpha, xbeta = \
                        (trained_model.mGrammar.getParameter('alpha'),
                         trained_model.mGrammar.getParameter('beta'))
                    # this assumes that the branch length in the input is normalized to 1
                    # this is the normalization constant
                    o_distance = options.format % (2 * xbeta + xalpha)
                    o_kappa = options.format % (xalpha / xbeta)

                    msg = "alpha=%6.4f, beta=%6.4f" % (xalpha, xbeta)

                    options.stdout.write("\t".join(("%f" % alpha,
                                                    "%f" % beta,
                                                    o_distance,
                                                    options.format % result.getLogLikelihood(
                                                    ),
                                                    o_alpha,
                                                    o_kappa,
                                                    msg)))
                    options.stdout.write("\n")
            continue

        options.stdout.write("%s\t%s\t" % (m1.mId, m2.mId))

        if options.distance in ("K80", ):
            result = xgram.train(model, data)
            trained_model = result.getModel()

        elif options.distance in ("REV", ):
            result = xgram.train(model, data)
            trained_model = result.getModel()
            alpha, beta, gamma, delta, epsilon, theta = \
                (trained_model.mGrammar.getParameter('alpha'),
                 trained_model.mGrammar.getParameter('beta'),
                 trained_model.mGrammar.getParameter('gamma'),
                 trained_model.mGrammar.getParameter('delta'),
                 trained_model.mGrammar.getParameter('epsilon'),
                 trained_model.mGrammar.getParameter('theta'))

            pi = trained_model.evaluateTerminalFrequencies(('A0',))[('A0',)]
            matrix = trained_model.evaluateRateMatrix(('A0',))[('A0',)]
            q, d = RateEstimation.getDistanceGTR(pi, matrix)
            o_distance = options.format % (d)
            o_kappa = ""
            msg = "alpha=%6.4f, beta=%6.4f, gamma=%6.4f, delta=%6.4f, epsilon=%6.4f, theta=%6.4f" % (
                alpha, beta, gamma, delta, epsilon, theta)

        elif options.distance in ('JC69', ):
            result = xgram.buildTree(model, data)

        if options.distance == "K80":
            alpha, beta = \
                (trained_model.mGrammar.getParameter('alpha'),
                    trained_model.mGrammar.getParameter('beta'))
            # this assumes that the branch length in the input is normalized to 1
            # this is the normalization constant
            o_distance = options.format % (2 * beta + alpha)
            o_kappa = options.format % (alpha / beta)

            msg = "alpha=%6.4f, beta=%6.4f" % (alpha, beta)
            alpha = "na"

        elif options.distance == "JC69":

            tree = result.getTree()
            # multiply distance by tree, as rates are set to 1 and
            # thus the matrix is scaled by a factor of 3
            o_distance = options.format % (
                3.0 * float(re.search("\(\S+:([0-9.]+)\)", tree).groups()[0]))
            o_kappa = "na"
            msg = ""

        writeModel(result.mModel, "trained", options)

        options.stdout.write("\t".join((o_distance,
                                        options.format % result.getLogLikelihood(
                                        ),
                                        o_alpha,
                                        o_kappa,
                                        msg)))

        if options.with_counts:
            info = Genomics.CalculatePairIndices(
                mali[ids[x]], mali[ids[y]], with_codons=options.is_codons)
            options.stdout.write("\t%s" % (str(info)))

        options.stdout.write("\n")

    shutil.rmtree(tempdir)
Ejemplo n.º 8
0
def runXrate(mali, pairs, options):

    from XGram.Generator.Prebuilt import DNA
    from XGram.Model import Annotation
    import XGram.Run

    xgram = XGram.XGram()
    if options.xrate_min_increment:
        xgram.setMinIncrement(options.xrate_min_increment)

    ninput, noutput, nskipped = 0, 0, 0

    tempdir = tempfile.mkdtemp()
    data = tempdir + "/data"

    if options.distance == "K80":
        model = DNA.buildModel(substitution_model="k80")
    elif options.distance == "JC69":
        model = DNA.buildModel(substitution_model="jc69")
    elif options.distance == "REV":
        model = DNA.buildModel(substitution_model="gtr")
    else:
        raise "distance %s not implemented for xrate" % (options.distance)

    writeModel(model, "input", options)

    if options.output_format == "list":
        options.stdout.write("\t".join(
            ("seq1", "seq2", "distance", "lnL", "alpha", "kappa", "msg")))

        if options.with_counts:
            options.stdout.write("\t%s" %
                                 Genomics.SequencePairInfo().getHeader())
        options.stdout.write("\n")

    for x, y in pairs:

        m1 = mali.getSequence(ids[x])
        ninput += 1
        temp_mali = Mali.Mali()
        m2 = mali.getSequence(ids[y])

        temp_mali.addSequence(m1.mId, m1.mFrom, m1.mTo, m1.mString)
        temp_mali.addSequence(m2.mId, m2.mFrom, m2.mTo, m2.mString)

        # if temp_mali.getWidth() < options.min_overlap:
        # if options.loglevel >= 1:
        # options.stdlog.write("# pair %s-%s: not computed because only %i residues overlap\n" % (mali.getEntry(ids[x]).mId,
        # mali.getEntry(ids[y]).mId,
        # temp_mali.getWidth()) )

        ##             nskipped += 1
        # continue

        outfile = open(data, "w")
        temp_mali.writeToFile(outfile,
                              format="stockholm",
                              write_ranges=False,
                              options=("#=GF NH (%s:1.0)%s;" %
                                       tuple(temp_mali.getIdentifiers()), ))
        outfile.close()

        o_alpha, o_kappa = "na", "na"
        o_distance = "na"
        msg = ""

        if options.test_xrate:
            for alpha in (0.1, 0.5, 1.0, 1.5):
                for beta in (0.1, 0.5, 1.0, 1.5):
                    model.mGrammar.setParameter("alpha", alpha)
                    model.mGrammar.setParameter("beta", beta)
                    result = xgram.train(model, data)
                    trained_model = result.getModel()
                    xalpha, xbeta = \
                        (trained_model.mGrammar.getParameter('alpha'),
                         trained_model.mGrammar.getParameter('beta'))
                    # this assumes that the branch length in the input is normalized to 1
                    # this is the normalization constant
                    o_distance = options.format % (2 * xbeta + xalpha)
                    o_kappa = options.format % (xalpha / xbeta)

                    msg = "alpha=%6.4f, beta=%6.4f" % (xalpha, xbeta)

                    options.stdout.write("\t".join(
                        ("%f" % alpha, "%f" % beta, o_distance,
                         options.format % result.getLogLikelihood(), o_alpha,
                         o_kappa, msg)))
                    options.stdout.write("\n")
            continue

        options.stdout.write("%s\t%s\t" % (m1.mId, m2.mId))

        if options.distance in ("K80", ):
            result = xgram.train(model, data)
            trained_model = result.getModel()

        elif options.distance in ("REV", ):
            result = xgram.train(model, data)
            trained_model = result.getModel()
            alpha, beta, gamma, delta, epsilon, theta = \
                (trained_model.mGrammar.getParameter('alpha'),
                 trained_model.mGrammar.getParameter('beta'),
                 trained_model.mGrammar.getParameter('gamma'),
                 trained_model.mGrammar.getParameter('delta'),
                 trained_model.mGrammar.getParameter('epsilon'),
                 trained_model.mGrammar.getParameter('theta'))

            pi = trained_model.evaluateTerminalFrequencies(('A0', ))[('A0', )]
            matrix = trained_model.evaluateRateMatrix(('A0', ))[('A0', )]
            q, d = RateEstimation.getDistanceGTR(pi, matrix)
            o_distance = options.format % (d)
            o_kappa = ""
            msg = "alpha=%6.4f, beta=%6.4f, gamma=%6.4f, delta=%6.4f, epsilon=%6.4f, theta=%6.4f" % (
                alpha, beta, gamma, delta, epsilon, theta)

        elif options.distance in ('JC69', ):
            result = xgram.buildTree(model, data)

        if options.distance == "K80":
            alpha, beta = \
                (trained_model.mGrammar.getParameter('alpha'),
                    trained_model.mGrammar.getParameter('beta'))
            # this assumes that the branch length in the input is normalized to 1
            # this is the normalization constant
            o_distance = options.format % (2 * beta + alpha)
            o_kappa = options.format % (alpha / beta)

            msg = "alpha=%6.4f, beta=%6.4f" % (alpha, beta)
            alpha = "na"

        elif options.distance == "JC69":

            tree = result.getTree()
            # multiply distance by tree, as rates are set to 1 and
            # thus the matrix is scaled by a factor of 3
            o_distance = options.format % (
                3.0 * float(re.search("\(\S+:([0-9.]+)\)", tree).groups()[0]))
            o_kappa = "na"
            msg = ""

        writeModel(result.mModel, "trained", options)

        options.stdout.write("\t".join(
            (o_distance, options.format % result.getLogLikelihood(), o_alpha,
             o_kappa, msg)))

        if options.with_counts:
            info = Genomics.CalculatePairIndices(mali[ids[x]],
                                                 mali[ids[y]],
                                                 with_codons=options.is_codons)
            options.stdout.write("\t%s" % (str(info)))

        options.stdout.write("\n")

    shutil.rmtree(tempdir)
Ejemplo n.º 9
0
def processMali( mali, options ):

    ncols = mali.getNumColumns()

    if ncols == 0:
        raise "refusing to process empty alignment."

    ## add annotation of states
    if options.block_size != None:
        if options.block_size < 1:
            size = int( float( ncols ) / 3.0 * options.block_size) * 3
        else:
            size = int( options.block_size ) * 3
        
        size = min( size, ncols )
        mali.addAnnotation( "STATE", "N" * size + "C" * (ncols - size))
            
    ## remove gene ids
    for id in mali.getIdentifiers():
        if options.separator in id:
            species = id.split(options.separator)[0]
            mali.rename( id, species )

    map_new2old = mali.mapIdentifiers()
    map_old2new = IOTools.getInvertedDictionary( map_new2old, make_unique = True )
    
    ids = mali.getIdentifiers()
    xgram = XGram.XGram()

    if options.xrate_min_increment:
        xgram.setMinIncrement( options.xrate_min_increment )

    ninput, noutput, nskipped = 0, 0, 0

    # remove empty columns and masked columns
    if options.clean_mali:
        mali.mGapChars = mali.mGapChars + ("n", "N")
        mali.removeGaps( minimum_gaps = 1, frame=3 )

    if options.input_filename_tree:
        nexus = TreeTools.Newick2Nexus( open(options.input_filename_tree,"r") )
        tree = nexus.trees[0]
        tree.relabel( map_old2new )
    else:
        tree = None

    annotation = mali.getAnnotation( "STATE" )
    chars = set(list(annotation))
    for c in chars:
        assert c in ("N", "C"), "unknown annotation %s: only 'N' and 'C' are recognized"
    if len(chars) == 1:
        if options.loglevel >= 1:
            options.stdlog.write("# WARNING: only a single block" )
        blocks = ( ("B0_", chars[0]), )
    else:
        blocks = ( ("B0_", "N"), 
                   ("B1_", "C") )
    
    result, mali, ids = prepareGrammar( xgram, mali, tree, map_old2new, blocks, options )

    trained_model = result.getModel()

    pis, matrices = RateEstimation.getRateMatrix( trained_model )

    annotation = mali.getAnnotation( "STATE" )

    for block, code in blocks :

        terminals = ( "%sCOD0" % block,
                      "%sCOD1" % block,
                      "%sCOD2" % block )
        
        pi = pis[terminals]

        if options.shared_rates == "all":
            rate_prefix_rs = ""
            rate_prefix_rn = ""
            rate_prefix_ri = ""
            rate_prefix_rv = ""
        elif options.shared_rates == "kappa":
            rate_prefix_rs = block
            rate_prefix_rn = block
            rate_prefix_ri = ""
            rate_prefix_rv = ""
        elif options.shared_rates == "kappa-ds":
            rate_prefix_rs = ""
            rate_prefix_rn = block
            rate_prefix_ri = ""
            rate_prefix_rv = ""
        elif options.shared_rates == "omega":
            rate_prefix_rs = ""
            rate_prefix_rn = ""
            rate_prefix_ri = block
            rate_prefix_rv = block
        elif options.shared_rates == "omega-ds":
            rate_prefix_rs = ""
            rate_prefix_rn = ""
            rate_prefix_ri = block
            rate_prefix_rv = ""
        elif options.shared_rates == "ds":
            rate_prefix_rs = ""
            rate_prefix_rn = block
            rate_prefix_ri = block
            rate_prefix_rv = block
        else:
            rate_prefix_rs = block
            rate_prefix_rn = block
            rate_prefix_ri = block
            rate_prefix_rv = block
        
        if options.shared_frequencies:
            frequency_prefix = ""
        else:
            frequency_prefix = block

        rs = trained_model.mGrammar.getParameter( '%sRs' % rate_prefix_rs )
        rn = trained_model.mGrammar.getParameter( '%sRn' % rate_prefix_rn )
        ri = trained_model.mGrammar.getParameter( '%sRi' % rate_prefix_ri )
        rv = trained_model.mGrammar.getParameter( '%sRv' % rate_prefix_rv )    

        nchars = annotation.count( code )

        msg = "iter=%i Rs=%6.4f Rn=%6.4f Ri=%6.4f Rv=%6.4f" % ( result.getNumIterations(), rs, rn, ri, rv )
        
        try:
            Q, t = RateEstimation.getQMatrix( pi,
                                              Rsi=rs * ri,
                                              Rsv=rs * rv,
                                              Rni=rn * ri,
                                              Rnv=rn * rv )
            avg_omega = (rs + rn) / 2.0
            Q0, t0 = RateEstimation.getQMatrix( pi,
                                                Rsi = ri * avg_omega,
                                                Rsv = rv * avg_omega,
                                                Rni = ri * avg_omega,
                                                Rnv = rv * avg_omega )

            avg_kappa = (ri + rv) / 2.0
            Q1, t1 = RateEstimation.getQMatrix( pi,
                                                Rsi = rs * avg_kappa,
                                                Rsv = rs * avg_kappa,
                                                Rni = rn * avg_kappa,
                                                Rnv = rn * avg_kappa )

            rI, rV, rS, rN = RateEstimation.countSubstitutions( pi, Q )
            rI0, rV0, rS0, rN0 = RateEstimation.countSubstitutions( pi, Q0 )    
            rI1, rV1, rS1, rN1 = RateEstimation.countSubstitutions( pi, Q1 )    

            dS = rS / (3 * rS0) * t
            dN = rN / (3 * rN0) * t

            o_kappa = options.value_format % ( rI / rI0 * rV0 / rV )
            o_omega = options.value_format % (dN / dS)

            o_dn = options.value_format % dN
            o_ds = options.value_format % dS
            o_rn = options.value_format % rN
            o_rs = options.value_format % rS
            o_rn0 = options.value_format % rN0
            o_rs0 = options.value_format % rS0
            o_t = options.value_format % t
            o_t0 = options.value_format % t0

        except ZeroDivisionError:

            o_kappa = "na"
            o_omega = "na"
            o_dn = "na"
            o_ds = "na"
            o_rn = "na"
            o_rs = "na"
            o_rn0 = "na"
            o_rs0 = "na"
            o_t = "na"
            o_t0 = "na"
            Q = None
            msg = "insufficient data to estimate rate matrix."
        
        options.stdout.write( "\t".join( map(str, (
                        code, block,
                        o_dn, o_ds, o_omega,
                        "na", "na", "na", "na",
                        o_kappa, 
                        result.getLogLikelihood(),
                        "na",
                        nchars ))))

        if options.with_rho:
            options.stdout.write( "\t" + "\t".join( map(str, (o_rn, o_rs, o_t,
                                                              o_rn0, o_rs0, o_t0 ))))
            
        options.stdout.write( "\t%s\n" %  msg )
Ejemplo n.º 10
0
def prepareGrammar( xgram, mali, tree, map_old2new, blocks, options ):
    """prepare grammar for custom grammars."""
    
    labels = map( lambda x: x[1], blocks )
    nblocks = len(blocks)
    
    annotate_terminals = {}
    for x in range(len(labels)):
        annotations = []
        key = []

        for c in range( 0,3 ):
            t = "B%i_COD%i" % (x, c)
            key.append(t)
            annotations.append( Annotation( row = "STATE",
                                            column = t,
                                            label = labels[x] ))
            
        annotate_terminals[ tuple(key) ] = annotations

    input_model = Codons.buildCodonML( codon_model = "f3x4-fourproducts",
                                       num_blocks = nblocks,
                                       grammar_type = "linear-blocks",
                                       annotate_terminals=annotate_terminals,
                                       shared_frequencies = options.shared_frequencies,
                                       shared_rates = False,
                                       )

    ## manually share rates between blocks
    if options.shared_rates == "kappa":
        for c in range( 0, nblocks):
            input_model.renameParameter( "B%i_Ri" % c, "Ri" )
            input_model.renameParameter( "B%i_Rv" % c, "Rv" )
    elif options.shared_rates == "kappa-ds":
        for c in range( 0, nblocks):
            input_model.renameParameter( "B%i_Ri" % c, "Ri" )
            input_model.renameParameter( "B%i_Rv" % c, "Rv" )
            input_model.renameParameter( "B%i_Rs" % c, "Rs" )
    elif options.shared_rates == "omega":
        for c in range( 0, nblocks):
            input_model.renameParameter( "B%i_Rs" % c, "Rs" )
            input_model.renameParameter( "B%i_Rn" % c, "Rn" )
    elif options.shared_rates == "omega-ds":
        for c in range( 0, nblocks):
            input_model.renameParameter( "B%i_Rv" % c, "Rv" )
            input_model.renameParameter( "B%i_Rs" % c, "Rs" )
            input_model.renameParameter( "B%i_Rn" % c, "Rn" )
    elif options.shared_rates == "ds":
        for c in range( 0, nblocks):
            input_model.renameParameter( "B%i_Rs" % c, "Rs" )
    elif options.shared_rates == "all":
        for c in range( 0, nblocks):
            input_model.renameParameter( "B%i_Rv" % c, "Rv" )
            input_model.renameParameter( "B%i_Rs" % c, "Rs" )
            input_model.renameParameter( "B%i_Rn" % c, "Rn" )
            input_model.renameParameter( "B%i_Ri" % c, "Ri" )

    writeModel( input_model, "input", options )
    
    ids = mali.getIdentifiers()

    fh, filename = tempfile.mkstemp()

    os.close(fh)
    outfile = open(filename, "w" )
    
    ## clip mali by supplied blocks
    mali.clipByAnnotation( "STATE", "".join(labels))

    if tree:
        tree.rescaleBranchLengths( 1.0 )
        tree_options = "#=GF NH %s" % tree.to_string( branchlengths_only=True, format="nh")
    elif mali.getNumSequences() == 2:
        tree_options = "#=GF NH (%s:1.0)%s;" % tuple(map_old2new.values())
    else:
        raise "Please supply a tree."

    mali.writeToFile( outfile, 
                      format="stockholm",
                      write_ranges = False,
                      options = ( tree_options, ) )
    outfile.close()
    
    ## prefix, code
    if options.shared_frequencies:
        frequency_codes = ( ("", ""), )
    else:
        frequency_codes = blocks
        
    if options.insert_frequencies:
        for prefix, code in frequency_codes:
            temp_mali = mali.getClone()
            temp_mali.clipByAnnotation( "STATE", code )
            RateEstimation.setFrequencies( input_model, temp_mali, prefix )
            
    if options.fix_frequencies:
        for prefix, code in frequency_codes:
            for char in ('a', 'c', 'g', 't'):
                for x in (0, 1, 2):
                    param = "%sp%s%i" % (prefix, char, x)
                    input_model.mGrammar.moveVariableToConst( param )

    writeModel( input_model, "input", options )
    
    t1 = time.time()

    result = xgram.train( input_model, filename )

    if options.dump:
        options.stdlog.write( "".join(result.mData) )
        options.stdlog.write( "".join(result.mLog) )
        mali.writeToFile( options.stdlog, 
                          format="stockholm",
                          write_ranges = False,
                          options = (tree_options,))

    t2 = time.time()
    
    trained_model = result.getModel()

    writeModel( trained_model, "trained", options )

    return result, mali, ids