Exemple #1
0
def prepareGrammar( options ):
    """prepare grammar for custom grammars."""
    
    num_blocks = options.num_blocks

    labels = string.letters.upper()
    annotate_terminals = {}
    for x in range(num_blocks):
        annotations = []
        key = "B%i" % x
        annotations.append( Annotation( row = "STATE",
                                        column = key,
                                        label = labels[x % len(labels)] ))

        annotate_terminals[ key ] = annotations

    input_model = DNA.buildModel( substitution_model = "gtr",
                                  num_blocks = num_blocks,
                                  grammar_type = options.grammar_type,
                                  shared_frequencies = False,
                                  shared_rates = False,
                                  annotate_terminals = annotate_terminals,
                                  )

    rate = 0.2
    for x in range(num_blocks):
        for param in ("alpha", "beta", "gamma", "delta", "theta", "epsilon"):
            p = "B%i_%s" % (x,param)
            input_model.mGrammar.removeParameter( p )
            input_model.mGrammar.addParameter( (p, rate), is_explicit = True )
        rate += 0.1

    grammar = input_model.mGrammar.mRules
    pseudononterminals = dict( [ ( ("NT_B%i*" % x,), ("NT_B%i" % x,) ) for x in range(num_blocks) ] )
    
    prob_same = options.probability_block
    prob_diff = (1.0 - prob_same) / len(pseudononterminals)

    for source in pseudononterminals.keys():
        mapped_source = pseudononterminals[source]
        for target,rule in grammar[source].items():
            if target == mapped_source:
                rule.mRate = (rule.mRate[0], prob_same )
            else:
                rule.mRate = (rule.mRate[0], prob_diff )

    writeModel( input_model, "input", options )

    return input_model
Exemple #2
0
    parser.set_defaults(
                        loglevel = 1,
                        model = "jc69",
                        test = True,
                        write = [],
                        output_pattern = "%s.eg",
                        stdout = sys.stdout,
                        stdlog = sys.stdout,
                        value_format = "%6.4f",
                        )

    (options, args) = parser.parse_args()
    
    xgram = XGram.XGram()

    model = DNA.buildModel( substitution_model = options.model )

    # print model.getGrammar()

    if len(args) > 0:
        data = args[0]
    else:
        data = XGram.PATH_DATA + "/dpse_dmel.stk"
    
    if options.test:
        
        xgram.setDebug()
        data = XGram.PATH_DATA + "/dpse_dmel.stk"    
        
        # print trained_model.getGrammar()
        print "result according to %s" % options.model
Exemple #3
0
def runXrate(mali, pairs, options):

    from XGram.Generator.Prebuilt import DNA
    from XGram.Model import Annotation
    import XGram.Run

    xgram = XGram.XGram()
    if options.xrate_min_increment:
        xgram.setMinIncrement(options.xrate_min_increment)

    ninput, noutput, nskipped = 0, 0, 0

    tempdir = tempfile.mkdtemp()
    data = tempdir + "/data"

    if options.distance == "K80":
        model = DNA.buildModel(substitution_model="k80")
    elif options.distance == "JC69":
        model = DNA.buildModel(substitution_model="jc69")
    elif options.distance == "REV":
        model = DNA.buildModel(substitution_model="gtr")
    else:
        raise "distance %s not implemented for xrate" % (options.distance)

    writeModel(model, "input", options)

    if options.output_format == "list":
        options.stdout.write(
            "\t".join(("seq1", "seq2", "distance", "lnL", "alpha", "kappa", "msg")))

        if options.with_counts:
            options.stdout.write(
                "\t%s" % Genomics.SequencePairInfo().getHeader())
        options.stdout.write("\n")

    for x, y in pairs:

        m1 = mali.getSequence(ids[x])
        ninput += 1
        temp_mali = Mali.Mali()
        m2 = mali.getSequence(ids[y])

        temp_mali.addSequence(m1.mId, m1.mFrom, m1.mTo, m1.mString)
        temp_mali.addSequence(m2.mId, m2.mFrom, m2.mTo, m2.mString)

# if temp_mali.getWidth() < options.min_overlap:
# if options.loglevel >= 1:
# options.stdlog.write("# pair %s-%s: not computed because only %i residues overlap\n" % (mali.getEntry(ids[x]).mId,
# mali.getEntry(ids[y]).mId,
# temp_mali.getWidth()) )

#             nskipped += 1
# continue

        outfile = open(data, "w")
        temp_mali.writeToFile(outfile, format="stockholm",
                              write_ranges=False,
                              options=("#=GF NH (%s:1.0)%s;" % tuple(temp_mali.getIdentifiers()),))
        outfile.close()

        o_alpha, o_kappa = "na", "na"
        o_distance = "na"
        msg = ""

        if options.test_xrate:
            for alpha in (0.1, 0.5, 1.0, 1.5):
                for beta in (0.1, 0.5, 1.0, 1.5):
                    model.mGrammar.setParameter("alpha", alpha)
                    model.mGrammar.setParameter("beta", beta)
                    result = xgram.train(model, data)
                    trained_model = result.getModel()
                    xalpha, xbeta = \
                        (trained_model.mGrammar.getParameter('alpha'),
                         trained_model.mGrammar.getParameter('beta'))
                    # this assumes that the branch length in the input is normalized to 1
                    # this is the normalization constant
                    o_distance = options.format % (2 * xbeta + xalpha)
                    o_kappa = options.format % (xalpha / xbeta)

                    msg = "alpha=%6.4f, beta=%6.4f" % (xalpha, xbeta)

                    options.stdout.write("\t".join(("%f" % alpha,
                                                    "%f" % beta,
                                                    o_distance,
                                                    options.format % result.getLogLikelihood(
                                                    ),
                                                    o_alpha,
                                                    o_kappa,
                                                    msg)))
                    options.stdout.write("\n")
            continue

        options.stdout.write("%s\t%s\t" % (m1.mId, m2.mId))

        if options.distance in ("K80", ):
            result = xgram.train(model, data)
            trained_model = result.getModel()

        elif options.distance in ("REV", ):
            result = xgram.train(model, data)
            trained_model = result.getModel()
            alpha, beta, gamma, delta, epsilon, theta = \
                (trained_model.mGrammar.getParameter('alpha'),
                 trained_model.mGrammar.getParameter('beta'),
                 trained_model.mGrammar.getParameter('gamma'),
                 trained_model.mGrammar.getParameter('delta'),
                 trained_model.mGrammar.getParameter('epsilon'),
                 trained_model.mGrammar.getParameter('theta'))

            pi = trained_model.evaluateTerminalFrequencies(('A0',))[('A0',)]
            matrix = trained_model.evaluateRateMatrix(('A0',))[('A0',)]
            q, d = RateEstimation.getDistanceGTR(pi, matrix)
            o_distance = options.format % (d)
            o_kappa = ""
            msg = "alpha=%6.4f, beta=%6.4f, gamma=%6.4f, delta=%6.4f, epsilon=%6.4f, theta=%6.4f" % (
                alpha, beta, gamma, delta, epsilon, theta)

        elif options.distance in ('JC69', ):
            result = xgram.buildTree(model, data)

        if options.distance == "K80":
            alpha, beta = \
                (trained_model.mGrammar.getParameter('alpha'),
                    trained_model.mGrammar.getParameter('beta'))
            # this assumes that the branch length in the input is normalized to 1
            # this is the normalization constant
            o_distance = options.format % (2 * beta + alpha)
            o_kappa = options.format % (alpha / beta)

            msg = "alpha=%6.4f, beta=%6.4f" % (alpha, beta)
            alpha = "na"

        elif options.distance == "JC69":

            tree = result.getTree()
            # multiply distance by tree, as rates are set to 1 and
            # thus the matrix is scaled by a factor of 3
            o_distance = options.format % (
                3.0 * float(re.search("\(\S+:([0-9.]+)\)", tree).groups()[0]))
            o_kappa = "na"
            msg = ""

        writeModel(result.mModel, "trained", options)

        options.stdout.write("\t".join((o_distance,
                                        options.format % result.getLogLikelihood(
                                        ),
                                        o_alpha,
                                        o_kappa,
                                        msg)))

        if options.with_counts:
            info = Genomics.CalculatePairIndices(
                mali[ids[x]], mali[ids[y]], with_codons=options.is_codons)
            options.stdout.write("\t%s" % (str(info)))

        options.stdout.write("\n")

    shutil.rmtree(tempdir)
Exemple #4
0
                ## without omega, we have a plain nucleotide model. Because we work in codon space,
                ## the branch length needs to be 20 * 20 / 4 * 4 as long = 25
                omega = 25.0 * options.ds
                not_omega = 25.0 * options.ds * options.omega

                input_model.mGrammar.setParameter("omega", omega)
                input_model.mGrammar.setParameter("not_omega", not_omega)

    elif options.model in ("K80"):

        if not (options.omega == None or options.omega == 1.0):
            raise "can only accept 1.0 for omega using the kimura model."

        if options.model == "K80":
            input_model = DNA.buildModel(substitution_model="k80",
                                         explicit_extension=True)

        alpha = options.ds * options.kappa / (1.0 + 2.0 * options.kappa)
        beta = options.ds / (1.0 + 2.0 * options.kappa)

        if options.loglevel >= 1:
            options.stdlog.write(
                "# computed parameters: alpha=%6.4f, beta=%6.4f\n" %
                (alpha, beta))

        input_model.mGrammar.setParameter("alpha", alpha)
        input_model.mGrammar.setParameter("beta", beta)

    ## set ext and not_ext to allow for long chains
    input_model.mGrammar.setParameter("ext", "0.999")
    input_model.mGrammar.setParameter("not_ext", "0.001")
Exemple #5
0
    def buildAndCheckModel(self, substitution_model):
        """build various models checking parameter settings."""

        print "##### %s : default ##########" % (substitution_model )    
        model = DNA.buildModel( substitution_model = substitution_model )
        self.checkModel( model )
        
        print "##### %s : explicit ##########" % (substitution_model )    
        model = DNA.buildModel( substitution_model = substitution_model,
                                explicit_extension = True )
        self.checkModel( model )       

        num_blocks = 2
        for grammar in ("linear-blocks", "multiple-blocks"):

            print "##### %s : %s : shared rates ##########" % (substitution_model, grammar )    

            model = DNA.buildModel( substitution_model = substitution_model,
                                    grammar_type = grammar,
                                    shared_rates = True,
                                    shared_frequencies = False,
                                    num_blocks = num_blocks )
            self.checkModel( model )       

            print "##### %s : %s : shared freqs ##########" % (substitution_model, grammar )    
            
            model = DNA.buildModel( substitution_model = substitution_model,
                                    grammar_type = grammar,
                                    shared_rates = False,
                                    shared_frequencies = True,
                                    num_blocks = num_blocks )
            self.checkModel( model )       

            print "##### %s : %s : shared all ##########" % (substitution_model, grammar )    
            model = DNA.buildModel( substitution_model = substitution_model,
                                    grammar_type = grammar,
                                    shared_rates = True,
                                    shared_frequencies = True,
                                    num_blocks = num_blocks )
            self.checkModel( model )       

            print "##### %s : %s : shared all with annotations ##########" % (substitution_model, grammar )                
            ## test model with annotations
            ## build annotation
            labels = string.letters.upper()
            annotate_terminals = {}
            for x in range(num_blocks):
                annotations = []
                key = "B%i" % x
                annotations.append( Annotation( row = "STATE",
                                                column = key,
                                                label = labels[x % len(labels)] ))

                annotate_terminals[ key ] = annotations
                
            model = DNA.buildModel( substitution_model = substitution_model,
                                    grammar_type = grammar,
                                    shared_rates = True,
                                    shared_frequencies = True,
                                    num_blocks = num_blocks,
                                    annotate_terminals = annotate_terminals )
            self.checkModel( model )       
                ## without omega, we have a plain nucleotide model. Because we work in codon space,
                ## the branch length needs to be 20 * 20 / 4 * 4 as long = 25
                omega     = 25.0 * options.ds 
                not_omega = 25.0 * options.ds * options.omega

                input_model.mGrammar.setParameter( "omega", omega )
                input_model.mGrammar.setParameter( "not_omega", not_omega )

    elif options.model in ("K80" ):

        if not (options.omega == None or options.omega == 1.0):
            raise "can only accept 1.0 for omega using the kimura model."

        if options.model == "K80":
            input_model = DNA.buildModel( substitution_model = "k80",
                                          explicit_extension = True )
            
        alpha = options.ds * options.kappa / (1.0 + 2.0 * options.kappa )
        beta  = options.ds / (1.0 + 2.0 * options.kappa )        

        if options.loglevel >= 1:
            options.stdlog.write("# computed parameters: alpha=%6.4f, beta=%6.4f\n" % (alpha, beta) )

        input_model.mGrammar.setParameter( "alpha", alpha )
        input_model.mGrammar.setParameter( "beta", beta )

    ## set ext and not_ext to allow for long chains
    input_model.mGrammar.setParameter( "ext", "0.999" )
    input_model.mGrammar.setParameter( "not_ext", "0.001" )        
        
    writeModel( input_model, "input", options )
Exemple #7
0
def runXrate(mali, pairs, options):

    from XGram.Generator.Prebuilt import DNA
    from XGram.Model import Annotation
    import XGram.Run

    xgram = XGram.XGram()
    if options.xrate_min_increment:
        xgram.setMinIncrement(options.xrate_min_increment)

    ninput, noutput, nskipped = 0, 0, 0

    tempdir = tempfile.mkdtemp()
    data = tempdir + "/data"

    if options.distance == "K80":
        model = DNA.buildModel(substitution_model="k80")
    elif options.distance == "JC69":
        model = DNA.buildModel(substitution_model="jc69")
    elif options.distance == "REV":
        model = DNA.buildModel(substitution_model="gtr")
    else:
        raise "distance %s not implemented for xrate" % (options.distance)

    writeModel(model, "input", options)

    if options.output_format == "list":
        options.stdout.write("\t".join(
            ("seq1", "seq2", "distance", "lnL", "alpha", "kappa", "msg")))

        if options.with_counts:
            options.stdout.write("\t%s" %
                                 Genomics.SequencePairInfo().getHeader())
        options.stdout.write("\n")

    for x, y in pairs:

        m1 = mali.getSequence(ids[x])
        ninput += 1
        temp_mali = Mali.Mali()
        m2 = mali.getSequence(ids[y])

        temp_mali.addSequence(m1.mId, m1.mFrom, m1.mTo, m1.mString)
        temp_mali.addSequence(m2.mId, m2.mFrom, m2.mTo, m2.mString)

        # if temp_mali.getWidth() < options.min_overlap:
        # if options.loglevel >= 1:
        # options.stdlog.write("# pair %s-%s: not computed because only %i residues overlap\n" % (mali.getEntry(ids[x]).mId,
        # mali.getEntry(ids[y]).mId,
        # temp_mali.getWidth()) )

        ##             nskipped += 1
        # continue

        outfile = open(data, "w")
        temp_mali.writeToFile(outfile,
                              format="stockholm",
                              write_ranges=False,
                              options=("#=GF NH (%s:1.0)%s;" %
                                       tuple(temp_mali.getIdentifiers()), ))
        outfile.close()

        o_alpha, o_kappa = "na", "na"
        o_distance = "na"
        msg = ""

        if options.test_xrate:
            for alpha in (0.1, 0.5, 1.0, 1.5):
                for beta in (0.1, 0.5, 1.0, 1.5):
                    model.mGrammar.setParameter("alpha", alpha)
                    model.mGrammar.setParameter("beta", beta)
                    result = xgram.train(model, data)
                    trained_model = result.getModel()
                    xalpha, xbeta = \
                        (trained_model.mGrammar.getParameter('alpha'),
                         trained_model.mGrammar.getParameter('beta'))
                    # this assumes that the branch length in the input is normalized to 1
                    # this is the normalization constant
                    o_distance = options.format % (2 * xbeta + xalpha)
                    o_kappa = options.format % (xalpha / xbeta)

                    msg = "alpha=%6.4f, beta=%6.4f" % (xalpha, xbeta)

                    options.stdout.write("\t".join(
                        ("%f" % alpha, "%f" % beta, o_distance,
                         options.format % result.getLogLikelihood(), o_alpha,
                         o_kappa, msg)))
                    options.stdout.write("\n")
            continue

        options.stdout.write("%s\t%s\t" % (m1.mId, m2.mId))

        if options.distance in ("K80", ):
            result = xgram.train(model, data)
            trained_model = result.getModel()

        elif options.distance in ("REV", ):
            result = xgram.train(model, data)
            trained_model = result.getModel()
            alpha, beta, gamma, delta, epsilon, theta = \
                (trained_model.mGrammar.getParameter('alpha'),
                 trained_model.mGrammar.getParameter('beta'),
                 trained_model.mGrammar.getParameter('gamma'),
                 trained_model.mGrammar.getParameter('delta'),
                 trained_model.mGrammar.getParameter('epsilon'),
                 trained_model.mGrammar.getParameter('theta'))

            pi = trained_model.evaluateTerminalFrequencies(('A0', ))[('A0', )]
            matrix = trained_model.evaluateRateMatrix(('A0', ))[('A0', )]
            q, d = RateEstimation.getDistanceGTR(pi, matrix)
            o_distance = options.format % (d)
            o_kappa = ""
            msg = "alpha=%6.4f, beta=%6.4f, gamma=%6.4f, delta=%6.4f, epsilon=%6.4f, theta=%6.4f" % (
                alpha, beta, gamma, delta, epsilon, theta)

        elif options.distance in ('JC69', ):
            result = xgram.buildTree(model, data)

        if options.distance == "K80":
            alpha, beta = \
                (trained_model.mGrammar.getParameter('alpha'),
                    trained_model.mGrammar.getParameter('beta'))
            # this assumes that the branch length in the input is normalized to 1
            # this is the normalization constant
            o_distance = options.format % (2 * beta + alpha)
            o_kappa = options.format % (alpha / beta)

            msg = "alpha=%6.4f, beta=%6.4f" % (alpha, beta)
            alpha = "na"

        elif options.distance == "JC69":

            tree = result.getTree()
            # multiply distance by tree, as rates are set to 1 and
            # thus the matrix is scaled by a factor of 3
            o_distance = options.format % (
                3.0 * float(re.search("\(\S+:([0-9.]+)\)", tree).groups()[0]))
            o_kappa = "na"
            msg = ""

        writeModel(result.mModel, "trained", options)

        options.stdout.write("\t".join(
            (o_distance, options.format % result.getLogLikelihood(), o_alpha,
             o_kappa, msg)))

        if options.with_counts:
            info = Genomics.CalculatePairIndices(mali[ids[x]],
                                                 mali[ids[y]],
                                                 with_codons=options.is_codons)
            options.stdout.write("\t%s" % (str(info)))

        options.stdout.write("\n")

    shutil.rmtree(tempdir)
    def buildAndCheckModel(self, substitution_model):
        """build various models checking parameter settings."""

        print "##### %s : default ##########" % (substitution_model)
        model = DNA.buildModel(substitution_model=substitution_model)
        self.checkModel(model)

        print "##### %s : explicit ##########" % (substitution_model)
        model = DNA.buildModel(substitution_model=substitution_model,
                               explicit_extension=True)
        self.checkModel(model)

        num_blocks = 2
        for grammar in ("linear-blocks", "multiple-blocks"):

            print "##### %s : %s : shared rates ##########" % (
                substitution_model, grammar)

            model = DNA.buildModel(substitution_model=substitution_model,
                                   grammar_type=grammar,
                                   shared_rates=True,
                                   shared_frequencies=False,
                                   num_blocks=num_blocks)
            self.checkModel(model)

            print "##### %s : %s : shared freqs ##########" % (
                substitution_model, grammar)

            model = DNA.buildModel(substitution_model=substitution_model,
                                   grammar_type=grammar,
                                   shared_rates=False,
                                   shared_frequencies=True,
                                   num_blocks=num_blocks)
            self.checkModel(model)

            print "##### %s : %s : shared all ##########" % (
                substitution_model, grammar)
            model = DNA.buildModel(substitution_model=substitution_model,
                                   grammar_type=grammar,
                                   shared_rates=True,
                                   shared_frequencies=True,
                                   num_blocks=num_blocks)
            self.checkModel(model)

            print "##### %s : %s : shared all with annotations ##########" % (
                substitution_model, grammar)
            ## test model with annotations
            ## build annotation
            labels = string.letters.upper()
            annotate_terminals = {}
            for x in range(num_blocks):
                annotations = []
                key = "B%i" % x
                annotations.append(
                    Annotation(row="STATE",
                               column=key,
                               label=labels[x % len(labels)]))

                annotate_terminals[key] = annotations

            model = DNA.buildModel(substitution_model=substitution_model,
                                   grammar_type=grammar,
                                   shared_rates=True,
                                   shared_frequencies=True,
                                   num_blocks=num_blocks,
                                   annotate_terminals=annotate_terminals)
            self.checkModel(model)