def prepareGrammar( options ): """prepare grammar for custom grammars.""" num_blocks = options.num_blocks labels = string.letters.upper() annotate_terminals = {} for x in range(num_blocks): annotations = [] key = "B%i" % x annotations.append( Annotation( row = "STATE", column = key, label = labels[x % len(labels)] )) annotate_terminals[ key ] = annotations input_model = DNA.buildModel( substitution_model = "gtr", num_blocks = num_blocks, grammar_type = options.grammar_type, shared_frequencies = False, shared_rates = False, annotate_terminals = annotate_terminals, ) rate = 0.2 for x in range(num_blocks): for param in ("alpha", "beta", "gamma", "delta", "theta", "epsilon"): p = "B%i_%s" % (x,param) input_model.mGrammar.removeParameter( p ) input_model.mGrammar.addParameter( (p, rate), is_explicit = True ) rate += 0.1 grammar = input_model.mGrammar.mRules pseudononterminals = dict( [ ( ("NT_B%i*" % x,), ("NT_B%i" % x,) ) for x in range(num_blocks) ] ) prob_same = options.probability_block prob_diff = (1.0 - prob_same) / len(pseudononterminals) for source in pseudononterminals.keys(): mapped_source = pseudononterminals[source] for target,rule in grammar[source].items(): if target == mapped_source: rule.mRate = (rule.mRate[0], prob_same ) else: rule.mRate = (rule.mRate[0], prob_diff ) writeModel( input_model, "input", options ) return input_model
parser.set_defaults( loglevel = 1, model = "jc69", test = True, write = [], output_pattern = "%s.eg", stdout = sys.stdout, stdlog = sys.stdout, value_format = "%6.4f", ) (options, args) = parser.parse_args() xgram = XGram.XGram() model = DNA.buildModel( substitution_model = options.model ) # print model.getGrammar() if len(args) > 0: data = args[0] else: data = XGram.PATH_DATA + "/dpse_dmel.stk" if options.test: xgram.setDebug() data = XGram.PATH_DATA + "/dpse_dmel.stk" # print trained_model.getGrammar() print "result according to %s" % options.model
def runXrate(mali, pairs, options): from XGram.Generator.Prebuilt import DNA from XGram.Model import Annotation import XGram.Run xgram = XGram.XGram() if options.xrate_min_increment: xgram.setMinIncrement(options.xrate_min_increment) ninput, noutput, nskipped = 0, 0, 0 tempdir = tempfile.mkdtemp() data = tempdir + "/data" if options.distance == "K80": model = DNA.buildModel(substitution_model="k80") elif options.distance == "JC69": model = DNA.buildModel(substitution_model="jc69") elif options.distance == "REV": model = DNA.buildModel(substitution_model="gtr") else: raise "distance %s not implemented for xrate" % (options.distance) writeModel(model, "input", options) if options.output_format == "list": options.stdout.write( "\t".join(("seq1", "seq2", "distance", "lnL", "alpha", "kappa", "msg"))) if options.with_counts: options.stdout.write( "\t%s" % Genomics.SequencePairInfo().getHeader()) options.stdout.write("\n") for x, y in pairs: m1 = mali.getSequence(ids[x]) ninput += 1 temp_mali = Mali.Mali() m2 = mali.getSequence(ids[y]) temp_mali.addSequence(m1.mId, m1.mFrom, m1.mTo, m1.mString) temp_mali.addSequence(m2.mId, m2.mFrom, m2.mTo, m2.mString) # if temp_mali.getWidth() < options.min_overlap: # if options.loglevel >= 1: # options.stdlog.write("# pair %s-%s: not computed because only %i residues overlap\n" % (mali.getEntry(ids[x]).mId, # mali.getEntry(ids[y]).mId, # temp_mali.getWidth()) ) # nskipped += 1 # continue outfile = open(data, "w") temp_mali.writeToFile(outfile, format="stockholm", write_ranges=False, options=("#=GF NH (%s:1.0)%s;" % tuple(temp_mali.getIdentifiers()),)) outfile.close() o_alpha, o_kappa = "na", "na" o_distance = "na" msg = "" if options.test_xrate: for alpha in (0.1, 0.5, 1.0, 1.5): for beta in (0.1, 0.5, 1.0, 1.5): model.mGrammar.setParameter("alpha", alpha) model.mGrammar.setParameter("beta", beta) result = xgram.train(model, data) trained_model = result.getModel() xalpha, xbeta = \ (trained_model.mGrammar.getParameter('alpha'), trained_model.mGrammar.getParameter('beta')) # this assumes that the branch length in the input is normalized to 1 # this is the normalization constant o_distance = options.format % (2 * xbeta + xalpha) o_kappa = options.format % (xalpha / xbeta) msg = "alpha=%6.4f, beta=%6.4f" % (xalpha, xbeta) options.stdout.write("\t".join(("%f" % alpha, "%f" % beta, o_distance, options.format % result.getLogLikelihood( ), o_alpha, o_kappa, msg))) options.stdout.write("\n") continue options.stdout.write("%s\t%s\t" % (m1.mId, m2.mId)) if options.distance in ("K80", ): result = xgram.train(model, data) trained_model = result.getModel() elif options.distance in ("REV", ): result = xgram.train(model, data) trained_model = result.getModel() alpha, beta, gamma, delta, epsilon, theta = \ (trained_model.mGrammar.getParameter('alpha'), trained_model.mGrammar.getParameter('beta'), trained_model.mGrammar.getParameter('gamma'), trained_model.mGrammar.getParameter('delta'), trained_model.mGrammar.getParameter('epsilon'), trained_model.mGrammar.getParameter('theta')) pi = trained_model.evaluateTerminalFrequencies(('A0',))[('A0',)] matrix = trained_model.evaluateRateMatrix(('A0',))[('A0',)] q, d = RateEstimation.getDistanceGTR(pi, matrix) o_distance = options.format % (d) o_kappa = "" msg = "alpha=%6.4f, beta=%6.4f, gamma=%6.4f, delta=%6.4f, epsilon=%6.4f, theta=%6.4f" % ( alpha, beta, gamma, delta, epsilon, theta) elif options.distance in ('JC69', ): result = xgram.buildTree(model, data) if options.distance == "K80": alpha, beta = \ (trained_model.mGrammar.getParameter('alpha'), trained_model.mGrammar.getParameter('beta')) # this assumes that the branch length in the input is normalized to 1 # this is the normalization constant o_distance = options.format % (2 * beta + alpha) o_kappa = options.format % (alpha / beta) msg = "alpha=%6.4f, beta=%6.4f" % (alpha, beta) alpha = "na" elif options.distance == "JC69": tree = result.getTree() # multiply distance by tree, as rates are set to 1 and # thus the matrix is scaled by a factor of 3 o_distance = options.format % ( 3.0 * float(re.search("\(\S+:([0-9.]+)\)", tree).groups()[0])) o_kappa = "na" msg = "" writeModel(result.mModel, "trained", options) options.stdout.write("\t".join((o_distance, options.format % result.getLogLikelihood( ), o_alpha, o_kappa, msg))) if options.with_counts: info = Genomics.CalculatePairIndices( mali[ids[x]], mali[ids[y]], with_codons=options.is_codons) options.stdout.write("\t%s" % (str(info))) options.stdout.write("\n") shutil.rmtree(tempdir)
## without omega, we have a plain nucleotide model. Because we work in codon space, ## the branch length needs to be 20 * 20 / 4 * 4 as long = 25 omega = 25.0 * options.ds not_omega = 25.0 * options.ds * options.omega input_model.mGrammar.setParameter("omega", omega) input_model.mGrammar.setParameter("not_omega", not_omega) elif options.model in ("K80"): if not (options.omega == None or options.omega == 1.0): raise "can only accept 1.0 for omega using the kimura model." if options.model == "K80": input_model = DNA.buildModel(substitution_model="k80", explicit_extension=True) alpha = options.ds * options.kappa / (1.0 + 2.0 * options.kappa) beta = options.ds / (1.0 + 2.0 * options.kappa) if options.loglevel >= 1: options.stdlog.write( "# computed parameters: alpha=%6.4f, beta=%6.4f\n" % (alpha, beta)) input_model.mGrammar.setParameter("alpha", alpha) input_model.mGrammar.setParameter("beta", beta) ## set ext and not_ext to allow for long chains input_model.mGrammar.setParameter("ext", "0.999") input_model.mGrammar.setParameter("not_ext", "0.001")
def buildAndCheckModel(self, substitution_model): """build various models checking parameter settings.""" print "##### %s : default ##########" % (substitution_model ) model = DNA.buildModel( substitution_model = substitution_model ) self.checkModel( model ) print "##### %s : explicit ##########" % (substitution_model ) model = DNA.buildModel( substitution_model = substitution_model, explicit_extension = True ) self.checkModel( model ) num_blocks = 2 for grammar in ("linear-blocks", "multiple-blocks"): print "##### %s : %s : shared rates ##########" % (substitution_model, grammar ) model = DNA.buildModel( substitution_model = substitution_model, grammar_type = grammar, shared_rates = True, shared_frequencies = False, num_blocks = num_blocks ) self.checkModel( model ) print "##### %s : %s : shared freqs ##########" % (substitution_model, grammar ) model = DNA.buildModel( substitution_model = substitution_model, grammar_type = grammar, shared_rates = False, shared_frequencies = True, num_blocks = num_blocks ) self.checkModel( model ) print "##### %s : %s : shared all ##########" % (substitution_model, grammar ) model = DNA.buildModel( substitution_model = substitution_model, grammar_type = grammar, shared_rates = True, shared_frequencies = True, num_blocks = num_blocks ) self.checkModel( model ) print "##### %s : %s : shared all with annotations ##########" % (substitution_model, grammar ) ## test model with annotations ## build annotation labels = string.letters.upper() annotate_terminals = {} for x in range(num_blocks): annotations = [] key = "B%i" % x annotations.append( Annotation( row = "STATE", column = key, label = labels[x % len(labels)] )) annotate_terminals[ key ] = annotations model = DNA.buildModel( substitution_model = substitution_model, grammar_type = grammar, shared_rates = True, shared_frequencies = True, num_blocks = num_blocks, annotate_terminals = annotate_terminals ) self.checkModel( model )
## without omega, we have a plain nucleotide model. Because we work in codon space, ## the branch length needs to be 20 * 20 / 4 * 4 as long = 25 omega = 25.0 * options.ds not_omega = 25.0 * options.ds * options.omega input_model.mGrammar.setParameter( "omega", omega ) input_model.mGrammar.setParameter( "not_omega", not_omega ) elif options.model in ("K80" ): if not (options.omega == None or options.omega == 1.0): raise "can only accept 1.0 for omega using the kimura model." if options.model == "K80": input_model = DNA.buildModel( substitution_model = "k80", explicit_extension = True ) alpha = options.ds * options.kappa / (1.0 + 2.0 * options.kappa ) beta = options.ds / (1.0 + 2.0 * options.kappa ) if options.loglevel >= 1: options.stdlog.write("# computed parameters: alpha=%6.4f, beta=%6.4f\n" % (alpha, beta) ) input_model.mGrammar.setParameter( "alpha", alpha ) input_model.mGrammar.setParameter( "beta", beta ) ## set ext and not_ext to allow for long chains input_model.mGrammar.setParameter( "ext", "0.999" ) input_model.mGrammar.setParameter( "not_ext", "0.001" ) writeModel( input_model, "input", options )
def runXrate(mali, pairs, options): from XGram.Generator.Prebuilt import DNA from XGram.Model import Annotation import XGram.Run xgram = XGram.XGram() if options.xrate_min_increment: xgram.setMinIncrement(options.xrate_min_increment) ninput, noutput, nskipped = 0, 0, 0 tempdir = tempfile.mkdtemp() data = tempdir + "/data" if options.distance == "K80": model = DNA.buildModel(substitution_model="k80") elif options.distance == "JC69": model = DNA.buildModel(substitution_model="jc69") elif options.distance == "REV": model = DNA.buildModel(substitution_model="gtr") else: raise "distance %s not implemented for xrate" % (options.distance) writeModel(model, "input", options) if options.output_format == "list": options.stdout.write("\t".join( ("seq1", "seq2", "distance", "lnL", "alpha", "kappa", "msg"))) if options.with_counts: options.stdout.write("\t%s" % Genomics.SequencePairInfo().getHeader()) options.stdout.write("\n") for x, y in pairs: m1 = mali.getSequence(ids[x]) ninput += 1 temp_mali = Mali.Mali() m2 = mali.getSequence(ids[y]) temp_mali.addSequence(m1.mId, m1.mFrom, m1.mTo, m1.mString) temp_mali.addSequence(m2.mId, m2.mFrom, m2.mTo, m2.mString) # if temp_mali.getWidth() < options.min_overlap: # if options.loglevel >= 1: # options.stdlog.write("# pair %s-%s: not computed because only %i residues overlap\n" % (mali.getEntry(ids[x]).mId, # mali.getEntry(ids[y]).mId, # temp_mali.getWidth()) ) ## nskipped += 1 # continue outfile = open(data, "w") temp_mali.writeToFile(outfile, format="stockholm", write_ranges=False, options=("#=GF NH (%s:1.0)%s;" % tuple(temp_mali.getIdentifiers()), )) outfile.close() o_alpha, o_kappa = "na", "na" o_distance = "na" msg = "" if options.test_xrate: for alpha in (0.1, 0.5, 1.0, 1.5): for beta in (0.1, 0.5, 1.0, 1.5): model.mGrammar.setParameter("alpha", alpha) model.mGrammar.setParameter("beta", beta) result = xgram.train(model, data) trained_model = result.getModel() xalpha, xbeta = \ (trained_model.mGrammar.getParameter('alpha'), trained_model.mGrammar.getParameter('beta')) # this assumes that the branch length in the input is normalized to 1 # this is the normalization constant o_distance = options.format % (2 * xbeta + xalpha) o_kappa = options.format % (xalpha / xbeta) msg = "alpha=%6.4f, beta=%6.4f" % (xalpha, xbeta) options.stdout.write("\t".join( ("%f" % alpha, "%f" % beta, o_distance, options.format % result.getLogLikelihood(), o_alpha, o_kappa, msg))) options.stdout.write("\n") continue options.stdout.write("%s\t%s\t" % (m1.mId, m2.mId)) if options.distance in ("K80", ): result = xgram.train(model, data) trained_model = result.getModel() elif options.distance in ("REV", ): result = xgram.train(model, data) trained_model = result.getModel() alpha, beta, gamma, delta, epsilon, theta = \ (trained_model.mGrammar.getParameter('alpha'), trained_model.mGrammar.getParameter('beta'), trained_model.mGrammar.getParameter('gamma'), trained_model.mGrammar.getParameter('delta'), trained_model.mGrammar.getParameter('epsilon'), trained_model.mGrammar.getParameter('theta')) pi = trained_model.evaluateTerminalFrequencies(('A0', ))[('A0', )] matrix = trained_model.evaluateRateMatrix(('A0', ))[('A0', )] q, d = RateEstimation.getDistanceGTR(pi, matrix) o_distance = options.format % (d) o_kappa = "" msg = "alpha=%6.4f, beta=%6.4f, gamma=%6.4f, delta=%6.4f, epsilon=%6.4f, theta=%6.4f" % ( alpha, beta, gamma, delta, epsilon, theta) elif options.distance in ('JC69', ): result = xgram.buildTree(model, data) if options.distance == "K80": alpha, beta = \ (trained_model.mGrammar.getParameter('alpha'), trained_model.mGrammar.getParameter('beta')) # this assumes that the branch length in the input is normalized to 1 # this is the normalization constant o_distance = options.format % (2 * beta + alpha) o_kappa = options.format % (alpha / beta) msg = "alpha=%6.4f, beta=%6.4f" % (alpha, beta) alpha = "na" elif options.distance == "JC69": tree = result.getTree() # multiply distance by tree, as rates are set to 1 and # thus the matrix is scaled by a factor of 3 o_distance = options.format % ( 3.0 * float(re.search("\(\S+:([0-9.]+)\)", tree).groups()[0])) o_kappa = "na" msg = "" writeModel(result.mModel, "trained", options) options.stdout.write("\t".join( (o_distance, options.format % result.getLogLikelihood(), o_alpha, o_kappa, msg))) if options.with_counts: info = Genomics.CalculatePairIndices(mali[ids[x]], mali[ids[y]], with_codons=options.is_codons) options.stdout.write("\t%s" % (str(info))) options.stdout.write("\n") shutil.rmtree(tempdir)
def buildAndCheckModel(self, substitution_model): """build various models checking parameter settings.""" print "##### %s : default ##########" % (substitution_model) model = DNA.buildModel(substitution_model=substitution_model) self.checkModel(model) print "##### %s : explicit ##########" % (substitution_model) model = DNA.buildModel(substitution_model=substitution_model, explicit_extension=True) self.checkModel(model) num_blocks = 2 for grammar in ("linear-blocks", "multiple-blocks"): print "##### %s : %s : shared rates ##########" % ( substitution_model, grammar) model = DNA.buildModel(substitution_model=substitution_model, grammar_type=grammar, shared_rates=True, shared_frequencies=False, num_blocks=num_blocks) self.checkModel(model) print "##### %s : %s : shared freqs ##########" % ( substitution_model, grammar) model = DNA.buildModel(substitution_model=substitution_model, grammar_type=grammar, shared_rates=False, shared_frequencies=True, num_blocks=num_blocks) self.checkModel(model) print "##### %s : %s : shared all ##########" % ( substitution_model, grammar) model = DNA.buildModel(substitution_model=substitution_model, grammar_type=grammar, shared_rates=True, shared_frequencies=True, num_blocks=num_blocks) self.checkModel(model) print "##### %s : %s : shared all with annotations ##########" % ( substitution_model, grammar) ## test model with annotations ## build annotation labels = string.letters.upper() annotate_terminals = {} for x in range(num_blocks): annotations = [] key = "B%i" % x annotations.append( Annotation(row="STATE", column=key, label=labels[x % len(labels)])) annotate_terminals[key] = annotations model = DNA.buildModel(substitution_model=substitution_model, grammar_type=grammar, shared_rates=True, shared_frequencies=True, num_blocks=num_blocks, annotate_terminals=annotate_terminals) self.checkModel(model)