def run(self): #---+----|----+----|----+----|----+----|----+----|----+----|----+----| """ Simulates a DNA dataset and stores it in NEXUS format in the supplied filename. """ self.starting_tree_source = self.opts.tree_source self.getStartingTree() ntax = len(self.opts.taxon_labels) self.phycassert(ntax > 3, 'Must specify labels for at least four taxa') ntips = self.starting_tree.getNTips() if self.starting_tree.isRooted(): ntips -= 1 self.phycassert(ntax == ntips, 'Number of tips in tree (%d) does not match number of taxon labels in taxon_labels (%d)' % (ntips,ntax)) core = LikelihoodCore(self) core.setupCore() if not core.tree.hasEdgeLens(): tm = phylogeny.TreeManip(core.tree) tm.setRandomEdgeLengths(self.opts.edgelen_dist) self.sim_model_tree = core.tree core.prepareForSimulation() sim_data = core.simulate() sim_data.saveToNexusFile(self.opts.file_name, self.opts.taxon_labels, 'dna', ('a','c','g','t')) dataf = open(self.opts.file_name,'a') dataf.write('\n[\ntree used for simulation:\n%s\n]\n' % core.tree.makeNewick() ) dataf.close()
def run(self): #---+----|----+----|----+----|----+----|----+----|----+----|----+----| """ Computes the log-likelihood based on the current tree and current model. """ ds = self.opts.data_source mat = ds and ds.getMatrix() or None self.phycassert(self.opts.data_source is not None, "specify data_source before calling like()") self._loadData(mat) self.starting_tree = self.getStartingTree() if self.opts.preorder_edgelens is not None: self.starting_tree.replaceEdgeLens(self.opts.preorder_edgelens) print '@@@@@@@@@@ self.starting_tree.makeNewick() =',self.starting_tree.makeNewick() core = LikelihoodCore(self) core.setupCore() core.prepareForLikelihood() if self.opts.store_site_likes: core.likelihood.storeSiteLikelihoods(True) self.opts.pattern_counts = None self.opts.char_to_pattern = None self.opts.site_likes = None self.opts.site_uf = None else: core.likelihood.storeSiteLikelihoods(False) lnL = core.calcLnLikelihood() if self.opts.store_site_likes: self.opts.pattern_counts = core.likelihood.getPatternCounts() self.opts.char_to_pattern = core.likelihood.getCharIndexToPatternIndex() self.opts.site_likes = core.likelihood.getSiteLikelihoods() self.opts.site_uf = core.likelihood.getSiteUF() return lnL
def run(self): #---+----|----+----|----+----|----+----|----+----|----+----|----+----| """ Performs simulations and computes the Gelfand-Ghosh measures Pm, Gm, and Dm = Pm + Gm for the parameters and trees specified when this object was created. """ self.outputHeader() # Check to make sure user specified an input tree file input_trees = self.opts.trees self.stdout.phycassert(input_trees is not None, 'trees cannot be None when gg is called') self.stdout.phycassert(input_trees.__class__.__name__ == 'TreeCollection', 'trees must be a TreeCollection object') # Check to make sure user specified an input params file input_params = self.opts.params self.stdout.phycassert(input_params is not None and len(input_params) > 0, 'params cannot be None or empty when gg is called') # Store transformed edge lengths and parameter values in dictionaries self.edge_lengths and self.parameters, respectively print 'Harvesting trees and parameters...' self.harvestTreesAndParams(input_trees, input_params) # Build list of tuples (t,n) where t is the tree id and n is the frequency of that tree in the sample tree_id_samplesize_tuples = [] for tid in self.parameters.keys(): tree_id_samplesize_tuples.append((tid,self.sample_size[tid])) # Sort list so that most commonly-encountered tree topology is first tree_id_samplesize_tuples.sort(cmp=lambda x,y: cmp(x[1], y[1])) tree_id_samplesize_tuples.reverse() self.loadData() self.gg_mu.resizePatternVect(self.nchar); self.curr_treeid, self.n = tree_id_samplesize_tuples[0] if False: # Create chain manager and (single) chain; no MCMC being done but the chain knows how to # interact with the model and compute likelihoods and priors mcmc_manager = MCMCManager(self) mcmc_manager.createChains() cc = self.mcmc_manager.getColdChain() core = cc.likelihood core = LikelihoodCore(self) core.setupCore() # Check to make sure model hasn't changed since creating the param file param_names_from_model = core.partition_model.getAllParameterNames() param_names_from_file = self.parameters[tid][0].keys() exclude_from_comparison = ['Gen','TL','lnL','lnPrior', '1_external_hyper', '1_internal_hyper', '1_edgelen_hyper'] #print '\n*******************\nparam_names_from_model:',param_names_from_model #print '\n*******************\nparam_names_from_file:',param_names_from_file self.phycassert(self.loMismo(param_names_from_model, param_names_from_file, exclude_from_comparison), 'Model differs from the model used to generate parameter file (%s)' % self.last_error) # Let gg_y contain the observed pattern counts core.likelihood.addDataTo(self.gg_y) #POLOLD #if self.gg_bin_patterns: # self.gg_binned_y = self.gg_y.getBinnedCounts() if self.gg_bin_patterns and self.opts.out.bincounts is not None: self._openBinCountsFile() ####################### ###### TREE LOOP ###### ####################### prev_pct_done = 0.0 prev_secs = 0.0 self.gg_total = 0 stopwatch = probdist.StopWatch() stopwatch.start() num_distinct_tree_topologies = len(tree_id_samplesize_tuples) self.output('Performing posterior-predictive simulations (%d distinct topologies):' % num_distinct_tree_topologies) for tnum,(tid,sample_size) in enumerate(tree_id_samplesize_tuples): self.n = sample_size #self.output('\nEvaluating tree = %d (%s)...' % (tnum+1,sample_size == 1 and '1 sample' or '%d samples' % sample_size)) # Replace the tree in likelihood core core.setTree(self.tree_objects[tid]) self.curr_tree = core.tree # Build up dictionary of nodes (self.curr_tree_node) in which keys are split representations and values # are node objects in self.curr_tree. This will be used later to set edge lengths in the tree. self.fillTreeNodeDict() # Prepare tree for likelihood calculation by allocating tip and internal data structures core.likelihood.prepareForLikelihood(self.curr_tree) #core.likelihood.prepareForSimulation(self.curr_tree) ######################### ###### SAMPLE LOOP ###### ######################### for param_map,edgelen_map in zip(self.parameters[tid],self.edge_lengths[tid]): # Report progress so user doesn't give up pct_done = 100.0*float(self.gg_total)/float(self.opts.nreps*self.nsamples) if pct_done - prev_pct_done >= 10.0: prev_pct_done = pct_done secs = stopwatch.elapsedSeconds() proportion_finished = pct_done/100.0 proportion_remaining = 1.0 - proportion_finished eta = secs*proportion_remaining/proportion_finished self.output(' %.0f%% done (%.1fs remaining)...' % (pct_done, eta)) prev_secs = secs cum_caltbinned_secs = 0.0 self.setupModel(tid, core, param_map, edgelen_map) #core.likelihood.storeSiteLikelihoods(True) #core.likelihood.storeAllCLAs(self.curr_tree) lnL = core.likelihood.calcLnL(self.curr_tree) #print '-->',lnL,param_map['lnL'] #siteLnLs = core.likelihood.getSiteLikelihoods() #counts = core.likelihood.getPatternCounts() #patterns = core.likelihood.listPatterns() #doof = open('doof_site_likes.txt','w') #doof.write('Patterns:\n%s\n\n' % patterns) #for s,c in zip(siteLnLs,counts): # doof.write('%g\t%g\n' % (s,c)) #doof.close() #core.likelihood.debugUncompressedDataInfo("all-site-patterns.txt"); for j in range(self.opts.nreps): self.gg_num_post_pred_reps += 1.0 self.gg_simdata = core.simulate() # Save the simulated data set if desired if self.opts.out.postpred.__bool__(): self._openPostPredFile() self.gg_simdata.saveToNexusFilePython(self.postpredf, self.taxon_labels, 'dna', ('a','c','g','t')) #self.gg_simdata.saveToNexusFile('doof.nex', self.taxon_labels, 'dna', ('a','c','g','t')) #self.debugPAUPNexus(self.opts.out.postpred._getFilename(), core) self._closePostPredFile() if self.gg_bin_patterns: # Compute the t function for the simulated dataset curr_t = self.gg_simdata.calctBinned(4,self.minbins) # 4 = number of states self.gg_binned_simdata = self.gg_simdata.getBinnedCounts() if self.bincountsf is not None: bstr = ['%.1f' % x for x in self.gg_binned_simdata] self.bincountsf.write('%s\tposterior predictive replicate\n' % '\t'.join(bstr)) else: # Compute the t function for the simulated dataset curr_t = self.gg_simdata.calct(4) # Add this value of t to the list (later the mean t will be computed) self.gg_t.append(curr_t) # Add the number of patterns in self.gg_simdata to the gg_npatterns list self.gg_npatterns.append(self.gg_simdata.getNUniquePatterns()) # Update running mean vector gg_mu. A running mean is maintained because # it is easy for the number of counts of constant patterns to overflow # if you wait until the end of the MCMC run to divide by the total. # Here is how the running mean is kept. Assume there will be four numbers # (a, b, c, d) averaged. Thus, the desired quantity is (a+b+c+d)/4. # # gg_num_post_pred_reps self.gg_mu # ------------------------------------------------------------ # 1 a = (a)/1 # 2 (1/2)a + b/2 = (a+b)/2 # 3 (2/3)[(a+b)/2] + c/3 = (a+b+c)/3 # 4 (3/4)[(a+b+c)/3] + d/4 = (a+b+c+d)/4 # ------------------------------------------------------------ # # Note that it is ok if gg_num_post_pred_reps = 1 (in which case # gg_mu is multiplied by zero). Because gg_mu is empty, multBy is # a no-op in this case p = 1.0/self.gg_num_post_pred_reps not_p = 1.0 - p self.gg_mu.multBy(not_p) self.gg_simdata.multBy(p) self.gg_simdata.addDataTo(self.gg_mu, 1.0) # Increment count of the total number of simulated datasets created # This value is used to later compute the mean t for all simulated datasets # and the mean counts for all simulated data sets self.gg_total += 1 self.ggCalculate() if self.gg_bin_patterns and self.bincountsf is not None: # write observed bin counts bstr = ['%.1f' % x for x in self.gg_binned_y] self.bincountsf.write('%s\tobserved\n' % '\t'.join(bstr)) # write mu bin counts bstr = ['%.1f' % x for x in self.gg_binned_mu] self.bincountsf.write('%s\tmu\n' % '\t'.join(bstr)) # write compromise action bin counts for i,k in enumerate(self.opts.kvalues): bstr = ['%.1f' % x for x in self.gg_a[i]] self.bincountsf.write('%s\ta for k=%.1f\n' % ('\t'.join(bstr),k)) self._closeBinCountsFile() return (self.gg_Pm, self.gg_Gm, self.gg_Dm);