Ejemplo n.º 1
0
    def run(self):
        #---+----|----+----|----+----|----+----|----+----|----+----|----+----|
        """
        Simulates a DNA dataset and stores it in NEXUS format in the supplied
        filename.

        """
        self.starting_tree_source = self.opts.tree_source
        self.getStartingTree()
        ntax = len(self.opts.taxon_labels)
        self.phycassert(ntax > 3, 'Must specify labels for at least four taxa')

        ntips = self.starting_tree.getNTips()
        if self.starting_tree.isRooted():
            ntips -= 1
        self.phycassert(ntax == ntips, 'Number of tips in tree (%d) does not match number of taxon labels in taxon_labels (%d)' % (ntips,ntax))

        core = LikelihoodCore(self)
        core.setupCore()
        if not core.tree.hasEdgeLens():
            tm = phylogeny.TreeManip(core.tree)
            tm.setRandomEdgeLengths(self.opts.edgelen_dist)
        self.sim_model_tree = core.tree
        core.prepareForSimulation()
        sim_data = core.simulate()
        sim_data.saveToNexusFile(self.opts.file_name, self.opts.taxon_labels, 'dna', ('a','c','g','t'))
        dataf = open(self.opts.file_name,'a')
        dataf.write('\n[\ntree used for simulation:\n%s\n]\n' % core.tree.makeNewick() )
        dataf.close()
Ejemplo n.º 2
0
 def run(self):
     #---+----|----+----|----+----|----+----|----+----|----+----|----+----|
     """
     Computes the log-likelihood based on the current tree and current
     model.
     
     """
     
     ds = self.opts.data_source
     mat = ds and ds.getMatrix() or None
     self.phycassert(self.opts.data_source is not None, "specify data_source before calling like()")
     self._loadData(mat)
     
     self.starting_tree =  self.getStartingTree()
     if self.opts.preorder_edgelens is not None:
         self.starting_tree.replaceEdgeLens(self.opts.preorder_edgelens)
         print '@@@@@@@@@@ self.starting_tree.makeNewick() =',self.starting_tree.makeNewick()
     core = LikelihoodCore(self)
     core.setupCore()
     core.prepareForLikelihood()
     if self.opts.store_site_likes:
         core.likelihood.storeSiteLikelihoods(True)
         self.opts.pattern_counts = None
         self.opts.char_to_pattern = None
         self.opts.site_likes = None
         self.opts.site_uf = None
     else:
         core.likelihood.storeSiteLikelihoods(False)
     lnL = core.calcLnLikelihood()
     if self.opts.store_site_likes:
         self.opts.pattern_counts = core.likelihood.getPatternCounts()
         self.opts.char_to_pattern = core.likelihood.getCharIndexToPatternIndex()
         self.opts.site_likes = core.likelihood.getSiteLikelihoods()
         self.opts.site_uf = core.likelihood.getSiteUF()
     return lnL
Ejemplo n.º 3
0
    def run(self):
        #---+----|----+----|----+----|----+----|----+----|----+----|----+----|
        """
        Performs simulations and computes the Gelfand-Ghosh measures Pm, Gm,
        and Dm = Pm + Gm for the parameters and trees specified when this
        object was created.

        """
        self.outputHeader()

        # Check to make sure user specified an input tree file
        input_trees = self.opts.trees
        self.stdout.phycassert(input_trees is not None, 'trees cannot be None when gg is called')
        self.stdout.phycassert(input_trees.__class__.__name__ == 'TreeCollection', 'trees must be a TreeCollection object')

        # Check to make sure user specified an input params file
        input_params = self.opts.params
        self.stdout.phycassert(input_params is not None and len(input_params) > 0, 'params cannot be None or empty when gg is called')

        # Store transformed edge lengths and parameter values in dictionaries self.edge_lengths and self.parameters, respectively
        print 'Harvesting trees and parameters...'
        self.harvestTreesAndParams(input_trees, input_params)

        # Build list of tuples (t,n) where t is the tree id and n is the frequency of that tree in the sample
        tree_id_samplesize_tuples = []
        for tid in self.parameters.keys():
            tree_id_samplesize_tuples.append((tid,self.sample_size[tid]))

        # Sort list so that most commonly-encountered tree topology is first
        tree_id_samplesize_tuples.sort(cmp=lambda x,y: cmp(x[1], y[1]))
        tree_id_samplesize_tuples.reverse()

        self.loadData()
        self.gg_mu.resizePatternVect(self.nchar);

        self.curr_treeid, self.n = tree_id_samplesize_tuples[0]

        if False:
            # Create chain manager and (single) chain; no MCMC being done but the chain knows how to
            # interact with the model and compute likelihoods and priors
            mcmc_manager = MCMCManager(self)
            mcmc_manager.createChains()
            cc = self.mcmc_manager.getColdChain()
            core = cc.likelihood
        core = LikelihoodCore(self)
        core.setupCore()

        # Check to make sure model hasn't changed since creating the param file
        param_names_from_model = core.partition_model.getAllParameterNames()
        param_names_from_file  = self.parameters[tid][0].keys()
        exclude_from_comparison = ['Gen','TL','lnL','lnPrior', '1_external_hyper', '1_internal_hyper', '1_edgelen_hyper']
        #print '\n*******************\nparam_names_from_model:',param_names_from_model
        #print '\n*******************\nparam_names_from_file:',param_names_from_file
        self.phycassert(self.loMismo(param_names_from_model, param_names_from_file, exclude_from_comparison), 'Model differs from the model used to generate parameter file (%s)' % self.last_error)

        # Let gg_y contain the observed pattern counts
        core.likelihood.addDataTo(self.gg_y)

        #POLOLD
        #if self.gg_bin_patterns:
        #    self.gg_binned_y = self.gg_y.getBinnedCounts()

        if self.gg_bin_patterns and self.opts.out.bincounts is not None:
            self._openBinCountsFile()

        #######################
        ###### TREE LOOP ######
        #######################
        prev_pct_done = 0.0
        prev_secs = 0.0
        self.gg_total = 0
        stopwatch = probdist.StopWatch()
        stopwatch.start()
        num_distinct_tree_topologies = len(tree_id_samplesize_tuples)
        self.output('Performing posterior-predictive simulations (%d distinct topologies):' % num_distinct_tree_topologies)
        for tnum,(tid,sample_size) in enumerate(tree_id_samplesize_tuples):
            self.n = sample_size
            #self.output('\nEvaluating tree = %d (%s)...' % (tnum+1,sample_size == 1 and '1 sample' or '%d samples' % sample_size))

            # Replace the tree in likelihood core
            core.setTree(self.tree_objects[tid])
            self.curr_tree = core.tree

            # Build up dictionary of nodes (self.curr_tree_node) in which keys are split representations and values
            # are node objects in self.curr_tree. This will be used later to set edge lengths in the tree.
            self.fillTreeNodeDict()

            # Prepare tree for likelihood calculation by allocating tip and internal data structures
            core.likelihood.prepareForLikelihood(self.curr_tree)
            #core.likelihood.prepareForSimulation(self.curr_tree)

            #########################
            ###### SAMPLE LOOP ######
            #########################
            for param_map,edgelen_map in zip(self.parameters[tid],self.edge_lengths[tid]):

                # Report progress so user doesn't give up
                pct_done = 100.0*float(self.gg_total)/float(self.opts.nreps*self.nsamples)
                if pct_done - prev_pct_done >= 10.0:
                    prev_pct_done = pct_done
                    secs = stopwatch.elapsedSeconds()
                    proportion_finished = pct_done/100.0
                    proportion_remaining = 1.0 - proportion_finished
                    eta = secs*proportion_remaining/proportion_finished
                    self.output('  %.0f%% done (%.1fs remaining)...' % (pct_done, eta))
                    prev_secs = secs
                    cum_caltbinned_secs = 0.0

                self.setupModel(tid, core, param_map, edgelen_map)

                #core.likelihood.storeSiteLikelihoods(True)
                #core.likelihood.storeAllCLAs(self.curr_tree)

                lnL = core.likelihood.calcLnL(self.curr_tree)

                #print '-->',lnL,param_map['lnL']
                #siteLnLs = core.likelihood.getSiteLikelihoods()
                #counts = core.likelihood.getPatternCounts()
                #patterns = core.likelihood.listPatterns()
                #doof = open('doof_site_likes.txt','w')
                #doof.write('Patterns:\n%s\n\n' % patterns)
                #for s,c in zip(siteLnLs,counts):
                #    doof.write('%g\t%g\n' % (s,c))
                #doof.close()
                #core.likelihood.debugUncompressedDataInfo("all-site-patterns.txt");

                for j in range(self.opts.nreps):
                    self.gg_num_post_pred_reps += 1.0

                    self.gg_simdata = core.simulate()

                    # Save the simulated data set if desired
                    if self.opts.out.postpred.__bool__():
                        self._openPostPredFile()
                        self.gg_simdata.saveToNexusFilePython(self.postpredf, self.taxon_labels, 'dna', ('a','c','g','t'))
                        #self.gg_simdata.saveToNexusFile('doof.nex', self.taxon_labels, 'dna', ('a','c','g','t'))
                        #self.debugPAUPNexus(self.opts.out.postpred._getFilename(), core)
                        self._closePostPredFile()

                    if self.gg_bin_patterns:
                        # Compute the t function for the simulated dataset

                        curr_t = self.gg_simdata.calctBinned(4,self.minbins) # 4 = number of states

                        self.gg_binned_simdata = self.gg_simdata.getBinnedCounts()

                        if self.bincountsf is not None:
                            bstr = ['%.1f' % x for x in self.gg_binned_simdata]
                            self.bincountsf.write('%s\tposterior predictive replicate\n' % '\t'.join(bstr))
                    else:
                        # Compute the t function for the simulated dataset
                        curr_t = self.gg_simdata.calct(4)

                    # Add this value of t to the list (later the mean t will be computed)
                    self.gg_t.append(curr_t)

                    # Add the number of patterns in self.gg_simdata to the gg_npatterns list
                    self.gg_npatterns.append(self.gg_simdata.getNUniquePatterns())

                    # Update running mean vector gg_mu. A running mean is maintained because
                    # it is easy for the number of counts of constant patterns to overflow
                    # if you wait until the end of the MCMC run to divide by the total.
                    # Here is how the running mean is kept. Assume there will be four numbers
                    # (a, b, c, d) averaged. Thus, the desired quantity is (a+b+c+d)/4.
                    #
                    # gg_num_post_pred_reps   self.gg_mu
                    # ------------------------------------------------------------
                    #           1             a                      = (a)/1
                    #           2             (1/2)a + b/2           = (a+b)/2
                    #           3             (2/3)[(a+b)/2] + c/3   = (a+b+c)/3
                    #           4             (3/4)[(a+b+c)/3] + d/4 = (a+b+c+d)/4
                    # ------------------------------------------------------------
                    #
                    # Note that it is ok if gg_num_post_pred_reps = 1 (in which case
                    # gg_mu is multiplied by zero). Because gg_mu is empty, multBy is
                    # a no-op in this case
                    p = 1.0/self.gg_num_post_pred_reps
                    not_p = 1.0 - p

                    self.gg_mu.multBy(not_p)
                    self.gg_simdata.multBy(p)
                    self.gg_simdata.addDataTo(self.gg_mu, 1.0)

                    # Increment count of the total number of simulated datasets created
                    # This value is used to later compute the mean t for all simulated datasets
                    # and the mean counts for all simulated data sets
                    self.gg_total += 1

        self.ggCalculate()

        if self.gg_bin_patterns and self.bincountsf is not None:
            # write observed bin counts
            bstr = ['%.1f' % x for x in self.gg_binned_y]
            self.bincountsf.write('%s\tobserved\n' % '\t'.join(bstr))

            # write mu bin counts
            bstr = ['%.1f' % x for x in self.gg_binned_mu]
            self.bincountsf.write('%s\tmu\n' % '\t'.join(bstr))

            # write compromise action bin counts
            for i,k in enumerate(self.opts.kvalues):
                bstr = ['%.1f' % x for x in self.gg_a[i]]
                self.bincountsf.write('%s\ta for k=%.1f\n' % ('\t'.join(bstr),k))

            self._closeBinCountsFile()

        return (self.gg_Pm, self.gg_Gm, self.gg_Dm);