def _log_likelihood(self, phi, alleleConfig, baseline, maxCopyNumber, update_tree=True): if update_tree: ################################################## # some useful info about the tree, # used by CNV related computations, set_node_height(self.tssb) set_path_from_root_to_node(self.tssb) map_datum_to_node(self.tssb) ################################################## # 注意:此处应该不受CN\genotype的限制,但不记录目标的CN和genotype # 因为此处的parameter不是准确的phi,所以无法达到最优,但为了能够抽样得到 # 最佳结构。此处要使CN和genotype自由发挥 # 在时间上的先后顺序能够明确影响测序数据的位置,只有重叠位置的时候才会 # 发生 # 注意此处可以设置默认参数 if self.fixedC < 0: return self.__log_likelihood_RD_BAF(phi, alleleConfig, baseline, maxCopyNumber) elif self.fixedC >= 0: return self.__log_likelihood_RD_BAF(phi, alleleConfig, baseline, maxCopyNumber, self.fixedC) else: raise Exception("fixedC is abnormal")
def _log_likelihood_GivenC(self, phi, C, alleleConfig, baseline, maxCopyNumber, update_tree=False): if update_tree: ################################################## # some useful info about the tree, # used by CNV related computations, set_node_height(self.tssb) set_path_from_root_to_node(self.tssb) map_datum_to_node(self.tssb) ################################################## copyNumbers = None if self.tag == "BASELINE": copyNumbers = [2] elif get_loga(self) > baseline: copyNumbers = range(3, maxCopyNumber + 1) else: copyNumbers = range(0, 2) pass if C not in copyNumbers: return -float("Inf") ll, pi = self._getLLStripe(C, phi, baseline, alleleConfig) self.copyNumber = C self.genotype = pi self.phi = phi return ll pass
def do_mcmc(stateManager, backupManager, safeToExit, runSucceeded, config, state, treeWriter, stripes, stripeNum, tmpDir): startIter = state['last_iteration'] + 1 unwrittenTreeL = [] mcmcSampleTimesL = [] lastMcmcSampleTime = time.time() # If --tmp-dir is not specified on the command line, it will by default be # None, which will cause mkdtemp() to place this directory under the system's # temporary directory. This is the desired behaviour. config['tmp_dir'] = tempfile.mkdtemp(prefix='pwgsdataexchange.', dir=tmpDir) for iteration in range(startIter, state['sample_number']): safeToExit.set() if iteration < 0: logmsg(iteration) # Referring to tssb as local variable instead of dictionary element is much # faster. tssb = state['tssb'] tssb.resample_assignments() tssb.cull_tree() # assign node ids wts, nodes = tssb.get_mixture() for i, node in enumerate(nodes): node.id = i ################################################## # some useful info about the tree, # used by CNV related computations, # to be called only after resampling assignments set_node_height(tssb) set_path_from_root_to_node(tssb) map_datum_to_node(tssb) ################################################## state['mh_acc'] = metropolis(tssb, state['mh_itr'], state['mh_std'], state['mh_burnin'], stripeNum, state['stripes_file'], state['rand_seed'], config['tmp_dir']) if float(state['mh_acc']) < 0.08 and state['mh_std'] < 10000: state['mh_std'] = state['mh_std'] * 2.0 logmsg("Shrinking MH proposals. Now %f" % state['mh_std']) if float(state['mh_acc']) > 0.5 and float(state['mh_acc']) < 0.99: state['mh_std'] = state['mh_std'] / 2.0 logmsg("Growing MH proposals. Now %f" % state['mh_std']) tssb.resample_sticks() tssb.resample_stick_orders() tssb.resample_hypers(dpAlpha=True, alphaDecay=True, dpGamma=True) lastLlh = tssb.complete_data_log_likelihood() if iteration >= 0: state['cd_llh_traces'][iteration] = lastLlh if True or mod(iteration, 10) == 0: weights, nodes = tssb.get_mixture() logmsg(' '.join([ str(v) for v in (iteration, len(nodes), state['cd_llh_traces'][iteration], state['mh_acc'], tssb.dpAlpha, tssb.dpGamma, tssb.alphaDecay) ])) if argmax(state['cd_llh_traces'][:iteration + 1]) == iteration: logmsg("%f is best per-data complete data likelihood so far." % (state['cd_llh_traces'][iteration])) else: state['burnin_cd_llh_traces'][ iteration + state['burnin_sample_number']] = lastLlh # Can't just put tssb in unwrittenTreeL, as this object will be modified # on subsequent iterations, meaning any stored references in # unwrittenTreeL will all point to the same sample. serialized = pickle.dumps(tssb, protocol=pickle.HIGHEST_PROTOCOL) unwrittenTreeL.append((serialized, iteration, lastLlh)) state['tssb'] = tssb state['rand_state'] = get_state() state['last_iteration'] = iteration if len([ C for C in state['tssb'].root['children'] if C['node'].has_data() ]) > 1: logmsg('Polyclonal tree detected with %s clones.' % len(state['tssb'].root['children'])) newMcmcSampleTime = time.time() mcmcSampleTimesL.append(newMcmcSampleTime - lastMcmcSampleTime) lastMcmcSampleTime = newMcmcSampleTime # It's not safe to exit while performing file IO, as we don't want # trees.zip or the computation state file to become corrupted from an # interrupted write. safeToExit.clear() shouldWriteBackup = iteration % state[ 'write_backups_every'] == 0 and iteration != startIter shouldWriteState = iteration % state['write_state_every'] == 0 isLastIteration = (iteration == state['sample_number'] - 1) # If backup is scheduled to be written, write both it and full program # state regardless of whether we're scheduled to write state this # iteration. if shouldWriteBackup or shouldWriteState or isLastIteration: with open('mcmc_samples.txt', 'a') as mcmcf: llhsAndTimes = [(itr, llh, itr_time) for ( tssb, itr, llh), itr_time in zip(unwrittenTreeL, mcmcSampleTimesL)] llhsAndTimes = '\n'.join([ '%s\t%s\t%s' % (itr, llh, itr_time) for itr, llh, itr_time in llhsAndTimes ]) mcmcf.write(llhsAndTimes + '\n') treeWriter.write_trees(unwrittenTreeL) stateManager.write_state(state) unwrittenTreeL = [] mcmcSampleTimesL = [] if shouldWriteBackup: backupManager.save_backup() backupManager.remove_backup() safeToExit.clear() # save the best tree print_top_trees(TreeWriter.defaultArchiveFn, state['top_k_trees_file'], state['top_k']) # save clonal frequencies freq = dict([(g, []) for g in state['stripe_list']]) stripeL = array(freq.keys(), str) stripeL.shape = (1, len(stripeL)) savetxt(state['clonal_freqs_file'], vstack((stripeL, array([freq[g] for g in freq.keys()]).T)), fmt='%s', delimiter=', ') safeToExit.set() runSucceeded.set()