def assertPriorIsPrecise(self, msbayes_workers, places=2): msbayes_workers = list(msbayes_workers) self.assertWorkersFinished(msbayes_workers) param_sums = self.get_parameter_summaries_from_msbayes_workers( msbayes_workers) sample_size = 0 for w in msbayes_workers: sample_size += w.sample_size for s in param_sums.itervalues(): self.assertEqual(s.n, sample_size) cfg = self.get_config_from_msbayes_workers(msbayes_workers) psi_indices = get_indices_of_patterns(msbayes_workers[0].header, PSI_PATTERNS) self.assertEqual(len(psi_indices), 1) model_indices = get_indices_of_patterns(msbayes_workers[0].header, MODEL_PATTERNS) if not msbayes_workers[0].model_index is None: self.assertEqual(len(model_indices), 1) else: self.assertEqual(len(model_indices), 0) tau_indices = get_indices_of_patterns(msbayes_workers[0].header, TAU_PATTERNS) a_theta_indices = get_indices_of_patterns(msbayes_workers[0].header, A_THETA_PATTERNS) d_theta_indices = get_indices_of_patterns(msbayes_workers[0].header, D_THETA_PATTERNS) if msbayes_workers[0].report_parameters: self.assertEqual(len(tau_indices), cfg.npairs) self.assertEqual(len(a_theta_indices), cfg.npairs) self.assertEqual(len(d_theta_indices), 2 * cfg.npairs) else: self.assertEqual(len(tau_indices), 0) self.assertEqual(len(a_theta_indices), 0) self.assertEqual(len(d_theta_indices), 0) _LOG.debug('\n{0}\n'.format('\n'.join( [str(param_sums[i]) for i in sorted(param_sums.iterkeys())]))) for i in psi_indices: self.assertSampleIsFromDistribution(param_sums[i], cfg.psi, places=places) for i in tau_indices: self.assertSampleIsFromDistribution(param_sums[i], cfg.tau, places=places) for i in a_theta_indices: self.assertSampleIsFromDistribution(param_sums[i], cfg.a_theta, places=places) for i in d_theta_indices: self.assertSampleIsFromDistribution(param_sums[i], cfg.d_theta, mean_adj=cfg.theta.mean, max_adj=cfg.theta.maximum, compare_variance=False, places=places)
def assertPriorIsPrecise(self, msbayes_workers, places=2): msbayes_workers = list(msbayes_workers) self.assertWorkersFinished(msbayes_workers) param_sums = self.get_parameter_summaries_from_msbayes_workers( msbayes_workers) sample_size = 0 for w in msbayes_workers: sample_size += w.sample_size for s in param_sums.itervalues(): self.assertEqual(s.n, sample_size) cfg = self.get_config_from_msbayes_workers(msbayes_workers) psi_indices = get_indices_of_patterns(msbayes_workers[0].header, PSI_PATTERNS) self.assertEqual(len(psi_indices), 1) model_indices = get_indices_of_patterns(msbayes_workers[0].header, MODEL_PATTERNS) if not msbayes_workers[0].model_index is None: self.assertEqual(len(model_indices), 1) else: self.assertEqual(len(model_indices), 0) tau_indices = get_indices_of_patterns(msbayes_workers[0].header, TAU_PATTERNS) a_theta_indices = get_indices_of_patterns(msbayes_workers[0].header, A_THETA_PATTERNS) d_theta_indices = get_indices_of_patterns(msbayes_workers[0].header, D_THETA_PATTERNS) if msbayes_workers[0].report_parameters: self.assertEqual(len(tau_indices), cfg.npairs) self.assertEqual(len(a_theta_indices), cfg.npairs) self.assertEqual(len(d_theta_indices), 2*cfg.npairs) else: self.assertEqual(len(tau_indices), 0) self.assertEqual(len(a_theta_indices), 0) self.assertEqual(len(d_theta_indices), 0) _LOG.debug('\n{0}\n'.format('\n'.join( [str(param_sums[i]) for i in sorted(param_sums.iterkeys())]))) for i in psi_indices: self.assertSampleIsFromDistribution(param_sums[i], cfg.psi, places=places) for i in tau_indices: self.assertSampleIsFromDistribution(param_sums[i], cfg.tau, places=places) for i in a_theta_indices: self.assertSampleIsFromDistribution(param_sums[i], cfg.a_theta, places=places) for i in d_theta_indices: self.assertSampleIsFromDistribution(param_sums[i], cfg.d_theta, mean_adj=cfg.theta.mean, max_adj=cfg.theta.maximum, compare_variance=False, places=places)
def parameter_density_iter(parameter_density_file, parameter_patterns = DIV_MODEL_PATTERNS + MODEL_PATTERNS + \ PSI_PATTERNS + MEAN_TAU_PATTERNS + OMEGA_PATTERNS + \ CV_PATTERNS): dens_file, close = process_file_arg(parameter_density_file) try: header = parse_header(dens_file, seek = False) parameter_indices = functions.get_indices_of_patterns(header, parameter_patterns) indices_to_heads = dict(zip(parameter_indices, [header[i] for i in parameter_indices])) heads_to_dens_tups = dict(zip([header[i] for i in parameter_indices], [None for i in range(len(parameter_indices))])) if not len(parameter_indices) == len(set(indices_to_heads.itervalues())): raise errors.ParameterParsingError('some parameters were found in ' 'multiple columns in density file {0!r}'.format( dens_file.name)) for i, line in enumerate(dens_file): l = line.strip().split() if l: for idx in parameter_indices: heads_to_dens_tups[indices_to_heads[idx]] = (float(l[idx]), float(l[idx + 1])) yield heads_to_dens_tups except: raise finally: if close: dens_file.close()
def result_path_iter(self, observed_index, prior_index): true_model = self.observed_index_to_prior_index[observed_index] out_dir = self.get_result_dir(observed_index, prior_index) if not os.path.isdir(out_dir): raise Exception('expected result direcory {0!r} does not ' 'exist'.format(out_dir)) observed_stream, close = process_file_arg( self.observed_index_to_path[observed_index]) header = parsing.parse_header(observed_stream, sep = '\t', strict = True, seek = False) parameter_indices = functions.get_indices_of_patterns(header, parsing.PARAMETER_PATTERNS) for i, line in enumerate(observed_stream): l = line.strip().split() true_params = dict(zip([header[x] for x in parameter_indices], [l[x] for x in parameter_indices])) true_params['PRI.model'] = str(true_model) result_prefix = '{0}{1}-'.format(self.get_result_path_prefix( observed_index, prior_index, i + 1), self.final_result_index) summary_path = result_prefix + 'posterior-summary.txt' psi_path = result_prefix + 'psi-results.txt' omega_path = result_prefix + 'omega-results.txt' cv_path = result_prefix + 'cv-results.txt' div_model_path = result_prefix + 'div-model-results.txt' model_path = result_prefix + 'model-results.txt' paths = {'summary': summary_path, 'psi': psi_path, 'omega': omega_path, 'cv': cv_path, 'div-model': div_model_path, 'model': model_path} yield true_params, paths observed_stream.close()
def parameter_density_iter(parameter_density_file, parameter_patterns = DIV_MODEL_PATTERNS + MODEL_PATTERNS + \ PSI_PATTERNS + MEAN_TAU_PATTERNS + OMEGA_PATTERNS + \ CV_PATTERNS): dens_file, close = process_file_arg(parameter_density_file) try: header = parse_header(dens_file, seek=False) parameter_indices = functions.get_indices_of_patterns( header, parameter_patterns) indices_to_heads = dict( zip(parameter_indices, [header[i] for i in parameter_indices])) heads_to_dens_tups = dict( zip([header[i] for i in parameter_indices], [None for i in range(len(parameter_indices))])) if not len(parameter_indices) == len(set( indices_to_heads.itervalues())): raise errors.ParameterParsingError( 'some parameters were found in ' 'multiple columns in density file {0!r}'.format( dens_file.name)) for i, line in enumerate(dens_file): l = line.strip().split() if l: for idx in parameter_indices: heads_to_dens_tups[indices_to_heads[idx]] = (float( l[idx]), float(l[idx + 1])) yield heads_to_dens_tups except: raise finally: if close: dens_file.close()
def get_parameter_summaries_from_msbayes_workers(self, msbayes_workers, shuffle_taus=True): msbayes_workers = list(msbayes_workers) s = dict( zip([i for i in msbayes_workers[0].parameter_indices], [ SampleSummarizer(tag=msbayes_workers[0].header[i]) for i in msbayes_workers[0].parameter_indices ])) ncols = None header = msbayes_workers[0].header pi = msbayes_workers[0].parameter_indices for w in msbayes_workers: self.assertEqual(w.header, header) self.assertEqual(w.parameter_indices, pi) f = open(w.prior_path, 'rU') for line_idx, row in enumerate(f): if not ncols: ncols = len(row.strip().split()) if HEADER_PATTERN.match(row.strip()): continue r = row.strip().split() assert len(r) == ncols if shuffle_taus: # because taus are sorted in prior files psi_index = get_indices_of_patterns( w.header, PSI_PATTERNS)[0] tau_indices = get_indices_of_patterns( w.header, TAU_PATTERNS) psi = int(r[psi_index]) taus = [float(r[i]) for i in tau_indices] self.assertEqual(psi, len(set(taus))) random.shuffle(taus) for n, i in enumerate(tau_indices): s[i].add_sample(taus[n]) p_set = set(w.parameter_indices) - set(tau_indices) p = sorted(list(p_set)) for i in p: s[i].add_sample(float(r[i])) else: for i in w.parameter_indices: s[i].add_sample(float(r[i])) f.close() return s
def get_parameter_summaries_from_msbayes_workers(self, msbayes_workers, shuffle_taus=True): msbayes_workers = list(msbayes_workers) s = dict(zip( [i for i in msbayes_workers[0].parameter_indices], [SampleSummarizer( tag=msbayes_workers[0].header[i]) for i in msbayes_workers[ 0].parameter_indices])) ncols = None header = msbayes_workers[0].header pi = msbayes_workers[0].parameter_indices for w in msbayes_workers: self.assertEqual(w.header, header) self.assertEqual(w.parameter_indices, pi) f = open(w.prior_path, 'rU') for line_idx, row in enumerate(f): if not ncols: ncols = len(row.strip().split()) if HEADER_PATTERN.match(row.strip()): continue r = row.strip().split() assert len(r) == ncols if shuffle_taus: # because taus are sorted in prior files psi_index = get_indices_of_patterns(w.header, PSI_PATTERNS)[0] tau_indices = get_indices_of_patterns(w.header, TAU_PATTERNS) psi = int(r[psi_index]) taus = [float(r[i]) for i in tau_indices] self.assertEqual(psi, len(set(taus))) random.shuffle(taus) for n, i in enumerate(tau_indices): s[i].add_sample(taus[n]) p_set = set(w.parameter_indices) - set(tau_indices) p = sorted(list(p_set)) for i in p: s[i].add_sample(float(r[i])) else: for i in w.parameter_indices: s[i].add_sample(float(r[i])) f.close() return s
def result_path_iter(self, observed_index, prior_index): true_model = self.observed_index_to_prior_index[observed_index] out_dir = self.get_result_dir(observed_index, prior_index) if not os.path.isdir(out_dir): raise Exception('expected result direcory {0!r} does not ' 'exist'.format(out_dir)) observed_stream, close = process_file_arg( self.observed_index_to_path[observed_index]) header = parsing.parse_header(observed_stream, sep='\t', strict=True, seek=False) parameter_indices = functions.get_indices_of_patterns( header, parsing.PARAMETER_PATTERNS) for i, line in enumerate(observed_stream): l = line.strip().split() true_params = dict( zip([header[x] for x in parameter_indices], [l[x] for x in parameter_indices])) true_params['PRI.model'] = str(true_model) result_prefix = '{0}{1}-'.format( self.get_result_path_prefix(observed_index, prior_index, i + 1), self.final_result_index) summary_path = result_prefix + 'posterior-summary.txt' psi_path = result_prefix + 'psi-results.txt' omega_path = result_prefix + 'omega-results.txt' cv_path = result_prefix + 'cv-results.txt' div_model_path = result_prefix + 'div-model-results.txt' model_path = result_prefix + 'model-results.txt' paths = { 'summary': summary_path, 'psi': psi_path, 'omega': omega_path, 'cv': cv_path, 'div-model': div_model_path, 'model': model_path } yield true_params, paths observed_stream.close()
def parameter_iter(file_obj, include_line = False, include_thetas = False): indices = {} post_file, close = process_file_arg(file_obj) header = parse_header(post_file, seek = False) mean_t_indices = functions.get_indices_of_patterns(header, MEAN_TAU_PATTERNS) if len(mean_t_indices) > 1: post_file.close() raise errors.ParameterParsingError('posterior file {0} has {1} mean ' 'tau columns'.format(post_file.name, len(mean_t_indices))) if mean_t_indices: indices['mean_tau'] = mean_t_indices omega_indices = functions.get_indices_of_patterns(header, OMEGA_PATTERNS) if len(omega_indices) > 1: post_file.close() raise errors.ParameterParsingError('posterior file {0} has {1} omega ' 'columns'.format(post_file.name, len(omega_indices))) if omega_indices: indices['omega'] = omega_indices cv_indices = functions.get_indices_of_patterns(header, CV_PATTERNS) if len(cv_indices) > 1: post_file.close() raise errors.ParameterParsingError('posterior file {0} has {1} cv ' 'columns'.format(post_file.name, len(cv_indices))) if cv_indices: indices['cv'] = cv_indices t_indices = functions.get_indices_of_patterns(header, TAU_PATTERNS) if t_indices: indices['taus'] = t_indices if include_thetas: a_theta_indices = functions.get_indices_of_patterns(header, A_THETA_PATTERNS) d1_theta_indices = functions.get_indices_of_patterns(header, D1_THETA_PATTERNS) d2_theta_indices = functions.get_indices_of_patterns(header, D2_THETA_PATTERNS) if a_theta_indices: indices['a_thetas'] = a_theta_indices if d1_theta_indices: indices['d1_thetas'] = d1_theta_indices if d2_theta_indices: indices['d2_thetas'] = d2_theta_indices psi_indices = functions.get_indices_of_patterns(header, PSI_PATTERNS) if len(psi_indices) > 1: post_file.close() raise errors.ParameterParsingError('posterior file {0} has {1} psi ' 'columns'.format(post_file.name, len(psi_indices))) if psi_indices: indices['psi'] = psi_indices model_indices = functions.get_indices_of_patterns(header, MODEL_PATTERNS) if len(model_indices) > 1: post_file.close() raise errors.ParameterParsingError('posterior file {0} has {1} model ' 'columns'.format(post_file.name, len(model_indices))) if model_indices: indices['model'] = model_indices div_model_indices = functions.get_indices_of_patterns(header, DIV_MODEL_PATTERNS) if len(div_model_indices) > 1: post_file.close() raise errors.ParameterParsingError('posterior file {0} has {1} div ' 'model columns'.format(post_file.name, len(div_model_indices))) if div_model_indices: indices['div_model'] = div_model_indices samples = dict(zip(indices.keys(), [None for i in range(len(indices))])) for i, line in enumerate(post_file): l = line.strip().split() if l: if len(l) != len(header): post_file.close() raise errors.ParameterParsingError('posterior file {0} has ' '{1} columns at line {2}; expecting {3}'.format( post_file.name, len(l), i + 2, len(header))) for k, idx_list in indices.iteritems(): if k in ['mean_tau', 'omega', 'cv']: samples[k] = [float(l[i]) for i in idx_list] elif k in ['psi', 'model', 'div_model']: samples[k] = [int(l[i]) for i in idx_list] elif k in ['taus', 'a_thetas', 'd1_thetas', 'd2_thetas']: samples[k] = [[float(l[i]) for i in idx_list]] else: post_file.close() raise errors.ParameterParsingError('unexpected key {0!r}; ' 'posterior file {1}, line {2}'.format( k, post_file.name, i+2)) if include_line: yield samples, l else: yield samples if close: post_file.close()
def get_dummy_indices(header_list, dummy_patterns=DUMMY_PATTERNS): return functions.get_indices_of_patterns(header_list, dummy_patterns)
def get_stat_indices(header_list, stat_patterns=DEFAULT_STAT_PATTERNS): return functions.get_indices_of_patterns(header_list, stat_patterns)
def get_parameter_indices(header_list, parameter_patterns=PARAMETER_PATTERNS): return functions.get_indices_of_patterns(header_list, parameter_patterns)
def parameter_iter(file_obj, include_line=False, include_thetas=False): indices = {} post_file, close = process_file_arg(file_obj) header = parse_header(post_file, seek=False) mean_t_indices = functions.get_indices_of_patterns(header, MEAN_TAU_PATTERNS) if len(mean_t_indices) > 1: post_file.close() raise errors.ParameterParsingError('posterior file {0} has {1} mean ' 'tau columns'.format( post_file.name, len(mean_t_indices))) if mean_t_indices: indices['mean_tau'] = mean_t_indices omega_indices = functions.get_indices_of_patterns(header, OMEGA_PATTERNS) if len(omega_indices) > 1: post_file.close() raise errors.ParameterParsingError('posterior file {0} has {1} omega ' 'columns'.format( post_file.name, len(omega_indices))) if omega_indices: indices['omega'] = omega_indices cv_indices = functions.get_indices_of_patterns(header, CV_PATTERNS) if len(cv_indices) > 1: post_file.close() raise errors.ParameterParsingError('posterior file {0} has {1} cv ' 'columns'.format( post_file.name, len(cv_indices))) if cv_indices: indices['cv'] = cv_indices t_indices = functions.get_indices_of_patterns(header, TAU_PATTERNS) if t_indices: indices['taus'] = t_indices if include_thetas: a_theta_indices = functions.get_indices_of_patterns( header, A_THETA_PATTERNS) d1_theta_indices = functions.get_indices_of_patterns( header, D1_THETA_PATTERNS) d2_theta_indices = functions.get_indices_of_patterns( header, D2_THETA_PATTERNS) if a_theta_indices: indices['a_thetas'] = a_theta_indices if d1_theta_indices: indices['d1_thetas'] = d1_theta_indices if d2_theta_indices: indices['d2_thetas'] = d2_theta_indices psi_indices = functions.get_indices_of_patterns(header, PSI_PATTERNS) if len(psi_indices) > 1: post_file.close() raise errors.ParameterParsingError('posterior file {0} has {1} psi ' 'columns'.format( post_file.name, len(psi_indices))) if psi_indices: indices['psi'] = psi_indices model_indices = functions.get_indices_of_patterns(header, MODEL_PATTERNS) if len(model_indices) > 1: post_file.close() raise errors.ParameterParsingError('posterior file {0} has {1} model ' 'columns'.format( post_file.name, len(model_indices))) if model_indices: indices['model'] = model_indices div_model_indices = functions.get_indices_of_patterns( header, DIV_MODEL_PATTERNS) if len(div_model_indices) > 1: post_file.close() raise errors.ParameterParsingError('posterior file {0} has {1} div ' 'model columns'.format( post_file.name, len(div_model_indices))) if div_model_indices: indices['div_model'] = div_model_indices samples = dict(zip(indices.keys(), [None for i in range(len(indices))])) for i, line in enumerate(post_file): l = line.strip().split() if l: if len(l) != len(header): post_file.close() raise errors.ParameterParsingError( 'posterior file {0} has ' '{1} columns at line {2}; expecting {3}'.format( post_file.name, len(l), i + 2, len(header))) for k, idx_list in indices.iteritems(): if k in ['mean_tau', 'omega', 'cv']: samples[k] = [float(l[i]) for i in idx_list] elif k in ['psi', 'model', 'div_model']: samples[k] = [int(l[i]) for i in idx_list] elif k in ['taus', 'a_thetas', 'd1_thetas', 'd2_thetas']: samples[k] = [[float(l[i]) for i in idx_list]] else: post_file.close() raise errors.ParameterParsingError( 'unexpected key {0!r}; ' 'posterior file {1}, line {2}'.format( k, post_file.name, i + 2)) if include_line: yield samples, l else: yield samples if close: post_file.close()