def evaluate_decomp(name, level, init_structure, split_id, sample_id, root): """Given a posterior sample, evaluate the predictive likelihood on the test rows and columns.""" params = storage.load(params_file(name)) data_matrix = storage.load(data_file(name)) splits = storage.load(splits_file(name)) train_rows, train_cols, test_rows, test_cols = splits[split_id] X_train = data_matrix[train_rows[:, nax], train_cols[nax, :]] X_row_test = data_matrix[test_rows[:, nax], train_cols[nax, :]] X_col_test = data_matrix[train_rows[:, nax], test_cols[nax, :]] if level == 1: init_row_loglik = init_col_loglik = None else: if params.save_samples: init_row_loglik, init_col_loglik = storage.load(scores_file(name, level-1, init_structure, split_id, sample_id)) else: init_row_loglik, init_col_loglik = storage.load(init_scores_file(name, level, init_structure, split_id, sample_id)) row_loglik, col_loglik = scoring.evaluate_model(X_train, root, X_row_test, X_col_test, init_row_loglik=init_row_loglik, init_col_loglik=init_col_loglik, num_steps_ais=params.num_steps_ais, max_dim=params.max_dim_predictive) return row_loglik, col_loglik
def collect_scores(name, level, structure): """Collect the held-out predictive log-likelihood scores for all CV splits and order them according to the indices of the original data matrix.""" params = storage.load(params_file(name)) splits = storage.load(splits_file(name)) row_loglik_all = [] col_loglik_all = [] failed = False for split_id, (train_rows, train_cols, test_rows, test_cols) in enumerate(splits): row_loglik_curr, col_loglik_curr = [], [] num_samples = params.num_samples for sample_id in range(num_samples): try: row_loglik_single, col_loglik_single = storage.load(scores_file(name, level, structure, split_id, sample_id)) except: row_loglik_single = np.nan * np.ones(len(test_rows)) col_loglik_single = np.nan * np.ones(len(test_cols)) failed = True row_loglik_curr.append(row_loglik_single) col_loglik_curr.append(col_loglik_single) row_loglik_all.append(np.array(row_loglik_curr)) col_loglik_all.append(np.array(col_loglik_curr)) if failed: print termcolor.colored(' failed: %s' % grammar.pretty_print(structure), 'red') storage.dump((row_loglik_all, col_loglik_all), collected_scores_file(name, level, structure))
def structureless_scores(name): """Evaluate the probability of the structureless model G on held-out data.""" data_matrix = storage.load(data_file(name)) if isinstance(data_matrix, recursive.Decomp): data_matrix = observations.DataMatrix.from_real_values(data_matrix.root.value()) splits = storage.load(splits_file(name)) params = storage.load(params_file(name)) row_loglik = np.array([]) col_loglik = np.array([]) num_entries = 0 for train_rows, train_cols, test_rows, test_cols in splits: X_train = data_matrix[train_rows[:, nax], train_cols[nax, :]] X_row_test = data_matrix[test_rows[:, nax], train_cols[nax, :]] X_col_test = data_matrix[train_rows[:, nax], test_cols[nax, :]] curr_row_loglik = scoring.no_structure_row_loglik( X_train[:, :params.max_dim_predictive], X_row_test[:, :params.max_dim_predictive]) row_loglik = np.concatenate([row_loglik, curr_row_loglik]) curr_col_loglik = scoring.no_structure_col_loglik( X_train[:params.max_dim_predictive, :], X_col_test[:params.max_dim_predictive, :]) col_loglik = np.concatenate([col_loglik, curr_col_loglik]) num_entries += train_cols.size * test_rows.size + train_rows.size * test_cols.size return PredictiveLikelihoodScores(row_loglik, col_loglik, num_entries)
def compute_scores(name, level, structure): """Average together the predictive likelihood scores over all the posterior samples, and return a PredictiveLikelihoodScores instance.""" if level == 0: if structure != 'g': raise RuntimeError('Invalid structure for level 0: %s' % structure) return structureless_scores(name) params = storage.load(params_file(name)) num_samples = params.num_samples splits = storage.load(splits_file(name)) row_loglik_all, col_loglik_all = storage.load(collected_scores_file(name, level, structure)) # treat errors as zeros (assume I've already checked that all samples for valid models are completed) row_loglik_all = [np.where(np.isnan(rl), -np.infty, rl) for rl in row_loglik_all] col_loglik_all = [np.where(np.isnan(cl), -np.infty, cl) for cl in col_loglik_all] row_loglik_vec, col_loglik_vec = np.array([]), np.array([]) num_entries = 0 for split_id, (train_rows, train_cols, test_rows, test_cols) in enumerate(splits): row_loglik_curr, col_loglik_curr = row_loglik_all[split_id], col_loglik_all[split_id] row_loglik_vec = np.concatenate([row_loglik_vec, np.logaddexp.reduce(row_loglik_curr, axis=0) - np.log(num_samples)]) col_loglik_vec = np.concatenate([col_loglik_vec, np.logaddexp.reduce(col_loglik_curr, axis=0) - np.log(num_samples)]) num_entries += train_cols.size * test_rows.size + train_rows.size * test_cols.size return PredictiveLikelihoodScores(row_loglik_vec, col_loglik_vec, num_entries)
def print_failures(prefix, outfile=sys.stdout): params = load_params(prefix) failures = [] for level in range(1, params.search_depth + 1): ok_counts = collections.defaultdict(int) fail_counts = collections.defaultdict(int) for expt_name in all_experiment_names(prefix): for _, structure in storage.load(experiments.structures_file(expt_name, level)): for split_id in range(params.num_splits): for sample_id in range(params.num_samples): ok = False fname = experiments.scores_file(expt_name, level, structure, split_id, sample_id) if storage.exists(fname): row_loglik, col_loglik = storage.load(fname) if np.all(np.isfinite(row_loglik)) and np.all(np.isfinite(col_loglik)): ok = True if ok: ok_counts[structure] += 1 else: fail_counts[structure] += 1 for structure in fail_counts: if ok_counts[structure] > 0: failures.append(presentation.Failure(structure, level, False)) else: failures.append(presentation.Failure(structure, level, True)) presentation.print_failed_structures(failures, outfile)
def fit_winning_sequence(name, sample_id): """After the sequence of models is identified, sample factorizations from each of the models on the full data matrix.""" data_matrix = storage.load(data_file(name)) sequence = sequence_of_structures(name) params = storage.load(params_file(name)) decomps = recursive.fit_sequence(sequence, data_matrix, gibbs_steps=params.gibbs_steps) storage.dump(decomps, winning_samples_file(name, sample_id))
def evaluation_jobs(name, level): params = storage.load(params_file(name)) structures = storage.load(structures_file(name, level)) return [('eval_job', name, level, pretty_print(init_s), pretty_print(s), split_id, sample_id) for init_s, s in structures for split_id in range(params.num_splits) for sample_id in range(params.num_samples)]
def winning_structures(name, level): """Determine the set of structures to expand.""" if level == 0: return ['g'] params = storage.load(params_file(name)) structures = storage.load(structures_file(name, level)) structures = [s for _, s in structures] structures = filter(lambda s: compute_scores(name, level, s) is not None, structures) # ignore failures structures.sort(key=lambda s: compute_scores(name, level, s).total(), reverse=True) return structures[:params.num_expand]
def print_model_sequence(name, outfile=sys.stdout): params = storage.load(params_file(name)) prev_structure = 'g' model_scores = [] for level in range(1, params.search_depth + 1): curr_structure = storage.load(winning_structure_file(name, level))[0] result = compute_scores(name, level, curr_structure) prev_result = compute_scores(name, level-1, prev_structure) model_scores.append(get_model_score(curr_structure, result, prev_result)) prev_structure = curr_structure presentation.print_model_sequence(model_scores, outfile)
def initial_samples_jobs(name, level): if level == 1: raise RuntimeError('No need for initialization in level 1.') winning_structures = storage.load(winning_structure_file(name, level-1)) params = storage.load(params_file(name)) return [('init_job', name, level, pretty_print(s), split_id, sample_id) for s in winning_structures for split_id in range(params.num_splits) for sample_id in range(params.num_samples)]
def init_level(name, level): """Initialize a given level of the search by saving all of the structures which need to be evaluated.""" if not storage.exists(experiment_dir(name)): raise RuntimeError('Experiment %s not yet initialized.' % name) params = storage.load(params_file(name)) if level == 1: init_structures = ['g'] else: init_structures = storage.load(winning_structure_file(name, level - 1)) structure_pairs = list_structure_pairs(init_structures, params.rules, params.expand_noise) storage.dump(structure_pairs, structures_file(name, level))
def final_structure(name): params = storage.load(params_file(name)) stop_at = 0 for level in range(1, params.search_depth + 1): if compute_improvement(name, level) > 1.: stop_at = level else: break if stop_at == 0: return 'g', 0 else: return storage.load(winning_structure_file(name, stop_at))[0], stop_at
def sequence_of_structures(name): """Get the sequence of structures corresponding to the final model chosen, i.e. a list of structures where each one was used to initialize the next one.""" sequence = [] params = storage.load(params_file(name)) structure = storage.load(winning_structure_file(name, params.search_depth))[0] sequence = [structure] for level in range(1, params.search_depth)[::-1]: structure = init_structure_for(name, level + 1, structure) sequence = [structure] + sequence return sequence
def print_components(name, outfile=sys.stdout): structure, level = final_structure(name) if level == 0: return seq = storage.load(winning_samples_file(name, 0)) decomp = seq[level] print_components_for_decomp(name, structure, decomp, outfile)
def summarize_results(name, outfile=sys.stdout): params = storage.load(params_file(name)) print_model_sequence(name, outfile) print_failures(name, outfile) print_running_times(name, outfile) for level in range(1, params.search_depth+1): print_scores(name, level, outfile) print_components(name, outfile)
def collect_scores_for_level(name, level): """Collect the held-out predictive log-likelihood scores for all CV splits and order them according to the indices of the original data matrix.""" structures = storage.load(structures_file(name, level)) structures = [s for _, s in structures] for s in structures: collect_scores(name, level, s) save_winning_structures(name, level)
def compute_improvement(name, level, structure=None): """Compute the improvement in predictive likelihood score from one level to the next.""" if structure is None: structure = storage.load(winning_structure_file(name, level)) if type(structure) == list: structure = structure[0] prev_structure = init_structure_for(name, level, structure) curr_scores = compute_scores(name, level, structure) prev_scores = compute_scores(name, level-1, prev_structure) return (curr_scores.row_avg() - prev_scores.row_avg() + curr_scores.col_avg() - prev_scores.col_avg()) / 2.
def init_structure_for(name, level, structure): """Determine which of the previous level's structures was used to initialize a given structure.""" if level == 1: return 'g' structure_pairs = storage.load(structures_file(name, level)) init_structure = None for init_s, s, in structure_pairs: if s == structure: init_structure = init_s assert init_structure is not None return init_structure
def load_trained_rbm(fname): """Load a previously trained RBM""" if fname[-2:] == 'pk': return storage.load(fname) elif fname[-3:] == 'mat': vars = scipy.io.loadmat(fname) return binary_rbms.RBM(gnp.garray(vars['visbiases'].ravel()), gnp.garray(vars['hidbiases'].ravel()), gnp.garray(vars['vishid'])) else: raise RuntimeError('Unknown format: {}'.format(fname))
def run_everything(name, args, email=None): params = storage.load(params_file(name)) init_level(name, 1) run_jobs(evaluation_jobs(name, 1), args, evaluation_key(name, 1)) collect_scores_for_level(name, 1) for level in range(2, params.search_depth + 1): init_level(name, level) run_jobs(initial_samples_jobs(name, level), args, initial_samples_key(name, level)) run_jobs(evaluation_jobs(name, level), args, evaluation_key(name, level)) collect_scores_for_level(name, level) run_jobs(final_model_jobs(name), args, final_model_key(name)) save_report(name, email)
def sample_from_model(name, level, init_structure, structure, split_id, sample_id): """Run an MCMC sampler to approximately sample from the posterior.""" params = storage.load(params_file(name)) data_matrix = storage.load(data_file(name)) splits = storage.load(splits_file(name)) train_rows, train_cols, test_rows, test_cols = splits[split_id] X_train = data_matrix[train_rows[:, nax], train_cols[nax, :]] if level == 1: init = X_train.sample_latent_values(np.zeros((X_train.m, X_train.n)), 1.) prev_model = recursive.GaussianNode(init, 'scalar', 1.) else: if params.save_samples: prev_model = storage.load(samples_file(name, level-1, init_structure, split_id, sample_id)) else: prev_model = storage.load(init_samples_file(name, level, init_structure, split_id, sample_id)) if isinstance(prev_model, recursive.Decomp): prev_model = prev_model.root return recursive.fit_model(structure, X_train, prev_model, gibbs_steps=params.gibbs_steps)
def print_failures(name, outfile=sys.stdout): params = storage.load(params_file(name)) failures = [] for level in range(1, params.search_depth + 1): for _, structure in storage.load(structures_file(name, level)): total_ok = 0 for split_id in range(params.num_splits): for sample_id in range(params.num_samples): fname = scores_file(name, level, structure, split_id, sample_id) if storage.exists(fname): row_loglik, col_loglik = storage.load(fname) if np.all(np.isfinite(row_loglik)) and np.all(np.isfinite(col_loglik)): total_ok += 1 if total_ok == 0: failures.append(presentation.Failure(structure, level, True)) elif total_ok < params.num_splits * params.num_samples: failures.append(presentation.Failure(structure, level, False)) presentation.print_failed_structures(failures, outfile)
def print_running_times(name, outfile=sys.stdout): params = storage.load(params_file(name)) running_times = [] for level in range(1, params.search_depth+1): structures = storage.load(structures_file(name, level)) structures = [s[1] for s in structures] for structure in structures: total = 0. num_samples = 0 for split in range(params.num_splits): for sample_id in range(params.num_samples): rtf = running_time_file(name, level, structure, split, sample_id) try: total += float(storage.load(rtf)) num_samples += 1 except IOError: pass if num_samples > 0: running_times.append(presentation.RunningTime(level, structure, num_samples, total)) presentation.print_running_times(running_times, outfile)
def run_gibbs(expt, save=True, show_progress=False): """Run Gibbs chains starting from the AIS particles (sampled proportionally to their weights), and save the final particles.""" if isinstance(expt, str): expt = get_experiment(expt) tr_expt = get_training_expt(expt) for it in tr_expt.save_after: for avg in AVG_VALS: print 'Iteration', it, avg try: rbm = load_rbm(expt, it, avg) except: continue log_Z = storage.load(expt.log_Z_file(it, avg)).as_numpy_array() final_states = storage.load(expt.final_states_file(it, avg)) # sample the states proportionally to the Z estimates p = log_Z - np.logaddexp.reduce(log_Z) p /= p.sum( ) # not needed in theory, but numpy complains if it doesn't sum exactly to 1 idxs = np.random.multinomial( 1, p, size=expt.annealing.num_particles).argmax(1) states = binary_rbms.RBMState(final_states.v[idxs, :], final_states.h[idxs, :]) if show_progress: pbar = misc.pbar(expt.gibbs_steps) for st in range(expt.gibbs_steps): states = rbm.step(states) if show_progress: pbar.update(st) if show_progress: pbar.finish() if save: storage.dump(states, expt.gibbs_states_file(it, avg))
def run_model(name, level, init_structure, structure, split_id, sample_id, save=True, save_sample=False): """Sample from the posterior given the training data, and evaluate on heldout rows/columns.""" params = storage.load(params_file(name)) t0 = time.time() root = sample_from_model(name, level, init_structure, structure, split_id, sample_id) if save and (save_sample or params.save_samples): storage.dump(root, samples_file(name, level, structure, split_id, sample_id)) print 'Saved.' row_loglik, col_loglik = evaluate_decomp(name, level, init_structure, split_id, sample_id, root) print 'Row:', row_loglik.mean() print 'Col:', col_loglik.mean() if save: storage.dump((row_loglik, col_loglik), scores_file(name, level, structure, split_id, sample_id)) storage.dump(time.time() - t0, running_time_file(name, level, structure, split_id, sample_id))
def print_scores(name, level, outfile=sys.stdout): structures = storage.load(structures_file(name, level)) structures = [s for _, s in structures] model_scores = [] for s in structures: result = compute_scores(name, level, s) if not result.all_finite(): continue if result is not None: prev_structure = init_structure_for(name, level, s) prev_result = compute_scores(name, level-1, prev_structure) model_scores.append(get_model_score(s, result, prev_result)) model_scores.sort(key=lambda ms: ms.total, reverse=True) presentation.print_scores(level, model_scores, outfile)
def print_scores(name, level, outfile=sys.stdout): structures = storage.load(structures_file(name, level)) structures = [s for _, s in structures] model_scores = [] for s in structures: result = compute_scores(name, level, s) if not result.all_finite(): continue if result is not None: prev_structure = init_structure_for(name, level, s) prev_result = compute_scores(name, level - 1, prev_structure) model_scores.append(get_model_score(s, result, prev_result)) model_scores.sort(key=lambda ms: ms.total, reverse=True) presentation.print_scores(level, model_scores, outfile)
def sample_from_model(name, level, init_structure, structure, split_id, sample_id): """Run an MCMC sampler to approximately sample from the posterior.""" params = storage.load(params_file(name)) data_matrix = storage.load(data_file(name)) splits = storage.load(splits_file(name)) train_rows, train_cols, test_rows, test_cols = splits[split_id] X_train = data_matrix[train_rows[:, nax], train_cols[nax, :]] if level == 1: init = X_train.sample_latent_values(np.zeros((X_train.m, X_train.n)), 1.) prev_model = recursive.GaussianNode(init, 'scalar', 1.) else: try: if params.save_samples: prev_model = storage.load(samples_file(name, level-1, init_structure, split_id, sample_id)) else: prev_model = storage.load(init_samples_file(name, level, init_structure, split_id, sample_id)) if isinstance(prev_model, recursive.Decomp): prev_model = prev_model.root except: return None return recursive.fit_model(structure, X_train, prev_model, gibbs_steps=params.gibbs_steps)
def run_model(name, level, init_structure, structure, split_id, sample_id, save=True, save_sample=False): """Sample from the posterior given the training data, and evaluate on heldout rows/columns.""" params = storage.load(params_file(name)) t0 = time.time() try: root = sample_from_model(name, level, init_structure, structure, split_id, sample_id) except: return # if save and (save_sample or params.save_samples): storage.dump(root, samples_file(name, level, structure, split_id, sample_id)) # print('Saved.') try: row_loglik, col_loglik = evaluate_decomp(name, level, init_structure, split_id, sample_id, root) except: return print('Row:', row_loglik.mean()) print('Col:', col_loglik.mean()) if save: storage.dump((row_loglik, col_loglik), scores_file(name, level, structure, split_id, sample_id)) storage.dump(time.time() - t0, running_time_file(name, level, structure, split_id, sample_id))
def print_components_for_decomp(name, structure, decomp, outfile=sys.stdout): data_matrix = storage.load(data_file(name)) for model in ['clustering', 'binary']: if model == 'clustering': left_dist, right_dist = 'm', 'M' else: left_dist, right_dist = 'b', 'B' if data_matrix.row_labels is not None: nodes = recursive.find_nodes(decomp, lambda node: isinstance(node, recursive.LeafNode) and node.distribution() == left_dist and node.m == data_matrix.m) for node in nodes: items = [presentation.LatentVariables(row_label, node.value()[i, :]) for i, row_label in enumerate(data_matrix.row_labels)] presentation.print_components(model, structure, 'row', items, outfile) if data_matrix.col_labels is not None: nodes = recursive.find_nodes(decomp, lambda node: isinstance(node, recursive.LeafNode) and node.distribution() == right_dist and node.n == data_matrix.n) for node in nodes: items = [presentation.LatentVariables(col_label, node.value()[:, i]) for i, col_label in enumerate(data_matrix.col_labels)] presentation.print_components(model, structure, 'col', items, outfile)
def load_small_rbm(): return storage.load(SMALL_RBM_FILE)
def load_params(prefix): expt_name = all_experiment_names(prefix)[0] return storage.load(experiments.params_file(expt_name))
def final_model_jobs(name): params = storage.load(params_file(name)) return [('final_job', name, i) for i in range(params.num_samples)]
def sample_matrix(name, sample_id, level, size, output_file): decomps = storage.load(winning_samples_file(name, sample_id)) data = decomps[level].sample_matrix(size) storage.dump(data, output_file) print('Samples saved at: "{}".'.format(output_file))
def load_wall_clock_time(expt_name): eval_expt = evaluation.get_experiment(expt_name) tr_expt = from_scratch.get_experiment(eval_expt.rbm_source.expt_name) return [storage.load(tr_expt.time_file(it)) for it in tr_expt.save_after if os.path.exists(tr_expt.time_file(it))]