def estimate_mh_time(RIPL, n, initial_mh_iter, ids, max_runtime=600, verbose=True): '''Uses a doubling strategy to try to estimate time per mh iter''' experiment_start = time.clock() start = time.clock() samples = np.zeros((len(ids), 0)) # These are recorded as normal for timing purposes iteration = 0 mh_iter = initial_mh_iter mh_sum = 0 max_mem = memory() while time.clock() - experiment_start < max_runtime: iteration += 1 if verbose: print 'Sample %d : trying %d steps' % (iteration, mh_iter) RIPL.infer(mh_iter) mh_sum += mh_iter samples = np.column_stack([samples, [RIPL.report_value(an_id) for an_id in ids]]) if time.clock() - experiment_start < 10: # Not much time has passed - estimates will be unreliable - just double iterations mh_iter = mh_iter * 2 else: # Some time has passed - make sure that next sample will not take too long now = time.clock() time_elapsed = now - start time_remaining = max_runtime - time_elapsed time_per_mh_iter = time_elapsed / mh_sum max_possible_iters = int(round(time_remaining / time_per_mh_iter)) mh_iter = min(max(1, max_possible_iters), mh_iter * 2) max_mem = max(max_mem, memory()) finish = time.clock() time_per_mh_iter = max(finish - start, 0.01) / mh_sum max_mem = max(max_mem, memory()) return {'time_per_mh_iter' : time_per_mh_iter, 'max_memory' : max_mem}
def cold_start_single_run(data, model_class, exp_params, model_params): """Function to be sent to picloud""" start = time.clock() model = model_class(**model_params) # Create model model.create_RIPL() model.observe_data(data["observations"]) # Observe data truth, missing_links = model.set_predictions(data["missing"]) # Burn in sample.collect_n_samples( model.RIPL, n=exp_params["n_samples"], mh_iter=exp_params["intermediate_iter"], ids=missing_links, max_runtime=exp_params["max_burn_time"], verbose=True, callback=return_AUC, callback_kwargs={"truth": truth}, ) # Collect samples max_memory = memory() mcmc_output = sample.collect_n_samples( model.RIPL, n=exp_params["n_samples"], mh_iter=exp_params["intermediate_iter"], ids=missing_links, max_runtime=exp_params["max_sample_time"], verbose=True, callback=return_AUC, callback_kwargs={"truth": truth}, ) samples = mcmc_output["samples"] n_samples = samples.shape[1] sample_ess = mcmc_output["ess"] max(max_memory, memory()) # Score samples predictions = list(samples.mean(axis=1)) roc_data = [] for (true_link, prediction) in zip(truth, predictions): roc_data.append((true_link, prediction)) AUC = ROCData(roc_data).auc() max_memory = max(max_memory, memory()) return { "predictions": predictions, "ess": sample_ess, "AUC": AUC, "runtime": time.clock() - start, "max_memory": max(max_memory, memory()), "n_samples": n_samples, }
def IRM(fold=1,burn=50,n_samples=100,mh_iter=10,verbose=True): # Start timing start = time.clock() # Create RIPL and clear any previous session MyRIPL = venture_engine MyRIPL.clear() # Load high school data set #data = scipy.io.loadmat("../../data/hs/hs_%dof5.mat" % fold, squeeze_me=True) data = scipy.io.loadmat("../../data/irm_synth/irm_synth_20.mat", squeeze_me=True) observed = list(zip(data['train_i'].flat, data['train_j'].flat, data['train_v'].flat)) missing = list(zip(data['test_i'].flat, data['test_j'].flat, data['test_v'].flat)) #observed = [(1,2,1),(1,3,1),(1,4,1),(2,3,0),(2,5,1)] #missing = [(1,5,1),(2,4,0),(3,4,1),(4,5,1)] # Convenience functions MyRIPL.assume('min-2', parse('(lambda (x y) (if (> x y) y x))')) MyRIPL.assume('max-2', parse('(lambda (x y) (if (> x y) x y))')) # Instantiate CRP MyRIPL.assume('alpha', parse('(uniform-continuous 0.0001 2.0)')) MyRIPL.assume('cluster-crp', parse('(CRP/make alpha)')) # Create class assignment lookup function MyRIPL.assume('node->class', parse('(mem (lambda (nodes) (cluster-crp)))')) # Create class interaction probability lookup function MyRIPL.assume('classes->parameters', parse('(mem (lambda (class1 class2) (beta 0.5 0.5)))')) #MyRIPL.assume('classes->parameters-symmetric', parse('(lambda (class1 class2) (classes->parameters (min-2 class1 class2) (max-2 class1 class2)))')) # Create relation evaluation function MyRIPL.assume('p-friends', parse('(lambda (node1 node2) (classes->parameters (node->class node1) (node->class node2)))')) MyRIPL.assume('friends', parse('(lambda (node1 node2) (bernoulli (p-friends node1 node2)))')) # Tell Venture about observations for (i,j,v) in observed: if verbose: print 'Observing (%3d, %3d) = %d' % (i, j, v) if v: MyRIPL.observe(parse('(friends %d %d)' % (i, j)), 'true') MyRIPL.observe(parse('(friends %d %d)' % (j, i)), 'true') else: MyRIPL.observe(parse('(friends %d %d)' % (i, j)), 'false') MyRIPL.observe(parse('(friends %d %d)' % (j, i)), 'false') # Tell Venture that we want to predict P(unobserved link) truth = [] missing_links = [] for (i,j,v) in missing: if verbose: print 'Predicting (%3d, %3d) = %d' % (i, j, v) truth.append(int(v)) missing_links.append(MyRIPL.predict(parse('(p-friends %d %d)' % (i, j)))) # Burn in #sample.collect_n_es(MyRIPL, n=burn, mh_iter=mh_iter, ids = [an_id for (an_id, _) in missing_links], min_samples=n_samples*2, max_runtime=600, verbose=verbose) #sample.collect_n_samples(MyRIPL, n=burn, mh_iter=mh_iter, ids = [an_id for (an_id, _) in missing_links], max_runtime=600, verbose=verbose) sample.collect_n_samples_before_timeout(MyRIPL, n=burn, initial_mh_iter=mh_iter, ids = [an_id for (an_id, _) in missing_links], max_runtime=60, verbose=verbose) # Collect samples #mcmc_output = sample.collect_n_es(MyRIPL, n=n_samples, mh_iter=mh_iter, ids = [an_id for (an_id, _) in missing_links], min_samples=n_samples*2, max_runtime=300, verbose=verbose) #mcmc_output = sample.collect_n_samples(MyRIPL, n=n_samples, mh_iter=mh_iter, ids = [an_id for (an_id, _) in missing_links], max_runtime=300, verbose=verbose) mcmc_output = sample.collect_n_samples_before_timeout(MyRIPL, n=n_samples, initial_mh_iter=mh_iter, ids = [an_id for (an_id, _) in missing_links], max_runtime=60, verbose=verbose) samples = mcmc_output['samples'] sample_ess = mcmc_output['ess'] if verbose: print 'Fold complete' predictions = list(samples.mean(axis=1)) roc_data = [] for (true_link, prediction) in zip(truth, predictions): roc_data.append((true_link, prediction)) AUC = ROCData(roc_data).auc() if verbose: print 'AUC = %f' % AUC max_mem = memory() return {'truth' : truth, 'predictions' : predictions, 'samples' : samples, 'ess' : sample_ess, 'AUC' : AUC, 'runtime' : time.clock() - start, 'max_mem' : max_mem}
def collect_n_samples_before_timeout(RIPL, n, initial_mh_iter, ids, max_runtime=600, verbose=True): '''Alters the number of intermediate mh_iter to n samples in max_runtime''' #### FIXME - Dangerous - thinning depends on the state! - need to do a trial run! #### Maybe it does need linear regression to be awesome? #### TODO - start here samples = np.zeros((len(ids), 0)) experiment_start = time.clock() start = time.clock() iteration = 0 mh_iter = initial_mh_iter mh_sum = 0 # For a maximum of n iterations for unused in range(n): if time.clock() - experiment_start < max_runtime: # Sample iteration += 1 if (iteration > 10) and (iteration < n): # A few samples have been collected, adjust mh_iter to finish on time #### TODO - a filtering approach would be more adaptive now = time.clock() time_elapsed = now - start time_remaining = max_runtime - time_elapsed time_per_mh_iter = time_elapsed / mh_sum samples_remaining = n - iteration time_remaining_per_sample = time_remaining / samples_remaining mh_iter = max(1, int(round(time_remaining_per_sample / time_per_mh_iter))) if verbose: print 'Time elapsed ', time_elapsed print 'Time remaining', time_remaining print 'Time per mh ', time_per_mh_iter print 'Samples to go ', samples_remaining print 'Time per samp.', time_remaining print 'Remaining per.', time_remaining_per_sample print 'mh_iter', mh_iter RIPL.infer(mh_iter) mh_sum += mh_iter if verbose: print 'Iteration %d' % iteration # Record sample samples = np.column_stack([samples, [RIPL.report_value(an_id) for an_id in ids]]) else: break finish = time.clock() time_per_sample = max(finish - start, 0.01) / iteration if verbose: print '%d iterations : %f seconds : %f seconds per iteration' % (iteration, finish - start, time_per_sample) # Compute average ess start = time.clock() ess = np.mean([(samples.shape[1]) / act.batch_means(samples[i,:]) for i in range(samples.shape[0])]) if np.isnan(ess) or np.isinf(ess): ess = 1 finish = time.clock() time_to_compute_ess = max(finish - start, 0.01) if verbose: print 'ESS = %f : %f seconds' % (ess, time_to_compute_ess) else: #print 'ESS = %3.0f' % ess pass # Finished - return samples and ...TODO max_mem = memory() return {'samples' : samples, 'ess' : ess, 'max_memory' : max_mem}