import cPickle import os from pyglm.utils.io import segment_data data_dir = '/Users/scott/Projects/pyglm/data/synth/dist/N16T300/2014_07_22-10_01/' with open(os.path.join(data_dir, 'data.pkl')) as f: data = cPickle.load(f) data_test = segment_data(data, (240,300)) with open(os.path.join(data_dir, 'data_test.pkl'), 'w') as f: cPickle.dump(data_test, f, protocol=-1) ts = [15,30,60,120,180, 240] datas = [] data_test for t in ts: datas.append(segment_data(data, (0,t))) for (t,d) in zip(ts, datas): with open(os.path.join(data_dir, 'data_%d.pkl' % t), 'w') as f: cPickle.dump(d, f, protocol=-1)
def run_synth_test(): """ Run a test with synthetic data and MAP inference with cross validation """ options, popn, data, popn_true, x_true = initialize_test_harness() # Get the list of models for cross validation base_model = make_model(options.model, N=data['N'], dt=0.001) models = get_xv_models(base_model) # TODO Segment data into training and cross validation sets train_frac = 0.75 T_split = data['T'] * train_frac train_data = segment_data(data, (0, T_split)) xv_data = segment_data(data, (T_split, data['T'])) # Preprocess the data sequences train_data = popn.preprocess_data(train_data) xv_data = popn.preprocess_data(xv_data) # Sample random initial state x0 = popn.sample() # Track the best model and parameters best_ind = -1 best_xv_ll = -np.Inf best_x = x0 best_model = None # Fit each model using the optimum of the previous models train_lls = np.zeros(len(models)) xv_lls = np.zeros(len(models)) total_lls = np.zeros(len(models)) for (i, model) in enumerate(models): print "Training model %d" % i x0 = copy.deepcopy(best_x) popn.set_hyperparameters(model) popn.set_data(train_data) ll0 = popn.compute_log_p(x0) print "Training LL0: %f" % ll0 # Perform inference x_inf = coord_descent(popn, x0=x0, maxiter=1) ll_train = popn.compute_log_p(x_inf) print "Training LP_inf: %f" % ll_train train_lls[i] = ll_train # Compute log lkhd on xv data popn.set_data(xv_data) ll_xv = popn.compute_ll(x_inf) print "Cross Validation LL: %f" % ll_xv xv_lls[i] = ll_xv # Compute log lkhd on total dataset popn.set_data(data) ll_total = popn.compute_ll(x_inf) print "Total LL: %f" % ll_total total_lls[i] = ll_total # Update best model if ll_xv > best_xv_ll: best_ind = i best_xv_ll = ll_xv best_x = copy.deepcopy(x_inf) best_model = copy.deepcopy(model) # Create a population with the best model popn.set_hyperparameters(best_model) popn.set_data(data) # Fit the best model on the full training data best_x = coord_descent(popn, data, x0=x0, maxiter=1, use_hessian=False, use_rop=False) # Print results summary for i in np.arange(len(models)): print "Model %d:\tTrain LL: %.1f\tXV LL: %.1f\tTotal LL: %.1f" % ( i, train_lls[i], xv_lls[i], total_lls[i]) print "Best model: %d" % best_ind print "Best Total LL: %f" % popn.compute_ll(best_x) print "True LL: %f" % popn_true.compute_ll(x_true) # Save results results_file = os.path.join(options.resultsDir, 'results.pkl') print "Saving results to %s" % results_file with open(results_file, 'w') as f: cPickle.dump(best_x, f) # Plot results plot_results(popn, best_x, popn_true, x_true, resdir=options.resultsDir)
def run_parallel_map(): """ Run a test with synthetic data and MCMC inference """ options, popn, data, client, popn_true, x_true = initialize_parallel_test_harness() # Get the list of models for cross validation base_model = make_model(options.model, N=data['N']) models = get_xv_models(base_model) # Segment data into training and cross validation sets train_frac = 0.75 T_split = data['T'] * train_frac train_data = segment_data(data, (0,T_split)) xv_data = segment_data(data, (T_split,data['T'])) # Sample random initial state x0 = popn.sample(None) # Track the best model and parameters best_ind = -1 best_xv_ll = -np.Inf best_x = x0 best_model = None use_existing = False start_time = time.clock() # Fit each model using the optimum of the previous models train_lls = np.zeros(len(models)) xv_lls = np.zeros(len(models)) total_lls = np.zeros(len(models)) for (i,model) in enumerate(models): print "Evaluating model %d" % i set_hyperparameters_on_engines(client[:], model) add_data_on_engines(client[:], train_data) if use_existing and \ os.path.exists(os.path.join(options.resultsDir, 'results.partial.%d.pkl' % i)): print "Found existing results for model %d" % i with open(os.path.join(options.resultsDir, 'results.partial.%d.pkl' % i)) as f: (x_inf, ll_train, ll_xv, ll_total) = cPickle.load(f) train_lls[i] = ll_train xv_lls[i] = ll_xv total_lls[i] = ll_total else: x0 = copy.deepcopy(best_x) # set_data_on_engines(client[:], train_data) ll0 = parallel_compute_ll(client[:], x0, data['N']) print "Training LL0: %f" % ll0 # Perform inference x_inf = parallel_coord_descent(client, data['N'], x0=x0, maxiter=1, use_hessian=False, use_rop=False) ll_train = parallel_compute_ll(client[:], x_inf, data['N']) print "Training LL_inf: %f" % ll_train train_lls[i] = ll_train # Compute log lkhd on xv data add_data_on_engines(client[:], xv_data) ll_xv = parallel_compute_ll(client[:], x_inf, data['N']) print "Cross Validation LL: %f" % ll_xv xv_lls[i] = ll_xv # Compute log lkhd on total dataset add_data_on_engines(client[:], data) ll_total = parallel_compute_ll(client[:], x_inf, data['N']) print "Total LL: %f" % ll_total total_lls[i] = ll_total print "Saving partial results" with open(os.path.join(options.resultsDir, 'results.partial.%d.pkl' % i),'w') as f: cPickle.dump((x_inf, ll_train, ll_xv, ll_total) ,f, protocol=-1) # Update best model if ll_xv > best_xv_ll: best_ind = i best_xv_ll = ll_xv best_x = copy.deepcopy(x_inf) best_model = copy.deepcopy(model) print "Training the best model (%d) with the full dataset" % best_ind # Set the best hyperparameters set_hyperparameters_on_engines(client[:], best_model) add_data_on_engines(client[:], data) # Fit the best model on the full training data best_x = parallel_coord_descent(client, data['N'], x0=best_x, maxiter=1, use_hessian=False, use_rop=False) # Print results summary for i in np.arange(len(models)): print "Model %d:\tTrain LL: %.1f\tXV LL: %.1f\tTotal LL: %.1f" % (i, train_lls[i], xv_lls[i], total_lls[i]) print "Best model: %d" % best_ind print "Best Total LL: %f" % parallel_compute_ll(client[:], best_x, data['N']) print "True LL: %f" % popn_true.compute_ll(x_true) stop_time = time.clock() # Save results with open(os.path.join(options.resultsDir, 'results.pkl'),'w') as f: cPickle.dump(best_x, f, protocol=-1) # Save runtime with open(os.path.join(options.resultsDir, 'runtime.pkl'),'w') as f: cPickle.dump(stop_time-start_time, f, protocol=-1)
def run_synth_test(): """ Run a test with synthetic data and MAP inference with cross validation """ options, popn, data, popn_true, x_true = initialize_test_harness() # Get the list of models for cross validation base_model = make_model(options.model, N=data['N'], dt=0.001) models = get_xv_models(base_model) # TODO Segment data into training and cross validation sets train_frac = 0.75 T_split = data['T'] * train_frac train_data = segment_data(data, (0,T_split)) xv_data = segment_data(data, (T_split,data['T'])) # Preprocess the data sequences train_data = popn.preprocess_data(train_data) xv_data = popn.preprocess_data(xv_data) # Sample random initial state x0 = popn.sample() # Track the best model and parameters best_ind = -1 best_xv_ll = -np.Inf best_x = x0 best_model = None # Fit each model using the optimum of the previous models train_lls = np.zeros(len(models)) xv_lls = np.zeros(len(models)) total_lls = np.zeros(len(models)) for (i,model) in enumerate(models): print "Training model %d" % i x0 = copy.deepcopy(best_x) popn.set_hyperparameters(model) popn.set_data(train_data) ll0 = popn.compute_log_p(x0) print "Training LL0: %f" % ll0 # Perform inference x_inf = coord_descent(popn, x0=x0, maxiter=1) ll_train = popn.compute_log_p(x_inf) print "Training LP_inf: %f" % ll_train train_lls[i] = ll_train # Compute log lkhd on xv data popn.set_data(xv_data) ll_xv = popn.compute_ll(x_inf) print "Cross Validation LL: %f" % ll_xv xv_lls[i] = ll_xv # Compute log lkhd on total dataset popn.set_data(data) ll_total = popn.compute_ll(x_inf) print "Total LL: %f" % ll_total total_lls[i] = ll_total # Update best model if ll_xv > best_xv_ll: best_ind = i best_xv_ll = ll_xv best_x = copy.deepcopy(x_inf) best_model = copy.deepcopy(model) # Create a population with the best model popn.set_hyperparameters(best_model) popn.set_data(data) # Fit the best model on the full training data best_x = coord_descent(popn, data, x0=x0, maxiter=1, use_hessian=False, use_rop=False) # Print results summary for i in np.arange(len(models)): print "Model %d:\tTrain LL: %.1f\tXV LL: %.1f\tTotal LL: %.1f" % (i, train_lls[i], xv_lls[i], total_lls[i]) print "Best model: %d" % best_ind print "Best Total LL: %f" % popn.compute_ll(best_x) print "True LL: %f" % popn_true.compute_ll(x_true) # Save results results_file = os.path.join(options.resultsDir, 'results.pkl') print "Saving results to %s" % results_file with open(results_file, 'w') as f: cPickle.dump(best_x, f) # Plot results plot_results(popn, best_x, popn_true, x_true, resdir=options.resultsDir)
import cPickle import os from pyglm.utils.io import segment_data data_dir = '/Users/scott/Projects/pyglm/data/synth/dist/N16T300/2014_07_22-10_01/' with open(os.path.join(data_dir, 'data.pkl')) as f: data = cPickle.load(f) data_test = segment_data(data, (240, 300)) with open(os.path.join(data_dir, 'data_test.pkl'), 'w') as f: cPickle.dump(data_test, f, protocol=-1) ts = [15, 30, 60, 120, 180, 240] datas = [] data_test for t in ts: datas.append(segment_data(data, (0, t))) for (t, d) in zip(ts, datas): with open(os.path.join(data_dir, 'data_%d.pkl' % t), 'w') as f: cPickle.dump(d, f, protocol=-1)
def run_parallel_map(): """ Run a test with synthetic data and MCMC inference """ options, popn, data, client, popn_true, x_true = initialize_parallel_test_harness( ) # Get the list of models for cross validation base_model = make_model(options.model, N=data['N']) models = get_xv_models(base_model) # Segment data into training and cross validation sets train_frac = 0.75 T_split = data['T'] * train_frac train_data = segment_data(data, (0, T_split)) xv_data = segment_data(data, (T_split, data['T'])) # Sample random initial state x0 = popn.sample(None) # Track the best model and parameters best_ind = -1 best_xv_ll = -np.Inf best_x = x0 best_model = None use_existing = False start_time = time.clock() # Fit each model using the optimum of the previous models train_lls = np.zeros(len(models)) xv_lls = np.zeros(len(models)) total_lls = np.zeros(len(models)) for (i, model) in enumerate(models): print "Evaluating model %d" % i set_hyperparameters_on_engines(client[:], model) add_data_on_engines(client[:], train_data) if use_existing and \ os.path.exists(os.path.join(options.resultsDir, 'results.partial.%d.pkl' % i)): print "Found existing results for model %d" % i with open( os.path.join(options.resultsDir, 'results.partial.%d.pkl' % i)) as f: (x_inf, ll_train, ll_xv, ll_total) = cPickle.load(f) train_lls[i] = ll_train xv_lls[i] = ll_xv total_lls[i] = ll_total else: x0 = copy.deepcopy(best_x) # set_data_on_engines(client[:], train_data) ll0 = parallel_compute_ll(client[:], x0, data['N']) print "Training LL0: %f" % ll0 # Perform inference x_inf = parallel_coord_descent(client, data['N'], x0=x0, maxiter=1, use_hessian=False, use_rop=False) ll_train = parallel_compute_ll(client[:], x_inf, data['N']) print "Training LL_inf: %f" % ll_train train_lls[i] = ll_train # Compute log lkhd on xv data add_data_on_engines(client[:], xv_data) ll_xv = parallel_compute_ll(client[:], x_inf, data['N']) print "Cross Validation LL: %f" % ll_xv xv_lls[i] = ll_xv # Compute log lkhd on total dataset add_data_on_engines(client[:], data) ll_total = parallel_compute_ll(client[:], x_inf, data['N']) print "Total LL: %f" % ll_total total_lls[i] = ll_total print "Saving partial results" with open( os.path.join(options.resultsDir, 'results.partial.%d.pkl' % i), 'w') as f: cPickle.dump((x_inf, ll_train, ll_xv, ll_total), f, protocol=-1) # Update best model if ll_xv > best_xv_ll: best_ind = i best_xv_ll = ll_xv best_x = copy.deepcopy(x_inf) best_model = copy.deepcopy(model) print "Training the best model (%d) with the full dataset" % best_ind # Set the best hyperparameters set_hyperparameters_on_engines(client[:], best_model) add_data_on_engines(client[:], data) # Fit the best model on the full training data best_x = parallel_coord_descent(client, data['N'], x0=best_x, maxiter=1, use_hessian=False, use_rop=False) # Print results summary for i in np.arange(len(models)): print "Model %d:\tTrain LL: %.1f\tXV LL: %.1f\tTotal LL: %.1f" % ( i, train_lls[i], xv_lls[i], total_lls[i]) print "Best model: %d" % best_ind print "Best Total LL: %f" % parallel_compute_ll(client[:], best_x, data['N']) print "True LL: %f" % popn_true.compute_ll(x_true) stop_time = time.clock() # Save results with open(os.path.join(options.resultsDir, 'results.pkl'), 'w') as f: cPickle.dump(best_x, f, protocol=-1) # Save runtime with open(os.path.join(options.resultsDir, 'runtime.pkl'), 'w') as f: cPickle.dump(stop_time - start_time, f, protocol=-1)