def debug_laplace(): # Load data set X, y, D, Xtest, ytest = gpml.load_mat( '../data/kfold_data/r_concrete_500_fold_10_of_10.mat', y_dim=1) # Load the suspicious kernel sk = fk.repr_string_to_kernel( 'ScoredKernel(k_opt=ProductKernel([ MaskKernel(ndim=8, active_dimension=0, base_kernel=CubicKernel(offset=1.757755, output_variance=7.084045)), MaskKernel(ndim=8, active_dimension=7, base_kernel=SqExpPeriodicKernel(lengthscale=-2.701080, period=-0.380918, output_variance=-0.071214)) ]), nll=6348.096611, laplace_nle=-184450132.068237, bic_nle=12720.630212, noise=[-1.77276072])' ) # Create some code to evaluate it if X.ndim == 1: X = X[:, nax] if y.ndim == 1: y = y[:, nax] ndata = y.shape[0] # Create data file data_file = cblparallel.create_temp_file('.mat') scipy.io.savemat(data_file, {'X': X, 'y': y}) # Save regression data # Move to fear cblparallel.copy_to_remote(data_file) scripts = [ gpml.OPTIMIZE_KERNEL_CODE % { 'datafile': data_file.split('/')[-1], 'writefile': '%(output_file)s', # N.B. cblparallel manages output files 'gpml_path': cblparallel.gpml_path(local_computation=False), 'kernel_family': sk.k_opt.gpml_kernel_expression(), 'kernel_params': '[ %s ]' % ' '.join(str(p) for p in sk.k_opt.param_vector()), 'noise': str(sk.noise), 'iters': str(300) } ] #### Need to be careful with % signs #### For the moment, cblparallel expects no single % signs - FIXME scripts[0] = re.sub('% ', '%% ', scripts[0]) # Test scripts[0] = re.sub('delta = 1e-6', 'delta = 1e-6', scripts[0]) #scripts[0] = re.sub('hyp.lik = [-1.77276072]', 'hyp.lik = [-0.77276072]', scripts[0]) output_file = cblparallel.run_batch_on_fear(scripts, language='matlab', max_jobs=600)[0] # Read in results output = gpml.read_outputs(output_file) result = ScoredKernel.from_matlab_output(output, sk.k_opt.family(), ndata) print result print output.hessian os.remove(output_file) # Remove temporary data file (perhaps on the cluster server) cblparallel.remove_temp_file(data_file, local_computation=False)
def make_predictions(X, y, Xtest, ytest, best_scored_kernel, local_computation=False, max_jobs=500, verbose=True, zero_mean=False, random_seed=0): ''' Evaluates a kernel on held out data Input: - X - A matrix (data_points x dimensions) of input locations - y - A matrix (data_points x 1) of output values - Xtest - Held out X data - ytest - Held out y data - best_scored_kernel - A Scored Kernel object to be evaluated on the held out data - ... Return: - A dictionary of results from the MATLAB script containing: - loglik - an array of log likelihoods of test data - predictions - an array of mean predictions for the held out data - actuals - ytest - model - I'm not sure FIXME - timestamp - A time stamp of some sort ''' # Make data into matrices in case they're unidimensional. if X.ndim == 1: X = X[:, nax] if y.ndim == 1: y = y[:, nax] ndata = y.shape[0] # Save temporary data file in standard temporary directory data_file = cblparallel.create_temp_file('.mat') scipy.io.savemat(data_file, {'X': X, 'y': y, 'Xtest' : Xtest, 'ytest' : ytest}) # Copy onto cluster server if necessary if not local_computation: if verbose: print 'Moving data file to fear' cblparallel.copy_to_remote(data_file) # Create prediction code parameters ={'datafile': data_file.split('/')[-1], 'writefile': '%(output_file)s', 'gpml_path': cblparallel.gpml_path(local_computation), 'kernel_family': best_scored_kernel.k_opt.gpml_kernel_expression(), 'kernel_params': '[ %s ]' % ' '.join(str(p) for p in best_scored_kernel.k_opt.param_vector()), 'noise': str(best_scored_kernel.noise), 'iters': str(30), 'seed': str(random_seed)} if zero_mean: code = gpml.PREDICT_AND_SAVE_CODE_ZERO_MEAN % parameters else: code = gpml.PREDICT_AND_SAVE_CODE % parameters code = re.sub('% ', '%% ', code) # HACK - cblparallel currently does not like % signs # Evaluate code - potentially on cluster if local_computation: temp_results_file = cblparallel.run_batch_locally([code], language='matlab', max_cpu=1.1, max_mem=1.1, verbose=verbose)[0] else: temp_results_file = cblparallel.run_batch_on_fear([code], language='matlab', max_jobs=max_jobs, verbose=verbose)[0] results = scipy.io.loadmat(temp_results_file) # Remove temporary files (perhaps on the cluster server) cblparallel.remove_temp_file(temp_results_file, local_computation) cblparallel.remove_temp_file(data_file, local_computation) # Return dictionary of MATLAB results return results
def make_predictions( X, y, Xtest, ytest, model, local_computation=False, max_jobs=500, verbose=True, random_seed=0, no_noise=False ): # Make data into matrices in case they're unidimensional. if X.ndim == 1: X = X[:, nax] if y.ndim == 1: y = y[:, nax] ndata = y.shape[0] # Save temporary data file in standard temporary directory data_file = cblparallel.create_temp_file(".mat") scipy.io.savemat(data_file, {"X": X, "y": y, "Xtest": Xtest, "ytest": ytest}) # Copy onto cluster server if necessary if not local_computation: if verbose: print "Moving data file to fear" cblparallel.copy_to_remote(data_file) # Create prediction code parameters = { "datafile": data_file.split("/")[-1], "writefile": "%(output_file)s", "gpml_path": cblparallel.gpml_path(local_computation), "mean_syntax": model.mean.get_gpml_expression(dimensions=X.shape[1]), "mean_params": "[ %s ]" % " ".join(str(p) for p in model.mean.param_vector), "kernel_syntax": model.kernel.get_gpml_expression(dimensions=X.shape[1]), "kernel_params": "[ %s ]" % " ".join(str(p) for p in model.kernel.param_vector), "lik_syntax": model.likelihood.get_gpml_expression(dimensions=X.shape[1]), "lik_params": "[ %s ]" % " ".join(str(p) for p in model.likelihood.param_vector), "inference": model.likelihood.gpml_inference_method, "iters": str(30), "seed": str(random_seed), } code = gpml.PREDICT_AND_SAVE_CODE % parameters code = re.sub("% ", "%% ", code) # HACK - cblparallel currently does not like % signs # Evaluate code - potentially on cluster if local_computation: temp_results_file = cblparallel.run_batch_locally( [code], language="matlab", max_cpu=1.1, max_mem=1.1, verbose=verbose )[0] else: temp_results_file = cblparallel.run_batch_on_fear( [code], language="matlab", max_jobs=max_jobs, verbose=verbose )[0] results = scipy.io.loadmat(temp_results_file) # Remove temporary files (perhaps on the cluster server) cblparallel.remove_temp_file(temp_results_file, local_computation) cblparallel.remove_temp_file(data_file, local_computation) # Return dictionary of MATLAB results return results
def rf_fear_test_home(n=10,n_trees=10): cblparallel.start_port_forwarding() # Data X, y = make_friedman1(n_samples=1200, random_state=0, noise=1.0) X_train, X_test = X[:200], X[200:] y_train, y_test = y[:200], y[200:] # Params #local_temp_path = os.path.abspath('../temp/') #remote_temp_path = 'python/' # Write data file locally #data_file = mkstemp_safe(cblparallel.config.LOCAL_TEMP_PATH, '.p') data_file = mkstemp_safe(cblparallel.config.HOME_TEMP_PATH, '.p') with open(data_file, 'w') as f: pickle.dump((X_train, y_train, X_test), f) # Prepare code scripts = [reduced_tree_code % {'data_file' : os.path.join(cblparallel.config.REMOTE_TEMP_PATH, os.path.split(data_file)[-1]), 'n_trees' : n_trees, 'random_state' : i * n_trees, 'output_file' : '%(output_file)s', 'flag_file' : '%(flag_file)s'} for i in range(n)] # Submit to fear with cblparallel.fear(via_gate=True) as fear: fear.copy_to(data_file, os.path.join(cblparallel.config.REMOTE_TEMP_PATH, os.path.split(data_file)[-1])) output_files = cblparallel.run_batch_on_fear(scripts, max_jobs=1000) fear.rm(os.path.join(cblparallel.config.REMOTE_TEMP_PATH, os.path.split(data_file)[-1])) # Kill local data file os.remove(data_file) # Now do something with the output estimators = [] predictions = [] for output_file in output_files: with open(output_file, 'r') as f: #(estimator, prediction) = pickle.load(f) prediction = np.genfromtxt(output_file, delimiter=',') os.remove(output_file) #estimators.append(estimator) predictions.append(prediction) #ens = EnsembleRegressor(estimators) #return RMSE(X_test, y_test, ens) ens_pred = np.mean(predictions, axis=0) return RMSE_y(y_test, ens_pred)
def remote_matlab_test(n=10): cblparallel.start_port_forwarding() # Prepare code scripts = [matlab_code] * n # Run bacth in parallel output_files = cblparallel.run_batch_on_fear(scripts, language='matlab', max_jobs=1000) # Now do something with the output estimators = [] for output_file in output_files: with open(output_file, 'r') as f: estimator = np.genfromtxt(output_file, delimiter=',') os.remove(output_file) estimators.append(estimator) ens_pred = np.mean(estimators) return ens_pred
def debug_laplace(): # Load data set X, y, D, Xtest, ytest = gpml.load_mat('../data/kfold_data/r_concrete_500_fold_10_of_10.mat', y_dim=1) # Load the suspicious kernel sk = fk.repr_string_to_kernel('ScoredKernel(k_opt=ProductKernel([ MaskKernel(ndim=8, active_dimension=0, base_kernel=CubicKernel(offset=1.757755, output_variance=7.084045)), MaskKernel(ndim=8, active_dimension=7, base_kernel=SqExpPeriodicKernel(lengthscale=-2.701080, period=-0.380918, output_variance=-0.071214)) ]), nll=6348.096611, laplace_nle=-184450132.068237, bic_nle=12720.630212, noise=[-1.77276072])') # Create some code to evaluate it if X.ndim == 1: X = X[:, nax] if y.ndim == 1: y = y[:, nax] ndata = y.shape[0] # Create data file data_file = cblparallel.create_temp_file('.mat') scipy.io.savemat(data_file, {'X': X, 'y': y}) # Save regression data # Move to fear cblparallel.copy_to_remote(data_file) scripts = [gpml.OPTIMIZE_KERNEL_CODE % {'datafile': data_file.split('/')[-1], 'writefile': '%(output_file)s', # N.B. cblparallel manages output files 'gpml_path': cblparallel.gpml_path(local_computation=False), 'kernel_family': sk.k_opt.gpml_kernel_expression(), 'kernel_params': '[ %s ]' % ' '.join(str(p) for p in sk.k_opt.param_vector()), 'noise': str(sk.noise), 'iters': str(300)}] #### Need to be careful with % signs #### For the moment, cblparallel expects no single % signs - FIXME scripts[0] = re.sub('% ', '%% ', scripts[0]) # Test scripts[0] = re.sub('delta = 1e-6', 'delta = 1e-6', scripts[0]) #scripts[0] = re.sub('hyp.lik = [-1.77276072]', 'hyp.lik = [-0.77276072]', scripts[0]) output_file = cblparallel.run_batch_on_fear(scripts, language='matlab', max_jobs=600)[0] # Read in results output = gpml.read_outputs(output_file) result = ScoredKernel.from_matlab_output(output, sk.k_opt.family(), ndata) print result print output.hessian os.remove(output_file) # Remove temporary data file (perhaps on the cluster server) cblparallel.remove_temp_file(data_file, local_computation=False)
def covariance_distance(kernels, X, local_computation=True, verbose=True): ''' Evaluate a distance matrix of kernels, in terms of their covariance matrix evaluated on training inputs Input: - kernels - A list of fk.ScoredKernel - X - A matrix (data_points x dimensions) of input locations - local_computation - Boolean indicating if computation should be performed on cluster or on local machine Return: - A matrix of similarities between the input kernels ''' assert(len(kernels) > 0) #### FIXME - This sort of check should happen earlier # Make data into matrices in case they're unidimensional. if X.ndim == 1: X = X[:, nax] # Save temporary data file in standard temporary directory data_file = cblparallel.create_temp_file('.mat') scipy.io.savemat(data_file, {'X': X}) # Copy onto cluster server if necessary if not local_computation: if verbose: print 'Moving data file to fear' cblparallel.copy_to_remote(data_file) # Construct testing code code = gpml.DISTANCE_CODE_HEADER % {'datafile': data_file.split('/')[-1], 'gpml_path': cblparallel.gpml_path(local_computation)} for (i, kernel) in enumerate([k.k_opt for k in kernels]): code = code + gpml.DISTANCE_CODE_COV % {'iter' : i + 1, 'kernel_family': kernel.gpml_kernel_expression(), 'kernel_params': '[ %s ]' % ' '.join(str(p) for p in kernel.param_vector())} code = code + gpml.DISTANCE_CODE_FOOTER_HIGH_MEM % {'writefile': '%(output_file)s'} # N.B. cblparallel manages output files code = re.sub('% ', '%% ', code) # HACK - cblparallel not fond of % signs at the moment # Run code - either locally or on cluster - returning location of output file if local_computation: output_file = cblparallel.run_batch_locally([code], language='matlab', max_cpu=1.1, max_mem=1.1, job_check_sleep=30, verbose=verbose, single_thread=False)[0] else: output_file = cblparallel.run_batch_on_fear([code], language='matlab', max_jobs=500, verbose=verbose)[0] # Read in results from experiment gpml_result = scipy.io.loadmat(output_file) distance = gpml_result['sim_matrix'] # Remove temporary files (perhaps on the cluster server) cblparallel.remove_temp_file(output_file, local_computation) cblparallel.remove_temp_file(data_file, local_computation) # Return distance matrix return distance
def make_predictions(X, y, Xtest, ytest, model, local_computation=False, max_jobs=500, verbose=True, random_seed=0, no_noise=False): # Make data into matrices in case they're unidimensional. if X.ndim == 1: X = X[:, nax] if y.ndim == 1: y = y[:, nax] ndata = y.shape[0] # Save temporary data file in standard temporary directory data_file = cblparallel.create_temp_file('.mat') scipy.io.savemat(data_file, {'X': X, 'y': y, 'Xtest' : Xtest, 'ytest' : ytest}) # Copy onto cluster server if necessary if not local_computation: if verbose: print 'Moving data file to fear' cblparallel.copy_to_remote(data_file) # Create prediction code parameters ={'datafile': data_file.split('/')[-1], 'writefile': '%(output_file)s', 'gpml_path': cblparallel.gpml_path(local_computation), 'mean_syntax': model.mean.get_gpml_expression(dimensions=X.shape[1]), 'mean_params': '[ %s ]' % ' '.join(str(p) for p in model.mean.param_vector), 'kernel_syntax': model.kernel.get_gpml_expression(dimensions=X.shape[1]), 'kernel_params': '[ %s ]' % ' '.join(str(p) for p in model.kernel.param_vector), 'lik_syntax': model.likelihood.get_gpml_expression(dimensions=X.shape[1]), 'lik_params': '[ %s ]' % ' '.join(str(p) for p in model.likelihood.param_vector), 'inference': model.likelihood.gpml_inference_method, 'iters': str(30), 'seed': str(random_seed)} code = gpml.PREDICT_AND_SAVE_CODE % parameters code = re.sub('% ', '%% ', code) # HACK - cblparallel currently does not like % signs # Evaluate code - potentially on cluster if local_computation: temp_results_file = cblparallel.run_batch_locally([code], language='matlab', max_cpu=1.1, max_mem=1.1, verbose=verbose)[0] else: temp_results_file = cblparallel.run_batch_on_fear([code], language='matlab', max_jobs=max_jobs, verbose=verbose)[0] results = scipy.io.loadmat(temp_results_file) # Remove temporary files (perhaps on the cluster server) cblparallel.remove_temp_file(temp_results_file, local_computation) cblparallel.remove_temp_file(data_file, local_computation) # Return dictionary of MATLAB results return results
def evaluate_models(models, X, y, verbose=True, iters=300, local_computation=False, zip_files=False, max_jobs=500, random_seed=0, subset=False, subset_size=250, full_iters=0, bundle_size=1): # Make data into matrices in case they're unidimensional. if X.ndim == 1: X = X[:, nax] if y.ndim == 1: y = y[:, nax] ndata = y.shape[0] # Create data file if verbose: print 'Creating data file locally' data_file = cblparallel.create_temp_file('.mat') scipy.io.savemat(data_file, {'X': X, 'y': y}) # Move to fear if necessary if not local_computation: if verbose: print 'Moving data file to fear' cblparallel.copy_to_remote(data_file) # Create a list of MATLAB scripts to assess and optimise parameters for each kernel if verbose: print 'Creating scripts' scripts = [None] * len(models) for (i, model) in enumerate(models): parameters = { 'datafile': data_file.split('/')[-1], 'writefile': '%(output_file)s', # N.B. cblparallel manages output files 'gpml_path': cblparallel.gpml_path(local_computation), 'mean_syntax': model.mean.get_gpml_expression(dimensions=X.shape[1]), 'mean_params': '[ %s ]' % ' '.join(str(p) for p in model.mean.param_vector), 'kernel_syntax': model.kernel.get_gpml_expression(dimensions=X.shape[1]), 'kernel_params': '[ %s ]' % ' '.join(str(p) for p in model.kernel.param_vector), 'lik_syntax': model.likelihood.get_gpml_expression(dimensions=X.shape[1]), 'lik_params': '[ %s ]' % ' '.join(str(p) for p in model.likelihood.param_vector), 'inference': model.likelihood.gpml_inference_method, 'iters': str(iters), 'seed': str(np.random.randint(2**31)), 'subset': 'true' if subset else 'false', 'subset_size': str(subset_size), 'full_iters': str(full_iters) } scripts[i] = gpml.OPTIMIZE_KERNEL_CODE % parameters #### Need to be careful with % signs #### For the moment, cblparallel expects no single % signs - FIXME scripts[i] = re.sub('% ', '%% ', scripts[i]) # Send to cblparallel and save output_files if verbose: print 'Sending scripts to cblparallel' if local_computation: output_files = cblparallel.run_batch_locally(scripts, language='matlab', max_cpu=1.1, job_check_sleep=5, submit_sleep=0.1, max_running_jobs=10, verbose=verbose) else: output_files = cblparallel.run_batch_on_fear(scripts, language='matlab', max_jobs=max_jobs, verbose=verbose, zip_files=zip_files, bundle_size=bundle_size) # Read in results results = [None] * len(models) for (i, output_file) in enumerate(output_files): if verbose: print 'Reading output file %d of %d' % (i + 1, len(models)) results[i] = GPModel.from_matlab_output(gpml.read_outputs(output_file), models[i], ndata) # Tidy up local output files for (i, output_file) in enumerate(output_files): if verbose: print 'Removing output file %d of %d' % (i + 1, len(models)) os.remove(output_file) # Remove temporary data file (perhaps on the cluster server) cblparallel.remove_temp_file(data_file, local_computation) # Return results i.e. list of ScoredKernel objects return results
def evaluate_models(models, X, y, verbose=True, iters=300, local_computation=False, zip_files=False, max_jobs=500, random_seed=0, subset=False, subset_size=250, full_iters=0, bundle_size=1): # Make data into matrices in case they're unidimensional. if X.ndim == 1: X = X[:, nax] if y.ndim == 1: y = y[:, nax] ndata = y.shape[0] # Create data file if verbose: print 'Creating data file locally' data_file = cblparallel.create_temp_file('.mat') scipy.io.savemat(data_file, {'X': X, 'y': y}) # Move to fear if necessary if not local_computation: if verbose: print 'Moving data file to fear' cblparallel.copy_to_remote(data_file) # Create a list of MATLAB scripts to assess and optimise parameters for each kernel if verbose: print 'Creating scripts' scripts = [None] * len(models) for (i, model) in enumerate(models): parameters = {'datafile': data_file.split('/')[-1], 'writefile': '%(output_file)s', # N.B. cblparallel manages output files 'gpml_path': cblparallel.gpml_path(local_computation), 'mean_syntax': model.mean.get_gpml_expression(dimensions=X.shape[1]), 'mean_params': '[ %s ]' % ' '.join(str(p) for p in model.mean.param_vector), 'kernel_syntax': model.kernel.get_gpml_expression(dimensions=X.shape[1]), 'kernel_params': '[ %s ]' % ' '.join(str(p) for p in model.kernel.param_vector), 'lik_syntax': model.likelihood.get_gpml_expression(dimensions=X.shape[1]), 'lik_params': '[ %s ]' % ' '.join(str(p) for p in model.likelihood.param_vector), 'inference': model.likelihood.gpml_inference_method, 'iters': str(iters), 'seed': str(np.random.randint(2**31)), 'subset': 'true' if subset else 'false', 'subset_size' : str(subset_size), 'full_iters' : str(full_iters)} scripts[i] = gpml.OPTIMIZE_KERNEL_CODE % parameters #### Need to be careful with % signs #### For the moment, cblparallel expects no single % signs - FIXME scripts[i] = re.sub('% ', '%% ', scripts[i]) # Send to cblparallel and save output_files if verbose: print 'Sending scripts to cblparallel' if local_computation: output_files = cblparallel.run_batch_locally(scripts, language='matlab', max_cpu=1.1, job_check_sleep=5, submit_sleep=0.1, max_running_jobs=10, verbose=verbose) else: output_files = cblparallel.run_batch_on_fear(scripts, language='matlab', max_jobs=max_jobs, verbose=verbose, zip_files=zip_files, bundle_size=bundle_size) # Read in results results = [None] * len(models) for (i, output_file) in enumerate(output_files): if verbose: print 'Reading output file %d of %d' % (i + 1, len(models)) results[i] = GPModel.from_matlab_output(gpml.read_outputs(output_file), models[i], ndata) # Tidy up local output files for (i, output_file) in enumerate(output_files): if verbose: print 'Removing output file %d of %d' % (i + 1, len(models)) os.remove(output_file) # Remove temporary data file (perhaps on the cluster server) cblparallel.remove_temp_file(data_file, local_computation) # Return results i.e. list of ScoredKernel objects return results
used_script_names = [] for file_name in script_names: if file_name[-2:] == '.m': # print 'Reading %s' % file_name used_script_names.append(file_name) with open(os.path.join('scripts', file_name)) as script_file: scripts.append(script_file.read() % {'data_file' : os.path.split(temp_data_file_name)[-1], 'output_file' : '%(output_file)s'}) script_names = used_script_names # Send to cluster # print 'Found %d scripts' % len(scripts) output_files = cblparallel.run_batch_on_fear(scripts, language='matlab', max_jobs=1000, verbose=False, zip_files=False, bundle_size=1) # print '%d output files returned' % len(output_files) # Move output for (src, name) in zip(output_files, script_names): # print 'Moving %s output' % name.split('.')[0] dest = os.path.join('outputs', name.split('.')[0] + '.mat') shutil.move(src, dest) # print 'Success' # Delete local data #for file_name in os.listdir('data'): # if file_name[-4:] == '.mat':
def evaluate_models( models, X, y, verbose=True, iters=300, local_computation=False, zip_files=False, max_jobs=500, random_seed=0, subset=False, subset_size=250, full_iters=0, bundle_size=1, ): # Make data into matrices in case they're unidimensional. if X.ndim == 1: X = X[:, nax] if y.ndim == 1: y = y[:, nax] ndata = y.shape[0] # Create data file if verbose: print "Creating data file locally" data_file = cblparallel.create_temp_file(".mat") scipy.io.savemat(data_file, {"X": X, "y": y}) # Move to fear if necessary if not local_computation: if verbose: print "Moving data file to fear" cblparallel.copy_to_remote(data_file) # Create a list of MATLAB scripts to assess and optimise parameters for each kernel if verbose: print "Creating scripts" scripts = [None] * len(models) for (i, model) in enumerate(models): parameters = { "datafile": data_file.split("/")[-1], "writefile": "%(output_file)s", # N.B. cblparallel manages output files "gpml_path": cblparallel.gpml_path(local_computation), "mean_syntax": model.mean.get_gpml_expression(dimensions=X.shape[1]), "mean_params": "[ %s ]" % " ".join(str(p) for p in model.mean.param_vector), "kernel_syntax": model.kernel.get_gpml_expression(dimensions=X.shape[1]), "kernel_params": "[ %s ]" % " ".join(str(p) for p in model.kernel.param_vector), "lik_syntax": model.likelihood.get_gpml_expression(dimensions=X.shape[1]), "lik_params": "[ %s ]" % " ".join(str(p) for p in model.likelihood.param_vector), "inference": model.likelihood.gpml_inference_method, "iters": str(iters), "seed": str(np.random.randint(2 ** 31)), "subset": "true" if subset else "false", "subset_size": str(subset_size), "full_iters": str(full_iters), } scripts[i] = gpml.OPTIMIZE_KERNEL_CODE % parameters #### Need to be careful with % signs #### For the moment, cblparallel expects no single % signs - FIXME scripts[i] = re.sub("% ", "%% ", scripts[i]) # Send to cblparallel and save output_files if verbose: print "Sending scripts to cblparallel" if local_computation: output_files = cblparallel.run_batch_locally( scripts, language="matlab", max_cpu=1.1, job_check_sleep=5, submit_sleep=0.1, max_running_jobs=10, verbose=verbose, ) else: output_files = cblparallel.run_batch_on_fear( scripts, language="matlab", max_jobs=max_jobs, verbose=verbose, zip_files=zip_files, bundle_size=bundle_size ) # Read in results results = [None] * len(models) for (i, output_file) in enumerate(output_files): if verbose: print "Reading output file %d of %d" % (i + 1, len(models)) results[i] = GPModel.from_matlab_output(gpml.read_outputs(output_file), models[i], ndata) # Tidy up local output files for (i, output_file) in enumerate(output_files): if verbose: print "Removing output file %d of %d" % (i + 1, len(models)) os.remove(output_file) # Remove temporary data file (perhaps on the cluster server) cblparallel.remove_temp_file(data_file, local_computation) # Return results i.e. list of ScoredKernel objects return results
def covariance_distance(kernels, X, local_computation=True, verbose=True): ''' Evaluate a distance matrix of kernels, in terms of their covariance matrix evaluated on training inputs Input: - kernels - A list of fk.ScoredKernel - X - A matrix (data_points x dimensions) of input locations - local_computation - Boolean indicating if computation should be performed on cluster or on local machine Return: - A matrix of similarities between the input kernels ''' assert (len(kernels) > 0 ) #### FIXME - This sort of check should happen earlier # Make data into matrices in case they're unidimensional. if X.ndim == 1: X = X[:, nax] # Save temporary data file in standard temporary directory data_file = cblparallel.create_temp_file('.mat') scipy.io.savemat(data_file, {'X': X}) # Copy onto cluster server if necessary if not local_computation: if verbose: print 'Moving data file to fear' cblparallel.copy_to_remote(data_file) # Construct testing code code = gpml.DISTANCE_CODE_HEADER % { 'datafile': data_file.split('/')[-1], 'gpml_path': cblparallel.gpml_path(local_computation) } for (i, kernel) in enumerate([k.k_opt for k in kernels]): code = code + gpml.DISTANCE_CODE_COV % { 'iter': i + 1, 'kernel_family': kernel.gpml_kernel_expression(), 'kernel_params': '[ %s ]' % ' '.join(str(p) for p in kernel.param_vector()) } code = code + gpml.DISTANCE_CODE_FOOTER_HIGH_MEM % { 'writefile': '%(output_file)s' } # N.B. cblparallel manages output files code = re.sub('% ', '%% ', code) # HACK - cblparallel not fond of % signs at the moment # Run code - either locally or on cluster - returning location of output file if local_computation: output_file = cblparallel.run_batch_locally([code], language='matlab', max_cpu=1.1, max_mem=1.1, job_check_sleep=30, verbose=verbose, single_thread=False)[0] else: output_file = cblparallel.run_batch_on_fear([code], language='matlab', max_jobs=500, verbose=verbose)[0] # Read in results from experiment gpml_result = scipy.io.loadmat(output_file) distance = gpml_result['sim_matrix'] # Remove temporary files (perhaps on the cluster server) cblparallel.remove_temp_file(output_file, local_computation) cblparallel.remove_temp_file(data_file, local_computation) # Return distance matrix return distance
def make_predictions(X, y, Xtest, ytest, best_scored_kernel, local_computation=False, max_jobs=500, verbose=True, zero_mean=False, random_seed=0): ''' Evaluates a kernel on held out data Input: - X - A matrix (data_points x dimensions) of input locations - y - A matrix (data_points x 1) of output values - Xtest - Held out X data - ytest - Held out y data - best_scored_kernel - A Scored Kernel object to be evaluated on the held out data - ... Return: - A dictionary of results from the MATLAB script containing: - loglik - an array of log likelihoods of test data - predictions - an array of mean predictions for the held out data - actuals - ytest - model - I'm not sure FIXME - timestamp - A time stamp of some sort ''' # Make data into matrices in case they're unidimensional. if X.ndim == 1: X = X[:, nax] if y.ndim == 1: y = y[:, nax] ndata = y.shape[0] # Save temporary data file in standard temporary directory data_file = cblparallel.create_temp_file('.mat') scipy.io.savemat(data_file, { 'X': X, 'y': y, 'Xtest': Xtest, 'ytest': ytest }) # Copy onto cluster server if necessary if not local_computation: if verbose: print 'Moving data file to fear' cblparallel.copy_to_remote(data_file) # Create prediction code parameters = { 'datafile': data_file.split('/')[-1], 'writefile': '%(output_file)s', 'gpml_path': cblparallel.gpml_path(local_computation), 'kernel_family': best_scored_kernel.k_opt.gpml_kernel_expression(), 'kernel_params': '[ %s ]' % ' '.join(str(p) for p in best_scored_kernel.k_opt.param_vector()), 'noise': str(best_scored_kernel.noise), 'iters': str(30), 'seed': str(random_seed) } if zero_mean: code = gpml.PREDICT_AND_SAVE_CODE_ZERO_MEAN % parameters else: code = gpml.PREDICT_AND_SAVE_CODE % parameters code = re.sub('% ', '%% ', code) # HACK - cblparallel currently does not like % signs # Evaluate code - potentially on cluster if local_computation: temp_results_file = cblparallel.run_batch_locally([code], language='matlab', max_cpu=1.1, max_mem=1.1, verbose=verbose)[0] else: temp_results_file = cblparallel.run_batch_on_fear([code], language='matlab', max_jobs=max_jobs, verbose=verbose)[0] results = scipy.io.loadmat(temp_results_file) # Remove temporary files (perhaps on the cluster server) cblparallel.remove_temp_file(temp_results_file, local_computation) cblparallel.remove_temp_file(data_file, local_computation) # Return dictionary of MATLAB results return results
def evaluate_kernels(kernels, X, y, verbose=True, noise=None, iters=300, local_computation=False, zip_files=False, max_jobs=500, zero_mean=False, random_seed=0): ''' Sets up the kernel optimisation and nll calculation experiments, returns the results as scored kernels Input: - kernels - A list of kernels (i.e. not scored kernels) - X - A matrix (data_points x dimensions) of input locations - y - A matrix (data_points x 1) of output values - ... Return: - A list of ScoredKernel objects ''' # Make data into matrices in case they're unidimensional. if X.ndim == 1: X = X[:, nax] if y.ndim == 1: y = y[:, nax] ndata = y.shape[0] # Set default noise using a heuristic. if noise is None: noise = np.log(np.var(y) / 10) # Create data file if verbose: print 'Creating data file locally' data_file = cblparallel.create_temp_file('.mat') scipy.io.savemat(data_file, {'X': X, 'y': y}) # Save regression data # Move to fear if necessary if not local_computation: if verbose: print 'Moving data file to fear' cblparallel.copy_to_remote(data_file) # Create a list of MATLAB scripts to assess and optimise parameters for each kernel if verbose: print 'Creating scripts' scripts = [None] * len(kernels) for (i, kernel) in enumerate(kernels): x = kernel.param_vector() parameters = { 'datafile': data_file.split('/')[-1], 'writefile': '%(output_file)s', # N.B. cblparallel manages output files 'gpml_path': cblparallel.gpml_path(local_computation), 'kernel_family': kernel.gpml_kernel_expression(), 'kernel_params': '[ %s ]' % ' '.join(str(p) for p in x if type(p) != list), 'eff_dimensions': '[ %s ]' % ';'.join(str(p) for p in x if type(p) == list), 'dim_positions': '[ %s ]' % ' '.join(str(i) for i in range(len(x)) if type(x[i]) == list), 'noise': str(noise), 'iters': str(iters), 'seed': str(random_seed) } print parameters['kernel_params'], parameters[ 'eff_dimensions'], parameters['dim_positions'] if zero_mean: scripts[i] = gpml.OPTIMIZE_KERNEL_CODE_ZERO_MEAN % parameters else: scripts[i] = gpml.OPTIMIZE_KERNEL_CODE % parameters #### Need to be careful with % signs #### For the moment, cblparallel expects no single % signs - FIXME scripts[i] = re.sub('% ', '%% ', scripts[i]) # Send to cblparallel and save output_files if verbose: print 'Sending scripts to cblparallel' if local_computation: output_files = cblparallel.run_batch_locally(scripts, language='matlab', max_cpu=1.1, job_check_sleep=5, submit_sleep=0.1, max_running_jobs=10, verbose=verbose) else: output_files = cblparallel.run_batch_on_fear(scripts, language='matlab', max_jobs=max_jobs, verbose=verbose, zip_files=zip_files) # Read in results results = [None] * len(kernels) for (i, output_file) in enumerate(output_files): if verbose: print 'Reading output file %d of %d: %s' % (i + 1, len(kernels), output_file) results[i] = ScoredKernel.from_matlab_output( gpml.read_outputs(output_file), kernels[i].family(), ndata) # Tidy up local output files for (i, output_file) in enumerate(output_files): if verbose: print 'Removing output file %d of %d' % (i + 1, len(kernels)) os.remove(output_file) # Remove temporary data file (perhaps on the cluster server) cblparallel.remove_temp_file(data_file, local_computation) # Return results i.e. list of ScoredKernel objects return results
def evaluate_kernels(kernels, X, y, verbose=True, noise=None, iters=300, local_computation=False, zip_files=False, max_jobs=500, zero_mean=False, random_seed=0): ''' Sets up the kernel optimisation and nll calculation experiments, returns the results as scored kernels Input: - kernels - A list of kernels (i.e. not scored kernels) - X - A matrix (data_points x dimensions) of input locations - y - A matrix (data_points x 1) of output values - ... Return: - A list of ScoredKernel objects ''' # Make data into matrices in case they're unidimensional. if X.ndim == 1: X = X[:, nax] if y.ndim == 1: y = y[:, nax] ndata = y.shape[0] # Set default noise using a heuristic. if noise is None: noise = np.log(np.var(y)/10) # Create data file if verbose: print 'Creating data file locally' data_file = cblparallel.create_temp_file('.mat') scipy.io.savemat(data_file, {'X': X, 'y': y}) # Save regression data # Move to fear if necessary if not local_computation: if verbose: print 'Moving data file to fear' cblparallel.copy_to_remote(data_file) # Create a list of MATLAB scripts to assess and optimise parameters for each kernel if verbose: print 'Creating scripts' scripts = [None] * len(kernels) for (i, kernel) in enumerate(kernels): parameters = {'datafile': data_file.split('/')[-1], 'writefile': '%(output_file)s', # N.B. cblparallel manages output files 'gpml_path': cblparallel.gpml_path(local_computation), 'kernel_family': kernel.gpml_kernel_expression(), 'kernel_params': '[ %s ]' % ' '.join(str(p) for p in kernel.param_vector()), 'noise': str(noise), 'iters': str(iters), 'seed': str(random_seed)} if zero_mean: scripts[i] = gpml.OPTIMIZE_KERNEL_CODE_ZERO_MEAN % parameters else: scripts[i] = gpml.OPTIMIZE_KERNEL_CODE % parameters #### Need to be careful with % signs #### For the moment, cblparallel expects no single % signs - FIXME scripts[i] = re.sub('% ', '%% ', scripts[i]) # Send to cblparallel and save output_files if verbose: print 'Sending scripts to cblparallel' if local_computation: output_files = cblparallel.run_batch_locally(scripts, language='matlab', max_cpu=1.1, job_check_sleep=5, submit_sleep=0.1, max_running_jobs=10, verbose=verbose) else: output_files = cblparallel.run_batch_on_fear(scripts, language='matlab', max_jobs=max_jobs, verbose=verbose, zip_files=zip_files) # Read in results results = [None] * len(kernels) for (i, output_file) in enumerate(output_files): if verbose: print 'Reading output file %d of %d' % (i + 1, len(kernels)) results[i] = ScoredKernel.from_matlab_output(gpml.read_outputs(output_file), kernels[i].family(), ndata) # Tidy up local output files for (i, output_file) in enumerate(output_files): if verbose: print 'Removing output file %d of %d' % (i + 1, len(kernels)) os.remove(output_file) # Remove temporary data file (perhaps on the cluster server) cblparallel.remove_temp_file(data_file, local_computation) # Return results i.e. list of ScoredKernel objects return results