def evaluate_kernel(kernel, X, y, noise=None, iters=300): ''' Sets up a kernel optimisation and nll calculation experiment, returns the result as scored kernel Input: - kernel - A kernel (i.e. not scored kernel) - X - A matrix (data_points x dimensions) of input locations - y - A matrix (data_points x 1) of output values - ... Return: - A ScoredKernel object ''' # Make data into matrices in case they're unidimensional. if X.ndim == 1: X = X[:, nax] if y.ndim == 1: y = y[:, nax] ndata = y.shape[0] # Set default noise using a heuristic. if noise is None: noise = np.log(np.var(y) / 10) data_file = mp.util.create_temp_file('.mat') scipy.io.savemat(data_file, {'X': X, 'y': y}) # Save regression data output_file = mp.util.create_temp_file('.mat') script = evaluate_kernel_code(kernel, data_file, output_file, noise, iters) mp.matlab.run(script) result = flexiblekernel.ScoredKernel.from_matlab_output( gpml.read_outputs(output_file), kernel.family(), ndata) os.remove(data_file) os.remove(output_file) return result
def evaluate_kernel(kernel, X, y, noise=None, iters=300): ''' Sets up a kernel optimisation and nll calculation experiment, returns the result as scored kernel Input: - kernel - A kernel (i.e. not scored kernel) - X - A matrix (data_points x dimensions) of input locations - y - A matrix (data_points x 1) of output values - ... Return: - A ScoredKernel object ''' # Make data into matrices in case they're unidimensional. if X.ndim == 1: X = X[:, nax] if y.ndim == 1: y = y[:, nax] ndata = y.shape[0] # Set default noise using a heuristic. if noise is None: noise = np.log(np.var(y)/10) data_file = mp.util.create_temp_file('.mat') scipy.io.savemat(data_file, {'X': X, 'y': y}) # Save regression data output_file = mp.util.create_temp_file('.mat') script = evaluate_kernel_code(kernel, data_file, output_file, noise, iters) mp.matlab.run(script) result = flexiblekernel.ScoredKernel.from_matlab_output(gpml.read_outputs(output_file), kernel.family(), ndata) os.remove(data_file) os.remove(output_file) return result
def debug_laplace(): # Load data set X, y, D, Xtest, ytest = gpml.load_mat( '../data/kfold_data/r_concrete_500_fold_10_of_10.mat', y_dim=1) # Load the suspicious kernel sk = fk.repr_string_to_kernel( 'ScoredKernel(k_opt=ProductKernel([ MaskKernel(ndim=8, active_dimension=0, base_kernel=CubicKernel(offset=1.757755, output_variance=7.084045)), MaskKernel(ndim=8, active_dimension=7, base_kernel=SqExpPeriodicKernel(lengthscale=-2.701080, period=-0.380918, output_variance=-0.071214)) ]), nll=6348.096611, laplace_nle=-184450132.068237, bic_nle=12720.630212, noise=[-1.77276072])' ) # Create some code to evaluate it if X.ndim == 1: X = X[:, nax] if y.ndim == 1: y = y[:, nax] ndata = y.shape[0] # Create data file data_file = cblparallel.create_temp_file('.mat') scipy.io.savemat(data_file, {'X': X, 'y': y}) # Save regression data # Move to fear cblparallel.copy_to_remote(data_file) scripts = [ gpml.OPTIMIZE_KERNEL_CODE % { 'datafile': data_file.split('/')[-1], 'writefile': '%(output_file)s', # N.B. cblparallel manages output files 'gpml_path': cblparallel.gpml_path(local_computation=False), 'kernel_family': sk.k_opt.gpml_kernel_expression(), 'kernel_params': '[ %s ]' % ' '.join(str(p) for p in sk.k_opt.param_vector()), 'noise': str(sk.noise), 'iters': str(300) } ] #### Need to be careful with % signs #### For the moment, cblparallel expects no single % signs - FIXME scripts[0] = re.sub('% ', '%% ', scripts[0]) # Test scripts[0] = re.sub('delta = 1e-6', 'delta = 1e-6', scripts[0]) #scripts[0] = re.sub('hyp.lik = [-1.77276072]', 'hyp.lik = [-0.77276072]', scripts[0]) output_file = cblparallel.run_batch_on_fear(scripts, language='matlab', max_jobs=600)[0] # Read in results output = gpml.read_outputs(output_file) result = ScoredKernel.from_matlab_output(output, sk.k_opt.family(), ndata) print result print output.hessian os.remove(output_file) # Remove temporary data file (perhaps on the cluster server) cblparallel.remove_temp_file(data_file, local_computation=False)
def debug_laplace(): # Load data set X, y, D, Xtest, ytest = gpml.load_mat('../data/kfold_data/r_concrete_500_fold_10_of_10.mat', y_dim=1) # Load the suspicious kernel sk = fk.repr_string_to_kernel('ScoredKernel(k_opt=ProductKernel([ MaskKernel(ndim=8, active_dimension=0, base_kernel=CubicKernel(offset=1.757755, output_variance=7.084045)), MaskKernel(ndim=8, active_dimension=7, base_kernel=SqExpPeriodicKernel(lengthscale=-2.701080, period=-0.380918, output_variance=-0.071214)) ]), nll=6348.096611, laplace_nle=-184450132.068237, bic_nle=12720.630212, noise=[-1.77276072])') # Create some code to evaluate it if X.ndim == 1: X = X[:, nax] if y.ndim == 1: y = y[:, nax] ndata = y.shape[0] # Create data file data_file = cblparallel.create_temp_file('.mat') scipy.io.savemat(data_file, {'X': X, 'y': y}) # Save regression data # Move to fear cblparallel.copy_to_remote(data_file) scripts = [gpml.OPTIMIZE_KERNEL_CODE % {'datafile': data_file.split('/')[-1], 'writefile': '%(output_file)s', # N.B. cblparallel manages output files 'gpml_path': cblparallel.gpml_path(local_computation=False), 'kernel_family': sk.k_opt.gpml_kernel_expression(), 'kernel_params': '[ %s ]' % ' '.join(str(p) for p in sk.k_opt.param_vector()), 'noise': str(sk.noise), 'iters': str(300)}] #### Need to be careful with % signs #### For the moment, cblparallel expects no single % signs - FIXME scripts[0] = re.sub('% ', '%% ', scripts[0]) # Test scripts[0] = re.sub('delta = 1e-6', 'delta = 1e-6', scripts[0]) #scripts[0] = re.sub('hyp.lik = [-1.77276072]', 'hyp.lik = [-0.77276072]', scripts[0]) output_file = cblparallel.run_batch_on_fear(scripts, language='matlab', max_jobs=600)[0] # Read in results output = gpml.read_outputs(output_file) result = ScoredKernel.from_matlab_output(output, sk.k_opt.family(), ndata) print result print output.hessian os.remove(output_file) # Remove temporary data file (perhaps on the cluster server) cblparallel.remove_temp_file(data_file, local_computation=False)
def evaluate_models(models, X, y, verbose=True, iters=300, local_computation=False, zip_files=False, max_jobs=500, random_seed=0, subset=False, subset_size=250, full_iters=0, bundle_size=1): # Make data into matrices in case they're unidimensional. if X.ndim == 1: X = X[:, nax] if y.ndim == 1: y = y[:, nax] ndata = y.shape[0] # Create data file if verbose: print 'Creating data file locally' data_file = cblparallel.create_temp_file('.mat') scipy.io.savemat(data_file, {'X': X, 'y': y}) # Move to fear if necessary if not local_computation: if verbose: print 'Moving data file to fear' cblparallel.copy_to_remote(data_file) # Create a list of MATLAB scripts to assess and optimise parameters for each kernel if verbose: print 'Creating scripts' scripts = [None] * len(models) for (i, model) in enumerate(models): parameters = { 'datafile': data_file.split('/')[-1], 'writefile': '%(output_file)s', # N.B. cblparallel manages output files 'gpml_path': cblparallel.gpml_path(local_computation), 'mean_syntax': model.mean.get_gpml_expression(dimensions=X.shape[1]), 'mean_params': '[ %s ]' % ' '.join(str(p) for p in model.mean.param_vector), 'kernel_syntax': model.kernel.get_gpml_expression(dimensions=X.shape[1]), 'kernel_params': '[ %s ]' % ' '.join(str(p) for p in model.kernel.param_vector), 'lik_syntax': model.likelihood.get_gpml_expression(dimensions=X.shape[1]), 'lik_params': '[ %s ]' % ' '.join(str(p) for p in model.likelihood.param_vector), 'inference': model.likelihood.gpml_inference_method, 'iters': str(iters), 'seed': str(np.random.randint(2**31)), 'subset': 'true' if subset else 'false', 'subset_size': str(subset_size), 'full_iters': str(full_iters) } scripts[i] = gpml.OPTIMIZE_KERNEL_CODE % parameters #### Need to be careful with % signs #### For the moment, cblparallel expects no single % signs - FIXME scripts[i] = re.sub('% ', '%% ', scripts[i]) # Send to cblparallel and save output_files if verbose: print 'Sending scripts to cblparallel' if local_computation: output_files = cblparallel.run_batch_locally(scripts, language='matlab', max_cpu=1.1, job_check_sleep=5, submit_sleep=0.1, max_running_jobs=10, verbose=verbose) else: output_files = cblparallel.run_batch_on_fear(scripts, language='matlab', max_jobs=max_jobs, verbose=verbose, zip_files=zip_files, bundle_size=bundle_size) # Read in results results = [None] * len(models) for (i, output_file) in enumerate(output_files): if verbose: print 'Reading output file %d of %d' % (i + 1, len(models)) results[i] = GPModel.from_matlab_output(gpml.read_outputs(output_file), models[i], ndata) # Tidy up local output files for (i, output_file) in enumerate(output_files): if verbose: print 'Removing output file %d of %d' % (i + 1, len(models)) os.remove(output_file) # Remove temporary data file (perhaps on the cluster server) cblparallel.remove_temp_file(data_file, local_computation) # Return results i.e. list of ScoredKernel objects return results
def evaluate_models(models, X, y, verbose=True, iters=300, local_computation=False, zip_files=False, max_jobs=500, random_seed=0, subset=False, subset_size=250, full_iters=0, bundle_size=1): # Make data into matrices in case they're unidimensional. if X.ndim == 1: X = X[:, nax] if y.ndim == 1: y = y[:, nax] ndata = y.shape[0] # Create data file if verbose: print 'Creating data file locally' data_file = cblparallel.create_temp_file('.mat') scipy.io.savemat(data_file, {'X': X, 'y': y}) # Move to fear if necessary if not local_computation: if verbose: print 'Moving data file to fear' cblparallel.copy_to_remote(data_file) # Create a list of MATLAB scripts to assess and optimise parameters for each kernel if verbose: print 'Creating scripts' scripts = [None] * len(models) for (i, model) in enumerate(models): parameters = {'datafile': data_file.split('/')[-1], 'writefile': '%(output_file)s', # N.B. cblparallel manages output files 'gpml_path': cblparallel.gpml_path(local_computation), 'mean_syntax': model.mean.get_gpml_expression(dimensions=X.shape[1]), 'mean_params': '[ %s ]' % ' '.join(str(p) for p in model.mean.param_vector), 'kernel_syntax': model.kernel.get_gpml_expression(dimensions=X.shape[1]), 'kernel_params': '[ %s ]' % ' '.join(str(p) for p in model.kernel.param_vector), 'lik_syntax': model.likelihood.get_gpml_expression(dimensions=X.shape[1]), 'lik_params': '[ %s ]' % ' '.join(str(p) for p in model.likelihood.param_vector), 'inference': model.likelihood.gpml_inference_method, 'iters': str(iters), 'seed': str(np.random.randint(2**31)), 'subset': 'true' if subset else 'false', 'subset_size' : str(subset_size), 'full_iters' : str(full_iters)} scripts[i] = gpml.OPTIMIZE_KERNEL_CODE % parameters #### Need to be careful with % signs #### For the moment, cblparallel expects no single % signs - FIXME scripts[i] = re.sub('% ', '%% ', scripts[i]) # Send to cblparallel and save output_files if verbose: print 'Sending scripts to cblparallel' if local_computation: output_files = cblparallel.run_batch_locally(scripts, language='matlab', max_cpu=1.1, job_check_sleep=5, submit_sleep=0.1, max_running_jobs=10, verbose=verbose) else: output_files = cblparallel.run_batch_on_fear(scripts, language='matlab', max_jobs=max_jobs, verbose=verbose, zip_files=zip_files, bundle_size=bundle_size) # Read in results results = [None] * len(models) for (i, output_file) in enumerate(output_files): if verbose: print 'Reading output file %d of %d' % (i + 1, len(models)) results[i] = GPModel.from_matlab_output(gpml.read_outputs(output_file), models[i], ndata) # Tidy up local output files for (i, output_file) in enumerate(output_files): if verbose: print 'Removing output file %d of %d' % (i + 1, len(models)) os.remove(output_file) # Remove temporary data file (perhaps on the cluster server) cblparallel.remove_temp_file(data_file, local_computation) # Return results i.e. list of ScoredKernel objects return results
def evaluate_models( models, X, y, verbose=True, iters=300, local_computation=False, zip_files=False, max_jobs=500, random_seed=0, subset=False, subset_size=250, full_iters=0, bundle_size=1, ): # Make data into matrices in case they're unidimensional. if X.ndim == 1: X = X[:, nax] if y.ndim == 1: y = y[:, nax] ndata = y.shape[0] # Create data file if verbose: print "Creating data file locally" data_file = cblparallel.create_temp_file(".mat") scipy.io.savemat(data_file, {"X": X, "y": y}) # Move to fear if necessary if not local_computation: if verbose: print "Moving data file to fear" cblparallel.copy_to_remote(data_file) # Create a list of MATLAB scripts to assess and optimise parameters for each kernel if verbose: print "Creating scripts" scripts = [None] * len(models) for (i, model) in enumerate(models): parameters = { "datafile": data_file.split("/")[-1], "writefile": "%(output_file)s", # N.B. cblparallel manages output files "gpml_path": cblparallel.gpml_path(local_computation), "mean_syntax": model.mean.get_gpml_expression(dimensions=X.shape[1]), "mean_params": "[ %s ]" % " ".join(str(p) for p in model.mean.param_vector), "kernel_syntax": model.kernel.get_gpml_expression(dimensions=X.shape[1]), "kernel_params": "[ %s ]" % " ".join(str(p) for p in model.kernel.param_vector), "lik_syntax": model.likelihood.get_gpml_expression(dimensions=X.shape[1]), "lik_params": "[ %s ]" % " ".join(str(p) for p in model.likelihood.param_vector), "inference": model.likelihood.gpml_inference_method, "iters": str(iters), "seed": str(np.random.randint(2 ** 31)), "subset": "true" if subset else "false", "subset_size": str(subset_size), "full_iters": str(full_iters), } scripts[i] = gpml.OPTIMIZE_KERNEL_CODE % parameters #### Need to be careful with % signs #### For the moment, cblparallel expects no single % signs - FIXME scripts[i] = re.sub("% ", "%% ", scripts[i]) # Send to cblparallel and save output_files if verbose: print "Sending scripts to cblparallel" if local_computation: output_files = cblparallel.run_batch_locally( scripts, language="matlab", max_cpu=1.1, job_check_sleep=5, submit_sleep=0.1, max_running_jobs=10, verbose=verbose, ) else: output_files = cblparallel.run_batch_on_fear( scripts, language="matlab", max_jobs=max_jobs, verbose=verbose, zip_files=zip_files, bundle_size=bundle_size ) # Read in results results = [None] * len(models) for (i, output_file) in enumerate(output_files): if verbose: print "Reading output file %d of %d" % (i + 1, len(models)) results[i] = GPModel.from_matlab_output(gpml.read_outputs(output_file), models[i], ndata) # Tidy up local output files for (i, output_file) in enumerate(output_files): if verbose: print "Removing output file %d of %d" % (i + 1, len(models)) os.remove(output_file) # Remove temporary data file (perhaps on the cluster server) cblparallel.remove_temp_file(data_file, local_computation) # Return results i.e. list of ScoredKernel objects return results
def evaluate_kernels(kernels, X, y, verbose=True, noise=None, iters=300, local_computation=False, zip_files=False, max_jobs=500, zero_mean=False, random_seed=0): ''' Sets up the kernel optimisation and nll calculation experiments, returns the results as scored kernels Input: - kernels - A list of kernels (i.e. not scored kernels) - X - A matrix (data_points x dimensions) of input locations - y - A matrix (data_points x 1) of output values - ... Return: - A list of ScoredKernel objects ''' # Make data into matrices in case they're unidimensional. if X.ndim == 1: X = X[:, nax] if y.ndim == 1: y = y[:, nax] ndata = y.shape[0] # Set default noise using a heuristic. if noise is None: noise = np.log(np.var(y) / 10) # Create data file if verbose: print 'Creating data file locally' data_file = cblparallel.create_temp_file('.mat') scipy.io.savemat(data_file, {'X': X, 'y': y}) # Save regression data # Move to fear if necessary if not local_computation: if verbose: print 'Moving data file to fear' cblparallel.copy_to_remote(data_file) # Create a list of MATLAB scripts to assess and optimise parameters for each kernel if verbose: print 'Creating scripts' scripts = [None] * len(kernels) for (i, kernel) in enumerate(kernels): x = kernel.param_vector() parameters = { 'datafile': data_file.split('/')[-1], 'writefile': '%(output_file)s', # N.B. cblparallel manages output files 'gpml_path': cblparallel.gpml_path(local_computation), 'kernel_family': kernel.gpml_kernel_expression(), 'kernel_params': '[ %s ]' % ' '.join(str(p) for p in x if type(p) != list), 'eff_dimensions': '[ %s ]' % ';'.join(str(p) for p in x if type(p) == list), 'dim_positions': '[ %s ]' % ' '.join(str(i) for i in range(len(x)) if type(x[i]) == list), 'noise': str(noise), 'iters': str(iters), 'seed': str(random_seed) } print parameters['kernel_params'], parameters[ 'eff_dimensions'], parameters['dim_positions'] if zero_mean: scripts[i] = gpml.OPTIMIZE_KERNEL_CODE_ZERO_MEAN % parameters else: scripts[i] = gpml.OPTIMIZE_KERNEL_CODE % parameters #### Need to be careful with % signs #### For the moment, cblparallel expects no single % signs - FIXME scripts[i] = re.sub('% ', '%% ', scripts[i]) # Send to cblparallel and save output_files if verbose: print 'Sending scripts to cblparallel' if local_computation: output_files = cblparallel.run_batch_locally(scripts, language='matlab', max_cpu=1.1, job_check_sleep=5, submit_sleep=0.1, max_running_jobs=10, verbose=verbose) else: output_files = cblparallel.run_batch_on_fear(scripts, language='matlab', max_jobs=max_jobs, verbose=verbose, zip_files=zip_files) # Read in results results = [None] * len(kernels) for (i, output_file) in enumerate(output_files): if verbose: print 'Reading output file %d of %d: %s' % (i + 1, len(kernels), output_file) results[i] = ScoredKernel.from_matlab_output( gpml.read_outputs(output_file), kernels[i].family(), ndata) # Tidy up local output files for (i, output_file) in enumerate(output_files): if verbose: print 'Removing output file %d of %d' % (i + 1, len(kernels)) os.remove(output_file) # Remove temporary data file (perhaps on the cluster server) cblparallel.remove_temp_file(data_file, local_computation) # Return results i.e. list of ScoredKernel objects return results
def evaluate_kernels(kernels, X, y, verbose=True, noise=None, iters=300, local_computation=False, zip_files=False, max_jobs=500, zero_mean=False, random_seed=0): ''' Sets up the kernel optimisation and nll calculation experiments, returns the results as scored kernels Input: - kernels - A list of kernels (i.e. not scored kernels) - X - A matrix (data_points x dimensions) of input locations - y - A matrix (data_points x 1) of output values - ... Return: - A list of ScoredKernel objects ''' # Make data into matrices in case they're unidimensional. if X.ndim == 1: X = X[:, nax] if y.ndim == 1: y = y[:, nax] ndata = y.shape[0] # Set default noise using a heuristic. if noise is None: noise = np.log(np.var(y)/10) # Create data file if verbose: print 'Creating data file locally' data_file = cblparallel.create_temp_file('.mat') scipy.io.savemat(data_file, {'X': X, 'y': y}) # Save regression data # Move to fear if necessary if not local_computation: if verbose: print 'Moving data file to fear' cblparallel.copy_to_remote(data_file) # Create a list of MATLAB scripts to assess and optimise parameters for each kernel if verbose: print 'Creating scripts' scripts = [None] * len(kernels) for (i, kernel) in enumerate(kernels): parameters = {'datafile': data_file.split('/')[-1], 'writefile': '%(output_file)s', # N.B. cblparallel manages output files 'gpml_path': cblparallel.gpml_path(local_computation), 'kernel_family': kernel.gpml_kernel_expression(), 'kernel_params': '[ %s ]' % ' '.join(str(p) for p in kernel.param_vector()), 'noise': str(noise), 'iters': str(iters), 'seed': str(random_seed)} if zero_mean: scripts[i] = gpml.OPTIMIZE_KERNEL_CODE_ZERO_MEAN % parameters else: scripts[i] = gpml.OPTIMIZE_KERNEL_CODE % parameters #### Need to be careful with % signs #### For the moment, cblparallel expects no single % signs - FIXME scripts[i] = re.sub('% ', '%% ', scripts[i]) # Send to cblparallel and save output_files if verbose: print 'Sending scripts to cblparallel' if local_computation: output_files = cblparallel.run_batch_locally(scripts, language='matlab', max_cpu=1.1, job_check_sleep=5, submit_sleep=0.1, max_running_jobs=10, verbose=verbose) else: output_files = cblparallel.run_batch_on_fear(scripts, language='matlab', max_jobs=max_jobs, verbose=verbose, zip_files=zip_files) # Read in results results = [None] * len(kernels) for (i, output_file) in enumerate(output_files): if verbose: print 'Reading output file %d of %d' % (i + 1, len(kernels)) results[i] = ScoredKernel.from_matlab_output(gpml.read_outputs(output_file), kernels[i].family(), ndata) # Tidy up local output files for (i, output_file) in enumerate(output_files): if verbose: print 'Removing output file %d of %d' % (i + 1, len(kernels)) os.remove(output_file) # Remove temporary data file (perhaps on the cluster server) cblparallel.remove_temp_file(data_file, local_computation) # Return results i.e. list of ScoredKernel objects return results