def parse_results(results_filenames, max_level=None): ''' Returns the best kernel in an experiment output file as a ScoredKernel ''' if not isinstance(results_filenames, list): # Backward compatibility wth specifying a single file results_filenames = [results_filenames] # Read relevant lines of file(s) result_tuples = [] for results_filename in results_filenames: lines = [] with open(results_filename) as results_file: score = None for line in results_file: if line.startswith('score = '): score = line[8:-2] elif line.startswith("GPModel"): lines.append(line) elif (not max_level is None) and (len(re.findall('Level [0-9]+', line)) > 0): level = int(line.split(' ')[2]) if level > max_level: break result_tuples += [ff.repr_to_model(line.strip()) for line in lines] if not score is None: best_tuple = sorted(result_tuples, key=lambda a_model : GPModel.score(a_model, score))[0] else: best_tuple = sorted(result_tuples, key=GPModel.score)[0] return best_tuple
def parse_results(results_filenames, max_level=None): ''' Returns the best kernel in an experiment output file as a ScoredKernel ''' if not isinstance(results_filenames, list): # Backward compatibility wth specifying a single file results_filenames = [results_filenames] # Read relevant lines of file(s) result_tuples = [] for results_filename in results_filenames: lines = [] with open(results_filename) as results_file: score = None for line in results_file: if line.startswith('score = '): score = line[8:-2] elif line.startswith("GPModel"): lines.append(line) elif (not max_level is None) and (len( re.findall('Level [0-9]+', line)) > 0): level = int(line.split(' ')[2]) if level > max_level: break result_tuples += [ff.repr_to_model(line.strip()) for line in lines] if not score is None: best_tuple = sorted( result_tuples, key=lambda a_model: GPModel.score(a_model, score))[0] else: best_tuple = sorted(result_tuples, key=GPModel.score)[0] return best_tuple
def evaluate_models(models, X, y, verbose=True, iters=300, local_computation=False, zip_files=False, max_jobs=500, random_seed=0, subset=False, subset_size=250, full_iters=0, bundle_size=1): # Make data into matrices in case they're unidimensional. if X.ndim == 1: X = X[:, nax] if y.ndim == 1: y = y[:, nax] ndata = y.shape[0] # Create data file if verbose: print 'Creating data file locally' data_file = cblparallel.create_temp_file('.mat') scipy.io.savemat(data_file, {'X': X, 'y': y}) # Move to fear if necessary if not local_computation: if verbose: print 'Moving data file to fear' cblparallel.copy_to_remote(data_file) # Create a list of MATLAB scripts to assess and optimise parameters for each kernel if verbose: print 'Creating scripts' scripts = [None] * len(models) for (i, model) in enumerate(models): parameters = { 'datafile': data_file.split('/')[-1], 'writefile': '%(output_file)s', # N.B. cblparallel manages output files 'gpml_path': cblparallel.gpml_path(local_computation), 'mean_syntax': model.mean.get_gpml_expression(dimensions=X.shape[1]), 'mean_params': '[ %s ]' % ' '.join(str(p) for p in model.mean.param_vector), 'kernel_syntax': model.kernel.get_gpml_expression(dimensions=X.shape[1]), 'kernel_params': '[ %s ]' % ' '.join(str(p) for p in model.kernel.param_vector), 'lik_syntax': model.likelihood.get_gpml_expression(dimensions=X.shape[1]), 'lik_params': '[ %s ]' % ' '.join(str(p) for p in model.likelihood.param_vector), 'inference': model.likelihood.gpml_inference_method, 'iters': str(iters), 'seed': str(np.random.randint(2**31)), 'subset': 'true' if subset else 'false', 'subset_size': str(subset_size), 'full_iters': str(full_iters) } scripts[i] = gpml.OPTIMIZE_KERNEL_CODE % parameters #### Need to be careful with % signs #### For the moment, cblparallel expects no single % signs - FIXME scripts[i] = re.sub('% ', '%% ', scripts[i]) # Send to cblparallel and save output_files if verbose: print 'Sending scripts to cblparallel' if local_computation: output_files = cblparallel.run_batch_locally(scripts, language='matlab', max_cpu=1.1, job_check_sleep=5, submit_sleep=0.1, max_running_jobs=10, verbose=verbose) else: output_files = cblparallel.run_batch_on_fear(scripts, language='matlab', max_jobs=max_jobs, verbose=verbose, zip_files=zip_files, bundle_size=bundle_size) # Read in results results = [None] * len(models) for (i, output_file) in enumerate(output_files): if verbose: print 'Reading output file %d of %d' % (i + 1, len(models)) results[i] = GPModel.from_matlab_output(gpml.read_outputs(output_file), models[i], ndata) # Tidy up local output files for (i, output_file) in enumerate(output_files): if verbose: print 'Removing output file %d of %d' % (i + 1, len(models)) os.remove(output_file) # Remove temporary data file (perhaps on the cluster server) cblparallel.remove_temp_file(data_file, local_computation) # Return results i.e. list of ScoredKernel objects return results
def perform_kernel_search(X, y, D, experiment_data_file_name, results_filename, exp): '''Search for the best kernel, in parallel on fear or local machine.''' # Initialise random seeds - randomness may be used in e.g. data subsetting utils.misc.set_all_random_seeds(exp.random_seed) # Create location, scale and minimum period parameters to pass around for initialisations data_shape = {} data_shape['x_mean'] = [np.mean(X[:,dim]) for dim in range(X.shape[1])] data_shape['y_mean'] = np.mean(y) #### TODO - should this be modified for non real valued data data_shape['x_sd'] = np.log([np.std(X[:,dim]) for dim in range(X.shape[1])]) data_shape['y_sd'] = np.log(np.std(y)) #### TODO - should this be modified for non real valued data data_shape['y_min'] = np.min(y) data_shape['y_max'] = np.max(y) data_shape['x_min'] = [np.min(X[:,dim]) for dim in range(X.shape[1])] data_shape['x_max'] = [np.max(X[:,dim]) for dim in range(X.shape[1])] # Initialise period at a multiple of the shortest / average distance between points, to prevent Nyquist problems. if exp.period_heuristic_type == 'none': data_shape['min_period'] = None if exp.period_heuristic_type == 'min': data_shape['min_period'] = np.log([exp.period_heuristic * utils.misc.min_abs_diff(X[:,i]) for i in range(X.shape[1])]) elif exp.period_heuristic_type == 'average': data_shape['min_period'] = np.log([exp.period_heuristic * np.ptp(X[:,i]) / X.shape[0] for i in range(X.shape[1])]) elif exp.period_heuristic_type == 'both': data_shape['min_period'] = np.log([max(exp.period_heuristic * utils.misc.min_abs_diff(X[:,i]), exp.period_heuristic * np.ptp(X[:,i]) / X.shape[0]) for i in range(X.shape[1])]) else: warnings.warn('Unrecognised period heuristic type : using most conservative heuristic') data_shape['min_period'] = np.log([max(exp.period_heuristic * utils.misc.min_abs_diff(X[:,i]), exp.period_heuristic * np.ptp(X[:,i]) / X.shape[0]) for i in range(X.shape[1])]) data_shape['max_period'] = [np.log((1.0/exp.max_period_heuristic)*(data_shape['x_max'][i] - data_shape['x_min'][i])) for i in range(X.shape[1])] # Initialise mean, kernel and likelihood m = eval(exp.mean) k = eval(exp.kernel) l = eval(exp.lik) current_models = [ff.GPModel(mean=m, kernel=k, likelihood=l, ndata=y.size)] print '\n\nStarting search with this model:\n' print current_models[0].pretty_print() print '' # Perform the initial expansion current_models = grammar.expand_models(D=D, models=current_models, base_kernels=exp.base_kernels, rules=exp.search_operators) # Convert to additive form if desired if exp.additive_form: current_models = [model.additive_form() for model in current_models] current_models = ff.remove_duplicates(current_models) # Set up lists to record search all_results = [] # List of scored kernels results_sequence = [] # List of lists of results, indexed by level of expansion. nan_sequence = [] # List of list of nan scored results oob_sequence = [] # List of list of out of bounds results best_models = None # Other setup best_score = np.Inf # Perform search for depth in range(exp.max_depth): if exp.debug==True: current_models = current_models[0:4] # Add random restarts to kernels current_models = ff.add_random_restarts(current_models, exp.n_rand, exp.sd, data_shape=data_shape) # Print result of expansion if exp.debug: print '\nRandomly restarted kernels\n' for model in current_models: print model.pretty_print() # Remove any redundancy introduced into kernel expressions current_models = [model.simplified() for model in current_models] # Print result of simplification if exp.debug: print '\nSimplified kernels\n' for model in current_models: print model.pretty_print() current_models = ff.remove_duplicates(current_models) # Print result of duplicate removal if exp.debug: print '\nDuplicate removed kernels\n' for model in current_models: print model.pretty_print() # Add jitter to parameter values (empirically discovered to help optimiser) current_models = ff.add_jitter(current_models, exp.jitter_sd) # Print result of jitter if exp.debug: print '\nJittered kernels\n' for model in current_models: print model.pretty_print() # Add the previous best models - in case we just need to optimise more rather than changing structure if not best_models is None: for a_model in best_models: current_models = current_models + [a_model.copy()] + ff.add_jitter_to_models([a_model.copy() for dummy in range(exp.n_rand)], exp.jitter_sd) # Randomise the order of the model to distribute computational load evenly np.random.shuffle(current_models) # Print current models if exp.debug: print '\nKernels to be evaluated\n' for model in current_models: print model.pretty_print() # Optimise parameters of and score the kernels new_results = jc.evaluate_models(current_models, X, y, verbose=exp.verbose, local_computation=exp.local_computation, zip_files=True, max_jobs=exp.max_jobs, iters=exp.iters, random_seed=exp.random_seed, subset=exp.subset, subset_size=exp.subset_size, full_iters=exp.full_iters, bundle_size=exp.bundle_size) # Remove models that were optimised to be out of bounds (this is similar to a 0-1 prior) new_results = [a_model for a_model in new_results if not a_model.out_of_bounds(data_shape)] oob_results = [a_model for a_model in new_results if a_model.out_of_bounds(data_shape)] oob_results = sorted(oob_results, key=lambda a_model : GPModel.score(a_model, exp.score), reverse=True) oob_sequence.append(oob_results) # Some of the scores may have failed - remove nans to prevent sorting algorithms messing up (new_results, nan_results) = remove_nan_scored_models(new_results, exp.score) nan_sequence.append(nan_results) assert(len(new_results) > 0) # FIXME - Need correct control flow if this happens # Sort the new results new_results = sorted(new_results, key=lambda a_model : GPModel.score(a_model, exp.score), reverse=True) print '\nAll new results\n' for result in new_results: print 'NLL=%0.1f' % result.nll, 'BIC=%0.1f' % result.bic, 'AIC=%0.1f' % result.aic, 'PL2=%0.3f' % result.pl2, result.pretty_print() all_results = all_results + new_results all_results = sorted(all_results, key=lambda a_model : GPModel.score(a_model, exp.score), reverse=True) results_sequence.append(all_results) # Extract the best k kernels from the new all_results best_results = sorted(new_results, key=lambda a_model : GPModel.score(a_model, exp.score))[0:exp.k] # Print best kernels if exp.debug: print '\nBest models\n' for model in best_results: print model.pretty_print() # Expand the best models current_models = grammar.expand_models(D=D, models=best_results, base_kernels=exp.base_kernels, rules=exp.search_operators) # Print expansion if exp.debug: print '\nExpanded models\n' for model in current_models: print model.pretty_print() # Convert to additive form if desired if exp.additive_form: current_models = [model.additive_form() for model in current_models] current_models = ff.remove_duplicates(current_models) # Print expansion if exp.debug: print '\Converted into additive\n' for model in current_models: print model.pretty_print() # Reduce number of kernels when in debug mode if exp.debug==True: current_models = current_models[0:4] # Write all_results to a temporary file at each level. all_results = sorted(all_results, key=lambda a_model : GPModel.score(a_model, exp.score), reverse=True) with open(results_filename + '.unfinished', 'w') as outfile: outfile.write('Experiment all_results for\n datafile = %s\n\n %s \n\n' \ % (experiment_data_file_name, experiment_fields_to_str(exp))) for (i, all_results) in enumerate(results_sequence): outfile.write('\n%%%%%%%%%% Level %d %%%%%%%%%%\n\n' % i) if exp.verbose_results: for result in all_results: print >> outfile, result else: # Only print top k kernels - i.e. those used to seed the next level of the search for result in sorted(all_results, key=lambda a_model : GPModel.score(a_model, exp.score))[0:exp.k]: print >> outfile, result # Write nan scored kernels to a log file with open(results_filename + '.nans', 'w') as outfile: outfile.write('Experiment nan results for\n datafile = %s\n\n %s \n\n' \ % (experiment_data_file_name, experiment_fields_to_str(exp))) for (i, nan_results) in enumerate(nan_sequence): outfile.write('\n%%%%%%%%%% Level %d %%%%%%%%%%\n\n' % i) for result in nan_results: print >> outfile, result # Write oob kernels to a log file with open(results_filename + '.oob', 'w') as outfile: outfile.write('Experiment oob results for\n datafile = %s\n\n %s \n\n' \ % (experiment_data_file_name, experiment_fields_to_str(exp))) for (i, nan_results) in enumerate(oob_sequence): outfile.write('\n%%%%%%%%%% Level %d %%%%%%%%%%\n\n' % i) for result in nan_results: print >> outfile, result # Have we hit a stopping criterion? if 'no_improvement' in exp.stopping_criteria: new_best_score = min(GPModel.score(a_model, exp.score) for a_model in new_results) if new_best_score < best_score - exp.improvement_tolerance: best_score = new_best_score else: # Insufficient improvement print 'Insufficient improvement to score - stopping search' break # Rename temporary results file to actual results file os.rename(results_filename + '.unfinished', results_filename)
def perform_kernel_search(X, y, D, experiment_data_file_name, results_filename, exp): '''Search for the best kernel, in parallel on fear or local machine.''' # Initialise random seeds - randomness may be used in e.g. data subsetting utils.misc.set_all_random_seeds(exp.random_seed) # Create location, scale and minimum period parameters to pass around for initialisations data_shape = {} data_shape['x_mean'] = [np.mean(X[:, dim]) for dim in range(X.shape[1])] data_shape['y_mean'] = np.mean( y) #### TODO - should this be modified for non real valued data data_shape['x_sd'] = np.log( [np.std(X[:, dim]) for dim in range(X.shape[1])]) data_shape['y_sd'] = np.log(np.std( y)) #### TODO - should this be modified for non real valued data data_shape['y_min'] = np.min(y) data_shape['y_max'] = np.max(y) data_shape['x_min'] = [np.min(X[:, dim]) for dim in range(X.shape[1])] data_shape['x_max'] = [np.max(X[:, dim]) for dim in range(X.shape[1])] data_shape['x_min_abs_diff'] = np.log( [utils.misc.min_abs_diff(X[:, i]) for i in range(X.shape[1])]) # Initialise period at a multiple of the shortest / average distance between points, to prevent Nyquist problems. if exp.period_heuristic_type == 'none': data_shape['min_period'] = None if exp.period_heuristic_type == 'min': data_shape['min_period'] = np.log([ exp.period_heuristic * utils.misc.min_abs_diff(X[:, i]) for i in range(X.shape[1]) ]) elif exp.period_heuristic_type == 'average': data_shape['min_period'] = np.log([ exp.period_heuristic * np.ptp(X[:, i]) / X.shape[0] for i in range(X.shape[1]) ]) elif exp.period_heuristic_type == 'both': data_shape['min_period'] = np.log([ max(exp.period_heuristic * utils.misc.min_abs_diff(X[:, i]), exp.period_heuristic * np.ptp(X[:, i]) / X.shape[0]) for i in range(X.shape[1]) ]) else: warnings.warn( 'Unrecognised period heuristic type : using most conservative heuristic' ) data_shape['min_period'] = np.log([ max(exp.period_heuristic * utils.misc.min_abs_diff(X[:, i]), exp.period_heuristic * np.ptp(X[:, i]) / X.shape[0]) for i in range(X.shape[1]) ]) data_shape['max_period'] = [ np.log((1.0 / exp.max_period_heuristic) * (data_shape['x_max'][i] - data_shape['x_min'][i])) for i in range(X.shape[1]) ] # Initialise mean, kernel and likelihood m = eval(exp.mean) k = eval(exp.kernel) l = eval(exp.lik) current_models = [ff.GPModel(mean=m, kernel=k, likelihood=l, ndata=y.size)] print '\n\nStarting search with this model:\n' print current_models[0].pretty_print() print '' # Perform the initial expansion current_models = grammar.expand_models(D=D, models=current_models, base_kernels=exp.base_kernels, rules=exp.search_operators) # Convert to additive form if desired if exp.additive_form: current_models = [model.additive_form() for model in current_models] current_models = ff.remove_duplicates(current_models) # Set up lists to record search all_results = [] # List of scored kernels results_sequence = [ ] # List of lists of results, indexed by level of expansion. nan_sequence = [] # List of list of nan scored results oob_sequence = [] # List of list of out of bounds results best_models = None # Other setup best_score = np.Inf # Perform search for depth in range(exp.max_depth): if exp.debug == True: current_models = current_models[0:4] # Add random restarts to kernels current_models = ff.add_random_restarts(current_models, exp.n_rand, exp.sd, data_shape=data_shape) # Print result of expansion if exp.debug: print '\nRandomly restarted kernels\n' for model in current_models: print model.pretty_print() # Remove any redundancy introduced into kernel expressions current_models = [model.simplified() for model in current_models] # Print result of simplification if exp.debug: print '\nSimplified kernels\n' for model in current_models: print model.pretty_print() current_models = ff.remove_duplicates(current_models) # Print result of duplicate removal if exp.debug: print '\nDuplicate removed kernels\n' for model in current_models: print model.pretty_print() # Add jitter to parameter values (empirically discovered to help optimiser) current_models = ff.add_jitter(current_models, exp.jitter_sd) # Print result of jitter if exp.debug: print '\nJittered kernels\n' for model in current_models: print model.pretty_print() # Add the previous best models - in case we just need to optimise more rather than changing structure if not best_models is None: for a_model in best_models: current_models = current_models + [ a_model.copy() ] + ff.add_jitter_to_models( [a_model.copy() for dummy in range(exp.n_rand)], exp.jitter_sd) # Randomise the order of the model to distribute computational load evenly np.random.shuffle(current_models) # Print current models if exp.debug: print '\nKernels to be evaluated\n' for model in current_models: print model.pretty_print() # Optimise parameters of and score the kernels new_results = jc.my_evaluate_models( current_models, X, y, verbose=exp.verbose, local_computation=exp.local_computation, zip_files=True, max_jobs=exp.max_jobs, iters=exp.iters, random_seed=exp.random_seed, subset=exp.subset, subset_size=exp.subset_size, full_iters=exp.full_iters, bundle_size=exp.bundle_size) # Remove models that were optimised to be out of bounds (this is similar to a 0-1 prior) new_results = [ a_model for a_model in new_results if not a_model.out_of_bounds(data_shape) ] oob_results = [ a_model for a_model in new_results if a_model.out_of_bounds(data_shape) ] #new_results = [a_model for a_model in new_results] #oob_results = [a_model for a_model in new_results] oob_results = sorted( oob_results, key=lambda a_model: GPModel.score(a_model, exp.score), reverse=True) oob_sequence.append(oob_results) # Some of the scores may have failed - remove nans to prevent sorting algorithms messing up (new_results, nan_results) = remove_nan_scored_models(new_results, exp.score) nan_sequence.append(nan_results) assert (len(new_results) > 0 ) # FIXME - Need correct control flow if this happens # Sort the new results new_results = sorted( new_results, key=lambda a_model: GPModel.score(a_model, exp.score), reverse=True) print '\nAll new results\n' for result in new_results: print 'NLL=%0.1f' % result.nll, 'BIC=%0.1f' % result.bic, 'AIC=%0.1f' % result.aic, 'PL2=%0.3f' % result.pl2, result.pretty_print( ) all_results = all_results + new_results all_results = sorted( all_results, key=lambda a_model: GPModel.score(a_model, exp.score), reverse=True) results_sequence.append(all_results) # Extract the best k kernels from the new all_results best_results = sorted( new_results, key=lambda a_model: GPModel.score(a_model, exp.score))[0:exp.k] # Print best kernels if exp.debug: print '\nBest models\n' for model in best_results: print model.pretty_print() # Expand the best models current_models = grammar.expand_models(D=D, models=best_results, base_kernels=exp.base_kernels, rules=exp.search_operators) # Print expansion if exp.debug: print '\nExpanded models\n' for model in current_models: print model.pretty_print() # Convert to additive form if desired if exp.additive_form: current_models = [ model.additive_form() for model in current_models ] current_models = ff.remove_duplicates(current_models) # Print expansion if exp.debug: print '\Converted into additive\n' for model in current_models: print model.pretty_print() # Reduce number of kernels when in debug mode if exp.debug == True: current_models = current_models[0:4] # Write all_results to a temporary file at each level. all_results = sorted( all_results, key=lambda a_model: GPModel.score(a_model, exp.score), reverse=True) with open(results_filename + '.unfinished', 'w') as outfile: outfile.write('Experiment all_results for\n datafile = %s\n\n %s \n\n' \ % (experiment_data_file_name, experiment_fields_to_str(exp))) for (i, all_results) in enumerate(results_sequence): outfile.write('\n%%%%%%%%%% Level %d %%%%%%%%%%\n\n' % i) if exp.verbose_results: for result in all_results: print >> outfile, result else: # Only print top k kernels - i.e. those used to seed the next level of the search i = 0 for result in sorted(all_results, key=lambda a_model: GPModel.score( a_model, exp.score))[0:exp.k]: print >> outfile, result scipy.io.savemat( results_filename + 'lvl_' + str(depth) + '_' + str(i) + '.mat1', result.gpml_result) i += 1 # Write nan scored kernels to a log file with open(results_filename + '.nans', 'w') as outfile: outfile.write('Experiment nan results for\n datafile = %s\n\n %s \n\n' \ % (experiment_data_file_name, experiment_fields_to_str(exp))) for (i, nan_results) in enumerate(nan_sequence): outfile.write('\n%%%%%%%%%% Level %d %%%%%%%%%%\n\n' % i) for result in nan_results: print >> outfile, result # Write oob kernels to a log file with open(results_filename + '.oob', 'w') as outfile: outfile.write('Experiment oob results for\n datafile = %s\n\n %s \n\n' \ % (experiment_data_file_name, experiment_fields_to_str(exp))) for (i, nan_results) in enumerate(oob_sequence): outfile.write('\n%%%%%%%%%% Level %d %%%%%%%%%%\n\n' % i) for result in nan_results: print >> outfile, result # Have we hit a stopping criterion? if 'no_improvement' in exp.stopping_criteria: new_best_score = min( GPModel.score(a_model, exp.score) for a_model in new_results) if new_best_score < best_score - exp.improvement_tolerance: best_score = new_best_score else: # Insufficient improvement print 'Insufficient improvement to score - stopping search' break # Rename temporary results file to actual results file os.rename(results_filename + '.unfinished', results_filename)
def evaluate_models(models, X, y, verbose=True, iters=300, local_computation=False, zip_files=False, max_jobs=500, random_seed=0, subset=False, subset_size=250, full_iters=0, bundle_size=1): # Make data into matrices in case they're unidimensional. if X.ndim == 1: X = X[:, nax] if y.ndim == 1: y = y[:, nax] ndata = y.shape[0] # Create data file if verbose: print 'Creating data file locally' data_file = cblparallel.create_temp_file('.mat') scipy.io.savemat(data_file, {'X': X, 'y': y}) # Move to fear if necessary if not local_computation: if verbose: print 'Moving data file to fear' cblparallel.copy_to_remote(data_file) # Create a list of MATLAB scripts to assess and optimise parameters for each kernel if verbose: print 'Creating scripts' scripts = [None] * len(models) for (i, model) in enumerate(models): parameters = {'datafile': data_file.split('/')[-1], 'writefile': '%(output_file)s', # N.B. cblparallel manages output files 'gpml_path': cblparallel.gpml_path(local_computation), 'mean_syntax': model.mean.get_gpml_expression(dimensions=X.shape[1]), 'mean_params': '[ %s ]' % ' '.join(str(p) for p in model.mean.param_vector), 'kernel_syntax': model.kernel.get_gpml_expression(dimensions=X.shape[1]), 'kernel_params': '[ %s ]' % ' '.join(str(p) for p in model.kernel.param_vector), 'lik_syntax': model.likelihood.get_gpml_expression(dimensions=X.shape[1]), 'lik_params': '[ %s ]' % ' '.join(str(p) for p in model.likelihood.param_vector), 'inference': model.likelihood.gpml_inference_method, 'iters': str(iters), 'seed': str(np.random.randint(2**31)), 'subset': 'true' if subset else 'false', 'subset_size' : str(subset_size), 'full_iters' : str(full_iters)} scripts[i] = gpml.OPTIMIZE_KERNEL_CODE % parameters #### Need to be careful with % signs #### For the moment, cblparallel expects no single % signs - FIXME scripts[i] = re.sub('% ', '%% ', scripts[i]) # Send to cblparallel and save output_files if verbose: print 'Sending scripts to cblparallel' if local_computation: output_files = cblparallel.run_batch_locally(scripts, language='matlab', max_cpu=1.1, job_check_sleep=5, submit_sleep=0.1, max_running_jobs=10, verbose=verbose) else: output_files = cblparallel.run_batch_on_fear(scripts, language='matlab', max_jobs=max_jobs, verbose=verbose, zip_files=zip_files, bundle_size=bundle_size) # Read in results results = [None] * len(models) for (i, output_file) in enumerate(output_files): if verbose: print 'Reading output file %d of %d' % (i + 1, len(models)) results[i] = GPModel.from_matlab_output(gpml.read_outputs(output_file), models[i], ndata) # Tidy up local output files for (i, output_file) in enumerate(output_files): if verbose: print 'Removing output file %d of %d' % (i + 1, len(models)) os.remove(output_file) # Remove temporary data file (perhaps on the cluster server) cblparallel.remove_temp_file(data_file, local_computation) # Return results i.e. list of ScoredKernel objects return results
def evaluate_models( models, X, y, verbose=True, iters=300, local_computation=False, zip_files=False, max_jobs=500, random_seed=0, subset=False, subset_size=250, full_iters=0, bundle_size=1, ): # Make data into matrices in case they're unidimensional. if X.ndim == 1: X = X[:, nax] if y.ndim == 1: y = y[:, nax] ndata = y.shape[0] # Create data file if verbose: print "Creating data file locally" data_file = cblparallel.create_temp_file(".mat") scipy.io.savemat(data_file, {"X": X, "y": y}) # Move to fear if necessary if not local_computation: if verbose: print "Moving data file to fear" cblparallel.copy_to_remote(data_file) # Create a list of MATLAB scripts to assess and optimise parameters for each kernel if verbose: print "Creating scripts" scripts = [None] * len(models) for (i, model) in enumerate(models): parameters = { "datafile": data_file.split("/")[-1], "writefile": "%(output_file)s", # N.B. cblparallel manages output files "gpml_path": cblparallel.gpml_path(local_computation), "mean_syntax": model.mean.get_gpml_expression(dimensions=X.shape[1]), "mean_params": "[ %s ]" % " ".join(str(p) for p in model.mean.param_vector), "kernel_syntax": model.kernel.get_gpml_expression(dimensions=X.shape[1]), "kernel_params": "[ %s ]" % " ".join(str(p) for p in model.kernel.param_vector), "lik_syntax": model.likelihood.get_gpml_expression(dimensions=X.shape[1]), "lik_params": "[ %s ]" % " ".join(str(p) for p in model.likelihood.param_vector), "inference": model.likelihood.gpml_inference_method, "iters": str(iters), "seed": str(np.random.randint(2 ** 31)), "subset": "true" if subset else "false", "subset_size": str(subset_size), "full_iters": str(full_iters), } scripts[i] = gpml.OPTIMIZE_KERNEL_CODE % parameters #### Need to be careful with % signs #### For the moment, cblparallel expects no single % signs - FIXME scripts[i] = re.sub("% ", "%% ", scripts[i]) # Send to cblparallel and save output_files if verbose: print "Sending scripts to cblparallel" if local_computation: output_files = cblparallel.run_batch_locally( scripts, language="matlab", max_cpu=1.1, job_check_sleep=5, submit_sleep=0.1, max_running_jobs=10, verbose=verbose, ) else: output_files = cblparallel.run_batch_on_fear( scripts, language="matlab", max_jobs=max_jobs, verbose=verbose, zip_files=zip_files, bundle_size=bundle_size ) # Read in results results = [None] * len(models) for (i, output_file) in enumerate(output_files): if verbose: print "Reading output file %d of %d" % (i + 1, len(models)) results[i] = GPModel.from_matlab_output(gpml.read_outputs(output_file), models[i], ndata) # Tidy up local output files for (i, output_file) in enumerate(output_files): if verbose: print "Removing output file %d of %d" % (i + 1, len(models)) os.remove(output_file) # Remove temporary data file (perhaps on the cluster server) cblparallel.remove_temp_file(data_file, local_computation) # Return results i.e. list of ScoredKernel objects return results