def test_expand_model(self): print 'expand model' print '2d' k = ff.SqExpKernel(dimension=0, lengthscale=0, sf=0) m = ff.GPModel(mean=ff.MeanZero(), kernel=k, likelihood=ff.LikGauss()) expanded = grammar.expand_models(2, [m], base_kernels='SE', rules=None) for k in expanded: print '\n', k.pretty_print(), '\n'
def perform_kernel_search(X, y, D, experiment_data_file_name, results_filename, exp): '''Search for the best kernel, in parallel on fear or local machine.''' # Initialise random seeds - randomness may be used in e.g. data subsetting utils.misc.set_all_random_seeds(exp.random_seed) # Create location, scale and minimum period parameters to pass around for initialisations data_shape = {} data_shape['x_mean'] = [np.mean(X[:,dim]) for dim in range(X.shape[1])] data_shape['y_mean'] = np.mean(y) #### TODO - should this be modified for non real valued data data_shape['x_sd'] = np.log([np.std(X[:,dim]) for dim in range(X.shape[1])]) data_shape['y_sd'] = np.log(np.std(y)) #### TODO - should this be modified for non real valued data data_shape['y_min'] = np.min(y) data_shape['y_max'] = np.max(y) data_shape['x_min'] = [np.min(X[:,dim]) for dim in range(X.shape[1])] data_shape['x_max'] = [np.max(X[:,dim]) for dim in range(X.shape[1])] # Initialise period at a multiple of the shortest / average distance between points, to prevent Nyquist problems. if exp.period_heuristic_type == 'none': data_shape['min_period'] = None if exp.period_heuristic_type == 'min': data_shape['min_period'] = np.log([exp.period_heuristic * utils.misc.min_abs_diff(X[:,i]) for i in range(X.shape[1])]) elif exp.period_heuristic_type == 'average': data_shape['min_period'] = np.log([exp.period_heuristic * np.ptp(X[:,i]) / X.shape[0] for i in range(X.shape[1])]) elif exp.period_heuristic_type == 'both': data_shape['min_period'] = np.log([max(exp.period_heuristic * utils.misc.min_abs_diff(X[:,i]), exp.period_heuristic * np.ptp(X[:,i]) / X.shape[0]) for i in range(X.shape[1])]) else: warnings.warn('Unrecognised period heuristic type : using most conservative heuristic') data_shape['min_period'] = np.log([max(exp.period_heuristic * utils.misc.min_abs_diff(X[:,i]), exp.period_heuristic * np.ptp(X[:,i]) / X.shape[0]) for i in range(X.shape[1])]) data_shape['max_period'] = [np.log((1.0/exp.max_period_heuristic)*(data_shape['x_max'][i] - data_shape['x_min'][i])) for i in range(X.shape[1])] # Initialise mean, kernel and likelihood m = eval(exp.mean) k = eval(exp.kernel) l = eval(exp.lik) current_models = [ff.GPModel(mean=m, kernel=k, likelihood=l, ndata=y.size)] print '\n\nStarting search with this model:\n' print current_models[0].pretty_print() print '' # Perform the initial expansion current_models = grammar.expand_models(D=D, models=current_models, base_kernels=exp.base_kernels, rules=exp.search_operators) # Convert to additive form if desired if exp.additive_form: current_models = [model.additive_form() for model in current_models] current_models = ff.remove_duplicates(current_models) # Set up lists to record search all_results = [] # List of scored kernels results_sequence = [] # List of lists of results, indexed by level of expansion. nan_sequence = [] # List of list of nan scored results oob_sequence = [] # List of list of out of bounds results best_models = None # Other setup best_score = np.Inf # Perform search for depth in range(exp.max_depth): if exp.debug==True: current_models = current_models[0:4] # Add random restarts to kernels current_models = ff.add_random_restarts(current_models, exp.n_rand, exp.sd, data_shape=data_shape) # Print result of expansion if exp.debug: print '\nRandomly restarted kernels\n' for model in current_models: print model.pretty_print() # Remove any redundancy introduced into kernel expressions current_models = [model.simplified() for model in current_models] # Print result of simplification if exp.debug: print '\nSimplified kernels\n' for model in current_models: print model.pretty_print() current_models = ff.remove_duplicates(current_models) # Print result of duplicate removal if exp.debug: print '\nDuplicate removed kernels\n' for model in current_models: print model.pretty_print() # Add jitter to parameter values (empirically discovered to help optimiser) current_models = ff.add_jitter(current_models, exp.jitter_sd) # Print result of jitter if exp.debug: print '\nJittered kernels\n' for model in current_models: print model.pretty_print() # Add the previous best models - in case we just need to optimise more rather than changing structure if not best_models is None: for a_model in best_models: current_models = current_models + [a_model.copy()] + ff.add_jitter_to_models([a_model.copy() for dummy in range(exp.n_rand)], exp.jitter_sd) # Randomise the order of the model to distribute computational load evenly np.random.shuffle(current_models) # Print current models if exp.debug: print '\nKernels to be evaluated\n' for model in current_models: print model.pretty_print() # Optimise parameters of and score the kernels new_results = jc.evaluate_models(current_models, X, y, verbose=exp.verbose, local_computation=exp.local_computation, zip_files=True, max_jobs=exp.max_jobs, iters=exp.iters, random_seed=exp.random_seed, subset=exp.subset, subset_size=exp.subset_size, full_iters=exp.full_iters, bundle_size=exp.bundle_size) # Remove models that were optimised to be out of bounds (this is similar to a 0-1 prior) new_results = [a_model for a_model in new_results if not a_model.out_of_bounds(data_shape)] oob_results = [a_model for a_model in new_results if a_model.out_of_bounds(data_shape)] oob_results = sorted(oob_results, key=lambda a_model : GPModel.score(a_model, exp.score), reverse=True) oob_sequence.append(oob_results) # Some of the scores may have failed - remove nans to prevent sorting algorithms messing up (new_results, nan_results) = remove_nan_scored_models(new_results, exp.score) nan_sequence.append(nan_results) assert(len(new_results) > 0) # FIXME - Need correct control flow if this happens # Sort the new results new_results = sorted(new_results, key=lambda a_model : GPModel.score(a_model, exp.score), reverse=True) print '\nAll new results\n' for result in new_results: print 'NLL=%0.1f' % result.nll, 'BIC=%0.1f' % result.bic, 'AIC=%0.1f' % result.aic, 'PL2=%0.3f' % result.pl2, result.pretty_print() all_results = all_results + new_results all_results = sorted(all_results, key=lambda a_model : GPModel.score(a_model, exp.score), reverse=True) results_sequence.append(all_results) # Extract the best k kernels from the new all_results best_results = sorted(new_results, key=lambda a_model : GPModel.score(a_model, exp.score))[0:exp.k] # Print best kernels if exp.debug: print '\nBest models\n' for model in best_results: print model.pretty_print() # Expand the best models current_models = grammar.expand_models(D=D, models=best_results, base_kernels=exp.base_kernels, rules=exp.search_operators) # Print expansion if exp.debug: print '\nExpanded models\n' for model in current_models: print model.pretty_print() # Convert to additive form if desired if exp.additive_form: current_models = [model.additive_form() for model in current_models] current_models = ff.remove_duplicates(current_models) # Print expansion if exp.debug: print '\Converted into additive\n' for model in current_models: print model.pretty_print() # Reduce number of kernels when in debug mode if exp.debug==True: current_models = current_models[0:4] # Write all_results to a temporary file at each level. all_results = sorted(all_results, key=lambda a_model : GPModel.score(a_model, exp.score), reverse=True) with open(results_filename + '.unfinished', 'w') as outfile: outfile.write('Experiment all_results for\n datafile = %s\n\n %s \n\n' \ % (experiment_data_file_name, experiment_fields_to_str(exp))) for (i, all_results) in enumerate(results_sequence): outfile.write('\n%%%%%%%%%% Level %d %%%%%%%%%%\n\n' % i) if exp.verbose_results: for result in all_results: print >> outfile, result else: # Only print top k kernels - i.e. those used to seed the next level of the search for result in sorted(all_results, key=lambda a_model : GPModel.score(a_model, exp.score))[0:exp.k]: print >> outfile, result # Write nan scored kernels to a log file with open(results_filename + '.nans', 'w') as outfile: outfile.write('Experiment nan results for\n datafile = %s\n\n %s \n\n' \ % (experiment_data_file_name, experiment_fields_to_str(exp))) for (i, nan_results) in enumerate(nan_sequence): outfile.write('\n%%%%%%%%%% Level %d %%%%%%%%%%\n\n' % i) for result in nan_results: print >> outfile, result # Write oob kernels to a log file with open(results_filename + '.oob', 'w') as outfile: outfile.write('Experiment oob results for\n datafile = %s\n\n %s \n\n' \ % (experiment_data_file_name, experiment_fields_to_str(exp))) for (i, nan_results) in enumerate(oob_sequence): outfile.write('\n%%%%%%%%%% Level %d %%%%%%%%%%\n\n' % i) for result in nan_results: print >> outfile, result # Have we hit a stopping criterion? if 'no_improvement' in exp.stopping_criteria: new_best_score = min(GPModel.score(a_model, exp.score) for a_model in new_results) if new_best_score < best_score - exp.improvement_tolerance: best_score = new_best_score else: # Insufficient improvement print 'Insufficient improvement to score - stopping search' break # Rename temporary results file to actual results file os.rename(results_filename + '.unfinished', results_filename)
def perform_kernel_search(X, y, D, experiment_data_file_name, results_filename, exp): '''Search for the best kernel, in parallel on fear or local machine.''' # Initialise random seeds - randomness may be used in e.g. data subsetting utils.misc.set_all_random_seeds(exp.random_seed) # Create location, scale and minimum period parameters to pass around for initialisations data_shape = {} data_shape['x_mean'] = [np.mean(X[:, dim]) for dim in range(X.shape[1])] data_shape['y_mean'] = np.mean( y) #### TODO - should this be modified for non real valued data data_shape['x_sd'] = np.log( [np.std(X[:, dim]) for dim in range(X.shape[1])]) data_shape['y_sd'] = np.log(np.std( y)) #### TODO - should this be modified for non real valued data data_shape['y_min'] = np.min(y) data_shape['y_max'] = np.max(y) data_shape['x_min'] = [np.min(X[:, dim]) for dim in range(X.shape[1])] data_shape['x_max'] = [np.max(X[:, dim]) for dim in range(X.shape[1])] data_shape['x_min_abs_diff'] = np.log( [utils.misc.min_abs_diff(X[:, i]) for i in range(X.shape[1])]) # Initialise period at a multiple of the shortest / average distance between points, to prevent Nyquist problems. if exp.period_heuristic_type == 'none': data_shape['min_period'] = None if exp.period_heuristic_type == 'min': data_shape['min_period'] = np.log([ exp.period_heuristic * utils.misc.min_abs_diff(X[:, i]) for i in range(X.shape[1]) ]) elif exp.period_heuristic_type == 'average': data_shape['min_period'] = np.log([ exp.period_heuristic * np.ptp(X[:, i]) / X.shape[0] for i in range(X.shape[1]) ]) elif exp.period_heuristic_type == 'both': data_shape['min_period'] = np.log([ max(exp.period_heuristic * utils.misc.min_abs_diff(X[:, i]), exp.period_heuristic * np.ptp(X[:, i]) / X.shape[0]) for i in range(X.shape[1]) ]) else: warnings.warn( 'Unrecognised period heuristic type : using most conservative heuristic' ) data_shape['min_period'] = np.log([ max(exp.period_heuristic * utils.misc.min_abs_diff(X[:, i]), exp.period_heuristic * np.ptp(X[:, i]) / X.shape[0]) for i in range(X.shape[1]) ]) data_shape['max_period'] = [ np.log((1.0 / exp.max_period_heuristic) * (data_shape['x_max'][i] - data_shape['x_min'][i])) for i in range(X.shape[1]) ] # Initialise mean, kernel and likelihood m = eval(exp.mean) k = eval(exp.kernel) l = eval(exp.lik) current_models = [ff.GPModel(mean=m, kernel=k, likelihood=l, ndata=y.size)] print '\n\nStarting search with this model:\n' print current_models[0].pretty_print() print '' # Perform the initial expansion current_models = grammar.expand_models(D=D, models=current_models, base_kernels=exp.base_kernels, rules=exp.search_operators) # Convert to additive form if desired if exp.additive_form: current_models = [model.additive_form() for model in current_models] current_models = ff.remove_duplicates(current_models) # Set up lists to record search all_results = [] # List of scored kernels results_sequence = [ ] # List of lists of results, indexed by level of expansion. nan_sequence = [] # List of list of nan scored results oob_sequence = [] # List of list of out of bounds results best_models = None # Other setup best_score = np.Inf # Perform search for depth in range(exp.max_depth): if exp.debug == True: current_models = current_models[0:4] # Add random restarts to kernels current_models = ff.add_random_restarts(current_models, exp.n_rand, exp.sd, data_shape=data_shape) # Print result of expansion if exp.debug: print '\nRandomly restarted kernels\n' for model in current_models: print model.pretty_print() # Remove any redundancy introduced into kernel expressions current_models = [model.simplified() for model in current_models] # Print result of simplification if exp.debug: print '\nSimplified kernels\n' for model in current_models: print model.pretty_print() current_models = ff.remove_duplicates(current_models) # Print result of duplicate removal if exp.debug: print '\nDuplicate removed kernels\n' for model in current_models: print model.pretty_print() # Add jitter to parameter values (empirically discovered to help optimiser) current_models = ff.add_jitter(current_models, exp.jitter_sd) # Print result of jitter if exp.debug: print '\nJittered kernels\n' for model in current_models: print model.pretty_print() # Add the previous best models - in case we just need to optimise more rather than changing structure if not best_models is None: for a_model in best_models: current_models = current_models + [ a_model.copy() ] + ff.add_jitter_to_models( [a_model.copy() for dummy in range(exp.n_rand)], exp.jitter_sd) # Randomise the order of the model to distribute computational load evenly np.random.shuffle(current_models) # Print current models if exp.debug: print '\nKernels to be evaluated\n' for model in current_models: print model.pretty_print() # Optimise parameters of and score the kernels new_results = jc.my_evaluate_models( current_models, X, y, verbose=exp.verbose, local_computation=exp.local_computation, zip_files=True, max_jobs=exp.max_jobs, iters=exp.iters, random_seed=exp.random_seed, subset=exp.subset, subset_size=exp.subset_size, full_iters=exp.full_iters, bundle_size=exp.bundle_size) # Remove models that were optimised to be out of bounds (this is similar to a 0-1 prior) new_results = [ a_model for a_model in new_results if not a_model.out_of_bounds(data_shape) ] oob_results = [ a_model for a_model in new_results if a_model.out_of_bounds(data_shape) ] #new_results = [a_model for a_model in new_results] #oob_results = [a_model for a_model in new_results] oob_results = sorted( oob_results, key=lambda a_model: GPModel.score(a_model, exp.score), reverse=True) oob_sequence.append(oob_results) # Some of the scores may have failed - remove nans to prevent sorting algorithms messing up (new_results, nan_results) = remove_nan_scored_models(new_results, exp.score) nan_sequence.append(nan_results) assert (len(new_results) > 0 ) # FIXME - Need correct control flow if this happens # Sort the new results new_results = sorted( new_results, key=lambda a_model: GPModel.score(a_model, exp.score), reverse=True) print '\nAll new results\n' for result in new_results: print 'NLL=%0.1f' % result.nll, 'BIC=%0.1f' % result.bic, 'AIC=%0.1f' % result.aic, 'PL2=%0.3f' % result.pl2, result.pretty_print( ) all_results = all_results + new_results all_results = sorted( all_results, key=lambda a_model: GPModel.score(a_model, exp.score), reverse=True) results_sequence.append(all_results) # Extract the best k kernels from the new all_results best_results = sorted( new_results, key=lambda a_model: GPModel.score(a_model, exp.score))[0:exp.k] # Print best kernels if exp.debug: print '\nBest models\n' for model in best_results: print model.pretty_print() # Expand the best models current_models = grammar.expand_models(D=D, models=best_results, base_kernels=exp.base_kernels, rules=exp.search_operators) # Print expansion if exp.debug: print '\nExpanded models\n' for model in current_models: print model.pretty_print() # Convert to additive form if desired if exp.additive_form: current_models = [ model.additive_form() for model in current_models ] current_models = ff.remove_duplicates(current_models) # Print expansion if exp.debug: print '\Converted into additive\n' for model in current_models: print model.pretty_print() # Reduce number of kernels when in debug mode if exp.debug == True: current_models = current_models[0:4] # Write all_results to a temporary file at each level. all_results = sorted( all_results, key=lambda a_model: GPModel.score(a_model, exp.score), reverse=True) with open(results_filename + '.unfinished', 'w') as outfile: outfile.write('Experiment all_results for\n datafile = %s\n\n %s \n\n' \ % (experiment_data_file_name, experiment_fields_to_str(exp))) for (i, all_results) in enumerate(results_sequence): outfile.write('\n%%%%%%%%%% Level %d %%%%%%%%%%\n\n' % i) if exp.verbose_results: for result in all_results: print >> outfile, result else: # Only print top k kernels - i.e. those used to seed the next level of the search i = 0 for result in sorted(all_results, key=lambda a_model: GPModel.score( a_model, exp.score))[0:exp.k]: print >> outfile, result scipy.io.savemat( results_filename + 'lvl_' + str(depth) + '_' + str(i) + '.mat1', result.gpml_result) i += 1 # Write nan scored kernels to a log file with open(results_filename + '.nans', 'w') as outfile: outfile.write('Experiment nan results for\n datafile = %s\n\n %s \n\n' \ % (experiment_data_file_name, experiment_fields_to_str(exp))) for (i, nan_results) in enumerate(nan_sequence): outfile.write('\n%%%%%%%%%% Level %d %%%%%%%%%%\n\n' % i) for result in nan_results: print >> outfile, result # Write oob kernels to a log file with open(results_filename + '.oob', 'w') as outfile: outfile.write('Experiment oob results for\n datafile = %s\n\n %s \n\n' \ % (experiment_data_file_name, experiment_fields_to_str(exp))) for (i, nan_results) in enumerate(oob_sequence): outfile.write('\n%%%%%%%%%% Level %d %%%%%%%%%%\n\n' % i) for result in nan_results: print >> outfile, result # Have we hit a stopping criterion? if 'no_improvement' in exp.stopping_criteria: new_best_score = min( GPModel.score(a_model, exp.score) for a_model in new_results) if new_best_score < best_score - exp.improvement_tolerance: best_score = new_best_score else: # Insufficient improvement print 'Insufficient improvement to score - stopping search' break # Rename temporary results file to actual results file os.rename(results_filename + '.unfinished', results_filename)
def perform_kernel_search(X, Y, exp): """Search for the best kernel""" # Initialise random seeds - randomness may be used in e.g. data subsetting utils.misc.set_all_random_seeds(exp['random_seed']) # Create location, scale and minimum period parameters to pass around for parameter initialisations data_shape = dict() data_shape['x_mean'] = [np.mean(X[:, dim]) for dim in range(X.shape[1])] data_shape['y_mean'] = np.mean(Y) # TODO - need to rethink this for non real valued data data_shape['x_sd'] = log([np.std(X[:, dim]) for dim in range(X.shape[1])]) data_shape['y_sd'] = log(np.std(Y)) # TODO - need to rethink this for non real valued data data_shape['y_min'] = np.min(Y) data_shape['y_max'] = np.max(Y) data_shape['x_min'] = [np.min(X[:, dim]) for dim in range(X.shape[1])] data_shape['x_max'] = [np.max(X[:, dim]) for dim in range(X.shape[1])] # Initialise period at a multiple of the shortest / average distance between points, to prevent Nyquist problems. # This is ultimately a little hacky and is avoiding more fundamental decisions if exp['period_heuristic_type'] == 'none': data_shape['min_period'] = None if exp['period_heuristic_type'] == 'min': data_shape['min_period'] = log([exp['period_heuristic'] * utils.misc.min_abs_diff(X[:, i]) for i in range(X.shape[1])]) elif exp['period_heuristic_type'] == 'average': data_shape['min_period'] = log([exp['period_heuristic'] * np.ptp(X[:, i]) / X.shape[0] for i in range(X.shape[1])]) elif exp['period_heuristic_type'] == 'both': data_shape['min_period'] = log([max(exp['period_heuristic'] * utils.misc.min_abs_diff(X[:, i]), exp['period_heuristic'] * np.ptp(X[:, i]) / X.shape[0]) for i in range(X.shape[1])]) else: warnings.warn('Unrecognised period heuristic type : using most conservative heuristic') data_shape['min_period'] = log([max(exp['period_heuristic'] * utils.misc.min_abs_diff(X[:, i]), exp['period_heuristic'] * np.ptp(X[:, i]) / X.shape[0]) for i in range(X.shape[1])]) data_shape['max_period'] = [log((1.0 / exp['max_period_heuristic']) * (data_shape['x_max'][i] - data_shape['x_min'][i])) for i in range(X.shape[1])] # Initialise mean, kernel and likelihood m = eval(exp['mean']) k = eval(exp['kernel']) l = eval(exp['lik']) current_models = [gpm.GPModel(mean=m, kernel=k, likelihood=l, ndata=Y.size)] print('\n\nStarting search with this model:\n') print(current_models[0].pretty_print()) print('') # Perform the initial expansion # current_models = grammar.expand_models(D=X.shape[1], # models=current_models, # base_kernels=exp['base_kernels'], # rules=exp['search_operators']) # Convert to additive form if desired if exp['additive_form']: current_models = [model.additive_form() for model in current_models] current_models = gpm.remove_duplicates(current_models) # Setup lists etc to record search and current state all_results = [] # List of scored kernels results_sequence = [] # List of lists of results, indexed by level of expansion. nan_sequence = [] # List of list of nan scored results oob_sequence = [] # List of list of out of bounds results best_models = None best_score = np.Inf # Setup multiprocessing pool processing_pool = Pool(processes=exp['n_processes'], maxtasksperchild=exp['max_tasks_per_process']) try: # Perform search for depth in range(exp['max_depth']): # If debug reduce number of models for fast evaluation if exp['debug']: current_models = current_models[0:4] # Add random restarts to kernels current_models = gpm.add_random_restarts(current_models, exp['n_rand'], exp['sd'], data_shape=data_shape) # Print result of expansion if debugging if exp['debug']: print('\nRandomly restarted kernels\n') for model in current_models: print(model.pretty_print()) # Remove any redundancy introduced into kernel expressions current_models = [model.simplified() for model in current_models] # Print result of simplification if exp['debug']: print('\nSimplified kernels\n') for model in current_models: print(model.pretty_print()) # Remove duplicate kernels current_models = gpm.remove_duplicates(current_models) # Print result of duplicate removal if exp['debug']: print('\nDuplicate removed kernels\n') for model in current_models: print(model.pretty_print()) # Add jitter to parameter values (helps sticky optimisers) current_models = gpm.add_jitter(current_models, exp['jitter_sd']) # Print result of jitter if exp['debug']: print('\nJittered kernels\n') for model in current_models: print model.pretty_print() # Add the previous best models - in case we just need to optimise more rather than changing structure if not best_models is None: for a_model in best_models: # noinspection PyUnusedLocal current_models = current_models + [a_model.copy()] +\ gpm.add_jitter([a_model.copy() for dummy in range(exp['n_rand'])], exp['jitter_sd']) # Randomise the order of the model to distribute computational load evenly if running on cluster np.random.shuffle(current_models) # Print current models if exp['debug']: print('\nKernels to be evaluated\n') for model in current_models: print(model.pretty_print()) if exp['strategy'] == 'vanilla': subset_n = X.shape[0] # No subset elif exp['strategy'] == 'subset': subset_n = min(exp['starting_subset'], X.shape[0]) while subset_n <= X.shape[0]: # Subset data X_subset = X[:subset_n] Y_subset = Y[:subset_n] # Use multiprocessing pool to optimise models kwargs = dict(inference='exact', messages=exp['verbose'], max_iters=exp['iters']) new_results = processing_pool.map(optimise_single_model, ((model, X_subset, Y_subset, kwargs) for model in current_models)) # Remove models that were optimised to be out of bounds (this is similar to a 0-1 prior) # TODO - put priors on hyperparameters new_results = [a_model for a_model in new_results if not a_model.out_of_bounds(data_shape)] oob_results = [a_model for a_model in new_results if a_model.out_of_bounds(data_shape)] oob_results = sorted(oob_results, key=lambda a_model: GPModel.score(a_model, exp['score']), reverse=True) oob_sequence.append(oob_results) # Some of the scores may have failed - remove nans to prevent sorting algorithms messing up (new_results, nan_results) = remove_nan_scored_models(new_results, exp['score']) nan_sequence.append(nan_results) assert(len(new_results) > 0) # FIXME - Need correct control flow if this happens # Sort the new results new_results = sorted(new_results, key=lambda a_model: GPModel.score(a_model, exp['score']), reverse=True) # Keep only the top models if exp['strategy'] == 'subset': new_results = new_results[int(np.floor(len(new_results) * exp['subset_pruning'])):] # Current = new current_models = new_results # Double the subset size, or exit loop if finished if subset_n == X.shape[0]: break else: subset_n = min(subset_n * 2, X.shape[0]) # Update user print('\nAll new results\n') for model in new_results: print('BIC=%0.1f' % model.bic, # 'NLL=%0.1f' % model.nll, # 'AIC=%0.1f' % model.aic, # 'PL2=%0.3f' % model.pl2, model.pretty_print()) all_results = all_results + new_results all_results = sorted(all_results, key=lambda a_model: GPModel.score(a_model, exp['score']), reverse=True) results_sequence.append(all_results) # Extract the best k kernels from the new all_results best_results = sorted(new_results, key=lambda a_model: GPModel.score(a_model, exp['score']))[0:exp['k']] # Print best kernels if debugging if exp['debug']: print('\nBest models\n') for model in best_results: print model.pretty_print() # Expand the best models current_models = grammar.expand_models(D=X.shape[1], models=best_results, base_kernels=exp['base_kernels'], rules=exp['search_operators']) # Print expansion if debugging if exp['debug']: print('\nExpanded models\n') for model in current_models: print(model.pretty_print()) # Convert to additive form if desired if exp['additive_form']: current_models = [model.additive_form() for model in current_models] current_models = gpm.remove_duplicates(current_models) # Print expansion if exp['debug']: print('\Converted into additive\n') for model in current_models: print(model.pretty_print()) # Reduce number of kernels when in debug mode if exp['debug']: current_models = current_models[0:4] # Have we hit a stopping criterion? if 'no_improvement' in exp['stopping_criteria']: new_best_score = min(GPModel.score(a_model, exp['score']) for a_model in new_results) if new_best_score < best_score - exp['improvement_tolerance']: best_score = new_best_score else: # Insufficient improvement print 'Insufficient improvement to score - stopping search' break finally: processing_pool.close() processing_pool.join() return all_results