def __init__(self, pgenerator, project_name, experiment_name): jobsDB = JobsDB(project_name, experiment_name) self.params = [] self.pgenerator = pgenerator # Go through all the parameters that pgenerator would make # and take out the ones which are already in JobsDB. for params in pgenerator: if jobsDB.search_by_param(params) is None: self.params.append(params)
def __init__(self, project_name, experiment_name, param_space, param_types=None): self.project_name = project_name self.experiment_name = experiment_name # set-up connection to mongodb... self.jobs = JobsDB(project_name, experiment_name, param_space=param_space, param_types=param_types) # we'll use the results multiple times, so cache it (TODO: catch OOM) self.completed = list(self.jobs.get_completed_jobs()) # set up the parameter-names for the plotters... self.param_names = self.jobs.get_param_names() self.losses = [] self.param_values = {} for trials in self.completed: self.losses.append( trials['loss'] ) for pname in self.param_names: pvalues = trials['params'][pname] # the raw values used for ths param # check if the parameter needs to be labelized to values... if self.jobs.param_value_encoder[pname]: pvalues = self.jobs.param_value_encoder[pname].transform( pvalues ) # look for key=pname in the param_values dict, if missing initialise # it to and empty list and append the first pvalue. If already present # just append pvalue. self.param_values.setdefault(pname, []).append( pvalues )
def __init__(self, pgenerator, project_name, experiment_name, n_best=0.1, n_samples=None, param_space=None): jobsDB = JobsDB(project_name, experiment_name) param_values = jobsDB.get_param_values(encode_labels=True) df = pd.DataFrame(param_values) gb_up = GB(n_estimators=500, learning_rate=0.1, loss='quantile', alpha=0.95, max_depth=3, max_features=None, min_samples_leaf=9, min_samples_split=9) gb_up.fit(df, losses) gb_dn = GB(n_estimators=500, learning_rate=0.1, loss='quantile', alpha=0.05, max_depth=3, max_features=None, min_samples_leaf=9, min_samples_split=9) gb_dn.fit(df, losses) gb = GB(n_estimators=500, learning_rate=0.1, loss='ls', max_depth=3, max_features=None, min_samples_leaf=9, min_samples_split=9) gb.fit(df, losses) if hasattr(pgenerator, "__name__"): self.pgenerator = pgenerator(param_space, n_samples) else: self.pgenerator = pgenerator trial_params_list = list(self.pgenerator) trial_params_df = pd.DataFrame( trial_params_list ) predicted_loss = gb.predict(trial_params_df) predicted_loss_up = gb_up.predict(trial_params_df) predicted_loss_dn = gb_dn.predict(trial_params_df) idx = np.argsort(predicted_loss_dn) sorted_trial_params = np.array(trial_params_list)[idx] if n_best >= 1: # actual thresh_idx = int(np.round(n_best)) else: # fraction thresh_idx = int( np.round(n_best * len(trial_params_list)) ) self.params = list(trial_params_list)[:thresh_idx]
class Worker: def __init__(self, project_name, experiment_name, clf, X, y, objective, host='localhost', port=27017, check_every=1, loop_forever=True): self.jobsDB = JobsDB(project_name, experiment_name, host, port) self.n_trials = -1 # loop-forever if not loop_forever: # we're probably running in the multi-experiment mode # so once we've computed everything in this experiment # we will exit to let the next one run. self.n_trials = self.jobsDB.get_queued_jobs().count() self.clf = clf # estimator self.X = X # X features self.y = y # y labels self.objective = objective self.check_every = check_every # delay in seconds between checking for jobs def start_worker(self): if self.n_trials > -1: logging.info('Worker will close after the {} jobs in this experiment.'.format(self.n_trials)) for i in range(self.n_trials): self.compute() # print some stats self.jobsDB.print_job_stats() else: while True: self.compute() def get_next_params(self): job = None while job is None: job = self.jobsDB.get_next_job_from_queue() if job is not None: logger.info(job) else: logger.info('No queued job. Waiting {}s for new jobs...'.format(self.check_every)) time.sleep(self.check_every) return job def compute(self): job = self.get_next_params() clf_params = job['params'] for p in clf_params: # one day, if/when python 3 is ubiquitous, this won't be necessary... if isinstance( clf_params[p], unicode ): clf_params[p] = str(clf_params[p]) scores = self.objective(self.clf, clf_params, self.X, self.y) logger.debug("scores from objective: {}".format(scores)) loss = np.mean(scores) std = np.std(scores) # then report these results back in the db... aux_data = {'loss': loss, 'std': std} self.jobsDB.report_job_completion(job['_id'], loss, aux_data=aux_data)
def __init__(self, project_name, experiment_name, clf, X, y, objective, host='localhost', port=27017, check_every=1, loop_forever=True): self.jobsDB = JobsDB(project_name, experiment_name, host, port) self.n_trials = -1 # loop-forever if not loop_forever: # we're probably running in the multi-experiment mode # so once we've computed everything in this experiment # we will exit to let the next one run. self.n_trials = self.jobsDB.get_queued_jobs().count() self.clf = clf # estimator self.X = X # X features self.y = y # y labels self.objective = objective self.check_every = check_every # delay in seconds between checking for jobs
class Plotting: def __init__(self, project_name, experiment_name, param_space, param_types=None): self.project_name = project_name self.experiment_name = experiment_name # set-up connection to mongodb... self.jobs = JobsDB(project_name, experiment_name, param_space=param_space, param_types=param_types) # we'll use the results multiple times, so cache it (TODO: catch OOM) self.completed = list(self.jobs.get_completed_jobs()) # set up the parameter-names for the plotters... self.param_names = self.jobs.get_param_names() self.losses = [] self.param_values = {} for trials in self.completed: self.losses.append( trials['loss'] ) for pname in self.param_names: pvalues = trials['params'][pname] # the raw values used for ths param # check if the parameter needs to be labelized to values... if self.jobs.param_value_encoder[pname]: pvalues = self.jobs.param_value_encoder[pname].transform( pvalues ) # look for key=pname in the param_values dict, if missing initialise # it to and empty list and append the first pvalue. If already present # just append pvalue. self.param_values.setdefault(pname, []).append( pvalues ) # # some parameters are words, like 'linear' or 'rbf', we can't # # find the correlation coefficient between a word and a loss (float), # # so we'll use LabelEncoder() from sklearn to map these paramters # # to numeric integer values... # self.param_value_encoder = {} # for name in self.param_names: # self.param_value_encoder[name] = False # self.param_values = {} # for name in self.param_names: # self.param_values[name] = [] # for trials in self.completed: # foreach row # for name in self.param_names: # foreach column # self.param_values[name].append( trials['params'][name] ) # # this seems like overkill but it makes sure there are no # # non-numeric parameters that go unnoticed... # if isinstance( trials['params'][name], unicode ): # self.param_value_encoder[name] = LabelEncoder() # logging.debug(self.param_value_encoder) # for name in self.param_value_encoder: # if self.param_value_encoder[name]: # logging.info('Using LabelEncoder on param: {}'.format(name)) # self.param_values[name] = self.param_value_encoder[name].fit_transform(self.param_values[name]) def mkfilename(self, prefix, extension="png"): return "{}_{}-{}.{}".format(prefix, self.project_name, self.experiment_name, extension) def plot_loss_vs_time(self): best_loss = np.inf loss_log = [] for trial in self.completed: trial_loss = float( trial['loss'] ) if trial_loss < best_loss: best_loss = trial_loss loss_log.append(best_loss) sns.plt.plot(loss_log) sns.plt.savefig( self.mkfilename('loss_vs_time') ) sns.plt.close() def get_best_two_params(self): param_names = self.jobs.get_param_names() if len(param_names) == 2: return param_names # there can be only two. # how much does each parameter correlate with the achieved loss... param_losscorr = {} for name in self.param_names: corr_coef, pval = pearsonr( self.losses, self.param_values[name] ) logging.info('Correlation of {} with loss: {}'.format(name, corr_coef)) param_losscorr[name] = abs(corr_coef) # abs, since we don't care about the direction sorted_by_corr = sorted(param_losscorr.items(), key=lambda x:x[1], reverse=True) best_params = [] for i in sorted_by_corr: if math.isnan( i[1] ): continue best_params.append(i[0]) if len(best_params) == 2: return best_params return best_params #return sorted_by_corr[0][0], sorted_by_corr[1][0] # TODO: could be made more general/robust def plot_heatmap(self, param1, param2): ''' Method for plotting the loss vs two parameters. ''' dat = [] for i in range(len(self.losses)): dat.append( [self.param_values[param1][i], self.param_values[param2][i], self.losses[i]] ) logging.info('Plotting 2d heatmap, {} and {} vs loss'.format(param1, param2)) # must not have duplicates before doing a pivot df = pd.DataFrame( dat, columns=[param1, param2, 'loss'] ) df = df.groupby( [param1, param2], as_index=False ).min() df = df.pivot( param1, param2, 'loss' ) sns.heatmap(df) def plot_loss_vs_param(self, params=None): if params is None: best_params = self.get_best_two_params() else: if isinstance(params, list): best_params = params[-2:] else: best_params = [params] # just one param fig = sns.plt.figure() if len(best_params) == 1: logging.info("Plotting loss vs. {}".format(best_params[0])) ax = fig.add_subplot(111) ax.set_ylabel('Log loss') ax.set_xlabel( best_params[0] ) if self.jobs.param_value_encoder[best_params[0]]: ### !!! label_names = self.jobs.param_value_encoder[best_params[0]].classes_ sns.plt.xticks( [0, 1], [label_names[0], label_names[1]] )#self.param_values['kernel'] ) ax.scatter( self.param_values[best_params[0]], self.losses ) if len(best_params) == 2: self.plot_heatmap(best_params[0], best_params[1]) plot_filename = "loss_vs_params" for param in best_params: plot_filename += "-{}".format(param) sns.plt.savefig( self.mkfilename(plot_filename) ) sns.plt.close()