コード例 #1
0
 def __init__(self, pgenerator, project_name, experiment_name):
     jobsDB = JobsDB(project_name, experiment_name)
     self.params = []
     self.pgenerator = pgenerator
     # Go through all the parameters that pgenerator would make
     # and take out the ones which are already in JobsDB.
     for params in pgenerator:
         if jobsDB.search_by_param(params) is None:
             self.params.append(params)
コード例 #2
0
ファイル: plotting.py プロジェクト: neverspill/optomatic
    def __init__(self, project_name, experiment_name, param_space, param_types=None):
        self.project_name = project_name
        self.experiment_name = experiment_name

        # set-up connection to mongodb...
        self.jobs = JobsDB(project_name, experiment_name, param_space=param_space, param_types=param_types)

        # we'll use the results multiple times, so cache it (TODO: catch OOM)
        self.completed = list(self.jobs.get_completed_jobs())

        # set up the parameter-names for the plotters...
        self.param_names = self.jobs.get_param_names()

        self.losses = []
        self.param_values = {}
        for trials in self.completed:

            self.losses.append( trials['loss'] )
            
            for pname in self.param_names:
                pvalues = trials['params'][pname] # the raw values used for ths param
                # check if the parameter needs to be labelized to values... 
                if self.jobs.param_value_encoder[pname]:
                    pvalues = self.jobs.param_value_encoder[pname].transform( pvalues )
                # look for key=pname in the param_values dict, if missing initialise
                # it to and empty list and append the first pvalue. If already present
                # just append pvalue.
                self.param_values.setdefault(pname, []).append( pvalues )
コード例 #3
0
    def __init__(self, pgenerator, project_name, experiment_name, n_best=0.1, n_samples=None, param_space=None):
        jobsDB = JobsDB(project_name, experiment_name)
        param_values = jobsDB.get_param_values(encode_labels=True)
        df = pd.DataFrame(param_values)

        gb_up = GB(n_estimators=500, learning_rate=0.1,
                    loss='quantile', alpha=0.95, 
                    max_depth=3, max_features=None,
                    min_samples_leaf=9, min_samples_split=9)

        gb_up.fit(df, losses) 

        gb_dn = GB(n_estimators=500, learning_rate=0.1,
                    loss='quantile', alpha=0.05, 
                    max_depth=3, max_features=None,
                    min_samples_leaf=9, min_samples_split=9)

        gb_dn.fit(df, losses) 

        gb = GB(n_estimators=500, learning_rate=0.1,
                    loss='ls',
                    max_depth=3, max_features=None,
                    min_samples_leaf=9, min_samples_split=9)

        gb.fit(df, losses) 

        if hasattr(pgenerator, "__name__"):
            self.pgenerator = pgenerator(param_space, n_samples)
        else:
            self.pgenerator = pgenerator

        trial_params_list = list(self.pgenerator)
        trial_params_df = pd.DataFrame( trial_params_list )
        predicted_loss = gb.predict(trial_params_df)
        predicted_loss_up = gb_up.predict(trial_params_df)
        predicted_loss_dn = gb_dn.predict(trial_params_df)
        idx = np.argsort(predicted_loss_dn)
        sorted_trial_params = np.array(trial_params_list)[idx]

        if n_best >= 1: # actual
            thresh_idx = int(np.round(n_best))
        else: # fraction
            thresh_idx = int( np.round(n_best * len(trial_params_list)) )
         self.params = list(trial_params_list)[:thresh_idx]
コード例 #4
0
ファイル: worker.py プロジェクト: neverspill/optomatic
class Worker:

    def __init__(self, project_name, experiment_name, clf, X, y, objective, 
                   host='localhost', port=27017, check_every=1, loop_forever=True):
        self.jobsDB = JobsDB(project_name, experiment_name, host, port)

        self.n_trials = -1 # loop-forever
        if not loop_forever:
            # we're probably running in the multi-experiment mode
            # so once we've computed everything in this experiment
            # we will exit to let the next one run.
            self.n_trials = self.jobsDB.get_queued_jobs().count()

        self.clf = clf # estimator
        self.X = X # X features
        self.y = y # y labels
        self.objective = objective
        self.check_every = check_every # delay in seconds between checking for jobs

    def start_worker(self):
        if self.n_trials > -1:
            logging.info('Worker will close after the {} jobs in this experiment.'.format(self.n_trials))
            for i in range(self.n_trials):
                self.compute()
            # print some stats
            self.jobsDB.print_job_stats()

        else:
            while True:
                self.compute()

    def get_next_params(self):
        job = None
        while job is None:
            job = self.jobsDB.get_next_job_from_queue()
            if job is not None:
                logger.info(job)
            else:
                logger.info('No queued job. Waiting {}s for new jobs...'.format(self.check_every))
                time.sleep(self.check_every)
        return job

    def compute(self):
        job = self.get_next_params()
        clf_params = job['params']
        for p in clf_params:
            # one day, if/when python 3 is ubiquitous, this won't be necessary...
            if isinstance( clf_params[p], unicode ): 
                clf_params[p] = str(clf_params[p])

        scores = self.objective(self.clf, clf_params, self.X, self.y)
        logger.debug("scores from objective: {}".format(scores))

        loss = np.mean(scores)
        std = np.std(scores)

        # then report these results back in the db...
        aux_data = {'loss': loss, 'std': std}
        self.jobsDB.report_job_completion(job['_id'], loss, aux_data=aux_data)
コード例 #5
0
ファイル: worker.py プロジェクト: neverspill/optomatic
    def __init__(self, project_name, experiment_name, clf, X, y, objective, 
                   host='localhost', port=27017, check_every=1, loop_forever=True):
        self.jobsDB = JobsDB(project_name, experiment_name, host, port)

        self.n_trials = -1 # loop-forever
        if not loop_forever:
            # we're probably running in the multi-experiment mode
            # so once we've computed everything in this experiment
            # we will exit to let the next one run.
            self.n_trials = self.jobsDB.get_queued_jobs().count()

        self.clf = clf # estimator
        self.X = X # X features
        self.y = y # y labels
        self.objective = objective
        self.check_every = check_every # delay in seconds between checking for jobs
コード例 #6
0
ファイル: plotting.py プロジェクト: neverspill/optomatic
class Plotting:

    def __init__(self, project_name, experiment_name, param_space, param_types=None):
        self.project_name = project_name
        self.experiment_name = experiment_name

        # set-up connection to mongodb...
        self.jobs = JobsDB(project_name, experiment_name, param_space=param_space, param_types=param_types)

        # we'll use the results multiple times, so cache it (TODO: catch OOM)
        self.completed = list(self.jobs.get_completed_jobs())

        # set up the parameter-names for the plotters...
        self.param_names = self.jobs.get_param_names()

        self.losses = []
        self.param_values = {}
        for trials in self.completed:

            self.losses.append( trials['loss'] )
            
            for pname in self.param_names:
                pvalues = trials['params'][pname] # the raw values used for ths param
                # check if the parameter needs to be labelized to values... 
                if self.jobs.param_value_encoder[pname]:
                    pvalues = self.jobs.param_value_encoder[pname].transform( pvalues )
                # look for key=pname in the param_values dict, if missing initialise
                # it to and empty list and append the first pvalue. If already present
                # just append pvalue.
                self.param_values.setdefault(pname, []).append( pvalues )
        
        # # some parameters are words, like 'linear' or 'rbf', we can't
        # # find the correlation coefficient between a word and a loss (float),
        # # so we'll use LabelEncoder() from sklearn to map these paramters
        # # to numeric integer values...
        # self.param_value_encoder = {}
        # for name in self.param_names:
        #     self.param_value_encoder[name] = False    

        # self.param_values = {}
        # for name in self.param_names:
        #     self.param_values[name] = []

        # for trials in self.completed: # foreach row
        #     for name in self.param_names: # foreach column
        #         self.param_values[name].append( trials['params'][name] )
        #         # this seems like overkill but it makes sure there are no
        #         # non-numeric parameters that go unnoticed...
        #         if isinstance( trials['params'][name], unicode ):
        #             self.param_value_encoder[name] = LabelEncoder()

        # logging.debug(self.param_value_encoder)
        # for name in self.param_value_encoder:
        #     if self.param_value_encoder[name]:
        #         logging.info('Using LabelEncoder on param: {}'.format(name))
        #         self.param_values[name] = self.param_value_encoder[name].fit_transform(self.param_values[name])

    def mkfilename(self, prefix, extension="png"):
        return "{}_{}-{}.{}".format(prefix, self.project_name, self.experiment_name, extension)

    def plot_loss_vs_time(self):
        best_loss = np.inf
        loss_log = []
        for trial in self.completed:
            trial_loss = float( trial['loss'] )
            if trial_loss < best_loss:
                best_loss = trial_loss
            loss_log.append(best_loss)
        
        sns.plt.plot(loss_log)
        sns.plt.savefig( self.mkfilename('loss_vs_time') )
        sns.plt.close()

    def get_best_two_params(self):
        param_names = self.jobs.get_param_names()
        if len(param_names) == 2:
            return param_names # there can be only two.

        # how much does each parameter correlate with the achieved loss...
        param_losscorr = {}
        for name in self.param_names:
            corr_coef, pval = pearsonr( self.losses, self.param_values[name] )
            logging.info('Correlation of {} with loss: {}'.format(name, corr_coef))
            param_losscorr[name] = abs(corr_coef) # abs, since we don't care about the direction

        sorted_by_corr = sorted(param_losscorr.items(), key=lambda x:x[1], reverse=True)
        best_params  = []
        for i in sorted_by_corr:
            if math.isnan( i[1] ): continue
            best_params.append(i[0])
            if len(best_params) == 2: return best_params
        return best_params
        #return sorted_by_corr[0][0], sorted_by_corr[1][0] # TODO: could be made more general/robust

    def plot_heatmap(self, param1, param2):
        '''
        Method for plotting the loss vs two parameters.
        '''

        dat = []
        for i in range(len(self.losses)):
            dat.append( [self.param_values[param1][i],
                        self.param_values[param2][i],
                           self.losses[i]] )
        logging.info('Plotting 2d heatmap, {} and {} vs loss'.format(param1, param2))
        # must not have duplicates before doing a pivot
        df = pd.DataFrame( dat, columns=[param1, param2, 'loss'] )
        df = df.groupby( [param1, param2], as_index=False ).min()
        df = df.pivot( param1, param2, 'loss' )
        sns.heatmap(df)

    def plot_loss_vs_param(self, params=None):

        if params is None:
            best_params = self.get_best_two_params()
        else:
            if isinstance(params, list):
                best_params = params[-2:]
            else:
                best_params = [params] # just one param

        fig = sns.plt.figure()

        if len(best_params) == 1:
            logging.info("Plotting loss vs. {}".format(best_params[0]))
            ax = fig.add_subplot(111)
            ax.set_ylabel('Log loss')
            ax.set_xlabel( best_params[0] )

            if self.jobs.param_value_encoder[best_params[0]]: ### !!!
                label_names = self.jobs.param_value_encoder[best_params[0]].classes_
                sns.plt.xticks( [0, 1], [label_names[0], label_names[1]] )#self.param_values['kernel'] )
            ax.scatter( self.param_values[best_params[0]], self.losses )

        if len(best_params) == 2:
            self.plot_heatmap(best_params[0], best_params[1])

        plot_filename = "loss_vs_params"
        for param in best_params:
            plot_filename += "-{}".format(param)

        sns.plt.savefig( self.mkfilename(plot_filename) )
        sns.plt.close()