def _fit_ranker(self, xtrain, ytrain, xtest, ytest, next_point): start = datetime.now() self._set_new_parameters(next_point) try: self.learner.fit(xtrain, ytrain, **self._fit_params) ypred = self.learner(xtest) loss = get_mean_loss(self.validation_loss, ytest, ypred) time_taken = duration_till_now(start) except: self.logger.error(traceback.format_exc()) self.logger.info( "For current parameter error occurred so taking loss as maximum value" ) loss = 1.00 time_taken = duration_till_now(start) return loss, time_taken
dataset_reader.kwargs['n_objects'] = n_objects else: dataset_reader.n_objects = n_objects X_train, Y_train, X_test, Y_test = dataset_reader.get_single_train_test_split() log_test_train_data(X_train, X_test, logger) learner_params['n_objects'], learner_params['n_object_features'] = X_train.shape[1:] if 'n_nests' in hp_ranges[learner_name].keys(): hp_ranges[learner_name]['n_nests'] = [2, np.max([3, int(n_objects / 2) + 1])] learner = all[n_objects] hp_params = create_optimizer_parameters2(fit_params, hp_ranges, learner, learner_name, hash_file) hp_params['optimizer_path'] = optimizer_path + 'objects{}'.format(n_objects) hp_params['random_state'] = random_state hp_params['learning_problem'] = learning_problem hp_params['validation_loss'] = lp_metric_dict[learning_problem].get(validation_loss, None) time_taken = duration_till_now(start) logger.info("Time Taken till now: {} milliseconds".format(seconds_to_time(time_taken))) time_eout_eval = get_duration_seconds('5H') logger.info( "Time spared for the out of sample evaluation : {} ".format(seconds_to_time(time_eout_eval))) total_duration = duration - time_taken - time_eout_eval hp_fit_params['n_iter'] = 10 hp_fit_params['total_duration'] = total_duration hp_fit_params['cv_iter'] = inner_cv optimizer_model = ParameterOptimizer(**hp_params) optimizer_model.fit(X_train, Y_train, **hp_fit_params) batch_size = X_test.shape[0] s_pred, y_pred = get_scores(optimizer_model, batch_size, X_test, logger) metric_loss = categorical_accuracy_np(Y_test, y_pred) logger.info(ERROR_OUTPUT_STRING.format("CategoricalAccuracy", str(np.mean(metric_loss)), n_objects))
def fit(self, X, Y, total_duration=600, n_iter=100, cv_iter=None, acq_func='gp_hedge', **kwargs): start = datetime.now() def splitter(itr): for train_idx, test_idx in itr: yield X[train_idx], Y[train_idx], X[test_idx], Y[test_idx] def splitter_dict(itr_dict): n_splits = len(list(itr_dict.values())[0]) for i in range(n_splits): X_train = dict() Y_train = dict() X_test = dict() Y_test = dict() for n_obj, itr in itr_dict.items(): train_idx = itr[i][0] test_idx = itr[i][1] X_train[n_obj] = np.copy(X[n_obj][train_idx]) X_test[n_obj] = np.copy(X[n_obj][test_idx]) Y_train[n_obj] = np.copy(Y[n_obj][train_idx]) Y_test[n_obj] = np.copy(Y[n_obj][test_idx]) yield X_train, Y_train, X_test, Y_test if cv_iter is None: cv_iter = ShuffleSplit(n_splits=3, test_size=0.1, random_state=self.random_state) if isinstance(X, dict): splits = dict() for n_obj, arr in X.items(): if arr.shape[0] == 1: splits[n_obj] = [([0], [0]) for i in range(cv_iter.n_splits)] else: splits[n_obj] = list(cv_iter.split(arr)) else: splits = list(cv_iter.split(X)) # Pre-compute splits for reuse # Here we fix a random seed for all simulations to correlate the random # streams: seed = self.random_state.randint(2**32, dtype='uint32') self.logger.debug( 'Random seed for the ranking algorithm: {}'.format(seed)) opt_seed = self.random_state.randint(2**32, dtype='uint32') self.logger.debug('Random seed for the optimizer: {}'.format(opt_seed)) gp_seed = self.random_state.randint(2**32, dtype='uint32') self.logger.debug( 'Random seed for the GP surrogate: {}'.format(gp_seed)) n_iter = self.set_optimizer(n_iter, opt_seed, acq_func, gp_seed, **kwargs) self._callbacks_set_optimizer(self.opt) self._callbacks_on_optimization_begin() time_taken = duration_till_now(start) total_duration -= time_taken max_fit_duration = -np.inf self.logger.info('Time left for {} iterations is {}'.format( n_iter, seconds_to_time(total_duration))) try: for t in range(n_iter): if total_duration <= 0: break start = datetime.now() self._callbacks_on_iteration_begin(t) self.logger.info( 'Starting optimization iteration: {}'.format(t)) if t > 0: self.log_best_params() next_point = self.opt.ask() self.logger.info('Next parameters:\n{}'.format(next_point)) results = [] running_times = [] if isinstance(X, dict): for X_train, Y_train, X_test, Y_test in splitter_dict( splits): result, time_taken = self._fit_ranker( X_train, Y_train, X_test, Y_test, next_point) running_times.append(time_taken) results.append(result) else: for X_train, Y_train, X_test, Y_test in splitter(splits): result, time_taken = self._fit_ranker( X_train, Y_train, X_test, Y_test, next_point) running_times.append(time_taken) results.append(result) results = np.array(results) running_times = np.array(running_times) mean_result = np.mean(results) mean_fitting_duration = np.mean(running_times) # Storing the maximum time to run the splitting model and adding the time for out of sample evaluation if max_fit_duration < np.sum(running_times): max_fit_duration = np.sum(running_times) self.logger.info( 'Validation error for the parameters is {:.4f}'.format( mean_result)) self.logger.info('Time taken for the parameters is {}'.format( seconds_to_time(np.sum(running_times)))) if "ps" in self.opt.acq_func: self.opt.tell(next_point, [mean_result, mean_fitting_duration]) else: self.opt.tell(next_point, mean_result) self._callbacks_on_iteration_end(t) self.logger.info( "Main optimizer iterations done {} and saving the model". format(np.array(self.opt.yi).shape[0])) dump(self.opt, self.optimizer_path) time_taken = duration_till_now(start) total_duration -= time_taken self.logger.info('Time left for simulations is {} '.format( seconds_to_time(total_duration))) # Delete Tensorflow graph, to prevent memory leaks: K.clear_session() sess = tf.Session() K.set_session(sess) if (total_duration - max_fit_duration) < 0: self.logger.info( 'Maximum time required by model to validate parameter values {}' .format(seconds_to_time(max_fit_duration))) self.logger.info( 'At iteration {} simulation stops, due to time deficiency' .format(t)) break except KeyboardInterrupt: self.logger.debug( 'Optimizer interrupted saving the model at {}'.format( self.optimizer_path)) self.log_best_params() else: self.logger.debug( 'Finally, fit a model on the complete training set and storing the model at {}' .format(self.optimizer_path)) finally: K.clear_session() sess = tf.Session() K.set_session(sess) self._callbacks_on_optimization_end() # self._fit_params["epochs"] = np.min([self._fit_params.get("epochs", 500) * 2, 1000]) if "ps" in self.opt.acq_func: best_point = self.opt.Xi[np.argmin( np.array(self.opt.yi)[:, 0])] else: best_point = self.opt.Xi[np.argmin(self.opt.yi)] self._set_new_parameters(best_point) self.model = copy.copy(self.learner) self.model.fit(X, Y, **self._fit_params) if np.array(self.opt.yi).shape[0] != 0: dump(self.opt, self.optimizer_path)