def test_early_stop_no_progress_loss(): trials = generate_trials_to_calculate([{'x': -100}]) fmin(fn=lambda x: x, space=hp.uniform("x", -5, 5), algo=rand.suggest, max_evals=500, trials=trials, early_stop_fn=no_progress_loss(10)) assert len(trials) == 10
def _find_optimal_model(train_ds, val_ds, test_ds, data_props, examples): search_space = { 'backlooking_period': hp.choice('backlooking_period', [1, 2, 3, 4]), 'n_estimators': hp.quniform('n_estimators', 100, 1000, 1), 'eta': hp.quniform('eta', 0.025, 0.5, 0.025), # A problem with max_depth casted to float instead of int with # the hp.quniform method. 'max_depth': hp.choice('max_depth', np.arange(1, 14, dtype=int)), 'min_child_weight': hp.quniform('min_child_weight', 1, 6, 1), 'subsample': hp.quniform('subsample', 0.5, 1, 0.05), 'gamma': hp.quniform('gamma', 0.5, 1, 0.05), 'colsample_bytree': hp.quniform('colsample_bytree', 0.5, 1, 0.05), 'eval_metric': 'mae', # Increase this number if you have more cores. Otherwise, remove it and it will default # to the maxium number. 'nthread': None, 'booster': 'gbtree', 'tree_method': 'exact', 'silent': 0, 'seed': 42 } search_space['train'] = train_ds search_space['val'] = val_ds search_space['test'] = test_ds search_space['iter_step'] = data_props['iter_step'] trials = Trials() best = fmin(_optimize_obj, search_space, algo=tpe.suggest, trials=trials, early_stop_fn=no_progress_loss(iteration_stop_count=25, percent_increase=0.025), max_evals=100) best_result = trials.best_trial['result']['results'] best_params = trials.best_trial['result']['params'] best_result = pd.DataFrame(best_result) best_result = _reformat_DF(best_result, data_props['iter_step']) best_params = pd.Series(best_params, name=data_props['iter_step']) return best_result, best_params
def _early_stop_fn() -> Any: no_progress_loss_fn = no_progress_loss( int(_get_option_value(*_opt_no_progress_loss))) timeout = int(_get_option_value(*_opt_timeout)) if timeout <= 0: return no_progress_loss_fn # Set base time for budget mechanism start_time = time.time() def timeout_fn(trials, best_loss=None, iteration_no_progress=0): # type: ignore no_progress_loss, meta = no_progress_loss_fn( trials, best_loss, iteration_no_progress) to = time.time() - start_time > timeout return no_progress_loss or to, meta return timeout_fn
def start_opt(self): best = fmin( # функция для оптимизации fn=partial(self.objective, pipeline=self.pipeline.get_model(), X_train=self.data.get_Xy()['X'], y_train=self.data.get_Xy()['y'], metric=self.metric), # пространство поиска гиперпараметров space=self.params.get_opt_space(), # алгоритм поиска algo=tpe.suggest, # число итераций # (можно ещё указать и время поиска) max_evals=250, # куда сохранять историю поиска trials=self.trials, # random state rstate=np.random.RandomState(1), # early stop early_stop_fn=no_progress_loss(**self.early_stop), # progressbar show_progressbar=True )
def _add_conf(func, trial): return dict(early_stop_fn=no_progress_loss(50))
def main_run_linear_models(train_ds, val_ds, test_ds, data_props, max_backlooking=None, layer_type='dense', activation_funcs=['sigmoid', 'relu', 'tanh'], max_serach_iterations=200, NN_max_depth=3, MAX_EPOCHS=800, patience=25, model_name='linear', examples=None, return_permutation_importances=True, redo_serach_best_model=False): mlflow.set_experiment(model_name) experiment_date_time = int( datetime.datetime.now().strftime("%Y%m%d%H%M%S")) flatten_input = True if layer_type == 'dense' else False def _extract_just_important_data_props(data_props): kwargs = {} kwargs['dataset_cols_X_just_these'] = data_props['third_filter'][ 'cols_just_these'] kwargs['dataset_cols_X_exclude'] = data_props['third_filter'][ 'cols_drop'] kwargs['dataset_cols_y'] = data_props['third_filter'][ 'y_cols_just_these'] kwargs['dataset_hash_input'] = int(data_props['first_step']['dataset']) kwargs['dataset_hash_first'] = data_props['first_step_data_hash'] kwargs['dataset_hash_second'] = data_props['second_step_data_hash'] kwargs['dataset_split_method'] = data_props['second_step'][ 'split_method'] kwargs['dataset_split_steps_train'] = data_props['second_step'][ 'split_props']['train_time_steps'] kwargs['dataset_split_steps_val'] = data_props['second_step'][ 'split_props']['val_time_steps'] kwargs['dataset_split_steps_test'] = data_props['second_step'][ 'split_props']['test_time_steps'] kwargs['dataset_iter_step'] = data_props['iter_step'] kwargs['dataset_normalization'] = data_props['second_step'][ 'normalize_method'] kwargs['dataset_window_backlooking'] = data_props['first_step'][ 'window_input_width'] kwargs['dataset_window_prediction'] = data_props['first_step'][ 'window_pred_width'] kwargs['dataset_window_shift'] = data_props['first_step'][ 'window_shift'] return kwargs def _hp_tranform_param_dict(param_dict): new_param_dict = {} for key, value in param_dict.items(): if type(value) == list: new_param_dict[key] = hp.choice(key, value) elif type(value) == set: new_param_dict[key] = hp.uniform(key, *values) else: new_param_dict[key] = value return new_param_dict max_backlooking = data_props['first_step'][ 'window_input_width'] if max_backlooking is None else max_backlooking param_grid = dict( n_layers=list(range(1, NN_max_depth + 1)), first_layer_nodes=[0] if NN_max_depth == 1 else [128, 64, 32, 16, 8], last_layer_nodes=[0] if NN_max_depth == 1 else [64, 32, 16, 8, 4], activation_func=activation_funcs, backlooking_window=list(range(1, max_backlooking + 1))) hp_param_dict = _hp_tranform_param_dict(param_dict=param_grid) hp_param_dict['model_name'] = model_name hp_param_dict['data_props'] = data_props hp_param_dict['layer_type'] = layer_type def _optimize_objective(*args, **kwargs): if args != (): kwargs = args[ 0] # if positional arguments expect first to be dictionary with all kwargs if type(kwargs) != dict: raise Exception( f'kwargs is not dict - it is {type(kwargs)} with values: {kwargs}' ) backlooking_window = kwargs.pop('backlooking_window') n_layers = kwargs.pop('n_layers') first_layer_nodes = kwargs.pop('first_layer_nodes') last_layer_nodes = kwargs.pop('last_layer_nodes') activation_func = kwargs.pop('activation_func') return_everything = kwargs.pop('return_everything', False) verbose = kwargs.pop('verbose', 0) model_name = kwargs.pop('model_name', 'linear') data_props = kwargs.pop('data_props') layer_type = kwargs.pop('layer_type', 'dense') dataset = _get_prep_data(train_ds, val_ds, test_ds, flatten=flatten_input, keep_last_n_periods=backlooking_window) now = datetime.datetime.now() date_time = str(now.strftime("%y%m%d%H%M%S")) model_name = f"{date_time}_{model_name}_w{backlooking_window}_l{n_layers}_a{activation_func}" kwargs = dict( model_name=model_name, n_layers=n_layers, first_layer_nodes=first_layer_nodes, last_layer_nodes=last_layer_nodes, activation_func=activation_func, input_size=dataset['input_shape'] if layer_type == 'dense' else tuple(list(train_ds.element_spec[0].shape)[1:]), output_size=dataset['output_shape'], backlooking_window=backlooking_window, layer_type=layer_type) model = createmodel(**kwargs) history, mlflow_additional_params = compile_and_fit( model=model, train=dataset['train_ds'], val=dataset['val_ds'], MAX_EPOCHS=MAX_EPOCHS, patience=patience, model_name=model_name, verbose=verbose) # Get all data props for documentation in MLflow kwargs.update(_extract_just_important_data_props(data_props)) kwargs['run'] = experiment_date_time mlflow_additional_params['kwargs'] = kwargs train_performance = dict( zip(model.metrics_names, evaluate_model(model=model, tf_data=dataset['train_ds']))) val_performance = dict( zip(model.metrics_names, evaluate_model(model=model, tf_data=dataset['val_ds']))) test_performance = dict( zip( model.metrics_names, evaluate_model( model=model, tf_data=dataset['test_ds'], mlflow_additional_params=mlflow_additional_params))) mlflow_additional_params['data_props'] = data_props # Only save model if close to 15% best models try: best_loss = float(trials.best_trial['result']['loss']) current_loss = min(history.history['val_loss']) if current_loss <= best_loss * (1 + 0.15): save_model = True else: save_model = False except: save_model = True mlflow_saved = my_helpers.mlflow_last_run_add_param( param_dict=mlflow_additional_params, save_model=save_model) tf.keras.backend.clear_session() return_metrics = dict(loss=val_performance['loss'], all_metrics={ 'train': train_performance, 'val': val_performance, 'test': test_performance }, status=STATUS_OK, mlflow=mlflow_saved, model_name=model_name) if return_everything: return_metrics['model'] = model return_metrics['history'] = history return return_metrics ###### Get old best model records ###### storage_file_path = os.path.join( my_helpers.get_project_directories(key='cache_dir'), 'storage_best_model.json') if not os.path.exists(storage_file_path): best_model_storage = {} else: with open(storage_file_path) as json_file: best_model_storage = json.load(json_file) ######## Search for best model ######## if redo_serach_best_model or model_name not in best_model_storage or data_props[ 'iter_step'] not in best_model_storage[model_name]: warnings.filterwarnings('ignore') trials = Trials() best = fmin(fn=_optimize_objective, space=hp_param_dict, algo=tpe.suggest, max_evals=max_serach_iterations, trials=trials, early_stop_fn=no_progress_loss(iteration_stop_count=int( max_serach_iterations / 4), percent_increase=0.025)) warnings.simplefilter('always') # getting all parameters for best model storage mlflow_best_model = trials.best_trial['result']['mlflow'] best_params = {} for key, idx in best.items(): best_params[key] = param_grid[key][idx] coef_names_ = list( data_props['look_ups']['out_lookup_col_name']['X'].keys()) coef_names_ = coef_names_ + [ col + f'_sft_{i}' for i in range(1, best_params['backlooking_window']) for col in coef_names_ ] # Saving best model to storage if model_name not in best_model_storage: best_model_storage[model_name] = {} if data_props['iter_step'] not in best_model_storage[model_name]: best_model_storage[model_name][data_props['iter_step']] = { 'best_model': { 'result': { 'loss': 10**10 } }, 'history': {} } best_model_param = dict( result={ 'loss': trials.best_trial['result']['loss'], 'all_metrics': trials.best_trial['result']['all_metrics'] }, model_name=trials.best_trial['result']['model_name'], model_id=trials.best_trial['result']['mlflow']['model_id'], run_id=experiment_date_time, input_coefs=coef_names_, path_saved_model=trials.best_trial['result']['mlflow'] ['saved_model_path'], status=trials.best_trial['result']['status'], params=best_params, data=_extract_just_important_data_props(data_props)) best_model_storage[model_name][data_props['iter_step']]['history'][ experiment_date_time] = best_model_param if trials.best_trial['result']['loss'] < best_model_storage[model_name][ data_props['iter_step']]['best_model']['result']['loss']: best_model_storage[model_name][ data_props['iter_step']]['best_model'] = best_model_param with open(storage_file_path, 'w') as outfile: json.dump(best_model_storage, outfile) else: # Get best model from storage best_model_param = best_model_storage[model_name][ data_props['iter_step']]['best_model'] ######## Get Best model again ######## best_model = tf.keras.models.load_model( best_model_param['path_saved_model']) best_model.compile(loss=tf.losses.MeanAbsoluteError(), optimizer=tf.optimizers.Adam(), metrics=[ tf.metrics.MeanAbsoluteError(), CustomMeanDirectionalAccuracy(), tf.losses.Huber(), tf.metrics.MeanAbsolutePercentageError(), tf.metrics.MeanSquaredError(), tf.metrics.MeanSquaredLogarithmicError() ]) print('Best model is:', best_model_param) out = dict(best_model_param) ####### Get examples for plotting ####### if examples is not None: example_X = examples['X'] periods = best_model_param['params']['backlooking_window'] if layer_type == 'dense': example_X = tf.data.Dataset.from_tensors( np.reshape(example_X[:, -periods:, :], (example_X.shape[0], -1))) else: example_X = tf.data.Dataset.from_tensors(example_X) out['examples_pred_y'] = best_model.predict(example_X) ###### For 1 layer dense/linear models get coef & p-values ###### if NN_max_depth == 1 and isinstance(best_model.layers[0], tf.keras.layers.Dense): # Get coefs intercept_ = best_model.layers[0].bias.numpy() coef_ = best_model.layers[0].weights[0].numpy() out['coef_'] = pd.Series( dict( zip(['intercept_'] + best_model_param['input_coefs'], intercept_.tolist() + coef_.squeeze().tolist()))) dataset = _get_prep_data(train_ds, val_ds, test_ds, flatten=True, keep_last_n_periods=best_model_param['params'] ['backlooking_window']) # get p-values import app.d_prediction.my_custom_pvalue_calc as my_p_lib out['p_values'] = {} for data_set in ['train', 'val', 'test']: y_pred = best_model.predict(dataset[f'{data_set}_X']) y_pred = np.reshape(y_pred, (-1, 1)) try: p_values = my_p_lib.coef_pval(dataset[f'{data_set}_X'], dataset[f'{data_set}_y'], coef_, intercept_, y_pred) p_values = pd.Series( dict(zip(best_model_param['input_coefs'], p_values))) out['p_values'][data_set] = p_values except: warnings.warn( "P-Values: ValueError: Input contains infinity or nan.") out['p_values'][data_set] = pd.Series( dict( zip(best_model_param['input_coefs'], ['error'] * len(best_model_param['input_coefs'])))) out['p_values'] = pd.DataFrame(out['p_values']) ##### Get Column Feature Importance ##### if return_permutation_importances: if 'feature_importance' in best_model_param: out['feature_importance'] = best_model_param['feature_importance'] else: import eli5 from eli5.sklearn import PermutationImportance sklearn_model = KerasRegressor(build_fn=best_model) sklearn_model.model = best_model dataset = _get_prep_data( train_ds, val_ds, test_ds, flatten=flatten_input, keep_last_n_periods=best_model_param['params'] ['backlooking_window']) out['feature_importance'] = {} for data_set in ['train', 'val']: # Calculate actual FeatureImporttance try: perm = PermutationImportance( sklearn_model, cv='prefit').fit( dataset[f'{data_set}_X'].numpy(), np.reshape(dataset[f'{data_set}_y'].numpy(), (-1, 1))) feature_importances = eli5.format_as_dataframe( eli5.explain_weights( perm, feature_names=best_model_param['input_coefs'], top=10**10)) out['feature_importance'][ data_set] = feature_importances.set_index( 'feature').to_dict() except: warnings.warn( "PermutationImportance: ValueError: Input contains infinity or a value too large for dtype('float16')." ) if out['feature_importance'] != {}: best_model_param['feature_importance'] = out[ 'feature_importance'] best_model_storage[model_name][ data_props['iter_step']]['best_model'][ 'feature_importance'] = out['feature_importance'] best_model_storage[model_name][ data_props['iter_step']]['history'][experiment_date_time][ 'feature_importance'] = out['feature_importance'] with open(storage_file_path, 'w') as outfile: json.dump(best_model_storage, outfile) out['status'] = 'ok' return out