Beispiel #1
0
def main_run_linear_models(train_ds,
                           val_ds,
                           test_ds,
                           data_props,
                           max_backlooking=None,
                           layer_type='dense',
                           activation_funcs=['sigmoid', 'relu', 'tanh'],
                           max_serach_iterations=200,
                           NN_max_depth=3,
                           MAX_EPOCHS=800,
                           patience=25,
                           model_name='linear',
                           examples=None,
                           return_permutation_importances=True,
                           redo_serach_best_model=False):
    mlflow.set_experiment(model_name)
    experiment_date_time = int(
        datetime.datetime.now().strftime("%Y%m%d%H%M%S"))

    flatten_input = True if layer_type == 'dense' else False

    def _extract_just_important_data_props(data_props):
        kwargs = {}
        kwargs['dataset_cols_X_just_these'] = data_props['third_filter'][
            'cols_just_these']
        kwargs['dataset_cols_X_exclude'] = data_props['third_filter'][
            'cols_drop']
        kwargs['dataset_cols_y'] = data_props['third_filter'][
            'y_cols_just_these']
        kwargs['dataset_hash_input'] = int(data_props['first_step']['dataset'])
        kwargs['dataset_hash_first'] = data_props['first_step_data_hash']
        kwargs['dataset_hash_second'] = data_props['second_step_data_hash']
        kwargs['dataset_split_method'] = data_props['second_step'][
            'split_method']
        kwargs['dataset_split_steps_train'] = data_props['second_step'][
            'split_props']['train_time_steps']
        kwargs['dataset_split_steps_val'] = data_props['second_step'][
            'split_props']['val_time_steps']
        kwargs['dataset_split_steps_test'] = data_props['second_step'][
            'split_props']['test_time_steps']
        kwargs['dataset_iter_step'] = data_props['iter_step']
        kwargs['dataset_normalization'] = data_props['second_step'][
            'normalize_method']
        kwargs['dataset_window_backlooking'] = data_props['first_step'][
            'window_input_width']
        kwargs['dataset_window_prediction'] = data_props['first_step'][
            'window_pred_width']
        kwargs['dataset_window_shift'] = data_props['first_step'][
            'window_shift']
        return kwargs

    def _hp_tranform_param_dict(param_dict):
        new_param_dict = {}
        for key, value in param_dict.items():
            if type(value) == list:
                new_param_dict[key] = hp.choice(key, value)
            elif type(value) == set:
                new_param_dict[key] = hp.uniform(key, *values)
            else:
                new_param_dict[key] = value
        return new_param_dict

    max_backlooking = data_props['first_step'][
        'window_input_width'] if max_backlooking is None else max_backlooking

    param_grid = dict(
        n_layers=list(range(1, NN_max_depth + 1)),
        first_layer_nodes=[0] if NN_max_depth == 1 else [128, 64, 32, 16, 8],
        last_layer_nodes=[0] if NN_max_depth == 1 else [64, 32, 16, 8, 4],
        activation_func=activation_funcs,
        backlooking_window=list(range(1, max_backlooking + 1)))
    hp_param_dict = _hp_tranform_param_dict(param_dict=param_grid)
    hp_param_dict['model_name'] = model_name
    hp_param_dict['data_props'] = data_props
    hp_param_dict['layer_type'] = layer_type

    def _optimize_objective(*args, **kwargs):
        if args != ():
            kwargs = args[
                0]  # if positional arguments expect first to be dictionary with all kwargs
        if type(kwargs) != dict:
            raise Exception(
                f'kwargs is not  dict - it is {type(kwargs)} with values: {kwargs}'
            )

        backlooking_window = kwargs.pop('backlooking_window')
        n_layers = kwargs.pop('n_layers')
        first_layer_nodes = kwargs.pop('first_layer_nodes')
        last_layer_nodes = kwargs.pop('last_layer_nodes')
        activation_func = kwargs.pop('activation_func')
        return_everything = kwargs.pop('return_everything', False)
        verbose = kwargs.pop('verbose', 0)
        model_name = kwargs.pop('model_name', 'linear')
        data_props = kwargs.pop('data_props')
        layer_type = kwargs.pop('layer_type', 'dense')

        dataset = _get_prep_data(train_ds,
                                 val_ds,
                                 test_ds,
                                 flatten=flatten_input,
                                 keep_last_n_periods=backlooking_window)

        now = datetime.datetime.now()
        date_time = str(now.strftime("%y%m%d%H%M%S"))
        model_name = f"{date_time}_{model_name}_w{backlooking_window}_l{n_layers}_a{activation_func}"

        kwargs = dict(
            model_name=model_name,
            n_layers=n_layers,
            first_layer_nodes=first_layer_nodes,
            last_layer_nodes=last_layer_nodes,
            activation_func=activation_func,
            input_size=dataset['input_shape'] if layer_type == 'dense' else
            tuple(list(train_ds.element_spec[0].shape)[1:]),
            output_size=dataset['output_shape'],
            backlooking_window=backlooking_window,
            layer_type=layer_type)

        model = createmodel(**kwargs)
        history, mlflow_additional_params = compile_and_fit(
            model=model,
            train=dataset['train_ds'],
            val=dataset['val_ds'],
            MAX_EPOCHS=MAX_EPOCHS,
            patience=patience,
            model_name=model_name,
            verbose=verbose)

        # Get all data props for documentation in MLflow
        kwargs.update(_extract_just_important_data_props(data_props))
        kwargs['run'] = experiment_date_time
        mlflow_additional_params['kwargs'] = kwargs

        train_performance = dict(
            zip(model.metrics_names,
                evaluate_model(model=model, tf_data=dataset['train_ds'])))
        val_performance = dict(
            zip(model.metrics_names,
                evaluate_model(model=model, tf_data=dataset['val_ds'])))
        test_performance = dict(
            zip(
                model.metrics_names,
                evaluate_model(
                    model=model,
                    tf_data=dataset['test_ds'],
                    mlflow_additional_params=mlflow_additional_params)))
        mlflow_additional_params['data_props'] = data_props

        # Only save model if close to 15% best models
        try:
            best_loss = float(trials.best_trial['result']['loss'])
            current_loss = min(history.history['val_loss'])
            if current_loss <= best_loss * (1 + 0.15):
                save_model = True
            else:
                save_model = False
        except:
            save_model = True
        mlflow_saved = my_helpers.mlflow_last_run_add_param(
            param_dict=mlflow_additional_params, save_model=save_model)

        tf.keras.backend.clear_session()

        return_metrics = dict(loss=val_performance['loss'],
                              all_metrics={
                                  'train': train_performance,
                                  'val': val_performance,
                                  'test': test_performance
                              },
                              status=STATUS_OK,
                              mlflow=mlflow_saved,
                              model_name=model_name)

        if return_everything:
            return_metrics['model'] = model
            return_metrics['history'] = history

        return return_metrics

    ###### Get old best model records ######

    storage_file_path = os.path.join(
        my_helpers.get_project_directories(key='cache_dir'),
        'storage_best_model.json')
    if not os.path.exists(storage_file_path):
        best_model_storage = {}
    else:
        with open(storage_file_path) as json_file:
            best_model_storage = json.load(json_file)

    ######## Search for best model ########

    if redo_serach_best_model or model_name not in best_model_storage or data_props[
            'iter_step'] not in best_model_storage[model_name]:
        warnings.filterwarnings('ignore')
        trials = Trials()
        best = fmin(fn=_optimize_objective,
                    space=hp_param_dict,
                    algo=tpe.suggest,
                    max_evals=max_serach_iterations,
                    trials=trials,
                    early_stop_fn=no_progress_loss(iteration_stop_count=int(
                        max_serach_iterations / 4),
                                                   percent_increase=0.025))
        warnings.simplefilter('always')

        # getting all parameters for best model storage
        mlflow_best_model = trials.best_trial['result']['mlflow']
        best_params = {}
        for key, idx in best.items():
            best_params[key] = param_grid[key][idx]

        coef_names_ = list(
            data_props['look_ups']['out_lookup_col_name']['X'].keys())
        coef_names_ = coef_names_ + [
            col + f'_sft_{i}'
            for i in range(1, best_params['backlooking_window'])
            for col in coef_names_
        ]

        # Saving best model to storage
        if model_name not in best_model_storage:
            best_model_storage[model_name] = {}
        if data_props['iter_step'] not in best_model_storage[model_name]:
            best_model_storage[model_name][data_props['iter_step']] = {
                'best_model': {
                    'result': {
                        'loss': 10**10
                    }
                },
                'history': {}
            }

        best_model_param = dict(
            result={
                'loss': trials.best_trial['result']['loss'],
                'all_metrics': trials.best_trial['result']['all_metrics']
            },
            model_name=trials.best_trial['result']['model_name'],
            model_id=trials.best_trial['result']['mlflow']['model_id'],
            run_id=experiment_date_time,
            input_coefs=coef_names_,
            path_saved_model=trials.best_trial['result']['mlflow']
            ['saved_model_path'],
            status=trials.best_trial['result']['status'],
            params=best_params,
            data=_extract_just_important_data_props(data_props))

        best_model_storage[model_name][data_props['iter_step']]['history'][
            experiment_date_time] = best_model_param
        if trials.best_trial['result']['loss'] < best_model_storage[model_name][
                data_props['iter_step']]['best_model']['result']['loss']:
            best_model_storage[model_name][
                data_props['iter_step']]['best_model'] = best_model_param

        with open(storage_file_path, 'w') as outfile:
            json.dump(best_model_storage, outfile)

    else:
        # Get best model from storage
        best_model_param = best_model_storage[model_name][
            data_props['iter_step']]['best_model']

    ######## Get Best model again ########
    best_model = tf.keras.models.load_model(
        best_model_param['path_saved_model'])
    best_model.compile(loss=tf.losses.MeanAbsoluteError(),
                       optimizer=tf.optimizers.Adam(),
                       metrics=[
                           tf.metrics.MeanAbsoluteError(),
                           CustomMeanDirectionalAccuracy(),
                           tf.losses.Huber(),
                           tf.metrics.MeanAbsolutePercentageError(),
                           tf.metrics.MeanSquaredError(),
                           tf.metrics.MeanSquaredLogarithmicError()
                       ])
    print('Best model is:', best_model_param)

    out = dict(best_model_param)

    ####### Get examples for plotting #######
    if examples is not None:
        example_X = examples['X']
        periods = best_model_param['params']['backlooking_window']
        if layer_type == 'dense':
            example_X = tf.data.Dataset.from_tensors(
                np.reshape(example_X[:, -periods:, :],
                           (example_X.shape[0], -1)))
        else:
            example_X = tf.data.Dataset.from_tensors(example_X)
        out['examples_pred_y'] = best_model.predict(example_X)

    ###### For 1 layer dense/linear models get coef & p-values ######
    if NN_max_depth == 1 and isinstance(best_model.layers[0],
                                        tf.keras.layers.Dense):
        # Get coefs
        intercept_ = best_model.layers[0].bias.numpy()
        coef_ = best_model.layers[0].weights[0].numpy()
        out['coef_'] = pd.Series(
            dict(
                zip(['intercept_'] + best_model_param['input_coefs'],
                    intercept_.tolist() + coef_.squeeze().tolist())))

        dataset = _get_prep_data(train_ds,
                                 val_ds,
                                 test_ds,
                                 flatten=True,
                                 keep_last_n_periods=best_model_param['params']
                                 ['backlooking_window'])

        # get p-values
        import app.d_prediction.my_custom_pvalue_calc as my_p_lib

        out['p_values'] = {}
        for data_set in ['train', 'val', 'test']:
            y_pred = best_model.predict(dataset[f'{data_set}_X'])
            y_pred = np.reshape(y_pred, (-1, 1))
            try:
                p_values = my_p_lib.coef_pval(dataset[f'{data_set}_X'],
                                              dataset[f'{data_set}_y'], coef_,
                                              intercept_, y_pred)
                p_values = pd.Series(
                    dict(zip(best_model_param['input_coefs'], p_values)))
                out['p_values'][data_set] = p_values
            except:
                warnings.warn(
                    "P-Values: ValueError: Input contains infinity or nan.")
                out['p_values'][data_set] = pd.Series(
                    dict(
                        zip(best_model_param['input_coefs'],
                            ['error'] * len(best_model_param['input_coefs']))))
        out['p_values'] = pd.DataFrame(out['p_values'])

    ##### Get Column Feature Importance #####
    if return_permutation_importances:
        if 'feature_importance' in best_model_param:
            out['feature_importance'] = best_model_param['feature_importance']

        else:
            import eli5
            from eli5.sklearn import PermutationImportance

            sklearn_model = KerasRegressor(build_fn=best_model)
            sklearn_model.model = best_model

            dataset = _get_prep_data(
                train_ds,
                val_ds,
                test_ds,
                flatten=flatten_input,
                keep_last_n_periods=best_model_param['params']
                ['backlooking_window'])

            out['feature_importance'] = {}
            for data_set in ['train', 'val']:
                # Calculate actual FeatureImporttance
                try:
                    perm = PermutationImportance(
                        sklearn_model, cv='prefit').fit(
                            dataset[f'{data_set}_X'].numpy(),
                            np.reshape(dataset[f'{data_set}_y'].numpy(),
                                       (-1, 1)))
                    feature_importances = eli5.format_as_dataframe(
                        eli5.explain_weights(
                            perm,
                            feature_names=best_model_param['input_coefs'],
                            top=10**10))
                    out['feature_importance'][
                        data_set] = feature_importances.set_index(
                            'feature').to_dict()
                except:
                    warnings.warn(
                        "PermutationImportance: ValueError: Input contains infinity or a value too large for dtype('float16')."
                    )

            if out['feature_importance'] != {}:
                best_model_param['feature_importance'] = out[
                    'feature_importance']
                best_model_storage[model_name][
                    data_props['iter_step']]['best_model'][
                        'feature_importance'] = out['feature_importance']
                best_model_storage[model_name][
                    data_props['iter_step']]['history'][experiment_date_time][
                        'feature_importance'] = out['feature_importance']

                with open(storage_file_path, 'w') as outfile:
                    json.dump(best_model_storage, outfile)

    out['status'] = 'ok'
    return out
def TrainNetwork(model,
                 modelfile,
                 x_train=None,
                 y_train=None,
                 x_valid=None,
                 y_valid=None,
                 sample_weight=None,
                 callbacks=[],
                 epochs=20,
                 batch_size=200,
                 verbose=1,
                 overwriteModel=False,
                 finishTraining=True):

    model, custom_objects = model.model, model.custom_objects

    # Set up our KerasRegressor wrapper.
    # I'm not 100% sure why we do this for our regressors (but not our classifiers),
    # but as we use this in the original training code I'll keep it for now.
    regressor = KerasRegressor(build_fn=model,
                               batch_size=batch_size,
                               epochs=epochs,
                               verbose=verbose)

    # Make the model directory if it does not already exist.
    model_dir = '/'.join(modelfile.split('/')[:-1])
    try:
        os.makedirs(model_dir)
    except:
        pass

    # Check if the model exists -- and load it if not overwriting.
    history_filename = 0
    if ('.h5' in modelfile):
        history_filename = '.'.join(modelfile.split('.')[:-1]) + '.csv'
    else:
        history_filename = modelfile + '.csv'  # if using .tf format, there won't be a file extension on the string at all.
    initial_epoch = 0
    if (pathlib.Path(modelfile).exists() and not overwriteModel):
        regressor.model = load_model(modelfile, custom_objects=custom_objects)

        # Now we want to figure out for how many epochs the loaded model was already trained,
        # so that it's trained, in total, for the requested number of epochs.
        # keras models don't seem to hold on to an epoch attribute for whatever reason,
        # so we will figure out the current epoch based on CSVLogger output if it exists.
        if (pathlib.Path(history_filename).exists()):
            with open(history_filename) as f:
                for i, l in enumerate(f):
                    pass
                initial_epoch = i  # zero-indexing will take care of the 1st line, which has headers
        if (not finishTraining):
            initial_epoch = regressor.get_params()['epochs']
        regressor.set_params(initial_epoch=initial_epoch)

    history = 0
    # Train the model if we've specified "finishTraining", or if we don't even
    # have a model yet. Setting finishTraining=False lets one immediately skip
    # to evaluating the model, which is especially helpful if EarlyStopping was used
    # and the final model didn't reach the specified last epoch.
    if (finishTraining or not pathlib.Path(modelfile).exists()):
        history = regressor.fit(x=x_train,
                                y=y_train,
                                validation_data=(x_valid, y_valid),
                                sample_weight=sample_weight,
                                callbacks=callbacks)

    saveModel = True
    if (initial_epoch == epochs or not finishTraining): saveModel = False
    if (saveModel):
        print('  Saving model to {}.'.format(modelfile))
        regressor.model.save(modelfile)

    # Now get the history from the log file, if it exists.
    # This is a better method than using the results of model.fit(),
    # since this will give us the whole history (not just whatever
    # was fitted right now). However, it relies on us having passed
    # a CSVLogger as one of our callbacks, which we normally do
    # but might not do in some specific circumstances.

    # fallback
    try:
        history = history.history
    except:
        history = {}
        pass

    if (pathlib.Path(history_filename).exists()):
        df = pd.read_csv(history_filename)
        history = {}
        for key in df.keys():
            history[key] = df[key].to_numpy()

    else:
        print('Warning: No log file found for model {}.'.format())
        print('This may result in an empty/incomplete history being returned.')
        print(
            'Please provide a CSVLogger callback to prevent this in the future.'
        )

    return regressor, history