Esempio n. 1
0
def get_clean_data(data_version,
                   recache_raw_data=False,
                   redo_data_cleaning=False,
                   comp_col='ric',
                   time_cols=['data_year', 'data_qrt'],
                   industry_col='industry',
                   required_filled_cols_before_filling=[],
                   required_filled_cols_after_filling=[],
                   drop_threshold_row_pct=0.25,
                   drop_threshold_row_quantile=0.2,
                   drop_threshold_col_pct=0,
                   append_data_quality_col=False):

    cache_folder = os.path.join(my.get_project_directories(key='cache_dir'),
                                'cleaned_data')
    my_hash = my.data_hash(data_version, comp_col, time_cols, industry_col,
                           required_filled_cols_before_filling,
                           required_filled_cols_after_filling,
                           drop_threshold_row_pct, drop_threshold_row_quantile,
                           drop_threshold_col_pct, append_data_quality_col)
    cache_file = os.path.join(cache_folder, my_hash + '.csv')

    if redo_data_cleaning or not os.path.exists(cache_file):
        print('Cleaned data not cached...')

        df, data_file, data_props, fillnan_formulas = _download_data_from_sql(
            data_version=data_version, recache=recache_raw_data)

        info_text = f'Initial dataset length {df.shape[0]} rows with {df.shape[1]} columns.\n'
        print(info_text)
        initial_len_df = len(df)

        df.replace([np.inf, -np.inf], np.nan, inplace=True)

        # Count filled columns per row as data-quality metric
        df, tmp_info_text = _data_quality_filter(
            df,
            drop_threshold_row_pct=drop_threshold_row_pct,
            drop_threshold_row_quantile=drop_threshold_row_quantile,
            drop_threshold_col_pct=drop_threshold_col_pct,
            required_filled_cols=required_filled_cols_before_filling,
            append_data_quality_col=append_data_quality_col)
        print(tmp_info_text)

        info_text += tmp_info_text

        def highlight_diff(data, color='yellow'):
            attr = 'background-color: {}'.format(color)
            other = data.xs('First', axis='columns', level=-1)
            return pd.DataFrame(np.where(data.ne(other, level=0), attr, ''),
                                index=data.index,
                                columns=data.columns)

        pre_df = df.copy().reset_index()
        dataset_nan_fill(df,
                         company_col=comp_col,
                         time_cols=time_cols,
                         industry_col=industry_col,
                         data_props=data_props,
                         fillnan_formulas=fillnan_formulas,
                         formula_iterations=3)
        post_df = df.copy().reset_index()

        df_all = pd.concat([
            pre_df.set_index(['ric', 'data_year', 'data_qrt']),
            post_df.set_index(['ric', 'data_year', 'data_qrt'])
        ],
                           axis='columns',
                           keys=['First', 'Second'])
        df_final = df_all.swaplevel(axis='columns')[[
            col for col in pre_df.columns.to_list()
            if col not in ['ric', 'data_year', 'data_qrt']
        ]]
        df_final.style.apply(highlight_diff, axis=None)

        len_df = len(df)
        df = df.loc[df[required_filled_cols_after_filling].notna().all(
            axis=1)]  # drop row based on required filled columns

        tmp_info_text = f'Data quality has dropped {len_df - len(df)} rows because of required filled cols ({str(required_filled_cols_after_filling)[1:-1]}) after NaN-filling resulting dataset with {len(df)} ({int(len(df) / initial_len_df * 100)}% of initial dataset size).'
        print(tmp_info_text)
        info_text += tmp_info_text

        with open(cache_file[:-3] + 'info', "w") as text_file:
            text_file.write(info_text)
        df.to_csv(cache_file)

    else:
        print('Cleaned data already cached.')
        with open(cache_file[:-3] + 'info', "r") as text_file:
            info_text = text_file.read()
        df = pd.read_csv(cache_file)
        print(info_text)

    return df
Esempio n. 2
0
def _download_data_from_sql(data_version='final_data', recache=False):
    from app.b_data_cleaning import get_dataset_registry
    sql_table_name = get_dataset_registry()[data_version]['sql_table']
    query = "SELECT * FROM {}".format(sql_table_name)

    param_dic = my.get_credentials(credential='aws_databases')['aws']

    cache_folder = os.path.join(my.get_project_directories(key='cache_dir'),
                                'raw_data')
    data_file = os.path.join(cache_folder, (data_version + '.csv'))
    if not os.path.exists(cache_folder):
        os.makedirs(cache_folder)

    if recache or not os.path.exists(data_file):
        print('Getting raw data via sql...')

        with my.postgresql_connect(param_dic) as conn:
            df = pd.read_sql_query(query, con=conn)
            obj_cols = df.select_dtypes(include='object').columns
            df[obj_cols] = df[obj_cols].astype(str)
            df.to_csv(data_file, index=False)
            with open(data_file[:-4] + '.dtypes', 'wb') as f:
                dtypes = df.dtypes.to_dict()
                dtypes = dict(
                    zip(dtypes.keys(), [
                        str if i == np.object else i for i in dtypes.values()
                    ]))
                pickle.dump(dtypes, f)
        print('Raw data cached.')

    else:
        print('Raw data already cached.')
        with open(data_file[:-4] + '.dtypes', 'rb') as f:
            dtypes = pickle.load(f)

        df = pd.read_csv(data_file, dtype=dtypes, index_col=False)

    if data_version == 'handpicked_dataset':
        app_dir = my.get_project_directories(key='app_dir')
        file_path = os.path.join(app_dir, 'a_get_data', 'reuters_eikon',
                                 'key_reuters_fields.csv')
        data_dict = pd.read_csv(file_path)
        data_dict['Clear Name'] = data_dict['Clear Name'].str.lower()
        data_dict = data_dict.set_index('Clear Name')
        new_data_dict = data_dict[['Data Type',
                                   'Variable Type']].to_dict(orient='index')

        fillnan_cols = []
        formula_methods = []
        for col in data_dict.columns.tolist():
            if col[:8] == 'fillnan_':
                fillnan_cols.append(col)
        fillnan_cols = sorted(fillnan_cols, key=str.lower)

        for index, row in data_dict[fillnan_cols].iterrows():
            tmp = row.tolist()
            tmp = [x for x in tmp if str(x) != 'nan']
            new_data_dict[index]['Fill NaN Rules'] = tmp
            for j in [
                    i.split(':')[1] for i in tmp
                    if i.split(':')[0] == 'formula'
            ]:
                formula_methods.append((index, j))

    else:
        new_data_dict = None
        formula_methods = None

    return df, data_file, new_data_dict, formula_methods
Esempio n. 3
0
def main_run_linear_models(train_ds,
                           val_ds,
                           test_ds,
                           data_props,
                           max_backlooking=None,
                           layer_type='dense',
                           activation_funcs=['sigmoid', 'relu', 'tanh'],
                           max_serach_iterations=200,
                           NN_max_depth=3,
                           MAX_EPOCHS=800,
                           patience=25,
                           model_name='linear',
                           examples=None,
                           return_permutation_importances=True,
                           redo_serach_best_model=False):
    mlflow.set_experiment(model_name)
    experiment_date_time = int(
        datetime.datetime.now().strftime("%Y%m%d%H%M%S"))

    flatten_input = True if layer_type == 'dense' else False

    def _extract_just_important_data_props(data_props):
        kwargs = {}
        kwargs['dataset_cols_X_just_these'] = data_props['third_filter'][
            'cols_just_these']
        kwargs['dataset_cols_X_exclude'] = data_props['third_filter'][
            'cols_drop']
        kwargs['dataset_cols_y'] = data_props['third_filter'][
            'y_cols_just_these']
        kwargs['dataset_hash_input'] = int(data_props['first_step']['dataset'])
        kwargs['dataset_hash_first'] = data_props['first_step_data_hash']
        kwargs['dataset_hash_second'] = data_props['second_step_data_hash']
        kwargs['dataset_split_method'] = data_props['second_step'][
            'split_method']
        kwargs['dataset_split_steps_train'] = data_props['second_step'][
            'split_props']['train_time_steps']
        kwargs['dataset_split_steps_val'] = data_props['second_step'][
            'split_props']['val_time_steps']
        kwargs['dataset_split_steps_test'] = data_props['second_step'][
            'split_props']['test_time_steps']
        kwargs['dataset_iter_step'] = data_props['iter_step']
        kwargs['dataset_normalization'] = data_props['second_step'][
            'normalize_method']
        kwargs['dataset_window_backlooking'] = data_props['first_step'][
            'window_input_width']
        kwargs['dataset_window_prediction'] = data_props['first_step'][
            'window_pred_width']
        kwargs['dataset_window_shift'] = data_props['first_step'][
            'window_shift']
        return kwargs

    def _hp_tranform_param_dict(param_dict):
        new_param_dict = {}
        for key, value in param_dict.items():
            if type(value) == list:
                new_param_dict[key] = hp.choice(key, value)
            elif type(value) == set:
                new_param_dict[key] = hp.uniform(key, *values)
            else:
                new_param_dict[key] = value
        return new_param_dict

    max_backlooking = data_props['first_step'][
        'window_input_width'] if max_backlooking is None else max_backlooking

    param_grid = dict(
        n_layers=list(range(1, NN_max_depth + 1)),
        first_layer_nodes=[0] if NN_max_depth == 1 else [128, 64, 32, 16, 8],
        last_layer_nodes=[0] if NN_max_depth == 1 else [64, 32, 16, 8, 4],
        activation_func=activation_funcs,
        backlooking_window=list(range(1, max_backlooking + 1)))
    hp_param_dict = _hp_tranform_param_dict(param_dict=param_grid)
    hp_param_dict['model_name'] = model_name
    hp_param_dict['data_props'] = data_props
    hp_param_dict['layer_type'] = layer_type

    def _optimize_objective(*args, **kwargs):
        if args != ():
            kwargs = args[
                0]  # if positional arguments expect first to be dictionary with all kwargs
        if type(kwargs) != dict:
            raise Exception(
                f'kwargs is not  dict - it is {type(kwargs)} with values: {kwargs}'
            )

        backlooking_window = kwargs.pop('backlooking_window')
        n_layers = kwargs.pop('n_layers')
        first_layer_nodes = kwargs.pop('first_layer_nodes')
        last_layer_nodes = kwargs.pop('last_layer_nodes')
        activation_func = kwargs.pop('activation_func')
        return_everything = kwargs.pop('return_everything', False)
        verbose = kwargs.pop('verbose', 0)
        model_name = kwargs.pop('model_name', 'linear')
        data_props = kwargs.pop('data_props')
        layer_type = kwargs.pop('layer_type', 'dense')

        dataset = _get_prep_data(train_ds,
                                 val_ds,
                                 test_ds,
                                 flatten=flatten_input,
                                 keep_last_n_periods=backlooking_window)

        now = datetime.datetime.now()
        date_time = str(now.strftime("%y%m%d%H%M%S"))
        model_name = f"{date_time}_{model_name}_w{backlooking_window}_l{n_layers}_a{activation_func}"

        kwargs = dict(
            model_name=model_name,
            n_layers=n_layers,
            first_layer_nodes=first_layer_nodes,
            last_layer_nodes=last_layer_nodes,
            activation_func=activation_func,
            input_size=dataset['input_shape'] if layer_type == 'dense' else
            tuple(list(train_ds.element_spec[0].shape)[1:]),
            output_size=dataset['output_shape'],
            backlooking_window=backlooking_window,
            layer_type=layer_type)

        model = createmodel(**kwargs)
        history, mlflow_additional_params = compile_and_fit(
            model=model,
            train=dataset['train_ds'],
            val=dataset['val_ds'],
            MAX_EPOCHS=MAX_EPOCHS,
            patience=patience,
            model_name=model_name,
            verbose=verbose)

        # Get all data props for documentation in MLflow
        kwargs.update(_extract_just_important_data_props(data_props))
        kwargs['run'] = experiment_date_time
        mlflow_additional_params['kwargs'] = kwargs

        train_performance = dict(
            zip(model.metrics_names,
                evaluate_model(model=model, tf_data=dataset['train_ds'])))
        val_performance = dict(
            zip(model.metrics_names,
                evaluate_model(model=model, tf_data=dataset['val_ds'])))
        test_performance = dict(
            zip(
                model.metrics_names,
                evaluate_model(
                    model=model,
                    tf_data=dataset['test_ds'],
                    mlflow_additional_params=mlflow_additional_params)))
        mlflow_additional_params['data_props'] = data_props

        # Only save model if close to 15% best models
        try:
            best_loss = float(trials.best_trial['result']['loss'])
            current_loss = min(history.history['val_loss'])
            if current_loss <= best_loss * (1 + 0.15):
                save_model = True
            else:
                save_model = False
        except:
            save_model = True
        mlflow_saved = my_helpers.mlflow_last_run_add_param(
            param_dict=mlflow_additional_params, save_model=save_model)

        tf.keras.backend.clear_session()

        return_metrics = dict(loss=val_performance['loss'],
                              all_metrics={
                                  'train': train_performance,
                                  'val': val_performance,
                                  'test': test_performance
                              },
                              status=STATUS_OK,
                              mlflow=mlflow_saved,
                              model_name=model_name)

        if return_everything:
            return_metrics['model'] = model
            return_metrics['history'] = history

        return return_metrics

    ###### Get old best model records ######

    storage_file_path = os.path.join(
        my_helpers.get_project_directories(key='cache_dir'),
        'storage_best_model.json')
    if not os.path.exists(storage_file_path):
        best_model_storage = {}
    else:
        with open(storage_file_path) as json_file:
            best_model_storage = json.load(json_file)

    ######## Search for best model ########

    if redo_serach_best_model or model_name not in best_model_storage or data_props[
            'iter_step'] not in best_model_storage[model_name]:
        warnings.filterwarnings('ignore')
        trials = Trials()
        best = fmin(fn=_optimize_objective,
                    space=hp_param_dict,
                    algo=tpe.suggest,
                    max_evals=max_serach_iterations,
                    trials=trials,
                    early_stop_fn=no_progress_loss(iteration_stop_count=int(
                        max_serach_iterations / 4),
                                                   percent_increase=0.025))
        warnings.simplefilter('always')

        # getting all parameters for best model storage
        mlflow_best_model = trials.best_trial['result']['mlflow']
        best_params = {}
        for key, idx in best.items():
            best_params[key] = param_grid[key][idx]

        coef_names_ = list(
            data_props['look_ups']['out_lookup_col_name']['X'].keys())
        coef_names_ = coef_names_ + [
            col + f'_sft_{i}'
            for i in range(1, best_params['backlooking_window'])
            for col in coef_names_
        ]

        # Saving best model to storage
        if model_name not in best_model_storage:
            best_model_storage[model_name] = {}
        if data_props['iter_step'] not in best_model_storage[model_name]:
            best_model_storage[model_name][data_props['iter_step']] = {
                'best_model': {
                    'result': {
                        'loss': 10**10
                    }
                },
                'history': {}
            }

        best_model_param = dict(
            result={
                'loss': trials.best_trial['result']['loss'],
                'all_metrics': trials.best_trial['result']['all_metrics']
            },
            model_name=trials.best_trial['result']['model_name'],
            model_id=trials.best_trial['result']['mlflow']['model_id'],
            run_id=experiment_date_time,
            input_coefs=coef_names_,
            path_saved_model=trials.best_trial['result']['mlflow']
            ['saved_model_path'],
            status=trials.best_trial['result']['status'],
            params=best_params,
            data=_extract_just_important_data_props(data_props))

        best_model_storage[model_name][data_props['iter_step']]['history'][
            experiment_date_time] = best_model_param
        if trials.best_trial['result']['loss'] < best_model_storage[model_name][
                data_props['iter_step']]['best_model']['result']['loss']:
            best_model_storage[model_name][
                data_props['iter_step']]['best_model'] = best_model_param

        with open(storage_file_path, 'w') as outfile:
            json.dump(best_model_storage, outfile)

    else:
        # Get best model from storage
        best_model_param = best_model_storage[model_name][
            data_props['iter_step']]['best_model']

    ######## Get Best model again ########
    best_model = tf.keras.models.load_model(
        best_model_param['path_saved_model'])
    best_model.compile(loss=tf.losses.MeanAbsoluteError(),
                       optimizer=tf.optimizers.Adam(),
                       metrics=[
                           tf.metrics.MeanAbsoluteError(),
                           CustomMeanDirectionalAccuracy(),
                           tf.losses.Huber(),
                           tf.metrics.MeanAbsolutePercentageError(),
                           tf.metrics.MeanSquaredError(),
                           tf.metrics.MeanSquaredLogarithmicError()
                       ])
    print('Best model is:', best_model_param)

    out = dict(best_model_param)

    ####### Get examples for plotting #######
    if examples is not None:
        example_X = examples['X']
        periods = best_model_param['params']['backlooking_window']
        if layer_type == 'dense':
            example_X = tf.data.Dataset.from_tensors(
                np.reshape(example_X[:, -periods:, :],
                           (example_X.shape[0], -1)))
        else:
            example_X = tf.data.Dataset.from_tensors(example_X)
        out['examples_pred_y'] = best_model.predict(example_X)

    ###### For 1 layer dense/linear models get coef & p-values ######
    if NN_max_depth == 1 and isinstance(best_model.layers[0],
                                        tf.keras.layers.Dense):
        # Get coefs
        intercept_ = best_model.layers[0].bias.numpy()
        coef_ = best_model.layers[0].weights[0].numpy()
        out['coef_'] = pd.Series(
            dict(
                zip(['intercept_'] + best_model_param['input_coefs'],
                    intercept_.tolist() + coef_.squeeze().tolist())))

        dataset = _get_prep_data(train_ds,
                                 val_ds,
                                 test_ds,
                                 flatten=True,
                                 keep_last_n_periods=best_model_param['params']
                                 ['backlooking_window'])

        # get p-values
        import app.d_prediction.my_custom_pvalue_calc as my_p_lib

        out['p_values'] = {}
        for data_set in ['train', 'val', 'test']:
            y_pred = best_model.predict(dataset[f'{data_set}_X'])
            y_pred = np.reshape(y_pred, (-1, 1))
            try:
                p_values = my_p_lib.coef_pval(dataset[f'{data_set}_X'],
                                              dataset[f'{data_set}_y'], coef_,
                                              intercept_, y_pred)
                p_values = pd.Series(
                    dict(zip(best_model_param['input_coefs'], p_values)))
                out['p_values'][data_set] = p_values
            except:
                warnings.warn(
                    "P-Values: ValueError: Input contains infinity or nan.")
                out['p_values'][data_set] = pd.Series(
                    dict(
                        zip(best_model_param['input_coefs'],
                            ['error'] * len(best_model_param['input_coefs']))))
        out['p_values'] = pd.DataFrame(out['p_values'])

    ##### Get Column Feature Importance #####
    if return_permutation_importances:
        if 'feature_importance' in best_model_param:
            out['feature_importance'] = best_model_param['feature_importance']

        else:
            import eli5
            from eli5.sklearn import PermutationImportance

            sklearn_model = KerasRegressor(build_fn=best_model)
            sklearn_model.model = best_model

            dataset = _get_prep_data(
                train_ds,
                val_ds,
                test_ds,
                flatten=flatten_input,
                keep_last_n_periods=best_model_param['params']
                ['backlooking_window'])

            out['feature_importance'] = {}
            for data_set in ['train', 'val']:
                # Calculate actual FeatureImporttance
                try:
                    perm = PermutationImportance(
                        sklearn_model, cv='prefit').fit(
                            dataset[f'{data_set}_X'].numpy(),
                            np.reshape(dataset[f'{data_set}_y'].numpy(),
                                       (-1, 1)))
                    feature_importances = eli5.format_as_dataframe(
                        eli5.explain_weights(
                            perm,
                            feature_names=best_model_param['input_coefs'],
                            top=10**10))
                    out['feature_importance'][
                        data_set] = feature_importances.set_index(
                            'feature').to_dict()
                except:
                    warnings.warn(
                        "PermutationImportance: ValueError: Input contains infinity or a value too large for dtype('float16')."
                    )

            if out['feature_importance'] != {}:
                best_model_param['feature_importance'] = out[
                    'feature_importance']
                best_model_storage[model_name][
                    data_props['iter_step']]['best_model'][
                        'feature_importance'] = out['feature_importance']
                best_model_storage[model_name][
                    data_props['iter_step']]['history'][experiment_date_time][
                        'feature_importance'] = out['feature_importance']

                with open(storage_file_path, 'w') as outfile:
                    json.dump(best_model_storage, outfile)

    out['status'] = 'ok'
    return out
    # y prediction column
    y_pred_col = ['y_eps pct']

    # window settings
    backlooking_yeras = 4

    # results location
    export_results = False
    export_results = '/Users/vanalmsick/Workspace/MasterThesis/results/'

    model_name = 'dense_lit_best'

    ###########################################################################

    mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
    tracking_address = my_helpers.get_project_directories(
        key='tensorboard_logs')
    try:
        shutil.rmtree(tracking_address)
        time.sleep(10)
    except:
        pass
    os.mkdir(tracking_address)

    from app.b_data_cleaning import get_dataset_registry

    dataset_props = get_dataset_registry()[dataset_name]
    comp_col = dataset_props['company_col']
    time_cols = dataset_props['iter_cols']
    industry_col = dataset_props['industry_col']

    from app.c_data_prep.i_feature_engineering import get_clean_data, feature_engerneeing
Esempio n. 5
0
def compile_and_fit(model,
                    train,
                    val,
                    model_name='UNKNOWN',
                    patience=25,
                    MAX_EPOCHS=50,
                    verbose=1):
    tracking_address = my_helpers.get_project_directories(
        key='tensorboard_logs')
    TBLOGDIR = tracking_address + "/" + model_name

    # Log to MLflow
    mlflow.keras.autolog()  # This is all you need!
    MLFLOW_RUN_NAME = f'{model_name} - {datetime.datetime.now().strftime("%y%m%d_%H%M%S")}'

    tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=TBLOGDIR,
                                                          histogram_freq=1)
    model.compile(loss=tf.losses.MeanAbsoluteError(),
                  optimizer=tf.optimizers.Adam(),
                  metrics=TF_ERROR_METRICS)

    if val is not None:
        early_stopping = tf.keras.callbacks.EarlyStopping(
            monitor='val_loss',
            patience=patience,
            mode='min',
            restore_best_weights=True)
        training_history = model.fit(
            train,
            epochs=MAX_EPOCHS,
            validation_data=val,
            callbacks=[early_stopping, tensorboard_callback],
            verbose=verbose)
    else:
        early_stopping = tf.keras.callbacks.EarlyStopping(
            monitor='loss',
            patience=patience,
            mode='min',
            restore_best_weights=True)
        training_history = model.fit(
            train,
            epochs=MAX_EPOCHS,
            callbacks=[early_stopping, tensorboard_callback],
            verbose=verbose)

    summary_table = pd.DataFrame(columns=[
        "Layer (Type)", "Input Shape", "Output Shape", "Param #", "Dropout",
        "Bias initializer", "Bias regularizer"
    ])
    for layer in model.layers:
        summary_table = summary_table.append(
            {
                "Layer (Type)":
                layer.name + '(' + layer.__class__.__name__ + ')',
                "Input Shape":
                layer.input_shape,
                "Output Shape":
                layer.output_shape,
                "Param #":
                layer.count_params(),
                "Dropout":
                layer.dropout if hasattr(layer, 'dropout') else 'nan',
                "Bias initializer":
                layer.bias_initializer._tf_api_names
                if hasattr(layer, 'bias_initializer') and hasattr(
                    layer.bias_initializer, '_tf_api_names') else 'nan',
                "Bias regularizer":
                layer.bias_regularizer
                if hasattr(layer, 'bias_regularizer') else 'nan'
            },
            ignore_index=True)

    mlflow_additional_params = {
        'layer_df': summary_table,
        'model_type': 'TensorFlow',
        'history_obj': training_history,
        'model_name': model_name,
        'max_epochs': early_stopping.params['epochs'],
        'actual_epochs': early_stopping.stopped_epoch,
        'early_stopped': model.stop_training,
        'loss': model.loss.name
    }

    if verbose != 0:
        print(model_name)
        print(model.summary())

    return training_history, mlflow_additional_params