Python impute Examples

Programming Language: Python

Namespace/Package Name: Utils.Data

Method/Function: impute

Examples at hotexamples.com: 7

Python impute - 7 examples found. These are the top rated real world Python examples of Utils.Data.impute extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

def main():
    configs = json.load(open('Configuration.json', 'r'))

    epochs = configs['training']['epochs']
    grouping = configs['data']['grouping']
    dynamic_features = configs['data']['dynamic_columns']

    outcomes = configs['data']['classification_outcome']
    lookback = configs['data']['batch_size']
    timeseries_path = configs['paths']['data_path']

    ##read, impute and scale dataset
    non_smotedtime_series = pd.read_csv(timeseries_path +
                                        "TimeSeriesAggregatedUpto0.csv")
    non_smotedtime_series[dynamic_features] = impute(non_smotedtime_series,
                                                     dynamic_features)
    normalized_timeseries = scale(non_smotedtime_series, dynamic_features)
    normalized_timeseries.insert(0, grouping, non_smotedtime_series[grouping])

    ##start working per outcome
    for outcome in outcomes:
        decision_maker = DecisionMaker()

        X_train_y0, y_train1, X_valid_y0, X_valid_y1, X_valid, y_val1, X_test, y_test1, timesteps, n_features=\
            process_data(normalized_timeseries, non_smotedtime_series, outcome, grouping, non_smotedtime_series[grouping], lookback)

Example #2

Show file

def main():
    configs = json.load(open('Configuration.json', 'r'))
    epochs = configs['training']['epochs']
    grouping = configs['data']['grouping']
    dynamic_features = configs['data']['dynamic_columns']

    outcomes = configs['data']['classification_outcome']
    lookback = configs['data']['batch_size']
    timeseries_path = configs['paths']['data_path']
    saved_models_path = configs['paths']['saved_models_path']

    ##read, impute and scale dataset
    non_smotedtime_series = pd.read_csv(timeseries_path + "TimeSeriesAggregatedUpto0.csv")
    non_smotedtime_series[dynamic_features] = impute(non_smotedtime_series, dynamic_features)
    normalized_timeseries = scale(non_smotedtime_series, dynamic_features)
    normalized_timeseries.insert(0, grouping, non_smotedtime_series[grouping])

    ##start working per outcome
    for outcome in outcomes:
        decision_maker = DecisionMaker()
        fold_ind, train_ind, test_ind = get_train_test_split(non_smotedtime_series[outcome].astype(int),
                                                             non_smotedtime_series[grouping])

        X_train_y0, X_valid_y0, X_valid, y_valid, X_test, y_test, timesteps,\
        n_features = \
            process_data(normalized_timeseries, non_smotedtime_series, outcome, grouping, lookback,
                         train_ind, test_ind)

        autoencoder = LSTMAutoEncoder(configs['model']['name'] + outcome, outcome, timesteps, n_features)
        autoencoder.summary()

        autoencoder.fit(X_train_y0, X_train_y0, epochs,lookback,X_valid_y0,X_valid_y0,2)

        ###save model
        filename = saved_models_path+ configs['model']['name'] + outcome+ '.h5'
        autoencoder.save_model(filename)

        ####LSTM autoencoder
        autoencoder.plot_history()
        test_x_predictions = autoencoder.predict(X_test)
        mse = np.mean(np.power(flatten(X_test) - flatten(test_x_predictions), 2), axis=1)

        test_error_df = pd.DataFrame({'Reconstruction_error' : mse,
                                 'True_class' : y_test.tolist()})

        pred_y, best_threshold, precision_rt, recall_rt= \
            autoencoder.predict_binary(test_error_df.True_class, test_error_df.Reconstruction_error)

        autoencoder.output_performance(test_error_df.True_class, test_error_df.Reconstruction_error,pred_y)
        autoencoder.plot_reconstruction_error(test_error_df, best_threshold)
        autoencoder.plot_roc(test_error_df)
        autoencoder.plot_pr(precision_rt, recall_rt)

Example #3

Show file

File: VisualiseRiskScore.py Project: zibrahim/StackedPredictor

def main():
    configs = json.load(open('Configuration.json', 'r'))
    grouping = configs['data']['grouping']
    dynamic_features = configs['data']['dynamic_columns']
    static_features = configs['data']['static_columns']

    targets = configs['data']['classification_target']
    timeseries_path = configs['paths']['data_path']

    ##read, impute and scale dataset
    non_smotedtime_series = pd.read_csv(timeseries_path +
                                        "TimeSeriesAggregatedUpto0.csv")
    non_smotedtime_series[dynamic_features] = impute(non_smotedtime_series,
                                                     dynamic_features)
    normalized_timeseries = scale(non_smotedtime_series, dynamic_features)
    normalized_timeseries.insert(0, grouping, non_smotedtime_series[grouping])

    risk_score_visualiser = Visualiser(normalized_timeseries,
                                       non_smotedtime_series, dynamic_features,
                                       static_features)
    for target in targets:
        risk_score_visualiser.plot_risk_scores(target)

Example #4

Show file

File: LSTMPredictorSMOTE.py Project: zibrahim/KDStackedPredictor

def main():
    configs = json.load(open('Configuration.json', 'r'))

    grouping = configs['data']['grouping']
    static_features = configs['data']['static_columns']
    dynamic_features = configs['data']['dynamic_columns']

    outcomes = configs['data']['classification_outcome']
    lookback = configs['data']['batch_size']
    timeseries_path = configs['paths']['data_path']

    ##read, impute and scale dataset

    ##start working per outcome
    for outcome in outcomes:
        time_series = pd.read_csv(timeseries_path + "SMOTEDTimeSeries/" +
                                  outcome + "StackedTimeSeries1Day.csv")

        time_series[dynamic_features] = impute(time_series, dynamic_features)
        normalised_series = scale(time_series, dynamic_features)
        normalised_series.insert(0, grouping, time_series[grouping])
        normalised_series.insert(len(normalised_series.columns), outcome,
                                 time_series[outcome])

        normalised_series = curve_shift(normalised_series,
                                        grouping,
                                        outcome,
                                        shift_by=lookback - 1)

        decision_maker = DecisionMaker()

        #train/test and validation sets
        X_cols = (normalised_series.columns).tolist()
        X_cols.remove(outcome)
        X_cols.remove(grouping)

        input_X = normalised_series.loc[:,
                                        normalised_series.columns.isin(
                                            X_cols
                                        )].values  # converts the df to a numpy array
        input_y = normalised_series[outcome].values

        n_features = input_X.shape[1]  # number of features

        X, y = temporalize(X=input_X, y=input_y, lookback=lookback)

        X_train, X_test, y_train, y_test = train_test_split(np.array(X),
                                                            np.array(y),
                                                            test_size=0.33,
                                                            random_state=SEED,
                                                            stratify=y)
        X_train, X_valid, y_train, y_valid = train_test_split(
            X_train,
            y_train,
            test_size=0.33,
            random_state=SEED,
            stratify=y_train)

        X_train = X_train.reshape(X_train.shape[0], lookback, n_features)
        X_valid = X_valid.reshape(X_valid.shape[0], lookback, n_features)
        X_test = X_test.reshape(X_test.shape[0], lookback, n_features)

        distrs_percents = [
            get_distribution_percentages(
                (normalised_series[outcome]).astype(int))
        ]
        scaler = StandardScaler().fit(flatten(X_train))

        a = flatten(X_train)
        print('colwise mean', np.mean(a, axis=0).round(6))
        print('colwise variance', np.var(a, axis=0))

        X_valid_scaled = Models.LSTMAutoEncoder.Utils.scale(X_valid, scaler)
        X_test_scaled = Models.LSTMAutoEncoder.Utils.scale(X_test, scaler)

        timesteps = X_train.shape[1]  # equal to the lookback
        n_features = X_train.shape[2]  # 59

        epochs = 100
        lr = 0.0001

        lstm_autoencoder = Sequential()
        # Encoder
        lstm_autoencoder.add(
            LSTM(32,
                 activation='relu',
                 input_shape=(timesteps, n_features),
                 return_sequences=True))
        lstm_autoencoder.add(
            LSTM(16, activation='relu', return_sequences=False))
        lstm_autoencoder.add(RepeatVector(timesteps))
        # Decoder
        lstm_autoencoder.add(LSTM(16, activation='relu',
                                  return_sequences=True))
        lstm_autoencoder.add(LSTM(32, activation='relu',
                                  return_sequences=True))
        lstm_autoencoder.add(TimeDistributed(Dense(n_features)))

        lstm_autoencoder.summary()

        adam = optimizers.Adam(lr)
        lstm_autoencoder.compile(loss='mse', optimizer=adam)

        cp = ModelCheckpoint(filepath="lstm_autoencoder_classifier.h5",
                             save_best_only=True,
                             verbose=0)

        tb = TensorBoard(log_dir='./logs',
                         histogram_freq=0,
                         write_graph=True,
                         write_images=True)

        lstm_autoencoder_history = lstm_autoencoder.fit(
            X_train,
            X_train,
            epochs=epochs,
            batch_size=lookback,
            validation_data=(X_valid, X_train),
            verbose=2).history

        #print(distrs_percents)
        ####LSTM autoencoder

        plt.figure(figsize=(10, 10))
        plt.plot(lstm_autoencoder_history['loss'], linewidth=2, label='Train')
        plt.plot(lstm_autoencoder_history['val_loss'],
                 linewidth=2,
                 label='Valid')
        plt.legend(loc='upper right')
        plt.title('Model loss')
        plt.ylabel('Loss')
        plt.xlabel('Epoch')
        plt.savefig("LossOverEpochsSMOTE.pdf", bbox_inches='tight')

        plt.figure(figsize=(10, 10))

        valid_x_predictions = lstm_autoencoder.predict(X_valid_scaled)
        mse = np.mean(np.power(
            flatten(X_valid_scaled) - flatten(valid_x_predictions), 2),
                      axis=1)

        error_df = pd.DataFrame({
            'Reconstruction_error': mse,
            'True_class': y_valid.tolist()
        })

        precision_rt, recall_rt, threshold_rt = precision_recall_curve(
            error_df.True_class, error_df.Reconstruction_error)
        plt.plot(threshold_rt,
                 precision_rt[1:],
                 label="Precision",
                 linewidth=5)
        plt.plot(threshold_rt, recall_rt[1:], label="Recall", linewidth=5)
        plt.title('Precision and recall for different threshold values')
        plt.xlabel('Threshold')
        plt.ylabel('Precision/Recall')
        plt.legend()
        plt.savefig(outcome + "ThresholdSMOTE.pdf", bbox_inches='tight')

        test_x_predictions = lstm_autoencoder.predict(X_test_scaled)
        mse = np.mean(np.power(
            flatten(X_test_scaled) - flatten(test_x_predictions), 2),
                      axis=1)

        error_df = pd.DataFrame({
            'Reconstruction_error': mse,
            'True_class': y_test.tolist()
        })

        plt.figure(figsize=(10, 10))

        threshold_fixed = 0.3
        groups = error_df.groupby('True_class')
        fig, ax = plt.subplots()

        for name, group in groups:
            ax.plot(group.index,
                    group.Reconstruction_error,
                    marker='o',
                    ms=3.5,
                    linestyle='',
                    label="Break" if name == 1 else "Normal")
        ax.hlines(threshold_fixed,
                  ax.get_xlim()[0],
                  ax.get_xlim()[1],
                  colors="r",
                  zorder=100,
                  label='Threshold')
        ax.legend()
        plt.title("Reconstruction error for different classes")
        plt.ylabel("Reconstruction error")
        plt.xlabel("Data point index")
        plt.savefig(outcome + "ReconstructionerrorSMOTE.pdf",
                    bbox_inches='tight')

        pred_y = [
            1 if e > threshold_fixed else 0
            for e in error_df.Reconstruction_error.values
        ]
        conf_matrix = confusion_matrix(error_df.True_class, pred_y)

        plt.figure(figsize=(6, 6))
        sns.heatmap(conf_matrix,
                    xticklabels=LABELS,
                    yticklabels=LABELS,
                    annot=True,
                    fmt="d")
        plt.title("Confusion matrix")
        plt.ylabel('True class')
        plt.xlabel('Predicted class')
        plt.savefig(outcome + "ConfusionMatrixSMOTE.pdf", bbox_inches='tight')

        false_pos_rate, true_pos_rate, thresholds = roc_curve(
            error_df.True_class, error_df.Reconstruction_error)
        roc_auc = auc(
            false_pos_rate,
            true_pos_rate,
        )

        plt.figure(figsize=(10, 10))

        plt.plot(false_pos_rate,
                 true_pos_rate,
                 linewidth=5,
                 label='AUC = %0.3f' % roc_auc)
        plt.plot([0, 1], [0, 1], linewidth=5)

        plt.xlim([-0.01, 1])
        plt.ylim([0, 1.01])
        plt.legend(loc='lower right')
        plt.title('Receiver operating characteristic curve (ROC)')
        plt.ylabel('True Positive Rate')
        plt.xlabel('False Positive Rate')
        plt.savefig(outcome + "rocSMOTE.pdf", bbox_inches='tight')

        precision, recall, thresholds = precision_recall_curve(
            error_df.True_class, error_df.Reconstruction_error)
        pr_auc = auc(recall, precision)

        plt.figure(figsize=(10, 10))

        plt.plot(recall, precision, linewidth=5, label='AUC = %0.3f' % pr_auc)
        plt.plot([0, 1], [0, 1], linewidth=5)

        plt.xlim([-0.01, 1])
        plt.ylim([0, 1.01])
        plt.legend(loc='lower right')
        plt.title('Receiver operating characteristic curve (ROC)')
        plt.ylabel('Precision')
        plt.xlabel('Recall')
        plt.savefig(outcome + "precision_recall_aucSMOTE.pdf",
                    bbox_inches='tight')

Example #5

Show file

File: StackedPredictor.py Project: zibrahim/StackedPredictor

def main():
    configs = json.load(open('Configuration.json', 'r'))
    epochs = configs['training']['epochs']
    grouping = configs['data']['grouping']
    dynamic_features = configs['data']['dynamic_columns']
    static_features = configs['data']['static_columns']

    outcomes = configs['data']['classification_outcome']
    lookback = configs['data']['batch_size']
    timeseries_path = configs['paths']['data_path']
    autoencoder_models_path = configs['paths']['autoencoder_models_path']
    test_data_path = configs['paths']['test_data_path']

    ##read, impute and scale dataset
    non_smotedtime_series = pd.read_csv(timeseries_path +
                                        "TimeSeriesAggregatedUpto0.csv")
    non_smotedtime_series[dynamic_features] = impute(non_smotedtime_series,
                                                     dynamic_features)
    normalized_timeseries = scale(non_smotedtime_series, dynamic_features)
    normalized_timeseries.insert(0, grouping, non_smotedtime_series[grouping])

    #intialise classification report which will house results of all outcomes
    classification_report = ClassificationReport()

    #save lstm performance for comparison with final outcome
    lstm_praucs = []
    ##start working per outcome
    for outcome in outcomes:
        fold_ind, train_ind, test_ind = get_train_test_split(
            non_smotedtime_series[outcome].astype(int),
            non_smotedtime_series[grouping])

        ##Load LSTM models if they exist, otherwise train new models and save them
        autoencoder_filename = autoencoder_models_path + configs['model'][
            'name'] + outcome + '.h5'
        X_train, X_train_y0, X_valid_y0, X_valid, y_valid, X_test, y_test, timesteps, \
        n_features = \
            process_data(normalized_timeseries, non_smotedtime_series, outcome, grouping, lookback,
                         train_ind, test_ind)
        if ("3D" not in outcome):
            if os.path.isfile(autoencoder_filename):
                print(" Autoencoder trained model exists for oucome", outcome,
                      "file:", autoencoder_filename)
                autoencoder = LSTMAutoEncoder(configs['model']['name'] +
                                              outcome,
                                              outcome,
                                              timesteps,
                                              n_features,
                                              saved_model=autoencoder_filename)
                autoencoder.summary()

            else:
                print("Autencoder trained model does not exist for outcome",
                      outcome, "file:", autoencoder_filename)
                autoencoder = LSTMAutoEncoder(
                    configs['model']['name'] + outcome, outcome, timesteps,
                    n_features)
                autoencoder.summary()

                autoencoder.fit(X_train_y0, X_train_y0, epochs, lookback,
                                X_valid_y0, X_valid_y0, 2)
                autoencoder.plot_history()

            train_x_predictions = autoencoder.predict(X_train)
            mse_train = np.mean(np.power(
                lstm_flatten(X_train) - lstm_flatten(train_x_predictions), 2),
                                axis=1)

            test_x_predictions = autoencoder.predict(X_test)

            mse_test = np.mean(np.power(
                lstm_flatten(X_test) - lstm_flatten(test_x_predictions), 2),
                               axis=1)

            test_error_df = pd.DataFrame({
                'Reconstruction_error': mse_test,
                'True_class': y_test.tolist()
            })

            pred_y, best_threshold, precision_rt, recall_rt = \
                  autoencoder.predict_binary(test_error_df.True_class, test_error_df.Reconstruction_error)

            autoencoder.output_performance(test_error_df.True_class, pred_y)
            autoencoder.plot_reconstruction_error(test_error_df,
                                                  best_threshold)
            autoencoder.plot_roc(test_error_df)
            autoencoder.plot_pr(precision_rt, recall_rt)
            lstm_prauc = auc(recall_rt, precision_rt)
            lstm_praucs.append(lstm_prauc)
            #Feature Selector
            training_loc = train_ind[0]  #+train_ind[1]
            training_ids = non_smotedtime_series.iloc[training_loc]
            training_ids = training_ids[grouping]

            testing_ids = non_smotedtime_series.iloc[test_ind[1]]
            testing_ids = testing_ids[grouping]

            flat_df, timesteps = flatten(non_smotedtime_series,
                                         dynamic_features, grouping,
                                         static_features, outcome)
            temporal_features = set(flat_df.columns) - set(static_features)
            temporal_features = set(temporal_features) - set(
                [outcome, grouping])

            X_train = flat_df.loc[flat_df[grouping].isin(training_ids)]
            y_train = X_train[outcome].astype(int)
            training_groups = X_train[grouping]
            X_train_static = X_train[static_features]
            X_train_static[grouping] = training_groups
            X_train = X_train[temporal_features]

            X_test = flat_df.loc[flat_df[grouping].isin(testing_ids)]
            y_test = X_test[outcome].astype(int)
            testing_groups = X_test[grouping]
            X_test_static = X_test[static_features]
            X_test_static.loc[grouping] = testing_groups
            X_test = X_test[temporal_features]

            ########
            aggregate_df = generate_aggregates(X_train, temporal_features,
                                               grouping, training_groups)

            static_aggregate_train_df = pd.concat(
                [aggregate_df, X_train_static], axis=1, join='inner')
            static_aggregate_train_df = static_aggregate_train_df.loc[:,
                                                                      ~static_aggregate_train_df
                                                                      .columns.
                                                                      duplicated(
                                                                      )]
            static_aggregate_train_df.drop(columns=[grouping],
                                           inplace=True,
                                           axis=1)
            static_aggregate_train_df['mse'] = mse_train

            aggregate_df_test = generate_aggregates(X_test, temporal_features,
                                                    grouping, testing_groups)
            static_aggregate_test_df = pd.concat(
                [aggregate_df_test, X_test_static], axis=1, join='inner')
            static_aggregate_test_df = static_aggregate_test_df.loc[:,
                                                                    ~static_aggregate_test_df
                                                                    .columns.
                                                                    duplicated(
                                                                    )]
            static_aggregate_test_df.drop(columns=[grouping],
                                          inplace=True,
                                          axis=1)
            static_aggregate_test_df['mse'] = mse_test

            static_aggregate_test_df.to_csv("static_aggretate.csv",
                                            index=False)
            static_baseline_classifier = XGBoostClassifier(
                static_aggregate_train_df, y_train, outcome, grouping)

            static_baseline_classifier.fit("aggregate_static", mse_train * 100)

            y_pred_binary, best_threshold, precision_rt, recall_rt, yhat = \
                static_baseline_classifier.predict(static_aggregate_test_df, y_test)

            print(" CLASS WEIGHTS FOR Y ACTUAL: ", class_counts(y_test))
            print(" CLASS WEIGHTS FOR Y PREDICTE: ",
                  class_counts(y_pred_binary))

            static_baseline_classifier.output_performance(
                y_test, y_pred_binary)
            static_baseline_classifier.plot_pr(precision_rt, recall_rt,
                                               "XGBoost Static")
            static_baseline_classifier.plot_feature_importance(
                static_aggregate_test_df.columns)

            to_write_for_plotting = static_aggregate_test_df
            to_write_for_plotting['outcome'] = y_test
            to_write_for_plotting.to_csv(test_data_path + outcome + ".csv",
                                         index=False)

            #add to classification report

            classification_report.add_model_result(outcome, y_test,
                                                   y_pred_binary,
                                                   best_threshold,
                                                   precision_rt, recall_rt,
                                                   yhat)

            #delete variables
            del static_aggregate_train_df
            del static_aggregate_test_df
            del X_train
            del X_train_y0
            del X_valid_y0
            del X_valid
            del y_valid
            del X_test
            del y_test
            del timesteps
            del train_x_predictions
            del test_x_predictions
            del test_error_df
    #risk_score_visualiser = Visualiser(normalized_timeseries, non_smotedtime_series,
    #                                  dynamic_features, static_features
    #                                 )
    #After fitting model to all outcomes, plot and get summary statistics
    classification_report.plot_distributions_vs_aucs()
    classification_report.plot_pr_auc()
    classification_report.plot_auc()
    classification_report.compare_lstim_xgboost(lstm_praucs)

Example #6

Show file

def main():
    configs = json.load(open('Configuration.json', 'r'))
    epochs = configs['training']['epochs']
    grouping = configs['data']['grouping']
    dynamic_features = configs['data']['dynamic_columns']
    static_features = configs['data']['static_columns']

    outcomes = configs['data']['classification_outcome']
    lookback = configs['data']['batch_size']
    timeseries_path = configs['paths']['data_path']
    saved_models_path = configs['paths']['saved_models_path']

    ##read, impute and scale dataset
    non_smotedtime_series = pd.read_csv(timeseries_path +
                                        "TimeSeriesAggregatedUpto0.csv")
    non_smotedtime_series[dynamic_features] = impute(non_smotedtime_series,
                                                     dynamic_features)
    normalized_timeseries = scale(non_smotedtime_series, dynamic_features)
    normalized_timeseries.insert(0, grouping, non_smotedtime_series[grouping])

    ##start working per outcome
    for outcome in outcomes:
        decision_maker = DecisionMaker()
        fold_ind, train_ind, test_ind = get_train_test_split(
            non_smotedtime_series[outcome].astype(int),
            non_smotedtime_series[grouping])

        ##Load LSTM models if they exist, otherwise train new models and save them
        filename = saved_models_path + configs['model'][
            'name'] + outcome + '.h5'

        X_train, X_train_y0, X_valid_y0, X_valid, y_valid, X_test, y_test, timesteps, \
        n_features = \
            process_data(normalized_timeseries, non_smotedtime_series, outcome, grouping, lookback,
                         train_ind, test_ind)

        if os.path.isfile(filename):
            autoencoder = LSTMAutoEncoder(configs['model']['name'] + outcome,
                                          outcome,
                                          timesteps,
                                          n_features,
                                          saved_model=filename)
            autoencoder.summary()

        else:
            autoencoder = LSTMAutoEncoder(configs['model']['name'] + outcome,
                                          outcome, timesteps, n_features)
            autoencoder.summary()

            autoencoder.fit(X_train_y0, X_train_y0, epochs, lookback,
                            X_valid_y0, X_valid_y0, 2)
            autoencoder.plot_history()
            ###save model
            filename = saved_models_path + configs['model'][
                'name'] + outcome + '.h5'
            autoencoder.save_model(filename)

        ####Predicting using the fitted model (loaded or trained)

        train_x_predictions = autoencoder.predict(X_train)
        mse_train = np.mean(np.power(
            lstm_flatten(X_train) - lstm_flatten(train_x_predictions), 2),
                            axis=1)

        test_x_predictions = autoencoder.predict(X_test)

        mse_test = np.mean(np.power(
            lstm_flatten(X_test) - lstm_flatten(test_x_predictions), 2),
                           axis=1)

        test_error_df = pd.DataFrame({
            'Reconstruction_error': mse_test,
            'True_class': y_test.tolist()
        })


        pred_y, best_threshold, precision_rt, recall_rt = \
              autoencoder.predict_binary(test_error_df.True_class, test_error_df.Reconstruction_error)

        autoencoder.output_performance(test_error_df.True_class,
                                       test_error_df.Reconstruction_error,
                                       pred_y)
        autoencoder.plot_reconstruction_error(test_error_df, best_threshold)
        autoencoder.plot_roc(test_error_df)
        autoencoder.plot_pr(precision_rt, recall_rt)

        #Feature Selector
        training_loc = train_ind[0]  #+train_ind[1]
        training_ids = non_smotedtime_series.iloc[training_loc]
        training_ids = training_ids[grouping]

        testing_ids = non_smotedtime_series.iloc[test_ind[1]]
        testing_ids = testing_ids[grouping]

        flat_df, timesteps = flatten(non_smotedtime_series, dynamic_features,
                                     grouping, static_features, outcome)
        temporal_features = set(flat_df.columns) - set(static_features)
        temporal_features = set(temporal_features) - set([outcome, grouping])

        X_train = flat_df.loc[flat_df[grouping].isin(training_ids)]
        y_train = X_train[outcome].astype(int)
        training_groups = X_train[grouping]
        X_train_static = X_train[static_features]
        X_train_static.loc[grouping] = training_groups
        X_train = X_train[temporal_features]
        X_train = scale(X_train, temporal_features)
        X_train['mse'] = mse_train

        #X_train, y_train = smote(X_train, y_train)
        X_test = flat_df.loc[flat_df[grouping].isin(testing_ids)]
        y_test = X_test[outcome].astype(int)
        testing_groups = X_test[grouping]
        X_test_static = X_test[static_features]
        X_test_static.loc[grouping] = testing_groups
        X_test = X_test[temporal_features]
        X_test = scale(X_test, temporal_features)
        X_test['mse'] = mse_test

        feature_selector = XGBoostClassifier(X_train, y_train, outcome,
                                             grouping)  #
        feature_selector.fit("temporal", training_groups)

        y_pred_binary, best_threshold, precision_rt, recall_rt = feature_selector.predict(
            X_test, y_test)
        feature_selector.plot_pr(precision_rt, recall_rt, "XGBoost Temporal")

        featuredf = pd.DataFrame()

        temporal_features = set(temporal_features) - set([outcome])
        featuredf['features'] = list(temporal_features)
        #featuredf['imp'] = fs_fi
        #featuredf = featuredf[featuredf['imp'] > 0]
        ########
        baseline_features = featuredf['features']

        baseline_features = set(
            [x.partition('_')[0] for x in list(baseline_features)])

        baseline_features = [x + "_0" for x in list(baseline_features)]

        baseline_features.insert(0, grouping)
        baseline_static_features = baseline_features + static_features

        slopes_df = generate_slopes(X_train, temporal_features,
                                    static_features, grouping, training_groups)

        aggregate_df = generate_aggregates(X_train, temporal_features,
                                           grouping, training_groups)

        slopes_static_baseline_train_df = pd.concat(
            [slopes_df, X_train_static], axis=1, join='inner')

        slopes_static_baseline_train_df = slopes_static_baseline_train_df.loc[:,
                                                                              ~slopes_static_baseline_train_df
                                                                              .
                                                                              columns
                                                                              .
                                                                              duplicated(
                                                                              )]
        slopes_static_baseline_train_groups = slopes_static_baseline_train_df[
            grouping]
        slopes_static_baseline_train_df.drop(columns=[grouping],
                                             inplace=True,
                                             axis=1)
        slopes_static_baseline_train_df['mse'] = mse_train

        slopes_df_test = generate_slopes(X_test, temporal_features,
                                         static_features, grouping,
                                         testing_groups)

        slopes_static_baseline_test_df = pd.concat(
            [slopes_df_test, X_test_static], axis=1, join='inner')
        slopes_static_baseline_test_df = slopes_static_baseline_test_df.loc[:,
                                                                            ~slopes_static_baseline_test_df
                                                                            .
                                                                            columns
                                                                            .
                                                                            duplicated(
                                                                            )]
        slopes_static_baseline_test_groups = slopes_static_baseline_test_df[
            grouping]
        slopes_static_baseline_test_df.drop(columns=[grouping],
                                            inplace=True,
                                            axis=1)
        slopes_static_baseline_test_df['mse'] = mse_test

        slopes_static_baseline_classifier = XGBoostClassifier(
            slopes_static_baseline_train_df, y_train, outcome, grouping)

        #bs_y, bs_ths, bs_id, bs_fi = slopes_static_baseline_classifier.fit("baseline_static_slope",
        #                                                                      slopes_static_baseline_train_groups)
        slopes_static_baseline_classifier.fit(
            "baseline_static_slope", slopes_static_baseline_train_groups)
        y_pred_binary, best_threshold, precision_rt, recall_rt = \
            slopes_static_baseline_classifier.predict( slopes_static_baseline_test_df, y_test)
        slopes_static_baseline_classifier.plot_pr(precision_rt, recall_rt,
                                                  "XGBoost Static")

Example #7

Show file

def main():
    configs = json.load(open('Configuration.json', 'r'))

    grouping = configs['data']['grouping']
    dynamic_features = configs['data']['dynamic_columns']

    outcomes = configs['data']['classification_outcome']
    lookback = configs['data']['batch_size']
    timeseries_path = configs['paths']['data_path']
    autoencoder_path = configs['paths']['autoencoder_path']

    ##read, impute and scale dataset
    non_smotedtime_series = pd.read_csv(timeseries_path +
                                        "TimeSeriesAggregatedUpto0.csv")
    non_smotedtime_series[dynamic_features] = impute(non_smotedtime_series,
                                                     dynamic_features)
    normalized_timeseries = scale(non_smotedtime_series, dynamic_features)
    normalized_timeseries.insert(0, grouping, non_smotedtime_series[grouping])

    ##start working per outcome
    for outcome in outcomes:
        decision_maker = DecisionMaker()

        X_train_y0, X_valid_y0, X_valid, y_valid, X_test, y_test,  timesteps, n_features =\
            process_data(normalized_timeseries, non_smotedtime_series, outcome, grouping, non_smotedtime_series[grouping], lookback)

        epochs = 100

        autoencoder = LSTMAutoEncoder(configs['model']['name'] + outcome,
                                      outcome, timesteps, n_features)
        autoencoder.summary()

        cp = ModelCheckpoint(filepath="lstm_autoencoder_classifier.h5",
                             save_best_only=True,
                             verbose=0)

        tb = TensorBoard(log_dir='./logs',
                         histogram_freq=0,
                         write_graph=True,
                         write_images=True)

        autoencoder.fit(X_train_y0, X_train_y0, epochs, lookback, X_valid_y0,
                        X_valid_y0, 2)
        ####LSTM autoencoder

        autoencoder.plot_history()
        valid_x_predictions = autoencoder.predict(X_valid)

        mse = np.mean(np.power(
            flatten(X_valid) - flatten(valid_x_predictions), 2),
                      axis=1)
        error_df = pd.DataFrame({
            'Reconstruction_error': mse,
            'True_class': y_valid.tolist()
        })

        precision_rt, recall_rt, threshold_rt = precision_recall_curve(
            error_df.True_class, error_df.Reconstruction_error)

        fscore = (2 * precision_rt * recall_rt) / (precision_rt + recall_rt)

        ix = np.argmax(fscore)
        best_threshold = threshold_rt[ix]
        # print('Best Threshold=%f, G-Mean=%.3f' % (thresholds[ix], fscore[ix]))
        pred_y = (error_df.Reconstruction_error >
                  best_threshold).astype('int32')

        perf_df = pd.DataFrame()
        perf_dict = performance_metrics(error_df.True_class, pred_y,
                                        error_df.Reconstruction_error)
        perf_df = perf_df.append(perf_dict, ignore_index=True)
        perf_df.to_csv(autoencoder_path + "performancemetrics" + outcome +
                       ".csv",
                       index=False)

        test_x_predictions = autoencoder.predict(X_test)
        mse = np.mean(np.power(
            flatten(X_test) - flatten(test_x_predictions), 2),
                      axis=1)

        error_df = pd.DataFrame({
            'Reconstruction_error': mse,
            'True_class': y_test.tolist()
        })

        plt.figure(figsize=(10, 10))

        groups = error_df.groupby('True_class')
        fig, ax = plt.subplots()

        for name, group in groups:
            ax.plot(group.index,
                    group.Reconstruction_error,
                    marker='o',
                    ms=3.5,
                    linestyle='',
                    label="1" if name == 1 else "0")
        ax.hlines(threshold_rt[ix],
                  ax.get_xlim()[0],
                  ax.get_xlim()[1],
                  colors="r",
                  zorder=100,
                  label='Threshold')
        ax.legend()
        plt.title("Reconstruction error for different classes")
        plt.ylabel("Reconstruction error")
        plt.xlabel("Data point index")
        plt.savefig(autoencoder_path + outcome + "Reconstructionerror.pdf",
                    bbox_inches='tight')

        false_pos_rate, true_pos_rate, thresholds = roc_curve(
            error_df.True_class, error_df.Reconstruction_error)
        roc_auc = auc(
            false_pos_rate,
            true_pos_rate,
        )

        plt.figure(figsize=(10, 10))

        plt.plot(false_pos_rate,
                 true_pos_rate,
                 linewidth=5,
                 label='AUC = %0.3f' % roc_auc)
        plt.plot([0, 1], [0, 1], linewidth=5)

        plt.xlim([-0.01, 1])
        plt.ylim([0, 1.01])
        plt.legend(loc='lower right')
        plt.title('Receiver operating characteristic curve (ROC)')
        plt.ylabel('True Positive Rate')
        plt.xlabel('False Positive Rate')
        plt.savefig(autoencoder_path + outcome + "roc.pdf",
                    bbox_inches='tight')

        pr_auc = auc(recall_rt, precision_rt)

        plt.figure(figsize=(10, 10))

        plt.plot(recall_rt,
                 precision_rt,
                 linewidth=5,
                 label='PR-AUC = %0.3f' % pr_auc)
        plt.plot([0, 1], [1, 0], linewidth=5)

        plt.xlim([-0.01, 1])
        plt.ylim([0, 1.01])
        plt.legend(loc='lower right')
        plt.title('Precision Recall Curive')
        plt.ylabel('Precision')
        plt.xlabel('Recall')
        plt.savefig(autoencoder_path + outcome + "precision_recall_auc.pdf",
                    bbox_inches='tight')