Exemple #1
0
def main():
    configs = json.load(open('Configuration.json', 'r'))

    epochs = configs['training']['epochs']
    grouping = configs['data']['grouping']
    dynamic_features = configs['data']['dynamic_columns']

    outcomes = configs['data']['classification_outcome']
    lookback = configs['data']['batch_size']
    timeseries_path = configs['paths']['data_path']

    ##read, impute and scale dataset
    non_smotedtime_series = pd.read_csv(timeseries_path +
                                        "TimeSeriesAggregatedUpto0.csv")
    non_smotedtime_series[dynamic_features] = impute(non_smotedtime_series,
                                                     dynamic_features)
    normalized_timeseries = scale(non_smotedtime_series, dynamic_features)
    normalized_timeseries.insert(0, grouping, non_smotedtime_series[grouping])

    ##start working per outcome
    for outcome in outcomes:
        decision_maker = DecisionMaker()

        X_train_y0, y_train1, X_valid_y0, X_valid_y1, X_valid, y_val1, X_test, y_test1, timesteps, n_features=\
            process_data(normalized_timeseries, non_smotedtime_series, outcome, grouping, non_smotedtime_series[grouping], lookback)
Exemple #2
0
def main():
    configs = json.load(open('Configuration.json', 'r'))
    epochs = configs['training']['epochs']
    grouping = configs['data']['grouping']
    dynamic_features = configs['data']['dynamic_columns']

    outcomes = configs['data']['classification_outcome']
    lookback = configs['data']['batch_size']
    timeseries_path = configs['paths']['data_path']
    saved_models_path = configs['paths']['saved_models_path']

    ##read, impute and scale dataset
    non_smotedtime_series = pd.read_csv(timeseries_path + "TimeSeriesAggregatedUpto0.csv")
    non_smotedtime_series[dynamic_features] = impute(non_smotedtime_series, dynamic_features)
    normalized_timeseries = scale(non_smotedtime_series, dynamic_features)
    normalized_timeseries.insert(0, grouping, non_smotedtime_series[grouping])

    ##start working per outcome
    for outcome in outcomes:
        decision_maker = DecisionMaker()
        fold_ind, train_ind, test_ind = get_train_test_split(non_smotedtime_series[outcome].astype(int),
                                                             non_smotedtime_series[grouping])

        X_train_y0, X_valid_y0, X_valid, y_valid, X_test, y_test, timesteps,\
        n_features = \
            process_data(normalized_timeseries, non_smotedtime_series, outcome, grouping, lookback,
                         train_ind, test_ind)

        autoencoder = LSTMAutoEncoder(configs['model']['name'] + outcome, outcome, timesteps, n_features)
        autoencoder.summary()

        autoencoder.fit(X_train_y0, X_train_y0, epochs,lookback,X_valid_y0,X_valid_y0,2)

        ###save model
        filename = saved_models_path+ configs['model']['name'] + outcome+ '.h5'
        autoencoder.save_model(filename)

        ####LSTM autoencoder
        autoencoder.plot_history()
        test_x_predictions = autoencoder.predict(X_test)
        mse = np.mean(np.power(flatten(X_test) - flatten(test_x_predictions), 2), axis=1)

        test_error_df = pd.DataFrame({'Reconstruction_error' : mse,
                                 'True_class' : y_test.tolist()})

        pred_y, best_threshold, precision_rt, recall_rt= \
            autoencoder.predict_binary(test_error_df.True_class, test_error_df.Reconstruction_error)

        autoencoder.output_performance(test_error_df.True_class, test_error_df.Reconstruction_error,pred_y)
        autoencoder.plot_reconstruction_error(test_error_df, best_threshold)
        autoencoder.plot_roc(test_error_df)
        autoencoder.plot_pr(precision_rt, recall_rt)
def main():
    configs = json.load(open('Configuration.json', 'r'))
    grouping = configs['data']['grouping']
    dynamic_features = configs['data']['dynamic_columns']
    static_features = configs['data']['static_columns']

    targets = configs['data']['classification_target']
    timeseries_path = configs['paths']['data_path']

    ##read, impute and scale dataset
    non_smotedtime_series = pd.read_csv(timeseries_path +
                                        "TimeSeriesAggregatedUpto0.csv")
    non_smotedtime_series[dynamic_features] = impute(non_smotedtime_series,
                                                     dynamic_features)
    normalized_timeseries = scale(non_smotedtime_series, dynamic_features)
    normalized_timeseries.insert(0, grouping, non_smotedtime_series[grouping])

    risk_score_visualiser = Visualiser(normalized_timeseries,
                                       non_smotedtime_series, dynamic_features,
                                       static_features)
    for target in targets:
        risk_score_visualiser.plot_risk_scores(target)
def main():
    configs = json.load(open('Configuration.json', 'r'))

    grouping = configs['data']['grouping']
    static_features = configs['data']['static_columns']
    dynamic_features = configs['data']['dynamic_columns']

    outcomes = configs['data']['classification_outcome']
    lookback = configs['data']['batch_size']
    timeseries_path = configs['paths']['data_path']

    ##read, impute and scale dataset

    ##start working per outcome
    for outcome in outcomes:
        time_series = pd.read_csv(timeseries_path + "SMOTEDTimeSeries/" +
                                  outcome + "StackedTimeSeries1Day.csv")

        time_series[dynamic_features] = impute(time_series, dynamic_features)
        normalised_series = scale(time_series, dynamic_features)
        normalised_series.insert(0, grouping, time_series[grouping])
        normalised_series.insert(len(normalised_series.columns), outcome,
                                 time_series[outcome])

        normalised_series = curve_shift(normalised_series,
                                        grouping,
                                        outcome,
                                        shift_by=lookback - 1)

        decision_maker = DecisionMaker()

        #train/test and validation sets
        X_cols = (normalised_series.columns).tolist()
        X_cols.remove(outcome)
        X_cols.remove(grouping)

        input_X = normalised_series.loc[:,
                                        normalised_series.columns.isin(
                                            X_cols
                                        )].values  # converts the df to a numpy array
        input_y = normalised_series[outcome].values

        n_features = input_X.shape[1]  # number of features

        X, y = temporalize(X=input_X, y=input_y, lookback=lookback)

        X_train, X_test, y_train, y_test = train_test_split(np.array(X),
                                                            np.array(y),
                                                            test_size=0.33,
                                                            random_state=SEED,
                                                            stratify=y)
        X_train, X_valid, y_train, y_valid = train_test_split(
            X_train,
            y_train,
            test_size=0.33,
            random_state=SEED,
            stratify=y_train)

        X_train = X_train.reshape(X_train.shape[0], lookback, n_features)
        X_valid = X_valid.reshape(X_valid.shape[0], lookback, n_features)
        X_test = X_test.reshape(X_test.shape[0], lookback, n_features)

        distrs_percents = [
            get_distribution_percentages(
                (normalised_series[outcome]).astype(int))
        ]
        scaler = StandardScaler().fit(flatten(X_train))

        a = flatten(X_train)
        print('colwise mean', np.mean(a, axis=0).round(6))
        print('colwise variance', np.var(a, axis=0))

        X_valid_scaled = Models.LSTMAutoEncoder.Utils.scale(X_valid, scaler)
        X_test_scaled = Models.LSTMAutoEncoder.Utils.scale(X_test, scaler)

        timesteps = X_train.shape[1]  # equal to the lookback
        n_features = X_train.shape[2]  # 59

        epochs = 100
        lr = 0.0001

        lstm_autoencoder = Sequential()
        # Encoder
        lstm_autoencoder.add(
            LSTM(32,
                 activation='relu',
                 input_shape=(timesteps, n_features),
                 return_sequences=True))
        lstm_autoencoder.add(
            LSTM(16, activation='relu', return_sequences=False))
        lstm_autoencoder.add(RepeatVector(timesteps))
        # Decoder
        lstm_autoencoder.add(LSTM(16, activation='relu',
                                  return_sequences=True))
        lstm_autoencoder.add(LSTM(32, activation='relu',
                                  return_sequences=True))
        lstm_autoencoder.add(TimeDistributed(Dense(n_features)))

        lstm_autoencoder.summary()

        adam = optimizers.Adam(lr)
        lstm_autoencoder.compile(loss='mse', optimizer=adam)

        cp = ModelCheckpoint(filepath="lstm_autoencoder_classifier.h5",
                             save_best_only=True,
                             verbose=0)

        tb = TensorBoard(log_dir='./logs',
                         histogram_freq=0,
                         write_graph=True,
                         write_images=True)

        lstm_autoencoder_history = lstm_autoencoder.fit(
            X_train,
            X_train,
            epochs=epochs,
            batch_size=lookback,
            validation_data=(X_valid, X_train),
            verbose=2).history

        #print(distrs_percents)
        ####LSTM autoencoder

        plt.figure(figsize=(10, 10))
        plt.plot(lstm_autoencoder_history['loss'], linewidth=2, label='Train')
        plt.plot(lstm_autoencoder_history['val_loss'],
                 linewidth=2,
                 label='Valid')
        plt.legend(loc='upper right')
        plt.title('Model loss')
        plt.ylabel('Loss')
        plt.xlabel('Epoch')
        plt.savefig("LossOverEpochsSMOTE.pdf", bbox_inches='tight')

        plt.figure(figsize=(10, 10))

        valid_x_predictions = lstm_autoencoder.predict(X_valid_scaled)
        mse = np.mean(np.power(
            flatten(X_valid_scaled) - flatten(valid_x_predictions), 2),
                      axis=1)

        error_df = pd.DataFrame({
            'Reconstruction_error': mse,
            'True_class': y_valid.tolist()
        })

        precision_rt, recall_rt, threshold_rt = precision_recall_curve(
            error_df.True_class, error_df.Reconstruction_error)
        plt.plot(threshold_rt,
                 precision_rt[1:],
                 label="Precision",
                 linewidth=5)
        plt.plot(threshold_rt, recall_rt[1:], label="Recall", linewidth=5)
        plt.title('Precision and recall for different threshold values')
        plt.xlabel('Threshold')
        plt.ylabel('Precision/Recall')
        plt.legend()
        plt.savefig(outcome + "ThresholdSMOTE.pdf", bbox_inches='tight')

        test_x_predictions = lstm_autoencoder.predict(X_test_scaled)
        mse = np.mean(np.power(
            flatten(X_test_scaled) - flatten(test_x_predictions), 2),
                      axis=1)

        error_df = pd.DataFrame({
            'Reconstruction_error': mse,
            'True_class': y_test.tolist()
        })

        plt.figure(figsize=(10, 10))

        threshold_fixed = 0.3
        groups = error_df.groupby('True_class')
        fig, ax = plt.subplots()

        for name, group in groups:
            ax.plot(group.index,
                    group.Reconstruction_error,
                    marker='o',
                    ms=3.5,
                    linestyle='',
                    label="Break" if name == 1 else "Normal")
        ax.hlines(threshold_fixed,
                  ax.get_xlim()[0],
                  ax.get_xlim()[1],
                  colors="r",
                  zorder=100,
                  label='Threshold')
        ax.legend()
        plt.title("Reconstruction error for different classes")
        plt.ylabel("Reconstruction error")
        plt.xlabel("Data point index")
        plt.savefig(outcome + "ReconstructionerrorSMOTE.pdf",
                    bbox_inches='tight')

        pred_y = [
            1 if e > threshold_fixed else 0
            for e in error_df.Reconstruction_error.values
        ]
        conf_matrix = confusion_matrix(error_df.True_class, pred_y)

        plt.figure(figsize=(6, 6))
        sns.heatmap(conf_matrix,
                    xticklabels=LABELS,
                    yticklabels=LABELS,
                    annot=True,
                    fmt="d")
        plt.title("Confusion matrix")
        plt.ylabel('True class')
        plt.xlabel('Predicted class')
        plt.savefig(outcome + "ConfusionMatrixSMOTE.pdf", bbox_inches='tight')

        false_pos_rate, true_pos_rate, thresholds = roc_curve(
            error_df.True_class, error_df.Reconstruction_error)
        roc_auc = auc(
            false_pos_rate,
            true_pos_rate,
        )

        plt.figure(figsize=(10, 10))

        plt.plot(false_pos_rate,
                 true_pos_rate,
                 linewidth=5,
                 label='AUC = %0.3f' % roc_auc)
        plt.plot([0, 1], [0, 1], linewidth=5)

        plt.xlim([-0.01, 1])
        plt.ylim([0, 1.01])
        plt.legend(loc='lower right')
        plt.title('Receiver operating characteristic curve (ROC)')
        plt.ylabel('True Positive Rate')
        plt.xlabel('False Positive Rate')
        plt.savefig(outcome + "rocSMOTE.pdf", bbox_inches='tight')

        precision, recall, thresholds = precision_recall_curve(
            error_df.True_class, error_df.Reconstruction_error)
        pr_auc = auc(recall, precision)

        plt.figure(figsize=(10, 10))

        plt.plot(recall, precision, linewidth=5, label='AUC = %0.3f' % pr_auc)
        plt.plot([0, 1], [0, 1], linewidth=5)

        plt.xlim([-0.01, 1])
        plt.ylim([0, 1.01])
        plt.legend(loc='lower right')
        plt.title('Receiver operating characteristic curve (ROC)')
        plt.ylabel('Precision')
        plt.xlabel('Recall')
        plt.savefig(outcome + "precision_recall_aucSMOTE.pdf",
                    bbox_inches='tight')
def main():
    configs = json.load(open('Configuration.json', 'r'))
    epochs = configs['training']['epochs']
    grouping = configs['data']['grouping']
    dynamic_features = configs['data']['dynamic_columns']
    static_features = configs['data']['static_columns']

    outcomes = configs['data']['classification_outcome']
    lookback = configs['data']['batch_size']
    timeseries_path = configs['paths']['data_path']
    autoencoder_models_path = configs['paths']['autoencoder_models_path']
    test_data_path = configs['paths']['test_data_path']

    ##read, impute and scale dataset
    non_smotedtime_series = pd.read_csv(timeseries_path +
                                        "TimeSeriesAggregatedUpto0.csv")
    non_smotedtime_series[dynamic_features] = impute(non_smotedtime_series,
                                                     dynamic_features)
    normalized_timeseries = scale(non_smotedtime_series, dynamic_features)
    normalized_timeseries.insert(0, grouping, non_smotedtime_series[grouping])

    #intialise classification report which will house results of all outcomes
    classification_report = ClassificationReport()

    #save lstm performance for comparison with final outcome
    lstm_praucs = []
    ##start working per outcome
    for outcome in outcomes:
        fold_ind, train_ind, test_ind = get_train_test_split(
            non_smotedtime_series[outcome].astype(int),
            non_smotedtime_series[grouping])

        ##Load LSTM models if they exist, otherwise train new models and save them
        autoencoder_filename = autoencoder_models_path + configs['model'][
            'name'] + outcome + '.h5'
        X_train, X_train_y0, X_valid_y0, X_valid, y_valid, X_test, y_test, timesteps, \
        n_features = \
            process_data(normalized_timeseries, non_smotedtime_series, outcome, grouping, lookback,
                         train_ind, test_ind)
        if ("3D" not in outcome):
            if os.path.isfile(autoencoder_filename):
                print(" Autoencoder trained model exists for oucome", outcome,
                      "file:", autoencoder_filename)
                autoencoder = LSTMAutoEncoder(configs['model']['name'] +
                                              outcome,
                                              outcome,
                                              timesteps,
                                              n_features,
                                              saved_model=autoencoder_filename)
                autoencoder.summary()

            else:
                print("Autencoder trained model does not exist for outcome",
                      outcome, "file:", autoencoder_filename)
                autoencoder = LSTMAutoEncoder(
                    configs['model']['name'] + outcome, outcome, timesteps,
                    n_features)
                autoencoder.summary()

                autoencoder.fit(X_train_y0, X_train_y0, epochs, lookback,
                                X_valid_y0, X_valid_y0, 2)
                autoencoder.plot_history()

            train_x_predictions = autoencoder.predict(X_train)
            mse_train = np.mean(np.power(
                lstm_flatten(X_train) - lstm_flatten(train_x_predictions), 2),
                                axis=1)

            test_x_predictions = autoencoder.predict(X_test)

            mse_test = np.mean(np.power(
                lstm_flatten(X_test) - lstm_flatten(test_x_predictions), 2),
                               axis=1)

            test_error_df = pd.DataFrame({
                'Reconstruction_error': mse_test,
                'True_class': y_test.tolist()
            })

            pred_y, best_threshold, precision_rt, recall_rt = \
                  autoencoder.predict_binary(test_error_df.True_class, test_error_df.Reconstruction_error)

            autoencoder.output_performance(test_error_df.True_class, pred_y)
            autoencoder.plot_reconstruction_error(test_error_df,
                                                  best_threshold)
            autoencoder.plot_roc(test_error_df)
            autoencoder.plot_pr(precision_rt, recall_rt)
            lstm_prauc = auc(recall_rt, precision_rt)
            lstm_praucs.append(lstm_prauc)
            #Feature Selector
            training_loc = train_ind[0]  #+train_ind[1]
            training_ids = non_smotedtime_series.iloc[training_loc]
            training_ids = training_ids[grouping]

            testing_ids = non_smotedtime_series.iloc[test_ind[1]]
            testing_ids = testing_ids[grouping]

            flat_df, timesteps = flatten(non_smotedtime_series,
                                         dynamic_features, grouping,
                                         static_features, outcome)
            temporal_features = set(flat_df.columns) - set(static_features)
            temporal_features = set(temporal_features) - set(
                [outcome, grouping])

            X_train = flat_df.loc[flat_df[grouping].isin(training_ids)]
            y_train = X_train[outcome].astype(int)
            training_groups = X_train[grouping]
            X_train_static = X_train[static_features]
            X_train_static[grouping] = training_groups
            X_train = X_train[temporal_features]

            X_test = flat_df.loc[flat_df[grouping].isin(testing_ids)]
            y_test = X_test[outcome].astype(int)
            testing_groups = X_test[grouping]
            X_test_static = X_test[static_features]
            X_test_static.loc[grouping] = testing_groups
            X_test = X_test[temporal_features]

            ########
            aggregate_df = generate_aggregates(X_train, temporal_features,
                                               grouping, training_groups)

            static_aggregate_train_df = pd.concat(
                [aggregate_df, X_train_static], axis=1, join='inner')
            static_aggregate_train_df = static_aggregate_train_df.loc[:,
                                                                      ~static_aggregate_train_df
                                                                      .columns.
                                                                      duplicated(
                                                                      )]
            static_aggregate_train_df.drop(columns=[grouping],
                                           inplace=True,
                                           axis=1)
            static_aggregate_train_df['mse'] = mse_train

            aggregate_df_test = generate_aggregates(X_test, temporal_features,
                                                    grouping, testing_groups)
            static_aggregate_test_df = pd.concat(
                [aggregate_df_test, X_test_static], axis=1, join='inner')
            static_aggregate_test_df = static_aggregate_test_df.loc[:,
                                                                    ~static_aggregate_test_df
                                                                    .columns.
                                                                    duplicated(
                                                                    )]
            static_aggregate_test_df.drop(columns=[grouping],
                                          inplace=True,
                                          axis=1)
            static_aggregate_test_df['mse'] = mse_test

            static_aggregate_test_df.to_csv("static_aggretate.csv",
                                            index=False)
            static_baseline_classifier = XGBoostClassifier(
                static_aggregate_train_df, y_train, outcome, grouping)

            static_baseline_classifier.fit("aggregate_static", mse_train * 100)

            y_pred_binary, best_threshold, precision_rt, recall_rt, yhat = \
                static_baseline_classifier.predict(static_aggregate_test_df, y_test)

            print(" CLASS WEIGHTS FOR Y ACTUAL: ", class_counts(y_test))
            print(" CLASS WEIGHTS FOR Y PREDICTE: ",
                  class_counts(y_pred_binary))

            static_baseline_classifier.output_performance(
                y_test, y_pred_binary)
            static_baseline_classifier.plot_pr(precision_rt, recall_rt,
                                               "XGBoost Static")
            static_baseline_classifier.plot_feature_importance(
                static_aggregate_test_df.columns)

            to_write_for_plotting = static_aggregate_test_df
            to_write_for_plotting['outcome'] = y_test
            to_write_for_plotting.to_csv(test_data_path + outcome + ".csv",
                                         index=False)

            #add to classification report

            classification_report.add_model_result(outcome, y_test,
                                                   y_pred_binary,
                                                   best_threshold,
                                                   precision_rt, recall_rt,
                                                   yhat)

            #delete variables
            del static_aggregate_train_df
            del static_aggregate_test_df
            del X_train
            del X_train_y0
            del X_valid_y0
            del X_valid
            del y_valid
            del X_test
            del y_test
            del timesteps
            del train_x_predictions
            del test_x_predictions
            del test_error_df
    #risk_score_visualiser = Visualiser(normalized_timeseries, non_smotedtime_series,
    #                                  dynamic_features, static_features
    #                                 )
    #After fitting model to all outcomes, plot and get summary statistics
    classification_report.plot_distributions_vs_aucs()
    classification_report.plot_pr_auc()
    classification_report.plot_auc()
    classification_report.compare_lstim_xgboost(lstm_praucs)
Exemple #6
0
def main():
    configs = json.load(open('Configuration.json', 'r'))
    epochs = configs['training']['epochs']
    grouping = configs['data']['grouping']
    dynamic_features = configs['data']['dynamic_columns']
    static_features = configs['data']['static_columns']

    outcomes = configs['data']['classification_outcome']
    lookback = configs['data']['batch_size']
    timeseries_path = configs['paths']['data_path']
    saved_models_path = configs['paths']['saved_models_path']

    ##read, impute and scale dataset
    non_smotedtime_series = pd.read_csv(timeseries_path +
                                        "TimeSeriesAggregatedUpto0.csv")
    non_smotedtime_series[dynamic_features] = impute(non_smotedtime_series,
                                                     dynamic_features)
    normalized_timeseries = scale(non_smotedtime_series, dynamic_features)
    normalized_timeseries.insert(0, grouping, non_smotedtime_series[grouping])

    ##start working per outcome
    for outcome in outcomes:
        decision_maker = DecisionMaker()
        fold_ind, train_ind, test_ind = get_train_test_split(
            non_smotedtime_series[outcome].astype(int),
            non_smotedtime_series[grouping])

        ##Load LSTM models if they exist, otherwise train new models and save them
        filename = saved_models_path + configs['model'][
            'name'] + outcome + '.h5'

        X_train, X_train_y0, X_valid_y0, X_valid, y_valid, X_test, y_test, timesteps, \
        n_features = \
            process_data(normalized_timeseries, non_smotedtime_series, outcome, grouping, lookback,
                         train_ind, test_ind)

        if os.path.isfile(filename):
            autoencoder = LSTMAutoEncoder(configs['model']['name'] + outcome,
                                          outcome,
                                          timesteps,
                                          n_features,
                                          saved_model=filename)
            autoencoder.summary()

        else:
            autoencoder = LSTMAutoEncoder(configs['model']['name'] + outcome,
                                          outcome, timesteps, n_features)
            autoencoder.summary()

            autoencoder.fit(X_train_y0, X_train_y0, epochs, lookback,
                            X_valid_y0, X_valid_y0, 2)
            autoencoder.plot_history()
            ###save model
            filename = saved_models_path + configs['model'][
                'name'] + outcome + '.h5'
            autoencoder.save_model(filename)

        ####Predicting using the fitted model (loaded or trained)

        train_x_predictions = autoencoder.predict(X_train)
        mse_train = np.mean(np.power(
            lstm_flatten(X_train) - lstm_flatten(train_x_predictions), 2),
                            axis=1)

        test_x_predictions = autoencoder.predict(X_test)

        mse_test = np.mean(np.power(
            lstm_flatten(X_test) - lstm_flatten(test_x_predictions), 2),
                           axis=1)

        test_error_df = pd.DataFrame({
            'Reconstruction_error': mse_test,
            'True_class': y_test.tolist()
        })


        pred_y, best_threshold, precision_rt, recall_rt = \
              autoencoder.predict_binary(test_error_df.True_class, test_error_df.Reconstruction_error)

        autoencoder.output_performance(test_error_df.True_class,
                                       test_error_df.Reconstruction_error,
                                       pred_y)
        autoencoder.plot_reconstruction_error(test_error_df, best_threshold)
        autoencoder.plot_roc(test_error_df)
        autoencoder.plot_pr(precision_rt, recall_rt)

        #Feature Selector
        training_loc = train_ind[0]  #+train_ind[1]
        training_ids = non_smotedtime_series.iloc[training_loc]
        training_ids = training_ids[grouping]

        testing_ids = non_smotedtime_series.iloc[test_ind[1]]
        testing_ids = testing_ids[grouping]

        flat_df, timesteps = flatten(non_smotedtime_series, dynamic_features,
                                     grouping, static_features, outcome)
        temporal_features = set(flat_df.columns) - set(static_features)
        temporal_features = set(temporal_features) - set([outcome, grouping])

        X_train = flat_df.loc[flat_df[grouping].isin(training_ids)]
        y_train = X_train[outcome].astype(int)
        training_groups = X_train[grouping]
        X_train_static = X_train[static_features]
        X_train_static.loc[grouping] = training_groups
        X_train = X_train[temporal_features]
        X_train = scale(X_train, temporal_features)
        X_train['mse'] = mse_train

        #X_train, y_train = smote(X_train, y_train)
        X_test = flat_df.loc[flat_df[grouping].isin(testing_ids)]
        y_test = X_test[outcome].astype(int)
        testing_groups = X_test[grouping]
        X_test_static = X_test[static_features]
        X_test_static.loc[grouping] = testing_groups
        X_test = X_test[temporal_features]
        X_test = scale(X_test, temporal_features)
        X_test['mse'] = mse_test

        feature_selector = XGBoostClassifier(X_train, y_train, outcome,
                                             grouping)  #
        feature_selector.fit("temporal", training_groups)

        y_pred_binary, best_threshold, precision_rt, recall_rt = feature_selector.predict(
            X_test, y_test)
        feature_selector.plot_pr(precision_rt, recall_rt, "XGBoost Temporal")

        featuredf = pd.DataFrame()

        temporal_features = set(temporal_features) - set([outcome])
        featuredf['features'] = list(temporal_features)
        #featuredf['imp'] = fs_fi
        #featuredf = featuredf[featuredf['imp'] > 0]
        ########
        baseline_features = featuredf['features']

        baseline_features = set(
            [x.partition('_')[0] for x in list(baseline_features)])

        baseline_features = [x + "_0" for x in list(baseline_features)]

        baseline_features.insert(0, grouping)
        baseline_static_features = baseline_features + static_features

        slopes_df = generate_slopes(X_train, temporal_features,
                                    static_features, grouping, training_groups)

        aggregate_df = generate_aggregates(X_train, temporal_features,
                                           grouping, training_groups)

        slopes_static_baseline_train_df = pd.concat(
            [slopes_df, X_train_static], axis=1, join='inner')

        slopes_static_baseline_train_df = slopes_static_baseline_train_df.loc[:,
                                                                              ~slopes_static_baseline_train_df
                                                                              .
                                                                              columns
                                                                              .
                                                                              duplicated(
                                                                              )]
        slopes_static_baseline_train_groups = slopes_static_baseline_train_df[
            grouping]
        slopes_static_baseline_train_df.drop(columns=[grouping],
                                             inplace=True,
                                             axis=1)
        slopes_static_baseline_train_df['mse'] = mse_train

        slopes_df_test = generate_slopes(X_test, temporal_features,
                                         static_features, grouping,
                                         testing_groups)

        slopes_static_baseline_test_df = pd.concat(
            [slopes_df_test, X_test_static], axis=1, join='inner')
        slopes_static_baseline_test_df = slopes_static_baseline_test_df.loc[:,
                                                                            ~slopes_static_baseline_test_df
                                                                            .
                                                                            columns
                                                                            .
                                                                            duplicated(
                                                                            )]
        slopes_static_baseline_test_groups = slopes_static_baseline_test_df[
            grouping]
        slopes_static_baseline_test_df.drop(columns=[grouping],
                                            inplace=True,
                                            axis=1)
        slopes_static_baseline_test_df['mse'] = mse_test

        slopes_static_baseline_classifier = XGBoostClassifier(
            slopes_static_baseline_train_df, y_train, outcome, grouping)

        #bs_y, bs_ths, bs_id, bs_fi = slopes_static_baseline_classifier.fit("baseline_static_slope",
        #                                                                      slopes_static_baseline_train_groups)
        slopes_static_baseline_classifier.fit(
            "baseline_static_slope", slopes_static_baseline_train_groups)
        y_pred_binary, best_threshold, precision_rt, recall_rt = \
            slopes_static_baseline_classifier.predict( slopes_static_baseline_test_df, y_test)
        slopes_static_baseline_classifier.plot_pr(precision_rt, recall_rt,
                                                  "XGBoost Static")
Exemple #7
0
def main():
    configs = json.load(open('Configuration.json', 'r'))

    grouping = configs['data']['grouping']
    dynamic_features = configs['data']['dynamic_columns']

    outcomes = configs['data']['classification_outcome']
    lookback = configs['data']['batch_size']
    timeseries_path = configs['paths']['data_path']
    autoencoder_path = configs['paths']['autoencoder_path']

    ##read, impute and scale dataset
    non_smotedtime_series = pd.read_csv(timeseries_path +
                                        "TimeSeriesAggregatedUpto0.csv")
    non_smotedtime_series[dynamic_features] = impute(non_smotedtime_series,
                                                     dynamic_features)
    normalized_timeseries = scale(non_smotedtime_series, dynamic_features)
    normalized_timeseries.insert(0, grouping, non_smotedtime_series[grouping])

    ##start working per outcome
    for outcome in outcomes:
        decision_maker = DecisionMaker()

        X_train_y0, X_valid_y0, X_valid, y_valid, X_test, y_test,  timesteps, n_features =\
            process_data(normalized_timeseries, non_smotedtime_series, outcome, grouping, non_smotedtime_series[grouping], lookback)

        epochs = 100

        autoencoder = LSTMAutoEncoder(configs['model']['name'] + outcome,
                                      outcome, timesteps, n_features)
        autoencoder.summary()

        cp = ModelCheckpoint(filepath="lstm_autoencoder_classifier.h5",
                             save_best_only=True,
                             verbose=0)

        tb = TensorBoard(log_dir='./logs',
                         histogram_freq=0,
                         write_graph=True,
                         write_images=True)

        autoencoder.fit(X_train_y0, X_train_y0, epochs, lookback, X_valid_y0,
                        X_valid_y0, 2)
        ####LSTM autoencoder

        autoencoder.plot_history()
        valid_x_predictions = autoencoder.predict(X_valid)

        mse = np.mean(np.power(
            flatten(X_valid) - flatten(valid_x_predictions), 2),
                      axis=1)
        error_df = pd.DataFrame({
            'Reconstruction_error': mse,
            'True_class': y_valid.tolist()
        })

        precision_rt, recall_rt, threshold_rt = precision_recall_curve(
            error_df.True_class, error_df.Reconstruction_error)

        fscore = (2 * precision_rt * recall_rt) / (precision_rt + recall_rt)

        ix = np.argmax(fscore)
        best_threshold = threshold_rt[ix]
        # print('Best Threshold=%f, G-Mean=%.3f' % (thresholds[ix], fscore[ix]))
        pred_y = (error_df.Reconstruction_error >
                  best_threshold).astype('int32')

        perf_df = pd.DataFrame()
        perf_dict = performance_metrics(error_df.True_class, pred_y,
                                        error_df.Reconstruction_error)
        perf_df = perf_df.append(perf_dict, ignore_index=True)
        perf_df.to_csv(autoencoder_path + "performancemetrics" + outcome +
                       ".csv",
                       index=False)

        test_x_predictions = autoencoder.predict(X_test)
        mse = np.mean(np.power(
            flatten(X_test) - flatten(test_x_predictions), 2),
                      axis=1)

        error_df = pd.DataFrame({
            'Reconstruction_error': mse,
            'True_class': y_test.tolist()
        })

        plt.figure(figsize=(10, 10))

        groups = error_df.groupby('True_class')
        fig, ax = plt.subplots()

        for name, group in groups:
            ax.plot(group.index,
                    group.Reconstruction_error,
                    marker='o',
                    ms=3.5,
                    linestyle='',
                    label="1" if name == 1 else "0")
        ax.hlines(threshold_rt[ix],
                  ax.get_xlim()[0],
                  ax.get_xlim()[1],
                  colors="r",
                  zorder=100,
                  label='Threshold')
        ax.legend()
        plt.title("Reconstruction error for different classes")
        plt.ylabel("Reconstruction error")
        plt.xlabel("Data point index")
        plt.savefig(autoencoder_path + outcome + "Reconstructionerror.pdf",
                    bbox_inches='tight')

        false_pos_rate, true_pos_rate, thresholds = roc_curve(
            error_df.True_class, error_df.Reconstruction_error)
        roc_auc = auc(
            false_pos_rate,
            true_pos_rate,
        )

        plt.figure(figsize=(10, 10))

        plt.plot(false_pos_rate,
                 true_pos_rate,
                 linewidth=5,
                 label='AUC = %0.3f' % roc_auc)
        plt.plot([0, 1], [0, 1], linewidth=5)

        plt.xlim([-0.01, 1])
        plt.ylim([0, 1.01])
        plt.legend(loc='lower right')
        plt.title('Receiver operating characteristic curve (ROC)')
        plt.ylabel('True Positive Rate')
        plt.xlabel('False Positive Rate')
        plt.savefig(autoencoder_path + outcome + "roc.pdf",
                    bbox_inches='tight')

        pr_auc = auc(recall_rt, precision_rt)

        plt.figure(figsize=(10, 10))

        plt.plot(recall_rt,
                 precision_rt,
                 linewidth=5,
                 label='PR-AUC = %0.3f' % pr_auc)
        plt.plot([0, 1], [1, 0], linewidth=5)

        plt.xlim([-0.01, 1])
        plt.ylim([0, 1.01])
        plt.legend(loc='lower right')
        plt.title('Precision Recall Curive')
        plt.ylabel('Precision')
        plt.xlabel('Recall')
        plt.savefig(autoencoder_path + outcome + "precision_recall_auc.pdf",
                    bbox_inches='tight')