Exemple #1
0
def main():
    configs = json.load(open('Configuration.json', 'r'))

    epochs = configs['training']['epochs']
    grouping = configs['data']['grouping']
    dynamic_features = configs['data']['dynamic_columns']

    outcomes = configs['data']['classification_outcome']
    lookback = configs['data']['batch_size']
    timeseries_path = configs['paths']['data_path']

    ##read, impute and scale dataset
    non_smotedtime_series = pd.read_csv(timeseries_path +
                                        "TimeSeriesAggregatedUpto0.csv")
    non_smotedtime_series[dynamic_features] = impute(non_smotedtime_series,
                                                     dynamic_features)
    normalized_timeseries = scale(non_smotedtime_series, dynamic_features)
    normalized_timeseries.insert(0, grouping, non_smotedtime_series[grouping])

    ##start working per outcome
    for outcome in outcomes:
        decision_maker = DecisionMaker()

        X_train_y0, y_train1, X_valid_y0, X_valid_y1, X_valid, y_val1, X_test, y_test1, timesteps, n_features=\
            process_data(normalized_timeseries, non_smotedtime_series, outcome, grouping, non_smotedtime_series[grouping], lookback)
Exemple #2
0
def main():
    configs = json.load(open('Configuration.json', 'r'))
    epochs = configs['training']['epochs']
    grouping = configs['data']['grouping']
    dynamic_features = configs['data']['dynamic_columns']

    outcomes = configs['data']['classification_outcome']
    lookback = configs['data']['batch_size']
    timeseries_path = configs['paths']['data_path']
    saved_models_path = configs['paths']['saved_models_path']

    ##read, impute and scale dataset
    non_smotedtime_series = pd.read_csv(timeseries_path + "TimeSeriesAggregatedUpto0.csv")
    non_smotedtime_series[dynamic_features] = impute(non_smotedtime_series, dynamic_features)
    normalized_timeseries = scale(non_smotedtime_series, dynamic_features)
    normalized_timeseries.insert(0, grouping, non_smotedtime_series[grouping])

    ##start working per outcome
    for outcome in outcomes:
        decision_maker = DecisionMaker()
        fold_ind, train_ind, test_ind = get_train_test_split(non_smotedtime_series[outcome].astype(int),
                                                             non_smotedtime_series[grouping])

        X_train_y0, X_valid_y0, X_valid, y_valid, X_test, y_test, timesteps,\
        n_features = \
            process_data(normalized_timeseries, non_smotedtime_series, outcome, grouping, lookback,
                         train_ind, test_ind)

        autoencoder = LSTMAutoEncoder(configs['model']['name'] + outcome, outcome, timesteps, n_features)
        autoencoder.summary()

        autoencoder.fit(X_train_y0, X_train_y0, epochs,lookback,X_valid_y0,X_valid_y0,2)

        ###save model
        filename = saved_models_path+ configs['model']['name'] + outcome+ '.h5'
        autoencoder.save_model(filename)

        ####LSTM autoencoder
        autoencoder.plot_history()
        test_x_predictions = autoencoder.predict(X_test)
        mse = np.mean(np.power(flatten(X_test) - flatten(test_x_predictions), 2), axis=1)

        test_error_df = pd.DataFrame({'Reconstruction_error' : mse,
                                 'True_class' : y_test.tolist()})

        pred_y, best_threshold, precision_rt, recall_rt= \
            autoencoder.predict_binary(test_error_df.True_class, test_error_df.Reconstruction_error)

        autoencoder.output_performance(test_error_df.True_class, test_error_df.Reconstruction_error,pred_y)
        autoencoder.plot_reconstruction_error(test_error_df, best_threshold)
        autoencoder.plot_roc(test_error_df)
        autoencoder.plot_pr(precision_rt, recall_rt)
def main():
    configs = json.load(open('Configuration.json', 'r'))

    grouping = configs['data']['grouping']
    static_features = configs['data']['static_columns']
    dynamic_features = configs['data']['dynamic_columns']

    outcomes = configs['data']['classification_outcome']
    lookback = configs['data']['batch_size']
    timeseries_path = configs['paths']['data_path']

    ##read, impute and scale dataset

    ##start working per outcome
    for outcome in outcomes:
        time_series = pd.read_csv(timeseries_path + "SMOTEDTimeSeries/" +
                                  outcome + "StackedTimeSeries1Day.csv")

        time_series[dynamic_features] = impute(time_series, dynamic_features)
        normalised_series = scale(time_series, dynamic_features)
        normalised_series.insert(0, grouping, time_series[grouping])
        normalised_series.insert(len(normalised_series.columns), outcome,
                                 time_series[outcome])

        normalised_series = curve_shift(normalised_series,
                                        grouping,
                                        outcome,
                                        shift_by=lookback - 1)

        decision_maker = DecisionMaker()

        #train/test and validation sets
        X_cols = (normalised_series.columns).tolist()
        X_cols.remove(outcome)
        X_cols.remove(grouping)

        input_X = normalised_series.loc[:,
                                        normalised_series.columns.isin(
                                            X_cols
                                        )].values  # converts the df to a numpy array
        input_y = normalised_series[outcome].values

        n_features = input_X.shape[1]  # number of features

        X, y = temporalize(X=input_X, y=input_y, lookback=lookback)

        X_train, X_test, y_train, y_test = train_test_split(np.array(X),
                                                            np.array(y),
                                                            test_size=0.33,
                                                            random_state=SEED,
                                                            stratify=y)
        X_train, X_valid, y_train, y_valid = train_test_split(
            X_train,
            y_train,
            test_size=0.33,
            random_state=SEED,
            stratify=y_train)

        X_train = X_train.reshape(X_train.shape[0], lookback, n_features)
        X_valid = X_valid.reshape(X_valid.shape[0], lookback, n_features)
        X_test = X_test.reshape(X_test.shape[0], lookback, n_features)

        distrs_percents = [
            get_distribution_percentages(
                (normalised_series[outcome]).astype(int))
        ]
        scaler = StandardScaler().fit(flatten(X_train))

        a = flatten(X_train)
        print('colwise mean', np.mean(a, axis=0).round(6))
        print('colwise variance', np.var(a, axis=0))

        X_valid_scaled = Models.LSTMAutoEncoder.Utils.scale(X_valid, scaler)
        X_test_scaled = Models.LSTMAutoEncoder.Utils.scale(X_test, scaler)

        timesteps = X_train.shape[1]  # equal to the lookback
        n_features = X_train.shape[2]  # 59

        epochs = 100
        lr = 0.0001

        lstm_autoencoder = Sequential()
        # Encoder
        lstm_autoencoder.add(
            LSTM(32,
                 activation='relu',
                 input_shape=(timesteps, n_features),
                 return_sequences=True))
        lstm_autoencoder.add(
            LSTM(16, activation='relu', return_sequences=False))
        lstm_autoencoder.add(RepeatVector(timesteps))
        # Decoder
        lstm_autoencoder.add(LSTM(16, activation='relu',
                                  return_sequences=True))
        lstm_autoencoder.add(LSTM(32, activation='relu',
                                  return_sequences=True))
        lstm_autoencoder.add(TimeDistributed(Dense(n_features)))

        lstm_autoencoder.summary()

        adam = optimizers.Adam(lr)
        lstm_autoencoder.compile(loss='mse', optimizer=adam)

        cp = ModelCheckpoint(filepath="lstm_autoencoder_classifier.h5",
                             save_best_only=True,
                             verbose=0)

        tb = TensorBoard(log_dir='./logs',
                         histogram_freq=0,
                         write_graph=True,
                         write_images=True)

        lstm_autoencoder_history = lstm_autoencoder.fit(
            X_train,
            X_train,
            epochs=epochs,
            batch_size=lookback,
            validation_data=(X_valid, X_train),
            verbose=2).history

        #print(distrs_percents)
        ####LSTM autoencoder

        plt.figure(figsize=(10, 10))
        plt.plot(lstm_autoencoder_history['loss'], linewidth=2, label='Train')
        plt.plot(lstm_autoencoder_history['val_loss'],
                 linewidth=2,
                 label='Valid')
        plt.legend(loc='upper right')
        plt.title('Model loss')
        plt.ylabel('Loss')
        plt.xlabel('Epoch')
        plt.savefig("LossOverEpochsSMOTE.pdf", bbox_inches='tight')

        plt.figure(figsize=(10, 10))

        valid_x_predictions = lstm_autoencoder.predict(X_valid_scaled)
        mse = np.mean(np.power(
            flatten(X_valid_scaled) - flatten(valid_x_predictions), 2),
                      axis=1)

        error_df = pd.DataFrame({
            'Reconstruction_error': mse,
            'True_class': y_valid.tolist()
        })

        precision_rt, recall_rt, threshold_rt = precision_recall_curve(
            error_df.True_class, error_df.Reconstruction_error)
        plt.plot(threshold_rt,
                 precision_rt[1:],
                 label="Precision",
                 linewidth=5)
        plt.plot(threshold_rt, recall_rt[1:], label="Recall", linewidth=5)
        plt.title('Precision and recall for different threshold values')
        plt.xlabel('Threshold')
        plt.ylabel('Precision/Recall')
        plt.legend()
        plt.savefig(outcome + "ThresholdSMOTE.pdf", bbox_inches='tight')

        test_x_predictions = lstm_autoencoder.predict(X_test_scaled)
        mse = np.mean(np.power(
            flatten(X_test_scaled) - flatten(test_x_predictions), 2),
                      axis=1)

        error_df = pd.DataFrame({
            'Reconstruction_error': mse,
            'True_class': y_test.tolist()
        })

        plt.figure(figsize=(10, 10))

        threshold_fixed = 0.3
        groups = error_df.groupby('True_class')
        fig, ax = plt.subplots()

        for name, group in groups:
            ax.plot(group.index,
                    group.Reconstruction_error,
                    marker='o',
                    ms=3.5,
                    linestyle='',
                    label="Break" if name == 1 else "Normal")
        ax.hlines(threshold_fixed,
                  ax.get_xlim()[0],
                  ax.get_xlim()[1],
                  colors="r",
                  zorder=100,
                  label='Threshold')
        ax.legend()
        plt.title("Reconstruction error for different classes")
        plt.ylabel("Reconstruction error")
        plt.xlabel("Data point index")
        plt.savefig(outcome + "ReconstructionerrorSMOTE.pdf",
                    bbox_inches='tight')

        pred_y = [
            1 if e > threshold_fixed else 0
            for e in error_df.Reconstruction_error.values
        ]
        conf_matrix = confusion_matrix(error_df.True_class, pred_y)

        plt.figure(figsize=(6, 6))
        sns.heatmap(conf_matrix,
                    xticklabels=LABELS,
                    yticklabels=LABELS,
                    annot=True,
                    fmt="d")
        plt.title("Confusion matrix")
        plt.ylabel('True class')
        plt.xlabel('Predicted class')
        plt.savefig(outcome + "ConfusionMatrixSMOTE.pdf", bbox_inches='tight')

        false_pos_rate, true_pos_rate, thresholds = roc_curve(
            error_df.True_class, error_df.Reconstruction_error)
        roc_auc = auc(
            false_pos_rate,
            true_pos_rate,
        )

        plt.figure(figsize=(10, 10))

        plt.plot(false_pos_rate,
                 true_pos_rate,
                 linewidth=5,
                 label='AUC = %0.3f' % roc_auc)
        plt.plot([0, 1], [0, 1], linewidth=5)

        plt.xlim([-0.01, 1])
        plt.ylim([0, 1.01])
        plt.legend(loc='lower right')
        plt.title('Receiver operating characteristic curve (ROC)')
        plt.ylabel('True Positive Rate')
        plt.xlabel('False Positive Rate')
        plt.savefig(outcome + "rocSMOTE.pdf", bbox_inches='tight')

        precision, recall, thresholds = precision_recall_curve(
            error_df.True_class, error_df.Reconstruction_error)
        pr_auc = auc(recall, precision)

        plt.figure(figsize=(10, 10))

        plt.plot(recall, precision, linewidth=5, label='AUC = %0.3f' % pr_auc)
        plt.plot([0, 1], [0, 1], linewidth=5)

        plt.xlim([-0.01, 1])
        plt.ylim([0, 1.01])
        plt.legend(loc='lower right')
        plt.title('Receiver operating characteristic curve (ROC)')
        plt.ylabel('Precision')
        plt.xlabel('Recall')
        plt.savefig(outcome + "precision_recall_aucSMOTE.pdf",
                    bbox_inches='tight')
Exemple #4
0
def main():
    configs = json.load(open('Configuration.json', 'r'))

    grouping = configs['data']['grouping']
    static_features = configs['data']['static_columns']

    outcomes = (configs['data']['classification_outcome'])
    timeseries_path = configs['paths']['data_path']

    for outcome in outcomes:
        decision_maker = DecisionMaker()

        time_series_nosmote = pd.read_csv(timeseries_path +
                                          "NonSMOTEDTimeSeries/" + outcome +
                                          "FlatTimeSeries1Day.csv")
        time_series = pd.read_csv(timeseries_path + "SMOTEDTimeSeries/" +
                                  outcome + "FlatTimeSeries1Day.csv")

        X_cols = (time_series.columns).tolist()
        X_cols.remove(outcome)
        X_train, X_valid, y_train, y_valid = train_test_split(
            time_series[X_cols],
            time_series[outcome],
            test_size=0.33,
            stratify=time_series[outcome],
            random_state=42)

        test_ids = set([x.partition('.')[0] for x in X_valid[grouping]])

        #print(test_ids.intersection(time_series_nosmote[grouping] ))
        X_test1 = time_series_nosmote.loc[time_series_nosmote[grouping].isin(
            test_ids)]
        distrs_percents = [
            get_distribution_percentages(
                (time_series_nosmote[outcome]).astype(int))
        ]
        print(distrs_percents)

        #####feature selector
        temporal_features = set(X_train.columns) - set(static_features)

        feature_selector = XGBoostClassifier(X_train[temporal_features],
                                             y_train, outcome, grouping)
        fs_y, fs_ths, fs_id, fs_fi = feature_selector.run_xgb("temporal")

        feature_selector.predict(X_valid[temporal_features], y_valid)

        decision_maker.add_classifier(outcome + "Tmp", fs_y, fs_ths, fs_id,
                                      fs_fi)

        featuredf = pd.DataFrame()

        temporal_features.remove(grouping)
        featuredf['features'] = list(temporal_features)
        featuredf['imp'] = fs_fi
        featuredf = featuredf[featuredf['imp'] > 0]

        ########################################
        #baseline and static
        baseline_features = featuredf['features']

        baseline_features = set(
            [x.partition('_')[0] for x in list(baseline_features)])

        baseline_features = [x + "_0" for x in list(baseline_features)]

        baseline_features.insert(0, grouping)
        baseline_static_features = baseline_features + static_features

        slopes_df = generate_slopes(X_train, static_features, grouping)
        slopes_static_baseline_df = pd.concat(
            [slopes_df, X_train[baseline_static_features]],
            axis=1,
            join='inner')

        slopes_static_baseline_df = slopes_static_baseline_df.loc[:,
                                                                  ~slopes_static_baseline_df
                                                                  .columns.
                                                                  duplicated()]

        slopes_df_test = generate_slopes(X_valid, static_features, grouping)
        slopes_static_baseline_test_df = pd.concat(
            [slopes_df_test, X_valid[baseline_static_features]],
            axis=1,
            join='inner')
        slopes_static_baseline_test_df = slopes_static_baseline_test_df.loc[:,
                                                                            ~slopes_static_baseline_test_df
                                                                            .
                                                                            columns
                                                                            .
                                                                            duplicated(
                                                                            )]

        slopes_static_baseline_classifier = XGBoostClassifier(
            slopes_static_baseline_df, y_train, outcome, grouping)

        bs_y, bs_ths, bs_id, bs_fi = slopes_static_baseline_classifier.run_xgb(
            "baseline_static_slope")
        slopes_static_baseline_classifier.predict(
            slopes_static_baseline_test_df, y_valid)
        tf.keras.backend.clear_session()

        decision_maker.add_classifier(outcome + "bss", bs_y, bs_ths, bs_id,
                                      bs_fi)

        ####LSTM autoencoder
        smoted_stacked_series = pd.read_csv(timeseries_path +
                                            "SMOTEDTimeSeries/" + outcome +
                                            "StackedTimeSeries1Day.csv")
Exemple #5
0
def main():
    configs = json.load(open('Configuration.json', 'r'))

    grouping = configs['data']['grouping']
    dynamic_features = configs['data']['dynamic_columns']

    outcomes = configs['data']['classification_outcome']
    lookback = configs['data']['batch_size']
    timeseries_path = configs['paths']['data_path']
    autoencoder_path = configs['paths']['autoencoder_path']

    ##read, impute and scale dataset
    non_smotedtime_series = pd.read_csv(timeseries_path +
                                        "TimeSeriesAggregatedUpto0.csv")
    non_smotedtime_series[dynamic_features] = impute(non_smotedtime_series,
                                                     dynamic_features)
    normalized_timeseries = scale(non_smotedtime_series, dynamic_features)
    normalized_timeseries.insert(0, grouping, non_smotedtime_series[grouping])

    ##start working per outcome
    for outcome in outcomes:
        decision_maker = DecisionMaker()

        X_train_y0, X_valid_y0, X_valid, y_valid, X_test, y_test,  timesteps, n_features =\
            process_data(normalized_timeseries, non_smotedtime_series, outcome, grouping, non_smotedtime_series[grouping], lookback)

        epochs = 100

        autoencoder = LSTMAutoEncoder(configs['model']['name'] + outcome,
                                      outcome, timesteps, n_features)
        autoencoder.summary()

        cp = ModelCheckpoint(filepath="lstm_autoencoder_classifier.h5",
                             save_best_only=True,
                             verbose=0)

        tb = TensorBoard(log_dir='./logs',
                         histogram_freq=0,
                         write_graph=True,
                         write_images=True)

        autoencoder.fit(X_train_y0, X_train_y0, epochs, lookback, X_valid_y0,
                        X_valid_y0, 2)
        ####LSTM autoencoder

        autoencoder.plot_history()
        valid_x_predictions = autoencoder.predict(X_valid)

        mse = np.mean(np.power(
            flatten(X_valid) - flatten(valid_x_predictions), 2),
                      axis=1)
        error_df = pd.DataFrame({
            'Reconstruction_error': mse,
            'True_class': y_valid.tolist()
        })

        precision_rt, recall_rt, threshold_rt = precision_recall_curve(
            error_df.True_class, error_df.Reconstruction_error)

        fscore = (2 * precision_rt * recall_rt) / (precision_rt + recall_rt)

        ix = np.argmax(fscore)
        best_threshold = threshold_rt[ix]
        # print('Best Threshold=%f, G-Mean=%.3f' % (thresholds[ix], fscore[ix]))
        pred_y = (error_df.Reconstruction_error >
                  best_threshold).astype('int32')

        perf_df = pd.DataFrame()
        perf_dict = performance_metrics(error_df.True_class, pred_y,
                                        error_df.Reconstruction_error)
        perf_df = perf_df.append(perf_dict, ignore_index=True)
        perf_df.to_csv(autoencoder_path + "performancemetrics" + outcome +
                       ".csv",
                       index=False)

        test_x_predictions = autoencoder.predict(X_test)
        mse = np.mean(np.power(
            flatten(X_test) - flatten(test_x_predictions), 2),
                      axis=1)

        error_df = pd.DataFrame({
            'Reconstruction_error': mse,
            'True_class': y_test.tolist()
        })

        plt.figure(figsize=(10, 10))

        groups = error_df.groupby('True_class')
        fig, ax = plt.subplots()

        for name, group in groups:
            ax.plot(group.index,
                    group.Reconstruction_error,
                    marker='o',
                    ms=3.5,
                    linestyle='',
                    label="1" if name == 1 else "0")
        ax.hlines(threshold_rt[ix],
                  ax.get_xlim()[0],
                  ax.get_xlim()[1],
                  colors="r",
                  zorder=100,
                  label='Threshold')
        ax.legend()
        plt.title("Reconstruction error for different classes")
        plt.ylabel("Reconstruction error")
        plt.xlabel("Data point index")
        plt.savefig(autoencoder_path + outcome + "Reconstructionerror.pdf",
                    bbox_inches='tight')

        false_pos_rate, true_pos_rate, thresholds = roc_curve(
            error_df.True_class, error_df.Reconstruction_error)
        roc_auc = auc(
            false_pos_rate,
            true_pos_rate,
        )

        plt.figure(figsize=(10, 10))

        plt.plot(false_pos_rate,
                 true_pos_rate,
                 linewidth=5,
                 label='AUC = %0.3f' % roc_auc)
        plt.plot([0, 1], [0, 1], linewidth=5)

        plt.xlim([-0.01, 1])
        plt.ylim([0, 1.01])
        plt.legend(loc='lower right')
        plt.title('Receiver operating characteristic curve (ROC)')
        plt.ylabel('True Positive Rate')
        plt.xlabel('False Positive Rate')
        plt.savefig(autoencoder_path + outcome + "roc.pdf",
                    bbox_inches='tight')

        pr_auc = auc(recall_rt, precision_rt)

        plt.figure(figsize=(10, 10))

        plt.plot(recall_rt,
                 precision_rt,
                 linewidth=5,
                 label='PR-AUC = %0.3f' % pr_auc)
        plt.plot([0, 1], [1, 0], linewidth=5)

        plt.xlim([-0.01, 1])
        plt.ylim([0, 1.01])
        plt.legend(loc='lower right')
        plt.title('Precision Recall Curive')
        plt.ylabel('Precision')
        plt.xlabel('Recall')
        plt.savefig(autoencoder_path + outcome + "precision_recall_auc.pdf",
                    bbox_inches='tight')
Exemple #6
0
def main():
    configs = json.load(open('Configuration.json', 'r'))
    epochs = configs['training']['epochs']
    grouping = configs['data']['grouping']
    dynamic_features = configs['data']['dynamic_columns']
    static_features = configs['data']['static_columns']

    outcomes = configs['data']['classification_outcome']
    lookback = configs['data']['batch_size']
    timeseries_path = configs['paths']['data_path']
    saved_models_path = configs['paths']['saved_models_path']

    ##read, impute and scale dataset
    non_smotedtime_series = pd.read_csv(timeseries_path +
                                        "TimeSeriesAggregatedUpto0.csv")
    non_smotedtime_series[dynamic_features] = impute(non_smotedtime_series,
                                                     dynamic_features)
    normalized_timeseries = scale(non_smotedtime_series, dynamic_features)
    normalized_timeseries.insert(0, grouping, non_smotedtime_series[grouping])

    ##start working per outcome
    for outcome in outcomes:
        decision_maker = DecisionMaker()
        fold_ind, train_ind, test_ind = get_train_test_split(
            non_smotedtime_series[outcome].astype(int),
            non_smotedtime_series[grouping])

        ##Load LSTM models if they exist, otherwise train new models and save them
        filename = saved_models_path + configs['model'][
            'name'] + outcome + '.h5'

        X_train, X_train_y0, X_valid_y0, X_valid, y_valid, X_test, y_test, timesteps, \
        n_features = \
            process_data(normalized_timeseries, non_smotedtime_series, outcome, grouping, lookback,
                         train_ind, test_ind)

        if os.path.isfile(filename):
            autoencoder = LSTMAutoEncoder(configs['model']['name'] + outcome,
                                          outcome,
                                          timesteps,
                                          n_features,
                                          saved_model=filename)
            autoencoder.summary()

        else:
            autoencoder = LSTMAutoEncoder(configs['model']['name'] + outcome,
                                          outcome, timesteps, n_features)
            autoencoder.summary()

            autoencoder.fit(X_train_y0, X_train_y0, epochs, lookback,
                            X_valid_y0, X_valid_y0, 2)
            autoencoder.plot_history()
            ###save model
            filename = saved_models_path + configs['model'][
                'name'] + outcome + '.h5'
            autoencoder.save_model(filename)

        ####Predicting using the fitted model (loaded or trained)

        train_x_predictions = autoencoder.predict(X_train)
        mse_train = np.mean(np.power(
            lstm_flatten(X_train) - lstm_flatten(train_x_predictions), 2),
                            axis=1)

        test_x_predictions = autoencoder.predict(X_test)

        mse_test = np.mean(np.power(
            lstm_flatten(X_test) - lstm_flatten(test_x_predictions), 2),
                           axis=1)

        test_error_df = pd.DataFrame({
            'Reconstruction_error': mse_test,
            'True_class': y_test.tolist()
        })


        pred_y, best_threshold, precision_rt, recall_rt = \
              autoencoder.predict_binary(test_error_df.True_class, test_error_df.Reconstruction_error)

        autoencoder.output_performance(test_error_df.True_class,
                                       test_error_df.Reconstruction_error,
                                       pred_y)
        autoencoder.plot_reconstruction_error(test_error_df, best_threshold)
        autoencoder.plot_roc(test_error_df)
        autoencoder.plot_pr(precision_rt, recall_rt)

        #Feature Selector
        training_loc = train_ind[0]  #+train_ind[1]
        training_ids = non_smotedtime_series.iloc[training_loc]
        training_ids = training_ids[grouping]

        testing_ids = non_smotedtime_series.iloc[test_ind[1]]
        testing_ids = testing_ids[grouping]

        flat_df, timesteps = flatten(non_smotedtime_series, dynamic_features,
                                     grouping, static_features, outcome)
        temporal_features = set(flat_df.columns) - set(static_features)
        temporal_features = set(temporal_features) - set([outcome, grouping])

        X_train = flat_df.loc[flat_df[grouping].isin(training_ids)]
        y_train = X_train[outcome].astype(int)
        training_groups = X_train[grouping]
        X_train_static = X_train[static_features]
        X_train_static.loc[grouping] = training_groups
        X_train = X_train[temporal_features]
        X_train = scale(X_train, temporal_features)
        X_train['mse'] = mse_train

        #X_train, y_train = smote(X_train, y_train)
        X_test = flat_df.loc[flat_df[grouping].isin(testing_ids)]
        y_test = X_test[outcome].astype(int)
        testing_groups = X_test[grouping]
        X_test_static = X_test[static_features]
        X_test_static.loc[grouping] = testing_groups
        X_test = X_test[temporal_features]
        X_test = scale(X_test, temporal_features)
        X_test['mse'] = mse_test

        feature_selector = XGBoostClassifier(X_train, y_train, outcome,
                                             grouping)  #
        feature_selector.fit("temporal", training_groups)

        y_pred_binary, best_threshold, precision_rt, recall_rt = feature_selector.predict(
            X_test, y_test)
        feature_selector.plot_pr(precision_rt, recall_rt, "XGBoost Temporal")

        featuredf = pd.DataFrame()

        temporal_features = set(temporal_features) - set([outcome])
        featuredf['features'] = list(temporal_features)
        #featuredf['imp'] = fs_fi
        #featuredf = featuredf[featuredf['imp'] > 0]
        ########
        baseline_features = featuredf['features']

        baseline_features = set(
            [x.partition('_')[0] for x in list(baseline_features)])

        baseline_features = [x + "_0" for x in list(baseline_features)]

        baseline_features.insert(0, grouping)
        baseline_static_features = baseline_features + static_features

        slopes_df = generate_slopes(X_train, temporal_features,
                                    static_features, grouping, training_groups)

        aggregate_df = generate_aggregates(X_train, temporal_features,
                                           grouping, training_groups)

        slopes_static_baseline_train_df = pd.concat(
            [slopes_df, X_train_static], axis=1, join='inner')

        slopes_static_baseline_train_df = slopes_static_baseline_train_df.loc[:,
                                                                              ~slopes_static_baseline_train_df
                                                                              .
                                                                              columns
                                                                              .
                                                                              duplicated(
                                                                              )]
        slopes_static_baseline_train_groups = slopes_static_baseline_train_df[
            grouping]
        slopes_static_baseline_train_df.drop(columns=[grouping],
                                             inplace=True,
                                             axis=1)
        slopes_static_baseline_train_df['mse'] = mse_train

        slopes_df_test = generate_slopes(X_test, temporal_features,
                                         static_features, grouping,
                                         testing_groups)

        slopes_static_baseline_test_df = pd.concat(
            [slopes_df_test, X_test_static], axis=1, join='inner')
        slopes_static_baseline_test_df = slopes_static_baseline_test_df.loc[:,
                                                                            ~slopes_static_baseline_test_df
                                                                            .
                                                                            columns
                                                                            .
                                                                            duplicated(
                                                                            )]
        slopes_static_baseline_test_groups = slopes_static_baseline_test_df[
            grouping]
        slopes_static_baseline_test_df.drop(columns=[grouping],
                                            inplace=True,
                                            axis=1)
        slopes_static_baseline_test_df['mse'] = mse_test

        slopes_static_baseline_classifier = XGBoostClassifier(
            slopes_static_baseline_train_df, y_train, outcome, grouping)

        #bs_y, bs_ths, bs_id, bs_fi = slopes_static_baseline_classifier.fit("baseline_static_slope",
        #                                                                      slopes_static_baseline_train_groups)
        slopes_static_baseline_classifier.fit(
            "baseline_static_slope", slopes_static_baseline_train_groups)
        y_pred_binary, best_threshold, precision_rt, recall_rt = \
            slopes_static_baseline_classifier.predict( slopes_static_baseline_test_df, y_test)
        slopes_static_baseline_classifier.plot_pr(precision_rt, recall_rt,
                                                  "XGBoost Static")