def main(): configs = json.load(open('Configuration.json', 'r')) epochs = configs['training']['epochs'] grouping = configs['data']['grouping'] dynamic_features = configs['data']['dynamic_columns'] outcomes = configs['data']['classification_outcome'] lookback = configs['data']['batch_size'] timeseries_path = configs['paths']['data_path'] ##read, impute and scale dataset non_smotedtime_series = pd.read_csv(timeseries_path + "TimeSeriesAggregatedUpto0.csv") non_smotedtime_series[dynamic_features] = impute(non_smotedtime_series, dynamic_features) normalized_timeseries = scale(non_smotedtime_series, dynamic_features) normalized_timeseries.insert(0, grouping, non_smotedtime_series[grouping]) ##start working per outcome for outcome in outcomes: decision_maker = DecisionMaker() X_train_y0, y_train1, X_valid_y0, X_valid_y1, X_valid, y_val1, X_test, y_test1, timesteps, n_features=\ process_data(normalized_timeseries, non_smotedtime_series, outcome, grouping, non_smotedtime_series[grouping], lookback)
def main(): configs = json.load(open('Configuration.json', 'r')) epochs = configs['training']['epochs'] grouping = configs['data']['grouping'] dynamic_features = configs['data']['dynamic_columns'] outcomes = configs['data']['classification_outcome'] lookback = configs['data']['batch_size'] timeseries_path = configs['paths']['data_path'] saved_models_path = configs['paths']['saved_models_path'] ##read, impute and scale dataset non_smotedtime_series = pd.read_csv(timeseries_path + "TimeSeriesAggregatedUpto0.csv") non_smotedtime_series[dynamic_features] = impute(non_smotedtime_series, dynamic_features) normalized_timeseries = scale(non_smotedtime_series, dynamic_features) normalized_timeseries.insert(0, grouping, non_smotedtime_series[grouping]) ##start working per outcome for outcome in outcomes: decision_maker = DecisionMaker() fold_ind, train_ind, test_ind = get_train_test_split(non_smotedtime_series[outcome].astype(int), non_smotedtime_series[grouping]) X_train_y0, X_valid_y0, X_valid, y_valid, X_test, y_test, timesteps,\ n_features = \ process_data(normalized_timeseries, non_smotedtime_series, outcome, grouping, lookback, train_ind, test_ind) autoencoder = LSTMAutoEncoder(configs['model']['name'] + outcome, outcome, timesteps, n_features) autoencoder.summary() autoencoder.fit(X_train_y0, X_train_y0, epochs,lookback,X_valid_y0,X_valid_y0,2) ###save model filename = saved_models_path+ configs['model']['name'] + outcome+ '.h5' autoencoder.save_model(filename) ####LSTM autoencoder autoencoder.plot_history() test_x_predictions = autoencoder.predict(X_test) mse = np.mean(np.power(flatten(X_test) - flatten(test_x_predictions), 2), axis=1) test_error_df = pd.DataFrame({'Reconstruction_error' : mse, 'True_class' : y_test.tolist()}) pred_y, best_threshold, precision_rt, recall_rt= \ autoencoder.predict_binary(test_error_df.True_class, test_error_df.Reconstruction_error) autoencoder.output_performance(test_error_df.True_class, test_error_df.Reconstruction_error,pred_y) autoencoder.plot_reconstruction_error(test_error_df, best_threshold) autoencoder.plot_roc(test_error_df) autoencoder.plot_pr(precision_rt, recall_rt)
def main(): configs = json.load(open('Configuration.json', 'r')) epochs = configs['training']['epochs'] grouping = configs['data']['grouping'] dynamic_features = configs['data']['dynamic_columns'] static_features = configs['data']['static_columns'] outcomes = configs['data']['classification_outcome'] lookback = configs['data']['batch_size'] timeseries_path = configs['paths']['data_path'] saved_models_path = configs['paths']['saved_models_path'] ##read, impute and scale dataset non_smotedtime_series = pd.read_csv(timeseries_path + "TimeSeriesAggregatedUpto0.csv") non_smotedtime_series[dynamic_features] = impute(non_smotedtime_series, dynamic_features) normalized_timeseries = scale(non_smotedtime_series, dynamic_features) normalized_timeseries.insert(0, grouping, non_smotedtime_series[grouping]) ##start working per outcome for outcome in outcomes: decision_maker = DecisionMaker() fold_ind, train_ind, test_ind = get_train_test_split( non_smotedtime_series[outcome].astype(int), non_smotedtime_series[grouping]) ##Load LSTM models if they exist, otherwise train new models and save them filename = saved_models_path + configs['model'][ 'name'] + outcome + '.h5' X_train, X_train_y0, X_valid_y0, X_valid, y_valid, X_test, y_test, timesteps, \ n_features = \ process_data(normalized_timeseries, non_smotedtime_series, outcome, grouping, lookback, train_ind, test_ind) if os.path.isfile(filename): autoencoder = LSTMAutoEncoder(configs['model']['name'] + outcome, outcome, timesteps, n_features, saved_model=filename) autoencoder.summary() else: autoencoder = LSTMAutoEncoder(configs['model']['name'] + outcome, outcome, timesteps, n_features) autoencoder.summary() autoencoder.fit(X_train_y0, X_train_y0, epochs, lookback, X_valid_y0, X_valid_y0, 2) autoencoder.plot_history() ###save model filename = saved_models_path + configs['model'][ 'name'] + outcome + '.h5' autoencoder.save_model(filename) ####Predicting using the fitted model (loaded or trained) train_x_predictions = autoencoder.predict(X_train) mse_train = np.mean(np.power( lstm_flatten(X_train) - lstm_flatten(train_x_predictions), 2), axis=1) test_x_predictions = autoencoder.predict(X_test) mse_test = np.mean(np.power( lstm_flatten(X_test) - lstm_flatten(test_x_predictions), 2), axis=1) test_error_df = pd.DataFrame({ 'Reconstruction_error': mse_test, 'True_class': y_test.tolist() }) pred_y, best_threshold, precision_rt, recall_rt = \ autoencoder.predict_binary(test_error_df.True_class, test_error_df.Reconstruction_error) autoencoder.output_performance(test_error_df.True_class, test_error_df.Reconstruction_error, pred_y) autoencoder.plot_reconstruction_error(test_error_df, best_threshold) autoencoder.plot_roc(test_error_df) autoencoder.plot_pr(precision_rt, recall_rt) #Feature Selector training_loc = train_ind[0] #+train_ind[1] training_ids = non_smotedtime_series.iloc[training_loc] training_ids = training_ids[grouping] testing_ids = non_smotedtime_series.iloc[test_ind[1]] testing_ids = testing_ids[grouping] flat_df, timesteps = flatten(non_smotedtime_series, dynamic_features, grouping, static_features, outcome) temporal_features = set(flat_df.columns) - set(static_features) temporal_features = set(temporal_features) - set([outcome, grouping]) X_train = flat_df.loc[flat_df[grouping].isin(training_ids)] y_train = X_train[outcome].astype(int) training_groups = X_train[grouping] X_train_static = X_train[static_features] X_train_static.loc[grouping] = training_groups X_train = X_train[temporal_features] X_train = scale(X_train, temporal_features) X_train['mse'] = mse_train #X_train, y_train = smote(X_train, y_train) X_test = flat_df.loc[flat_df[grouping].isin(testing_ids)] y_test = X_test[outcome].astype(int) testing_groups = X_test[grouping] X_test_static = X_test[static_features] X_test_static.loc[grouping] = testing_groups X_test = X_test[temporal_features] X_test = scale(X_test, temporal_features) X_test['mse'] = mse_test feature_selector = XGBoostClassifier(X_train, y_train, outcome, grouping) # feature_selector.fit("temporal", training_groups) y_pred_binary, best_threshold, precision_rt, recall_rt = feature_selector.predict( X_test, y_test) feature_selector.plot_pr(precision_rt, recall_rt, "XGBoost Temporal") featuredf = pd.DataFrame() temporal_features = set(temporal_features) - set([outcome]) featuredf['features'] = list(temporal_features) #featuredf['imp'] = fs_fi #featuredf = featuredf[featuredf['imp'] > 0] ######## baseline_features = featuredf['features'] baseline_features = set( [x.partition('_')[0] for x in list(baseline_features)]) baseline_features = [x + "_0" for x in list(baseline_features)] baseline_features.insert(0, grouping) baseline_static_features = baseline_features + static_features slopes_df = generate_slopes(X_train, temporal_features, static_features, grouping, training_groups) aggregate_df = generate_aggregates(X_train, temporal_features, grouping, training_groups) slopes_static_baseline_train_df = pd.concat( [slopes_df, X_train_static], axis=1, join='inner') slopes_static_baseline_train_df = slopes_static_baseline_train_df.loc[:, ~slopes_static_baseline_train_df . columns . duplicated( )] slopes_static_baseline_train_groups = slopes_static_baseline_train_df[ grouping] slopes_static_baseline_train_df.drop(columns=[grouping], inplace=True, axis=1) slopes_static_baseline_train_df['mse'] = mse_train slopes_df_test = generate_slopes(X_test, temporal_features, static_features, grouping, testing_groups) slopes_static_baseline_test_df = pd.concat( [slopes_df_test, X_test_static], axis=1, join='inner') slopes_static_baseline_test_df = slopes_static_baseline_test_df.loc[:, ~slopes_static_baseline_test_df . columns . duplicated( )] slopes_static_baseline_test_groups = slopes_static_baseline_test_df[ grouping] slopes_static_baseline_test_df.drop(columns=[grouping], inplace=True, axis=1) slopes_static_baseline_test_df['mse'] = mse_test slopes_static_baseline_classifier = XGBoostClassifier( slopes_static_baseline_train_df, y_train, outcome, grouping) #bs_y, bs_ths, bs_id, bs_fi = slopes_static_baseline_classifier.fit("baseline_static_slope", # slopes_static_baseline_train_groups) slopes_static_baseline_classifier.fit( "baseline_static_slope", slopes_static_baseline_train_groups) y_pred_binary, best_threshold, precision_rt, recall_rt = \ slopes_static_baseline_classifier.predict( slopes_static_baseline_test_df, y_test) slopes_static_baseline_classifier.plot_pr(precision_rt, recall_rt, "XGBoost Static")
def main(): configs = json.load(open('Configuration.json', 'r')) epochs = configs['training']['epochs'] grouping = configs['data']['grouping'] dynamic_features = configs['data']['dynamic_columns'] static_features = configs['data']['static_columns'] outcomes = configs['data']['classification_outcome'] lookback = configs['data']['batch_size'] timeseries_path = configs['paths']['data_path'] autoencoder_models_path = configs['paths']['autoencoder_models_path'] test_data_path = configs['paths']['test_data_path'] ##read, impute and scale dataset non_smotedtime_series = pd.read_csv(timeseries_path + "TimeSeriesAggregatedUpto0.csv") non_smotedtime_series[dynamic_features] = impute(non_smotedtime_series, dynamic_features) normalized_timeseries = scale(non_smotedtime_series, dynamic_features) normalized_timeseries.insert(0, grouping, non_smotedtime_series[grouping]) #intialise classification report which will house results of all outcomes classification_report = ClassificationReport() #save lstm performance for comparison with final outcome lstm_praucs = [] ##start working per outcome for outcome in outcomes: fold_ind, train_ind, test_ind = get_train_test_split( non_smotedtime_series[outcome].astype(int), non_smotedtime_series[grouping]) ##Load LSTM models if they exist, otherwise train new models and save them autoencoder_filename = autoencoder_models_path + configs['model'][ 'name'] + outcome + '.h5' X_train, X_train_y0, X_valid_y0, X_valid, y_valid, X_test, y_test, timesteps, \ n_features = \ process_data(normalized_timeseries, non_smotedtime_series, outcome, grouping, lookback, train_ind, test_ind) if ("3D" not in outcome): if os.path.isfile(autoencoder_filename): print(" Autoencoder trained model exists for oucome", outcome, "file:", autoencoder_filename) autoencoder = LSTMAutoEncoder(configs['model']['name'] + outcome, outcome, timesteps, n_features, saved_model=autoencoder_filename) autoencoder.summary() else: print("Autencoder trained model does not exist for outcome", outcome, "file:", autoencoder_filename) autoencoder = LSTMAutoEncoder( configs['model']['name'] + outcome, outcome, timesteps, n_features) autoencoder.summary() autoencoder.fit(X_train_y0, X_train_y0, epochs, lookback, X_valid_y0, X_valid_y0, 2) autoencoder.plot_history() train_x_predictions = autoencoder.predict(X_train) mse_train = np.mean(np.power( lstm_flatten(X_train) - lstm_flatten(train_x_predictions), 2), axis=1) test_x_predictions = autoencoder.predict(X_test) mse_test = np.mean(np.power( lstm_flatten(X_test) - lstm_flatten(test_x_predictions), 2), axis=1) test_error_df = pd.DataFrame({ 'Reconstruction_error': mse_test, 'True_class': y_test.tolist() }) pred_y, best_threshold, precision_rt, recall_rt = \ autoencoder.predict_binary(test_error_df.True_class, test_error_df.Reconstruction_error) autoencoder.output_performance(test_error_df.True_class, pred_y) autoencoder.plot_reconstruction_error(test_error_df, best_threshold) autoencoder.plot_roc(test_error_df) autoencoder.plot_pr(precision_rt, recall_rt) lstm_prauc = auc(recall_rt, precision_rt) lstm_praucs.append(lstm_prauc) #Feature Selector training_loc = train_ind[0] #+train_ind[1] training_ids = non_smotedtime_series.iloc[training_loc] training_ids = training_ids[grouping] testing_ids = non_smotedtime_series.iloc[test_ind[1]] testing_ids = testing_ids[grouping] flat_df, timesteps = flatten(non_smotedtime_series, dynamic_features, grouping, static_features, outcome) temporal_features = set(flat_df.columns) - set(static_features) temporal_features = set(temporal_features) - set( [outcome, grouping]) X_train = flat_df.loc[flat_df[grouping].isin(training_ids)] y_train = X_train[outcome].astype(int) training_groups = X_train[grouping] X_train_static = X_train[static_features] X_train_static[grouping] = training_groups X_train = X_train[temporal_features] X_test = flat_df.loc[flat_df[grouping].isin(testing_ids)] y_test = X_test[outcome].astype(int) testing_groups = X_test[grouping] X_test_static = X_test[static_features] X_test_static.loc[grouping] = testing_groups X_test = X_test[temporal_features] ######## aggregate_df = generate_aggregates(X_train, temporal_features, grouping, training_groups) static_aggregate_train_df = pd.concat( [aggregate_df, X_train_static], axis=1, join='inner') static_aggregate_train_df = static_aggregate_train_df.loc[:, ~static_aggregate_train_df .columns. duplicated( )] static_aggregate_train_df.drop(columns=[grouping], inplace=True, axis=1) static_aggregate_train_df['mse'] = mse_train aggregate_df_test = generate_aggregates(X_test, temporal_features, grouping, testing_groups) static_aggregate_test_df = pd.concat( [aggregate_df_test, X_test_static], axis=1, join='inner') static_aggregate_test_df = static_aggregate_test_df.loc[:, ~static_aggregate_test_df .columns. duplicated( )] static_aggregate_test_df.drop(columns=[grouping], inplace=True, axis=1) static_aggregate_test_df['mse'] = mse_test static_aggregate_test_df.to_csv("static_aggretate.csv", index=False) static_baseline_classifier = XGBoostClassifier( static_aggregate_train_df, y_train, outcome, grouping) static_baseline_classifier.fit("aggregate_static", mse_train * 100) y_pred_binary, best_threshold, precision_rt, recall_rt, yhat = \ static_baseline_classifier.predict(static_aggregate_test_df, y_test) print(" CLASS WEIGHTS FOR Y ACTUAL: ", class_counts(y_test)) print(" CLASS WEIGHTS FOR Y PREDICTE: ", class_counts(y_pred_binary)) static_baseline_classifier.output_performance( y_test, y_pred_binary) static_baseline_classifier.plot_pr(precision_rt, recall_rt, "XGBoost Static") static_baseline_classifier.plot_feature_importance( static_aggregate_test_df.columns) to_write_for_plotting = static_aggregate_test_df to_write_for_plotting['outcome'] = y_test to_write_for_plotting.to_csv(test_data_path + outcome + ".csv", index=False) #add to classification report classification_report.add_model_result(outcome, y_test, y_pred_binary, best_threshold, precision_rt, recall_rt, yhat) #delete variables del static_aggregate_train_df del static_aggregate_test_df del X_train del X_train_y0 del X_valid_y0 del X_valid del y_valid del X_test del y_test del timesteps del train_x_predictions del test_x_predictions del test_error_df #risk_score_visualiser = Visualiser(normalized_timeseries, non_smotedtime_series, # dynamic_features, static_features # ) #After fitting model to all outcomes, plot and get summary statistics classification_report.plot_distributions_vs_aucs() classification_report.plot_pr_auc() classification_report.plot_auc() classification_report.compare_lstim_xgboost(lstm_praucs)
def main(): configs = json.load(open('Configuration.json', 'r')) grouping = configs['data']['grouping'] dynamic_features = configs['data']['dynamic_columns'] outcomes = configs['data']['classification_outcome'] lookback = configs['data']['batch_size'] timeseries_path = configs['paths']['data_path'] autoencoder_path = configs['paths']['autoencoder_path'] ##read, impute and scale dataset non_smotedtime_series = pd.read_csv(timeseries_path + "TimeSeriesAggregatedUpto0.csv") non_smotedtime_series[dynamic_features] = impute(non_smotedtime_series, dynamic_features) normalized_timeseries = scale(non_smotedtime_series, dynamic_features) normalized_timeseries.insert(0, grouping, non_smotedtime_series[grouping]) ##start working per outcome for outcome in outcomes: decision_maker = DecisionMaker() X_train_y0, X_valid_y0, X_valid, y_valid, X_test, y_test, timesteps, n_features =\ process_data(normalized_timeseries, non_smotedtime_series, outcome, grouping, non_smotedtime_series[grouping], lookback) epochs = 100 autoencoder = LSTMAutoEncoder(configs['model']['name'] + outcome, outcome, timesteps, n_features) autoencoder.summary() cp = ModelCheckpoint(filepath="lstm_autoencoder_classifier.h5", save_best_only=True, verbose=0) tb = TensorBoard(log_dir='./logs', histogram_freq=0, write_graph=True, write_images=True) autoencoder.fit(X_train_y0, X_train_y0, epochs, lookback, X_valid_y0, X_valid_y0, 2) ####LSTM autoencoder autoencoder.plot_history() valid_x_predictions = autoencoder.predict(X_valid) mse = np.mean(np.power( flatten(X_valid) - flatten(valid_x_predictions), 2), axis=1) error_df = pd.DataFrame({ 'Reconstruction_error': mse, 'True_class': y_valid.tolist() }) precision_rt, recall_rt, threshold_rt = precision_recall_curve( error_df.True_class, error_df.Reconstruction_error) fscore = (2 * precision_rt * recall_rt) / (precision_rt + recall_rt) ix = np.argmax(fscore) best_threshold = threshold_rt[ix] # print('Best Threshold=%f, G-Mean=%.3f' % (thresholds[ix], fscore[ix])) pred_y = (error_df.Reconstruction_error > best_threshold).astype('int32') perf_df = pd.DataFrame() perf_dict = performance_metrics(error_df.True_class, pred_y, error_df.Reconstruction_error) perf_df = perf_df.append(perf_dict, ignore_index=True) perf_df.to_csv(autoencoder_path + "performancemetrics" + outcome + ".csv", index=False) test_x_predictions = autoencoder.predict(X_test) mse = np.mean(np.power( flatten(X_test) - flatten(test_x_predictions), 2), axis=1) error_df = pd.DataFrame({ 'Reconstruction_error': mse, 'True_class': y_test.tolist() }) plt.figure(figsize=(10, 10)) groups = error_df.groupby('True_class') fig, ax = plt.subplots() for name, group in groups: ax.plot(group.index, group.Reconstruction_error, marker='o', ms=3.5, linestyle='', label="1" if name == 1 else "0") ax.hlines(threshold_rt[ix], ax.get_xlim()[0], ax.get_xlim()[1], colors="r", zorder=100, label='Threshold') ax.legend() plt.title("Reconstruction error for different classes") plt.ylabel("Reconstruction error") plt.xlabel("Data point index") plt.savefig(autoencoder_path + outcome + "Reconstructionerror.pdf", bbox_inches='tight') false_pos_rate, true_pos_rate, thresholds = roc_curve( error_df.True_class, error_df.Reconstruction_error) roc_auc = auc( false_pos_rate, true_pos_rate, ) plt.figure(figsize=(10, 10)) plt.plot(false_pos_rate, true_pos_rate, linewidth=5, label='AUC = %0.3f' % roc_auc) plt.plot([0, 1], [0, 1], linewidth=5) plt.xlim([-0.01, 1]) plt.ylim([0, 1.01]) plt.legend(loc='lower right') plt.title('Receiver operating characteristic curve (ROC)') plt.ylabel('True Positive Rate') plt.xlabel('False Positive Rate') plt.savefig(autoencoder_path + outcome + "roc.pdf", bbox_inches='tight') pr_auc = auc(recall_rt, precision_rt) plt.figure(figsize=(10, 10)) plt.plot(recall_rt, precision_rt, linewidth=5, label='PR-AUC = %0.3f' % pr_auc) plt.plot([0, 1], [1, 0], linewidth=5) plt.xlim([-0.01, 1]) plt.ylim([0, 1.01]) plt.legend(loc='lower right') plt.title('Precision Recall Curive') plt.ylabel('Precision') plt.xlabel('Recall') plt.savefig(autoencoder_path + outcome + "precision_recall_auc.pdf", bbox_inches='tight')