def plot_tuning_results(mses_train, ml_name, input_path, scaler, factor): """ Plot results of tuning window size parameter :param mses_train: numpy array of train MSEs :param ml_name: machine learning model name :param input_path: tuning directory :param scaler: scaler string :param factor: stable number to factor the matrix :return: plot displayed """ c2 = "blue" c1 = "lightblue" p_color = "black" box_plot = plt.boxplot(np.transpose(mses_train) * factor, patch_artist=True) for item in ['boxes', 'whiskers', 'fliers', 'medians', 'caps']: plt.setp(box_plot[item], color=p_color) plt.setp(box_plot["boxes"], facecolor=c1) plt.setp(box_plot["fliers"], markeredgecolor=c2) # plt.boxplot(np.log(np.transpose(mses_train))) # plt.yscale('log') plt.title( "Anomaly prediction over the simulator data set - {0} Model".format( ml_name)) plt.ylabel( "Testing log(MSE)s of the records = Actual MSE * {0}".format(factor)) plt.xlabel("Setting of n_prev") min_value = 0.7 * np.amin(mses_train * factor) max_value = 1.3 * np.amax(mses_train * factor) plt.gcf().set_size_inches(12, 9) plt.gca().set_ylim([min_value, max_value]) # plt.show() ml_directory_route = os.path.join(input_path, ml_name) create_directories(ml_directory_route) plot_directory_route = os.path.join(ml_directory_route, scaler) create_directories(plot_directory_route) current_time = get_current_time() plt_path = os.path.join(*[ str(plot_directory_route), str(ml_name) + '_' + str(scaler) + '_' + str(current_time) + '.png' ]) plt.savefig(f"{plt_path}") plt.clf()
def model_tuning(file_path, input_features, target_features, window_size, scaler, results_path, model_name): """ model's tuning process by using GridSearchCV :param file_path: data file path :param input_features: the list of features which the user chose for the train :param target_features: the list of features which the user chose for the test :param window_size: window size variable :param scaler: scaler name :param results_path: results path :param model_name: model name :return: model name , best models params """ df_train = pd.read_csv(f'{file_path}') input_df_train = df_train[input_features] target_df_train = df_train[target_features] # Step 1 : Clean train data set input_df_train = clean_data(input_df_train) target_df_train = clean_data(target_df_train) # Step 2: Normalize the data X = normalize_data(data=input_df_train, scaler=scaler)[0] Y = normalize_data(data=target_df_train, scaler=scaler)[0] X_train, X_test, Y_train, Y_test = train_test_split(X, Y) model = get_model(model_name) model_grid_params = get_model_params(model_name) tsr = TimeSeriesRegressor(model, n_prev=window_size) grid_search = GridSearchCV(tsr, model_grid_params) grid_search.fit(X_train, Y_train) prediction = grid_search.predict(X_test) plot_title = "Optimized Time Series " + model_name + " model" print(str(model_name) + " " + str(grid_search.best_params_)) current_time = get_current_time() file_name = str(current_time) + "-" + str(model_name) + "-model_data.json" data = {} data['model'] = model_name data["input_features"] = input_features data["target_features"] = target_features data['params'] = grid_search.best_params_ data['score'] = grid_search.best_score_ file_path = os.path.join(str(results_path), str(file_name)) with open(f"{file_path}", 'w') as outfile: json.dump(data, outfile) # Y_test_preprocessed = tsr._preprocess(X_test, Y_test)[1] # # for i, target_feature in enumerate(target_features): # title = "Grid search test performance of " + model_name + " for window size: " + \ # str(window_size) + " and " + target_feature + " feature" # plot_prediction_performance(Y_train=Y_test_preprocessed[:, i], # X_pred=prediction[:, i], # results_path=results_path, # title=title) return data['params'], data['score']
def run_model(training_data_path, test_data_path, results_path, similarity_score, save_model, new_model_running, algorithm_path, threshold, features_list, target_features_list, train_scaler_path, target_scaler_path, event): """ Run SVR model process :param training_data_path: train data set directory path :param test_data_path: test data set directory path :param results_path: results directory path :param similarity_score: chosen similarity functions :param save_model: indicator whether the user want to save the model or not :param new_model_running: indicator whether we are in new model creation flow or not :param algorithm_path: path of existing algorithm :param threshold: saved threshold for load model flow :param features_list: saved chosen features for load model flow :param target_features_list: all the features in the test data set for the target :param train_scaler_path: path of existing input train scaler directory :param target_scaler_path: path of existing input target scaler directory :param event: running state flag :return: reported results for SVR execution """ # Choose between new model creation flow and load existing model flow if new_model_running: kernel, gamma, epsilon, threshold, window_size = get_svr_new_model_parameters( ) else: svr_model = pickle.load(open(algorithm_path, 'rb')) X_train_scaler = pickle.load(open(train_scaler_path, 'rb')) Y_train_scaler = pickle.load(open(target_scaler_path, 'rb')) window_size = svr_model.n_prev X_train = None Y_train = None FLIGHT_ROUTES = get_subdirectories(test_data_path) current_time = get_current_time() current_time_path = os.path.join( *[str(results_path), 'svr', str(current_time)]) create_directories(f"{current_time_path}") # Create sub directories for each similarity function for similarity in similarity_score: similarity_path = os.path.join( *[str(current_time_path), str(similarity)]) create_directories(f"{similarity_path}") # Train the model for each flight route for flight_route in FLIGHT_ROUTES: # Execute training for new model flow if new_model_running: svr_model, X_train_scaler, Y_train_scaler, X_train, Y_train = execute_train( flight_route, training_data_path=training_data_path, kernel=kernel, gamma=gamma, epsilon=epsilon, features_list=features_list, window_size=window_size, target_features_list=target_features_list, event=event) # Get results for each similarity function for similarity in similarity_score: current_results_path = os.path.join( *[str(current_time_path), str(similarity), str(flight_route)]) create_directories(f"{current_results_path}") tpr_scores, fpr_scores, acc_scores, delay_scores, routes_duration, attacks_duration = execute_predict( flight_route, test_data_path=test_data_path, similarity_score=similarity, threshold=threshold, svr_model=svr_model, X_train_scaler=X_train_scaler, results_path=current_results_path, add_plots=True, run_new_model=new_model_running, X_train=X_train, features_list=features_list, target_features_list=target_features_list, save_model=save_model, Y_train_scaler=Y_train_scaler, Y_train=Y_train, window_size=window_size, event=event) df = pd.DataFrame(tpr_scores) tpr_path = os.path.join( *[str(current_results_path), str(flight_route) + '_tpr.csv']) df.to_csv(f"{tpr_path}", index=False) df = pd.DataFrame(fpr_scores) fpr_path = os.path.join( *[str(current_results_path), str(flight_route) + '_fpr.csv']) df.to_csv(f"{fpr_path}", index=False) df = pd.DataFrame(acc_scores) acc_path = os.path.join( *[str(current_results_path), str(flight_route) + '_acc.csv']) df.to_csv(f"{acc_path}", index=False) df = pd.DataFrame(delay_scores) delay_path = os.path.join( *[str(current_results_path), str(flight_route) + '_delay.csv']) df.to_csv(f"{delay_path}", index=False) algorithm_name = "SVR" # Report results for training data to csv files for similarity in similarity_score: report_similarity_path = os.path.join( *[str(results_path), 'svr', str(current_time), str(similarity)]) report_results(f"{report_similarity_path}", test_data_path, FLIGHT_ROUTES, algorithm_name, similarity, routes_duration, attacks_duration)
def model_tuning(file_path, input_features, target_features, window_size, scaler, results_path): """ model's tuning process by using GridSearchCV :param model_name: model name :param file_path: data file path :param input_features: the list of features which the user chose for the train :param target_features: the list of features which the user chose for the test :param window_size: window size variable :param scaler: scaler name :param results_path: results path :return: model name , best models params """ df_train = pd.read_csv(f'{file_path}') input_df_train = df_train[input_features] target_df_train = df_train[target_features] X = normalize_data(data=input_df_train, scaler=scaler)[0] Y = normalize_data(data=target_df_train, scaler=scaler)[0] X_train, X_test, Y_train, Y_test = train_test_split(X, Y) assert len(X_train) == len(Y_train) assert len(X_test) == len(Y_test) X_train_preprocessed = get_training_data_lstm(X_train, window_size) X_test_preprocessed = get_training_data_lstm(X_test, window_size) Y_train_preprocessed = get_training_data_lstm(Y_train, window_size) Y_test_preprocessed = get_training_data_lstm(Y_test, window_size) params_configurations = get_lstm_params_configurations() total_scores = dict() for config in params_configurations: encoding_dimension, activation, loss, optimizer, epochs = config lstm_model = get_lstm_autoencoder_model( timesteps=window_size, input_features=input_df_train.shape[1], target_features=target_df_train.shape[1], encoding_dimension=encoding_dimension, activation=activation, loss=loss, optimizer=optimizer) lstm_model.fit(X_train_preprocessed, Y_train_preprocessed, epochs=epochs, verbose=0) X_test_pred = lstm_model.predict(X_test_preprocessed) scores = [] for i, pred in enumerate(X_test_pred): scores.append( anomaly_score_multi(Y_test_preprocessed[i], pred, 'MSE')) total_scores[str(config)] = mean(scores) total_sorted = { k: v for k, v in sorted(total_scores.items(), key=lambda item: item[1]) } best_config = list(total_sorted.items())[0][0] best_score = list(total_sorted.items())[0][1] print(best_config) print(best_score) current_time = get_current_time() file_name = str(current_time) + "-LSTM-model_data.json" data = {} data['model'] = 'LSTM' data["input_features"] = input_features data["target_features"] = target_features data["window_size"] = window_size data['params'] = best_config data['score'] = best_score file_path = os.path.join(*[str(results_path), str(file_name)]) with open(f"{file_path}", 'w') as outfile: json.dump(data, outfile) return data['params'], data['score']