''' from sklearn.neural_network import MLPClassifier,MLPRegressor clf = MLPClassifier(hidden_layer_sizes=(64,16,2,16),activation='tanh') Y_one_hot=np.array([np.eye(11)[label] for label in Y]) clf.fit(X,Y_one_hot) Predicted_Y=clf.predict_proba(valid_X).argmax(axis=1) print np.mean(np.asarray(valid_Y) == Predicted_Y) ''' ''' from mlp import MLP clf=MLP() clf.fit(X,Y) Predicted_Y = clf.predict(valid_X) print np.mean(np.asarray(valid_Y) == Predicted_Y) ''' from mlp import MLPRegressor clf = MLPRegressor() clf.fit(X) X_transformed = clf.transform(X) print X_transformed.shape for i in range(11): indices = Y == i plt.scatter(X_transformed[indices][:, 0], X_transformed[indices][:, 1], c=np.random.rand(3), label=str(i)) plt.legend() plt.show()
def train(method, target, tss, hpo_evals, kfolds, scaler, use_null, root): """Train single-target [method] regressor for [target]. Args: method: Regression method: GP, MLP, or XGB. target: Post-impact property to regress. tss: Training set size. hpo_evals: Number of optimization steps. kfolds: Number of folds in cross-validation. scaler: Standardization method. use_null: Train with zero values? Returns: r2_score. Coefficient of determination. """ # Pre-impact features to use in training and prediction raw_features = [ 'mtotal', 'gamma', 'b_inf', 'v_inf', 'targ_core', 'targ_omega_norm', 'targ_theta', 'targ_phi', 'proj_core', 'proj_omega_norm', 'proj_theta', 'proj_phi' ] # Validation set size vss = 500 welcome(use_null) output_folder, metrics_file = make_dir_struct(root, method, target, tss) print('\nLoading training dataset (LHS10K)') ds = Dataset("12D_LHS10K", tss, raw_features, target, scaling_method=scaler, use_null=use_null) # Save data scalers joblib.dump(ds.x_scaler, "{}/x_scaler.save".format(output_folder)) joblib.dump(ds.y_scaler, "{}/y_scaler.save".format(output_folder)) # Initialize untrained emulator if method == 'gp': from gp import GPRegressor emu = GPRegressor(ds, output_folder) elif method == 'mlp': from mlp import MLPRegressor emu = MLPRegressor(ds, output_folder) elif method == 'xgb': from xgb import XGBoostRegressor emu = XGBoostRegressor(ds, output_folder) else: exit('Method not available!') if (tss > 2000) and (method == "gp"): # Check if model exists for GP with TSS=2000 gp_pkl = "{}/{}/gp/2000/best/best.pkl".format(output_root, target) if isfile(gp_pkl): emu.best_model = gp_pkl else: # Regressor hyperparameter optimization print('\n\nSearching for optimal regressor architecture') print('\n\tUsing 5-fold cross-validation with an 80/20 split\n') print('\tTraining set (N={})'.format(int( len(ds.collision_ids) * 0.8))) print('\tValidation set (N={})'.format(int( len(ds.collision_ids) * 0.2))) hpo_metrics = emu.hpo(hpo_evals) print('\n\tBest mean regressor performance during HPO:') report_metrics(hpo_metrics) append_metrics('hpo', hpo_metrics, metrics_file) # Load test data print('\n\n\nLoading test data') dt = Dataset("12D_LHS500", vss, raw_features, target, scaling_method=scaler, use_null=False, external_x_scaler=ds.x_scaler, external_y_scaler=ds.y_scaler) print('\n\nEvaluating assuming perfect nan classification:') print('\n\tx: {}'.format(len(dt.scaled_x))) print('\ty: {}'.format(len(dt.scaled_y))) _, y_true_nonan, y_pred_nonan = emu.evaluate_best(dt) metrics_nonan = emu._regression_metrics(y_true_nonan, y_pred_nonan) report_metrics(metrics_nonan) append_metrics('nonan', metrics_nonan, metrics_file) results_file = '{}/results_nonan.csv'.format(output_folder) save_results(dt.x, y_true_nonan, y_pred_nonan, results_file, dt.collision_ids, target, method, tss, scaler, hpo_evals, use_null) figure_name = '{}/correlation_nonan.png'.format(output_folder) correlations(dt.x, y_true_nonan, y_pred_nonan, figure_name, metrics_nonan) ############################################################################ print('\n\nEvaluating model assuming perfect null classification:') ids_pzc, x_input, m_tot, j_tot, y_true_nonan_nonull, y_pred_nonan_nonull = perfect_null_classifier( dt.collision_ids, dt.x.values, dt.x.mtotal, dt.J_tot, y_true_nonan, y_pred_nonan, target) x_input = pd.DataFrame(x_input, columns=dt.x.columns) dx_pzc = pd.DataFrame(x_input, columns=dt.x.columns) dx_pzc['collision_id'] = ids_pzc # Evaluate regressor with pre-classification metrics_nonan_nonull = emu._regression_metrics(y_true_nonan_nonull, y_pred_nonan_nonull) print('\n\tx: {}'.format(len(x_input))) print('\ty: {}'.format(len(y_true_nonan_nonull))) report_metrics(metrics_nonan_nonull) append_metrics('nonan_nonull', metrics_nonan_nonull, metrics_file) results_file = '{}/results_pnc_pzc.csv'.format(output_folder) save_results(dx_pzc, y_true_nonan_nonull, y_pred_nonan_nonull, results_file, ids_pzc, target, method, tss, scaler, hpo_evals, use_null) figure_name = '{}/correlation_nonan_nonull.png'.format(output_folder) correlations(x_input, y_true_nonan_nonull, y_pred_nonan_nonull, figure_name, metrics_nonan_nonull) ############################################################################ print('\n\nEvaluating model with physics enforcement:') y_pred_phys = enforce_physics(y_true_nonan_nonull, y_pred_nonan_nonull, m_tot, j_tot, target) print('\n\tx: {}'.format(len(m_tot))) print('\ty_true: {}'.format(len(y_true_nonan_nonull))) print('\ty_pred: {}'.format(len(y_pred_nonan_nonull))) print('\ty_phys: {}'.format(len(y_pred_phys))) # Evaluate regressor with physics enforcement metrics_phys = emu._regression_metrics(y_true_nonan_nonull, y_pred_phys) report_metrics(metrics_phys) append_metrics('physics', metrics_phys, metrics_file) results_file = '{}/results_nonan_nonull_phys.csv'.format(output_folder) save_results(dt.x, y_true_nonan, y_pred_nonan, results_file, dt.collision_ids, target, method, tss, scaler, hpo_evals, use_null) figure_name = '{}/correlation_nonan_nonull_phys.png'.format(output_folder) correlations(x_input, y_true_nonan, y_pred_phys, figure_name, metrics_phys)
# %% import pandas as pd import numpy as np train = pd.read_csv("data/regression/data.activation.train.100.csv") test = pd.read_csv("data/regression/data.activation.test.100.csv") X_train, y_train = train.iloc[:, :-1], train.y X_test, y_test = test.iloc[:, :-1], test.y # %% from mlp import MLPRegressor estimator = MLPRegressor(random_seed = 12369666, \ num_iterations = 10000) print('R^2 score:', estimator.fit(X_train, y_train).score(X_test, y_test)) # %% from mlp import Visualizer vis = Visualizer() vis.plot_train_test_error(estimator, X_train, y_train, \ X_test, y_test, log_scale=True, show = True) # %% from mlp import Visualizer vis = Visualizer() vis.plot_regression_result(estimator, X_test.x, y_test, show=True) # %% from mlp import Visualizer vis = Visualizer() vis.plot_regression_dataset(X_train, y_train, show=True)
def run_cross_validation(model_path: str): full_results = {} current_location = os.getcwd() df = pd.read_csv(os.path.join("data", "screening_study_with_article_info.csv"), encoding="utf-8") text_df = pd.read_csv(os.path.join("data", "text_data.csv"), encoding="utf-8", index_col=0) y = df["Reading_Time"] df.drop("Reading_Time", axis=1, inplace=True) df.drop("Duration_Entire_Survey", axis=1, inplace=True) df = pd.get_dummies(df) full_train = df.copy(deep=True) full_train["Reading_Time"] = y full_train = full_train.merge(text_df, on="Survey_Num", how="outer") full_train = full_train[["text", "Reading_Time"]] text_only = full_train[["text"]] ### Gather surprisal baselines ### surprisal_only = pd.read_csv(os.path.join("data", "article_num_to_surprisal_only.csv"), header=0, index_col=0) surprisal_only_dict = dict(zip(surprisal_only.article_num, surprisal_only.surprisal)) # a function to map the article num to the surprisal sum def data_to_surprisal_only(train_data): return train_data["Survey_Num"].apply(lambda x: surprisal_only_dict[x]).to_numpy().reshape(-1, 1) # gather the summed RTs from the LMM with open(os.path.join("data", "article_num_to_predictions_lmm.json"), "r") as fin: lmm_surprsial = json.load(fin) cv = KFold(n_splits=10, random_state=SEED, shuffle=False) data_only_cv = { "rf": {"rmse": [], "mae": []}, "knn": {"rmse": [], "mae": []}, "mlp": {"rmse": [], "mae": []}, "lr": {"rmse": [], "mae": []}, "lr-basic": {"rmse": [], "mae": []}, "std": {"rmse": [], "mae": []}, "lmm": {"rmse": [], "mae": []}, "lr-surp": {"rmse": [], "mae": []}, } """ Data Only Models: includes: MLP Regressor Random Forest Linear Regression (word-only, regular, surprisal-only) Standard 240 WPM Model KNN RT LMM model (used by dict params from previous train) """ print("Training data only models ...") for index, (train_index, test_index) in enumerate(cv.split(df)): print("On training split {}".format(index + 1)) # instantiate models mlp = MLPRegressor(df.shape[1], 100, 1) rf = RandomForestRegressor(random_state=SEED) knn = KNeighborsRegressor() lr = LinearRegression() lr_surp = LinearRegression() # truly awful, not enough info to do good basic_lr = LinearRegression() std = StandardModel() surprisal_lmm = SurprisalModelLMM(lmm_surprsial) # set up datasets curr_xtrain = df.iloc[train_index.tolist(), :] curr_xtest = df.iloc[test_index.tolist(), :] curr_ytrain = y.iloc[train_index.tolist()] curr_ytest = y.iloc[test_index.tolist()] # fit mlp = fit_nn(mlp, curr_xtrain, curr_ytrain, n_epochs=100) rf.fit(curr_xtrain, curr_ytrain) knn.fit(curr_xtrain, curr_ytrain) lr.fit(curr_xtrain, curr_ytrain) basic_lr.fit(curr_xtrain[["Num_Words"]], curr_ytrain) lr_surp.fit(data_to_surprisal_only(curr_xtrain), curr_ytrain) # # calculate scores pred_mlp = predict_nn(mlp, curr_xtest) scores = get_scores(curr_ytest, pred_mlp, verbose=False) add_to_cv_dict(data_only_cv, "mlp", scores) pred_rf = rf.predict(curr_xtest) scores = get_scores(curr_ytest, pred_rf, verbose=False) add_to_cv_dict(data_only_cv, "rf", scores) pred_knn = knn.predict(curr_xtest) scores = get_scores(curr_ytest, pred_knn, verbose=False) add_to_cv_dict(data_only_cv, "knn", scores) pred_lr = lr.predict(curr_xtest) scores = get_scores(curr_ytest, pred_lr, verbose=False) add_to_cv_dict(data_only_cv, "lr", scores) pred_lr_basic = basic_lr.predict(curr_xtest[["Num_Words"]]) scores = get_scores(curr_ytest, pred_lr_basic, verbose=False) add_to_cv_dict(data_only_cv, "lr-basic", scores) pred_lr_surp = basic_lr.predict(data_to_surprisal_only(curr_xtest)) scores = get_scores(curr_ytest, pred_lr_surp, verbose=False) add_to_cv_dict(data_only_cv, "lr-surp", scores) pred_std = std.predict(curr_xtest) scores = get_scores(curr_ytest, pred_std, verbose=False) add_to_cv_dict(data_only_cv, "std", scores) pred_lmm = surprisal_lmm.predict(curr_xtest) scores = get_scores(curr_ytest, pred_lmm, verbose=False) add_to_cv_dict(data_only_cv, "lmm", scores) # print out extracted data only reports for key, value in data_only_cv.items(): for metric, scores in value.items(): print('The model {} got an average of {} for {}'.format(key, np.mean(scores), metric)) for model_dir in glob.glob(os.path.join(model_path, "*")): model_name = model_dir.split("/")[-1] print("Evaluating model", model_name) text_only_cv = { f"{model_name}": {"rmse": [], "mae": []}, } stacked_cv = { f"{model_name}/MLP": {"rmse": [], "mae": []}, } trained_model = TextRegressor.load(os.path.join(model_dir, "best-model.pt")) doc_embedder = trained_model.document_embeddings sentence = Sentence('The grass is green . And the sky is blue .') doc_embedder.embed(sentence) test_embed = sentence.get_embedding() EMBEDDING_SIZE = test_embed.shape[0] assert test_embed is not None, "embedded a None object" # build combined text and embedding vector embedding_df = build_embedding_df(text_only, doc_embedder, embed_size=EMBEDDING_SIZE, shorten=True if model_name == "roBERTa" else False) combined_df = pd.concat([df, embedding_df], axis=1) assert combined_df.shape[0] == df.shape[0] and combined_df.shape[1] == df.shape[1] + embedding_df.shape[1], \ "shapes were not aligned: df {} combined {}, embed {}".format(df.shape, combined_df.shape, embedding_df.shape) """ Text Only (Embedding) Models: includes: LSTM Transformers # """ print("Training text only models ...") for train_index, test_index in cv.split(df): # instantiate models mlp = MLPRegressor(test_embed.shape[0], 100, 1) # set up datasets and combine with text curr_xtrain, curr_xtest, curr_ytrain, curr_ytest = get_splits(train_index, test_index, embedding_df, y) # fit mlp = fit_nn(mlp, curr_xtrain, curr_ytrain, n_epochs=100) # calculate scores pred_mlp = predict_nn(mlp, curr_xtest) scores = get_scores(curr_ytest, pred_mlp, verbose=False) add_to_cv_dict(text_only_cv, model_name, scores) # print out extracted data only reports for key, value in text_only_cv.items(): for metric, scores in value.items(): print('The model {} got an average of {} for {}'.format(key, np.mean(scores), metric)) """ Stacked Models: includes: LSTM / MLP Regressor Transformers / MLP Regressor """ print("Training stacked models ...") for train_index, test_index in cv.split(df): # instantiate models mlp = MLPRegressor(df.shape[1] + test_embed.shape[0], 100, 1) # set up datasets and combine with text curr_xtrain, curr_xtest, curr_ytrain, curr_ytest = get_splits(train_index, test_index, combined_df, y) # fit mlp = fit_nn(mlp, curr_xtrain, curr_ytrain, n_epochs=100) # calculate scores pred_mlp = predict_nn(mlp, curr_xtest) scores = get_scores(curr_ytest, pred_mlp, verbose=False) add_to_cv_dict(stacked_cv, model_name + "/MLP", scores) # print out extracted data only reports for key, value in stacked_cv.items(): for metric, scores in value.items(): print('The model {} got an average of {} for {}'.format(key, np.mean(scores), metric)) full_results = {**full_results, **stacked_cv, **text_only_cv} #### create overall dataframe from results #### results = pd.DataFrame(columns=["Group", "Name", "MAE", "MAE-STD", "RMSE", "RMSE-STD"]) for (group_name, dict_results) in [("Data-Only", data_only_cv), ("Full_Results", full_results)]: for key, value in dict_results.items(): results = results.append(pd.Series({"Group": group_name, "Name": key, "MAE": np.mean(value["mae"]), "RMSE": np.mean(value["rmse"]), "RMSE-STD": np.std(value["rmse"]), "MAE-STD": np.std(value["mae"])}), ignore_index=True) assert results.shape[0] != 0, "no data added to the dataframe" if not os.path.isdir("results"): os.makedirs("results") results.to_csv("results/all_results-{}-{}.csv".format(SEED, model_path.split("/")[0]))
xc, yc = np.meshgrid(x, y) zc = xc * yc z = np.stack([xc, yc], 0) z = np.transpose(np.reshape(z, (2, N * N))) zc = np.reshape(zc, (N * N, )) zzc = np.concatenate([z, zc[:, np.newaxis]], axis=1) np.random.shuffle(zzc) zzc_train = zzc[:num_train, :] zzc_test = zzc[num_train:, :] x_train = zzc_train[:, :2] z_train = zzc_train[:, 2:] x_test = zzc_test[:, :2] z_test = zzc_test[:, 2:] # Create regressor regressor = MLPRegressor([2, 10, 10, 1], 'relu', learning_rate=0.001, optimizer='Adam') # Fitting regressor.fit(x=x_train, y=z_train, steps=100000) # Evaluation l2_error = regressor.eval(x=x_test, y=z_test) print('l2 error: {}'.format(l2_error))
error_functions, GridSearch param_grid = { 'hidden_layers': [[], [5, 5, 5, 5]], 'num_iterations': [10, 100, 1000], 'eta': [0.005, 0.1], 'batch_portion': [0.1, 0.5, 1], 'bias': [True, False], 'activation_function': [activation_functions.sigmoid, \ activation_functions.tanh], 'error_function': [error_functions.mean_squared, \ error_functions.mean], 'moment': [0, 0.2, 0.5] } gs = GridSearch(MLPRegressor(random_seed = 12369666), param_grid) train = pd.read_csv("data/regression/data.cube.train.100.csv") test = pd.read_csv("data/regression/data.cube.test.100.csv") X_train, y_train = train.iloc[:, :-1], train.y X_test, y_test = test.iloc[:, :-1], test.y gs.fit(X_train, y_train, X_test, y_test) #%% f = open("grid_search_scores.txt","w+") f.write(str(gs.param_scores_)) f.close() # %% print(gs.param_scores_)
show = True, \ save_path = f'results/regression/{name}-{size}-result-train.png') vis.plot_train_test_error(estimator, \ X_train, y_train, X_test, y_test, \ show = True, \ save_path = f'results/regression/{name}-{size}-error.png') return score # %% activation scores = [] for i in range(10): estimator = MLPRegressor(hidden_layers=[5],\ num_iterations=1000, eta=0.005, batch_portion=0.7, \ bias=False, moment=0.5, \ activation_function=activation_functions.sigmoid, \ error_function=error_functions.mean_squared, random_seed = 12369666 + i) score = process_regression_dataset('activation', estimator, \ datasets_path_format='data/regression/data.{}.{}.{}.csv', \ draw = True if i == 0 else False) scores.append(score) print(f'activation: mean - {round(np.mean(scores), 4)}, ' + \ f'std - {round(np.std(scores), 4)}') # %% cube scores = [] for i in range(10): estimator = MLPRegressor(activation_function = \
y_test, show=True) print( 'Computing and plotting errors on train and test datasets for each iteration... (might take a while)' ) vis.plot_train_test_error(clf, X_train, y_train, X_test, y_test, show=True) print('Plotting edge weights during training...') vis.plot_training_history('training_data.joblib') print('Finished') if config['problem_type'] == 'regression': estimator = MLPRegressor(num_iterations = config['iterations'], \ bias = config['bias'], \ hidden_layers = config['hidden_layers'], \ eta = config['learning_rate'], \ moment = config['moment'], \ batch_portion = config['batch_portion'], \ random_seed = config['random_seed'], \ activation_function = activation_function, \ error_function = error_function) estimator = estimator.fit(X_train, y_train, \ serialize_path='training_data.joblib') print('Regression R^2 score:', estimator.score(X_test, y_test)) print('Plotting training dataset...') vis.plot_regression_dataset(X_train.iloc[:, 0], y_train, show=True) print('Plotting test dataset...') vis.plot_regression_dataset(X_test.iloc[:, 0], y_test, show=True) print('Plotting classifier decision space for train data...') vis.plot_regression_result(estimator,