Example #1
0
File: main.py Project: apwan/timbre
'''
from sklearn.neural_network import MLPClassifier,MLPRegressor
clf = MLPClassifier(hidden_layer_sizes=(64,16,2,16),activation='tanh')
Y_one_hot=np.array([np.eye(11)[label] for label in Y])
clf.fit(X,Y_one_hot)
Predicted_Y=clf.predict_proba(valid_X).argmax(axis=1)
print np.mean(np.asarray(valid_Y) == Predicted_Y)
'''
'''
from mlp import MLP
clf=MLP()
clf.fit(X,Y)
Predicted_Y = clf.predict(valid_X)
print np.mean(np.asarray(valid_Y) == Predicted_Y)
'''

from mlp import MLPRegressor
clf = MLPRegressor()
clf.fit(X)
X_transformed = clf.transform(X)
print X_transformed.shape

for i in range(11):
    indices = Y == i
    plt.scatter(X_transformed[indices][:, 0],
                X_transformed[indices][:, 1],
                c=np.random.rand(3),
                label=str(i))
plt.legend()
plt.show()
Example #2
0
def train(method, target, tss, hpo_evals, kfolds, scaler, use_null, root):
    """Train single-target [method] regressor for [target].

    Args:
        method:    Regression method: GP, MLP, or XGB.
        target:    Post-impact property to regress.
        tss:       Training set size.
        hpo_evals: Number of optimization steps.
        kfolds:    Number of folds in cross-validation.
        scaler:    Standardization method.
        use_null:  Train with zero values?


    Returns:
        r2_score.  Coefficient of determination.
    """

    # Pre-impact features to use in training and prediction
    raw_features = [
        'mtotal', 'gamma', 'b_inf', 'v_inf', 'targ_core', 'targ_omega_norm',
        'targ_theta', 'targ_phi', 'proj_core', 'proj_omega_norm', 'proj_theta',
        'proj_phi'
    ]

    # Validation set size
    vss = 500

    welcome(use_null)

    output_folder, metrics_file = make_dir_struct(root, method, target, tss)

    print('\nLoading training dataset (LHS10K)')

    ds = Dataset("12D_LHS10K",
                 tss,
                 raw_features,
                 target,
                 scaling_method=scaler,
                 use_null=use_null)

    # Save data scalers
    joblib.dump(ds.x_scaler, "{}/x_scaler.save".format(output_folder))
    joblib.dump(ds.y_scaler, "{}/y_scaler.save".format(output_folder))

    # Initialize untrained emulator
    if method == 'gp':
        from gp import GPRegressor
        emu = GPRegressor(ds, output_folder)
    elif method == 'mlp':
        from mlp import MLPRegressor
        emu = MLPRegressor(ds, output_folder)
    elif method == 'xgb':
        from xgb import XGBoostRegressor
        emu = XGBoostRegressor(ds, output_folder)
    else:
        exit('Method not available!')

    if (tss > 2000) and (method == "gp"):

        # Check if model exists for GP with TSS=2000
        gp_pkl = "{}/{}/gp/2000/best/best.pkl".format(output_root, target)

        if isfile(gp_pkl):

            emu.best_model = gp_pkl

    else:

        # Regressor hyperparameter optimization
        print('\n\nSearching for optimal regressor architecture')
        print('\n\tUsing 5-fold cross-validation with an 80/20 split\n')
        print('\tTraining set   (N={})'.format(int(
            len(ds.collision_ids) * 0.8)))
        print('\tValidation set (N={})'.format(int(
            len(ds.collision_ids) * 0.2)))

        hpo_metrics = emu.hpo(hpo_evals)

        print('\n\tBest mean regressor performance during HPO:')
        report_metrics(hpo_metrics)

        append_metrics('hpo', hpo_metrics, metrics_file)

    # Load test data
    print('\n\n\nLoading test data')

    dt = Dataset("12D_LHS500",
                 vss,
                 raw_features,
                 target,
                 scaling_method=scaler,
                 use_null=False,
                 external_x_scaler=ds.x_scaler,
                 external_y_scaler=ds.y_scaler)

    print('\n\nEvaluating assuming perfect nan classification:')

    print('\n\tx: {}'.format(len(dt.scaled_x)))
    print('\ty: {}'.format(len(dt.scaled_y)))

    _, y_true_nonan, y_pred_nonan = emu.evaluate_best(dt)

    metrics_nonan = emu._regression_metrics(y_true_nonan, y_pred_nonan)

    report_metrics(metrics_nonan)
    append_metrics('nonan', metrics_nonan, metrics_file)

    results_file = '{}/results_nonan.csv'.format(output_folder)

    save_results(dt.x, y_true_nonan, y_pred_nonan, results_file,
                 dt.collision_ids, target, method, tss, scaler, hpo_evals,
                 use_null)

    figure_name = '{}/correlation_nonan.png'.format(output_folder)

    correlations(dt.x, y_true_nonan, y_pred_nonan, figure_name, metrics_nonan)

    ############################################################################

    print('\n\nEvaluating model assuming perfect null classification:')

    ids_pzc, x_input, m_tot, j_tot, y_true_nonan_nonull, y_pred_nonan_nonull = perfect_null_classifier(
        dt.collision_ids, dt.x.values, dt.x.mtotal, dt.J_tot, y_true_nonan,
        y_pred_nonan, target)

    x_input = pd.DataFrame(x_input, columns=dt.x.columns)

    dx_pzc = pd.DataFrame(x_input, columns=dt.x.columns)

    dx_pzc['collision_id'] = ids_pzc

    # Evaluate regressor with pre-classification
    metrics_nonan_nonull = emu._regression_metrics(y_true_nonan_nonull,
                                                   y_pred_nonan_nonull)

    print('\n\tx: {}'.format(len(x_input)))
    print('\ty: {}'.format(len(y_true_nonan_nonull)))

    report_metrics(metrics_nonan_nonull)
    append_metrics('nonan_nonull', metrics_nonan_nonull, metrics_file)

    results_file = '{}/results_pnc_pzc.csv'.format(output_folder)

    save_results(dx_pzc, y_true_nonan_nonull, y_pred_nonan_nonull,
                 results_file, ids_pzc, target, method, tss, scaler, hpo_evals,
                 use_null)

    figure_name = '{}/correlation_nonan_nonull.png'.format(output_folder)

    correlations(x_input, y_true_nonan_nonull, y_pred_nonan_nonull,
                 figure_name, metrics_nonan_nonull)

    ############################################################################

    print('\n\nEvaluating model with physics enforcement:')

    y_pred_phys = enforce_physics(y_true_nonan_nonull, y_pred_nonan_nonull,
                                  m_tot, j_tot, target)

    print('\n\tx:      {}'.format(len(m_tot)))
    print('\ty_true: {}'.format(len(y_true_nonan_nonull)))
    print('\ty_pred: {}'.format(len(y_pred_nonan_nonull)))
    print('\ty_phys: {}'.format(len(y_pred_phys)))

    # Evaluate regressor with physics enforcement
    metrics_phys = emu._regression_metrics(y_true_nonan_nonull, y_pred_phys)

    report_metrics(metrics_phys)
    append_metrics('physics', metrics_phys, metrics_file)

    results_file = '{}/results_nonan_nonull_phys.csv'.format(output_folder)

    save_results(dt.x, y_true_nonan, y_pred_nonan, results_file,
                 dt.collision_ids, target, method, tss, scaler, hpo_evals,
                 use_null)

    figure_name = '{}/correlation_nonan_nonull_phys.png'.format(output_folder)

    correlations(x_input, y_true_nonan, y_pred_phys, figure_name, metrics_phys)
Example #3
0
# %%
import pandas as pd
import numpy as np
train = pd.read_csv("data/regression/data.activation.train.100.csv")
test = pd.read_csv("data/regression/data.activation.test.100.csv")

X_train, y_train = train.iloc[:, :-1], train.y
X_test, y_test = test.iloc[:, :-1], test.y

# %%
from mlp import MLPRegressor

estimator = MLPRegressor(random_seed = 12369666, \
    num_iterations = 10000)
print('R^2 score:', estimator.fit(X_train, y_train).score(X_test, y_test))

# %%
from mlp import Visualizer
vis = Visualizer()
vis.plot_train_test_error(estimator, X_train, y_train, \
    X_test, y_test, log_scale=True, show = True)

# %%
from mlp import Visualizer
vis = Visualizer()
vis.plot_regression_result(estimator, X_test.x, y_test, show=True)

# %%
from mlp import Visualizer
vis = Visualizer()
vis.plot_regression_dataset(X_train, y_train, show=True)
def run_cross_validation(model_path: str):
    full_results = {}
    current_location = os.getcwd()
    df = pd.read_csv(os.path.join("data", "screening_study_with_article_info.csv"), encoding="utf-8")
    text_df = pd.read_csv(os.path.join("data", "text_data.csv"), encoding="utf-8", index_col=0)
    y = df["Reading_Time"]
    df.drop("Reading_Time", axis=1, inplace=True)
    df.drop("Duration_Entire_Survey", axis=1, inplace=True)
    df = pd.get_dummies(df)

    full_train = df.copy(deep=True)
    full_train["Reading_Time"] = y
    full_train = full_train.merge(text_df, on="Survey_Num", how="outer")
    full_train = full_train[["text", "Reading_Time"]]
    text_only = full_train[["text"]]

    ### Gather surprisal baselines ###
    surprisal_only = pd.read_csv(os.path.join("data", "article_num_to_surprisal_only.csv"), header=0, index_col=0)
    surprisal_only_dict = dict(zip(surprisal_only.article_num, surprisal_only.surprisal))
    # a function to map the article num to the surprisal sum
    def data_to_surprisal_only(train_data):
        return train_data["Survey_Num"].apply(lambda x: surprisal_only_dict[x]).to_numpy().reshape(-1, 1)
    # gather the summed RTs from the LMM
    with open(os.path.join("data", "article_num_to_predictions_lmm.json"), "r") as fin:
        lmm_surprsial = json.load(fin)

    cv = KFold(n_splits=10, random_state=SEED, shuffle=False)

    data_only_cv = {
        "rf": {"rmse": [], "mae": []},
        "knn": {"rmse": [], "mae": []},
        "mlp": {"rmse": [], "mae": []},
        "lr": {"rmse": [], "mae": []},
        "lr-basic": {"rmse": [], "mae": []},
        "std": {"rmse": [], "mae": []},
        "lmm": {"rmse": [], "mae": []},
        "lr-surp": {"rmse": [], "mae": []},
    }

    """
    Data Only Models:
    includes:
        MLP Regressor
        Random Forest
        Linear Regression (word-only, regular, surprisal-only)
        Standard 240 WPM Model
        KNN
        RT LMM model (used by dict params from previous train)
    """
    print("Training data only models ...")
    for index, (train_index, test_index) in enumerate(cv.split(df)):
        print("On training split {}".format(index + 1))
        # instantiate models
        mlp = MLPRegressor(df.shape[1], 100, 1)
        rf = RandomForestRegressor(random_state=SEED)
        knn = KNeighborsRegressor()
        lr = LinearRegression()
        lr_surp = LinearRegression() # truly awful, not enough info to do good
        basic_lr = LinearRegression()
        std = StandardModel()
        surprisal_lmm = SurprisalModelLMM(lmm_surprsial)

        # set up datasets
        curr_xtrain = df.iloc[train_index.tolist(), :]
        curr_xtest = df.iloc[test_index.tolist(), :]
        curr_ytrain = y.iloc[train_index.tolist()]
        curr_ytest = y.iloc[test_index.tolist()]

        # fit
        mlp = fit_nn(mlp, curr_xtrain, curr_ytrain, n_epochs=100)
        rf.fit(curr_xtrain, curr_ytrain)
        knn.fit(curr_xtrain, curr_ytrain)
        lr.fit(curr_xtrain, curr_ytrain)
        basic_lr.fit(curr_xtrain[["Num_Words"]], curr_ytrain)
        lr_surp.fit(data_to_surprisal_only(curr_xtrain), curr_ytrain)

        # # calculate scores
        pred_mlp = predict_nn(mlp, curr_xtest)
        scores = get_scores(curr_ytest, pred_mlp, verbose=False)
        add_to_cv_dict(data_only_cv, "mlp", scores)

        pred_rf = rf.predict(curr_xtest)
        scores = get_scores(curr_ytest, pred_rf, verbose=False)
        add_to_cv_dict(data_only_cv, "rf", scores)

        pred_knn = knn.predict(curr_xtest)
        scores = get_scores(curr_ytest, pred_knn, verbose=False)
        add_to_cv_dict(data_only_cv, "knn", scores)

        pred_lr = lr.predict(curr_xtest)
        scores = get_scores(curr_ytest, pred_lr, verbose=False)
        add_to_cv_dict(data_only_cv, "lr", scores)

        pred_lr_basic = basic_lr.predict(curr_xtest[["Num_Words"]])
        scores = get_scores(curr_ytest, pred_lr_basic, verbose=False)
        add_to_cv_dict(data_only_cv, "lr-basic", scores)

        pred_lr_surp = basic_lr.predict(data_to_surprisal_only(curr_xtest))
        scores = get_scores(curr_ytest, pred_lr_surp, verbose=False)
        add_to_cv_dict(data_only_cv, "lr-surp", scores)

        pred_std = std.predict(curr_xtest)
        scores = get_scores(curr_ytest, pred_std, verbose=False)
        add_to_cv_dict(data_only_cv, "std", scores)

        pred_lmm = surprisal_lmm.predict(curr_xtest)
        scores = get_scores(curr_ytest, pred_lmm, verbose=False)
        add_to_cv_dict(data_only_cv, "lmm", scores)

    # print out extracted data only reports
    for key, value in data_only_cv.items():
        for metric, scores in value.items():
            print('The model {} got an average of {} for {}'.format(key, np.mean(scores), metric))


    for model_dir in glob.glob(os.path.join(model_path, "*")):
        model_name = model_dir.split("/")[-1]
        print("Evaluating model", model_name)
        text_only_cv = {
            f"{model_name}": {"rmse": [], "mae": []},
        }

        stacked_cv = {
            f"{model_name}/MLP": {"rmse": [], "mae": []},
        }

        trained_model = TextRegressor.load(os.path.join(model_dir, "best-model.pt"))
        doc_embedder = trained_model.document_embeddings
        sentence = Sentence('The grass is green . And the sky is blue .')
        doc_embedder.embed(sentence)
        test_embed  = sentence.get_embedding()
        EMBEDDING_SIZE = test_embed.shape[0]
        assert test_embed is not None, "embedded a None object"

        # build combined text and embedding vector
        embedding_df = build_embedding_df(text_only, doc_embedder, embed_size=EMBEDDING_SIZE, shorten=True if model_name == "roBERTa" else False)
        combined_df = pd.concat([df, embedding_df], axis=1)
        assert combined_df.shape[0] == df.shape[0] and combined_df.shape[1] == df.shape[1] + embedding_df.shape[1], \
                "shapes were not aligned: df {} combined {}, embed {}".format(df.shape, combined_df.shape, embedding_df.shape)

        """
        Text Only (Embedding) Models:
        includes:
            LSTM
            Transformers
        # """
        print("Training text only models ...")
        for train_index, test_index in cv.split(df):
            # instantiate models
            mlp = MLPRegressor(test_embed.shape[0], 100, 1)
            # set up datasets and combine with text
            curr_xtrain, curr_xtest, curr_ytrain, curr_ytest = get_splits(train_index, test_index, embedding_df, y)
            # fit
            mlp = fit_nn(mlp, curr_xtrain, curr_ytrain, n_epochs=100)

            # calculate scores
            pred_mlp = predict_nn(mlp, curr_xtest)
            scores = get_scores(curr_ytest, pred_mlp, verbose=False)
            add_to_cv_dict(text_only_cv, model_name, scores)

        # print out extracted data only reports
        for key, value in text_only_cv.items():
            for metric, scores in value.items():
                print('The model {} got an average of {} for {}'.format(key, np.mean(scores), metric))


        """
        Stacked Models:
        includes:
            LSTM / MLP Regressor
            Transformers / MLP Regressor
        """
        print("Training stacked models ...")
        for train_index, test_index in cv.split(df):
            # instantiate models
            mlp = MLPRegressor(df.shape[1] + test_embed.shape[0], 100, 1)
            # set up datasets and combine with text
            curr_xtrain, curr_xtest, curr_ytrain, curr_ytest = get_splits(train_index, test_index, combined_df, y)

            # fit
            mlp = fit_nn(mlp, curr_xtrain, curr_ytrain, n_epochs=100)

            # calculate scores
            pred_mlp = predict_nn(mlp, curr_xtest)
            scores = get_scores(curr_ytest, pred_mlp, verbose=False)
            add_to_cv_dict(stacked_cv, model_name + "/MLP", scores)

        # print out extracted data only reports
        for key, value in stacked_cv.items():
            for metric, scores in value.items():
                print('The model {} got an average of {} for {}'.format(key, np.mean(scores), metric))

        full_results = {**full_results, **stacked_cv, **text_only_cv}


    #### create overall dataframe from results ####
    results = pd.DataFrame(columns=["Group", "Name", "MAE", "MAE-STD", "RMSE", "RMSE-STD"])
    for (group_name, dict_results) in [("Data-Only", data_only_cv), ("Full_Results", full_results)]:
        for key, value in dict_results.items():
            results = results.append(pd.Series({"Group": group_name, "Name": key, "MAE": np.mean(value["mae"]),
                             "RMSE": np.mean(value["rmse"]), "RMSE-STD": np.std(value["rmse"]),
                             "MAE-STD": np.std(value["mae"])}), ignore_index=True)
    assert results.shape[0] != 0, "no data added to the dataframe"
    if not os.path.isdir("results"):
        os.makedirs("results")
    results.to_csv("results/all_results-{}-{}.csv".format(SEED, model_path.split("/")[0]))
Example #5
0
xc, yc = np.meshgrid(x, y)
zc = xc * yc

z = np.stack([xc, yc], 0)
z = np.transpose(np.reshape(z, (2, N * N)))
zc = np.reshape(zc, (N * N, ))

zzc = np.concatenate([z, zc[:, np.newaxis]], axis=1)
np.random.shuffle(zzc)
zzc_train = zzc[:num_train, :]
zzc_test = zzc[num_train:, :]

x_train = zzc_train[:, :2]
z_train = zzc_train[:, 2:]

x_test = zzc_test[:, :2]
z_test = zzc_test[:, 2:]

# Create regressor
regressor = MLPRegressor([2, 10, 10, 1],
                         'relu',
                         learning_rate=0.001,
                         optimizer='Adam')

# Fitting
regressor.fit(x=x_train, y=z_train, steps=100000)

# Evaluation
l2_error = regressor.eval(x=x_test, y=z_test)
print('l2 error: {}'.format(l2_error))
Example #6
0
    error_functions, GridSearch

param_grid = {
    'hidden_layers': [[], [5, 5, 5, 5]],
    'num_iterations': [10, 100, 1000],
    'eta': [0.005, 0.1],
    'batch_portion': [0.1, 0.5, 1],
    'bias': [True, False],
    'activation_function': [activation_functions.sigmoid, \
        activation_functions.tanh],
    'error_function': [error_functions.mean_squared, \
        error_functions.mean],
    'moment': [0, 0.2, 0.5]
}

gs = GridSearch(MLPRegressor(random_seed = 12369666), param_grid)

train = pd.read_csv("data/regression/data.cube.train.100.csv")
test = pd.read_csv("data/regression/data.cube.test.100.csv")
X_train, y_train = train.iloc[:, :-1], train.y
X_test, y_test = test.iloc[:, :-1], test.y
gs.fit(X_train, y_train, X_test, y_test)

#%%
f = open("grid_search_scores.txt","w+")
f.write(str(gs.param_scores_))
f.close()

# %%
print(gs.param_scores_)
Example #7
0
            show = True, \
            save_path = f'results/regression/{name}-{size}-result-train.png')
        vis.plot_train_test_error(estimator, \
            X_train, y_train, X_test, y_test, \
            show = True, \
            save_path = f'results/regression/{name}-{size}-error.png')

    return score


# %% activation
scores = []
for i in range(10):
    estimator = MLPRegressor(hidden_layers=[5],\
        num_iterations=1000, eta=0.005, batch_portion=0.7, \
        bias=False, moment=0.5, \
        activation_function=activation_functions.sigmoid, \
        error_function=error_functions.mean_squared,
        random_seed = 12369666 + i)

    score = process_regression_dataset('activation', estimator, \
        datasets_path_format='data/regression/data.{}.{}.{}.csv', \
        draw = True if i == 0 else False)

    scores.append(score)
print(f'activation: mean - {round(np.mean(scores), 4)}, ' + \
    f'std - {round(np.std(scores), 4)}')

# %% cube
scores = []
for i in range(10):
    estimator = MLPRegressor(activation_function = \
Example #8
0
                                   y_test,
                                   show=True)
    print(
        'Computing and plotting errors on train and test datasets for each iteration... (might take a while)'
    )
    vis.plot_train_test_error(clf, X_train, y_train, X_test, y_test, show=True)
    print('Plotting edge weights during training...')
    vis.plot_training_history('training_data.joblib')
    print('Finished')

if config['problem_type'] == 'regression':
    estimator = MLPRegressor(num_iterations = config['iterations'], \
        bias = config['bias'], \
        hidden_layers = config['hidden_layers'], \
        eta = config['learning_rate'], \
        moment = config['moment'], \
        batch_portion = config['batch_portion'], \
        random_seed = config['random_seed'], \
        activation_function = activation_function, \
        error_function = error_function)
    estimator = estimator.fit(X_train, y_train, \
        serialize_path='training_data.joblib')

    print('Regression R^2 score:', estimator.score(X_test, y_test))

    print('Plotting training dataset...')
    vis.plot_regression_dataset(X_train.iloc[:, 0], y_train, show=True)
    print('Plotting test dataset...')
    vis.plot_regression_dataset(X_test.iloc[:, 0], y_test, show=True)
    print('Plotting classifier decision space for train data...')
    vis.plot_regression_result(estimator,