Exemple #1
0
def train_simple(data: DataFrame, esti: Estimator, eid: str) -> PipResult:
    """
    Train without cross validation
    """
    try:
        print(f"--- train_simple {eid}")
        # Prepare training and test data.
        df_train, df_test = data.randomSplit([0.9, 0.1], seed=12345)

        # Run TrainValidationSplit, and choose the best set of parameters.
        trained_model: Transformer = esti.fit(df_train)

        # Make predictions on test data. model is the model with combination of parameters
        # that performed best.
        predictions = trained_model.transform(df_test) \
            .select("features", "label", "prediction")

        # Select (prediction, true label) and compute test error
        evaluator = RegressionEvaluator(labelCol="label",
                                        predictionCol="prediction",
                                        metricName="rmse")
        rmse = evaluator.evaluate(predictions)

        print(f"-- Root Mean Squared Error (RMSE) on test data = {rmse}")
        fnam = cm.fnam(eid)
        hlp.save_model(trained_model, hlp.get_datadir(), fnam)
        print(f"-- saved model to {fnam}")
        return PipResult(rmse, trained_model, "OK")
    except Exception:
        print(tb.format_exc())
        return PipResult(0.0, None, "ERROR")
Exemple #2
0
 def fit_classifier(self) -> None:
     """
     Fit the classifier and save the model using Pickle.
     :return:
     """
     self.clf.fit(self.X, self.y)
     save_model(self.clf, config.dataset, config.model)
Exemple #3
0
def linear_lasso_regression(X, y):
    """
    Fit lasso linear regression model
    :param X: train input
    :param y: train output
    :return: None
    """
    # Train a linear regression model with Lasso regression and assess its performance.
    lasso_regression = Lasso(  # Optimal parameters from by grid search run.
        alpha=0,
        tol=0.01,
        selection="random",
        positive=False,
        max_iter=1000,
        normalize=False)
    lasso_regression.fit(X, y)
    quick_prediction_test(lasso_regression, X, y)
    y_prediction = lasso_regression.predict(X)
    evaluate_model_error(lasso_regression, y, y_prediction)
    plot_best_fit(y, y_prediction, "lasso_reg")
    save_model(lasso_regression, "lasso_reg")

    # Using the same model with 10-fold cross validation.
    k_fold_cross_validation(lasso_regression, X, y, folds=5)

    if config.is_grid_search:
        print("\nPerforming grid search to find optimal hyperparameters...")

        # Applying grid search algorithm to the Lasso Regression model
        parameters = {
            "alpha": [0.0001, 0.001, 0.01, 0.1, 0, 1, 10],
            "tol": [0.01, 0.1, 1],
            "positive": [True, False],
            "selection": ["cyclic", "random"]
        }
        lasso_reg_gs_results = grid_search_algorithm(Lasso(),
                                                     X,
                                                     y,
                                                     parameters,
                                                     folds=5)
        lasso_reg_gs_results_df = pd.DataFrame(
            lasso_reg_gs_results.cv_results_)
        lasso_reg_gs_results_df.to_csv(
            "grid_search_results/lasso_reg_grid_search_results.csv")

        # Top 5 hyperparameters found for Lasso Regression.
        print("Top 5 hyperparameters combinations found for Lasso Regression:")
        lasso_reg_gs_results_df.sort_values(by=['rank_test_score']).head(5)

        # Best model found by grid search algorithm for Lasso Regression.
        final_lasso_reg_model = lasso_reg_gs_results.best_estimator_
        print(
            "\nBest model hyperparameters found by grid search algorithm for Lasso Regression:"
        )
        print(final_lasso_reg_model)
Exemple #4
0
def linear_ridge_regression(X, y) -> None:
    """
    Fit ridge linear regression model
    :param X: train input
    :param y: train output
    :return: None
    """
    # Train a linear regression model with Ridge regression and assess its performance.
    ridge_regression = Ridge(
        alpha=0.1, solver='svd',
        normalize=False)  # Optimal parameters from by grid search run.
    ridge_regression.fit(X, y)
    quick_prediction_test(ridge_regression, X, y)
    y_prediction = ridge_regression.predict(X)
    evaluate_model_error(ridge_regression, y, y_prediction)
    plot_best_fit(y, y_prediction, "ridge_reg")
    save_model(ridge_regression, "ridge_reg")

    # Using the same model with 10-fold cross validation.
    k_fold_cross_validation(ridge_regression, X, y)

    if config.is_grid_search:
        print("\nPerforming grid search to find optimal hyperparameters...")

        # Applying grid search algorithm to the Ridge Regression model to explore combinations of hyperparameters to
        # find the best hyperparameters for a model, since doing so manually is a very time consuming task.
        parameters = {
            "alpha": [0.1, 1, 10],
            "normalize": [True, False],
            "solver":
            ["auto", "svd", "cholesky", "lsqr", "sparse_cg", "sag", "saga"]
        }
        ridge_reg_gs_results = grid_search_algorithm(Ridge(),
                                                     X,
                                                     y,
                                                     parameters,
                                                     folds=5)
        ridge_reg_gs_results_df = pd.DataFrame(
            ridge_reg_gs_results.cv_results_)
        ridge_reg_gs_results_df.to_csv(
            "grid_search_results/ridge_reg_grid_search_results.csv")

        # Top 5 hyperparameters found for Ridge Regression.
        print("Top 5 hyperparameters combinations found for Ridge Regression:")
        ridge_reg_gs_results_df.sort_values(by=['rank_test_score']).head(5)

        # Best model found by grid search algorithm for Ridge Regression.
        final_ridge_reg_model = ridge_reg_gs_results.best_estimator_
        print(
            "\nBest model hyperparameters found by grid search algorithm for Ridge Regression:"
        )
        print(final_ridge_reg_model)
Exemple #5
0
def decision_tree_regression(X, y):
    """
    Briefly explore decision tree regression model.
    :param X:
    :param y:
    :return:
    """
    tree_regressor = DecisionTreeRegressor()
    tree_regressor.fit(X, y)
    quick_prediction_test(tree_regressor, X, y)
    evaluate_model_error(tree_regressor, y, tree_regressor.predict(X))
    save_model(tree_regressor, "tree_reg")
    k_fold_cross_validation(tree_regressor, X, y)
Exemple #6
0
def svm_regression(X, y):
    """
    Briefly explore SVM regression model.
    :param X:
    :param y:
    :return:
    """
    svr = SVR()
    svr.fit(X, y)
    quick_prediction_test(svr, X, y)
    evaluate_model_error(svr, y, svr.predict(X))
    save_model(svr, "svr")
    k_fold_cross_validation(svr, X, y)
Exemple #7
0
def mlp_regression(X, y):
    """
    Briefly explore MLP regression model.
    :param X:
    :param y:
    :return:
    """
    mlp_regressor = MLPRegressor()
    mlp_regressor.fit(X, y)
    quick_prediction_test(mlp_regressor, X, y)
    evaluate_model_error(mlp_regressor, y, mlp_regressor.predict(X))
    save_model(mlp_regressor, "mlp_reg")
    k_fold_cross_validation(mlp_regressor, X, y)
Exemple #8
0
    def save_load_hlp():
        SparkSession.builder \
            .appName("tryout") \
            .getOrCreate()
        m = LinearRegression(regParam=0.5, maxIter=10)
        pm = m.extractParamMap()
        pprint(pm)
        print()
        pprint(str(path))
        hlp.save_model(m, path, nam)

        m2 = hlp.load_model(path, nam)
        pm2 = m2.extractParamMap()
        pprint(pm2)
Exemple #9
0
def elastic_net_regression(X, y):
    """
    Briefly explore elastic net linear regression model.
    :param X:
    :param y:
    :return:
    """
    elastic_net_regression = ElasticNet()
    elastic_net_regression.fit(X, y)
    quick_prediction_test(elastic_net_regression, X, y)
    evaluate_model_error(elastic_net_regression, y,
                         elastic_net_regression.predict(X))
    save_model(elastic_net_regression, "elasticnet_reg")
    k_fold_cross_validation(elastic_net_regression, X, y)
Exemple #10
0
def random_forest_generator_regression(X, y):
    """
    Briefly explore random forest generator regression model.
    :param X:
    :param y:
    :return:
    """
    random_forest_generator_reg = RandomForestRegressor()
    random_forest_generator_reg.fit(X, y)
    quick_prediction_test(random_forest_generator_reg, X, y)
    evaluate_model_error(random_forest_generator_reg, y,
                         random_forest_generator_reg.predict(X))
    save_model(random_forest_generator_reg, "random_forest_generator_reg")
    k_fold_cross_validation(random_forest_generator_reg, X, y)
Exemple #11
0
def train(data: DataFrame, esti: Estimator, eid: str,
          param_grid_builder: Callable[[Estimator], list]) -> PipResult:
    try:
        print(f"--- train {eid}")
        # Prepare training and test data.
        df_train, df_test = data.randomSplit([0.9, 0.1], seed=12345)

        # We use a ParamGridBuilder to construct a grid of parameters to search over.
        # TrainValidationSplit will try all combinations of values and determine best model using
        # the evaluator.
        params = param_grid_builder(esti)
        print(f"--- params")
        pprint(params)

        # In this case the estimator is simply the linear regression.
        # A TrainValidationSplit requires an Estimator, a set of Estimator ParamMaps, and an Evaluator.
        tvs = TrainValidationSplit(
            estimator=esti,
            estimatorParamMaps=params,
            evaluator=RegressionEvaluator(),
            # 80% of the data will be used for training, 20% for validation.
            trainRatio=0.8)

        # Run TrainValidationSplit, and choose the best set of parameters.
        trained_models: TrainValidationSplitModel = tvs.fit(df_train)

        # Make predictions on test data. model is the model with combination of parameters
        # that performed best.
        predictions = trained_models.transform(df_test) \
            .select("features", "label", "prediction")

        # Select (prediction, true label) and compute test error
        evaluator = RegressionEvaluator(labelCol="label",
                                        predictionCol="prediction",
                                        metricName="rmse")
        rmse = evaluator.evaluate(predictions)

        print(f"-- Root Mean Squared Error (RMSE) on test data = {rmse}")
        fnam = cm.fnam(eid)
        hlp.save_model(trained_models.bestModel, hlp.get_datadir(), fnam)
        print(f"-- saved model to {fnam}")
        return PipResult(rmse, trained_models.bestModel, "OK")
    except Exception:
        print(tb.format_exc())
        return PipResult(0.0, None, "ERROR")
Exemple #12
0
def general_linear_regression(X, y) -> None:
    """
    Fit general linear regression model
    :param X: train input
    :param y: train output
    :return: None
    """
    # Train a linear regression model and assess its performance.
    linear_regression = LinearRegression(fit_intercept=True, normalize=True)
    linear_regression.fit(X, y)
    quick_prediction_test(linear_regression, X, y)
    y_prediction = linear_regression.predict(X)
    evaluate_model_error(linear_regression, y, y_prediction)
    plot_best_fit(y, y_prediction, "linear_reg")
    save_model(linear_regression, "linear_reg")

    # Using the same model with 10-fold cross validation.
    k_fold_cross_validation(linear_regression, X, y)
Exemple #13
0
from torch.nn import MSELoss
from torch.optim import SGD
import numpy as np

import sys

sys.path.append('..')
from helpers import load_data_in_chunks, save_model
from model import Net
from CustomLoss import CustomLoss

(Xs, Ys) = load_data_in_chunks('basic', 'train', chunk_size=5)
Xs = Xs.astype(np.float32)
Ys = Ys.astype(np.float32)

regr = NeuralNet(Net,
                 max_epochs=10000000000,
                 batch_size=100,
                 iterator_train__shuffle=True,
                 criterion=CustomLoss,
                 optimizer=SGD,
                 optimizer__lr=1e-5,
                 optimizer__momentum=0.9,
                 optimizer__nesterov=True,
                 optimizer__dampening=0,
                 verbose=5,
                 callbacks=[('early_stop', EarlyStopping())])
regr.fit(Xs, Ys / 5000)

save_model(regr, 'conv-mse')
Exemple #14
0
def run_nlm(model,
            corpus,
            optimizer=None,
            epochs=1,
            eval_corpus=None,
            status_interval=25,
            str_pattern='{}_{}_epoch_{}.pkl',
            rz_amplifier=5):
    entity_offset = 1

    for epoch in range(1, epochs + 1):
        X_epoch_loss, E_epoch_loss, R_epoch_loss, L_epoch_loss = 0, 0, 0, 0
        epoch_tokens, epoch_r_div, epoch_l_div, epoch_e_div = 0, 0, 0, 0
        epoch_start = time.time()
        count_E = 0
        count_E_correct = 0
        count_R = 0
        r_true_positive = 0
        r_false_positive = 0
        for i_doc, doc in enumerate(corpus.gen()):
            model.reset_state()
            # initialize e_current
            e_current = model.get_new_entity()

            # forward first token through Embedding and RNN
            # initialize states
            # lstm initializes states with zeros when given None
            h_t, states = model.forward_rnn(doc.X[0], states=None)
            h_t = h_t.squeeze(0)

            # initialize loss tensors
            X_loss = torch.tensor(0, dtype=torch.float, device=device)
            E_loss = torch.tensor(0, dtype=torch.float, device=device)
            R_loss = torch.tensor(0, dtype=torch.float, device=device)
            L_loss = torch.tensor(0, dtype=torch.float, device=device)

            # counters to properly devide losses
            r_div = 0
            l_div = 0
            e_div = 0

            # counter for stats
            doc_r_true_positive = 0
            doc_r_false_positive = 0
            doc_count_R = 0

            # iterate over document
            for t in range(doc.X.size(0) - 1):
                # define target values
                next_X = doc.X[t + 1]  # next Token
                next_E = doc.E[
                    t +
                    1] - entity_offset  # next Entity, offset to match indices with self.entities
                next_R = doc.R[t + 1]  # next R type
                next_L = doc.L[t + 1]  # next Length

                # ***START PAPER ALGORITHM***

                # Define current value for L
                current_L = doc.L[t]
                if current_L == 1:
                    # 1.
                    # last L equals 1: not continuing entity mention

                    # predict next R
                    R_dist = model.get_next_R(h_t)
                    # create loss for R
                    r_current_loss = torch.nn.functional.cross_entropy(
                        R_dist, next_R.view(-1)) * rz_amplifier
                    # r_current_loss is used to make amplification of loss possible
                    R_loss += r_current_loss

                    # add division counter for R loss
                    r_div += 1

                    if next_R == 1:
                        # next token is within an entity mention
                        doc_count_R += 1
                        if R_dist.argmax():
                            # both True - correct pred
                            doc_r_true_positive += 1
                            #R_loss += r_current_loss
                        else:
                            # false negative prediction
                            # extra loss to increase recall
                            R_loss += r_current_loss * rz_amplifier
                            pass

                        # select the entity
                        E_dist = model.get_next_E(h_t, t)
                        # count for stats
                        count_E += 1
                        count_E_correct += int(E_dist.argmax() == next_E)
                        # calculate entity loss
                        E_loss += torch.nn.functional.cross_entropy(
                            E_dist, next_E.view(-1))
                        e_div += 1

                        # register entity
                        model.register_predicted_entity(next_E)

                        # set e_current to entity embedding e_t-1
                        e_current = model.get_entity_embedding(next_E)

                        # predict length of entity and calculate loss
                        L_dist = model.get_next_L(h_t, e_current)
                        L_loss += torch.nn.functional.cross_entropy(
                            L_dist, next_L.view(-1))
                        l_div += 1

                    else:
                        # only for stats and possibility to amplify loss
                        if R_dist.argmax():
                            # wrong True pred
                            doc_r_false_positive += 1
                            # extra loss
                            # R_loss += r_current_loss * rz_amplifier
                        else:
                            # correct False pred
                            # R_loss += r_current_loss
                            pass
                else:
                    # 2. Otherwise
                    # last L unequal 1, continuing entity mention
                    # set last new_L = last_L - 1
                    # new_R = last_R
                    # new_E = last_E

                    # additional prediction for E to get more training cases
                    # (it also makes stats more comparable to deep-mind paper)
                    E_dist = model.get_next_E(h_t, t)
                    count_E += 1
                    count_E_correct += int(E_dist.argmax() == next_E)
                    E_loss += torch.nn.functional.cross_entropy(
                        E_dist, next_E.view(-1))
                    e_div += 1
                    pass

                # 3. Sample X, get distribution for next Token
                X_dist = model.get_next_X(h_t, e_current)
                X_loss += torch.nn.functional.cross_entropy(
                    X_dist, next_X.view(-1))

                # 4. Advance the RNN on predicted token, here in training next token
                h_t, states = model.forward_rnn(doc.X[t + 1], states)
                h_t = h_t.squeeze(0)
                # new hidden state of next token from here (h_t, previous was h_t-1)

                # 5. Update entity state
                if next_R == 1:
                    model.update_entity_embedding(next_E, h_t, t)
                    # set e_current to embedding e_t
                    e_current = model.get_entity_embedding(next_E)

                # 6. Nothing toDo?

            # ***END PAPER ALGORITHM***

            # calculate stats and divide loss values
            r_true_positive += doc_r_true_positive
            r_false_positive += doc_r_false_positive
            count_R += doc_count_R
            doc_r_prec = doc_r_true_positive / max(
                (doc_r_true_positive + doc_r_false_positive), 1)
            doc_r_recall = doc_r_true_positive / max(doc_count_R, 1)
            doc_rf_score = 2 * ((doc_r_prec * doc_r_recall) /
                                max(doc_r_prec + doc_r_recall, 1))

            R_loss = R_loss / max(doc_rf_score, 0.35)

            X_epoch_loss += X_loss.item()
            R_epoch_loss += R_loss.item()
            E_epoch_loss += E_loss.item()
            L_epoch_loss += L_loss.item()
            X_loss /= len(doc)
            R_loss /= max(r_div, 1)
            E_loss /= max(e_div, 1)
            L_loss /= max(l_div, 1)

            epoch_tokens += len(doc)
            epoch_r_div += r_div
            epoch_l_div += l_div
            epoch_e_div += e_div

            if optimizer:
                # optimization step
                optimizer.zero_grad()
                loss = X_loss + R_loss + E_loss + L_loss
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
                optimizer.step()
            if status_interval and i_doc % status_interval == 0:
                # status output
                r_prec = r_true_positive / max(
                    (r_true_positive + r_false_positive), 1)
                r_recall = r_true_positive / max(count_R, 1)
                rf_score = 2 * (
                    (r_prec * r_recall) / max(r_prec + r_recall, 1))
                print(
                    f'Doc {i_doc}/{len(corpus)-1}: X_loss {X_epoch_loss / epoch_tokens:0.3}, R_loss {R_epoch_loss / epoch_r_div:0.3}, E_loss {E_epoch_loss / epoch_e_div:0.3}, L_loss {L_epoch_loss / epoch_l_div:0.3}, E_acc {count_E_correct/count_E:0.3}, R_prec {r_prec:0.3}, R_recall {r_recall:0.3}'
                )
                sys.stdout.flush()

        # calulate readable time format
        seconds = round(time.time() - epoch_start)
        m, s = divmod(seconds, 60)
        h, m = divmod(m, 60)
        x_hour_and_ = f'{h} hours and ' * bool(h)
        if optimizer:
            print(f'Epoch {epoch} finished after {x_hour_and_}{m} minutes.')
        else:
            print(
                f'Evaluation on "{corpus.partition}" partition finished after {x_hour_and_}{m} minutes.'
            )

        # calculate epoch stats: precision, recall and F-Score
        r_prec = r_true_positive / max((r_true_positive + r_false_positive), 1)
        r_recall = r_true_positive / max(count_R, 1)
        rf_score = 2 * ((r_prec * r_recall) / max(r_prec + r_recall, 1))

        print(
            f'Loss: X_loss {X_epoch_loss / epoch_tokens:0.3}, R_loss {R_epoch_loss / epoch_r_div:0.3}, E_loss {E_epoch_loss / epoch_e_div:0.3}, L_loss {L_epoch_loss / epoch_l_div:0.3}, E_acc {count_E_correct/count_E:0.3}, R_prec {r_prec:0.3}, R_recall {r_recall:0.3}, R_Fscore {rf_score:0.3}'
        )
        print()

        # if in train mode
        if optimizer:
            # save model
            file_name = str_pattern.format(model.__class__.__name__,
                                           model.lstm.hidden_size, epoch)
            save_model(model, file_name)
            if eval_corpus:
                # evaluate on evaluation corpus
                with torch.no_grad():
                    model.eval()
                    run_nlm(model,
                            eval_corpus,
                            status_interval=None,
                            rz_amplifier=rz_amplifier)
                    model.train()
 def save_model(self, save_location, name):
     save_model(save_location, name, self.model)
Exemple #16
0
from sklearn.dummy import DummyRegressor

import sys
sys.path.append('..')
from helpers import load_data, save_model

(Xs, Ys) = load_data('train')
regr = DummyRegressor(strategy='median')
regr.fit(Xs, Ys)
save_model(regr, 'dummy')
Exemple #17
0
from torch.nn import MSELoss
from torch.optim import SGD
import numpy as np

import sys
sys.path.append('..')
from helpers import load_data_in_chunks, save_model
from model import Net
from RelativeEntropyLoss import RelativeEntropyLoss

(Xs, Ys) = load_data_in_chunks('survival', 'train', chunk_size=5)
Xs = Xs.astype(np.float32)
Ys = Ys.astype(np.float32)

regr = NeuralNet(Net,
                 max_epochs=10000000000,
                 batch_size=100,
                 iterator_train__shuffle=True,
                 criterion=RelativeEntropyLoss,
                 optimizer=SGD,
                 optimizer__lr=1e-5,
                 optimizer__momentum=0.9,
                 optimizer__nesterov=True,
                 optimizer__dampening=0,
                 verbose=5,
                 callbacks=[('early_stop', EarlyStopping())])

regr.fit(Xs, Ys)

save_model(regr, 'conv-survival')
Exemple #18
0
from sklearn.linear_model import LinearRegression

import sys

sys.path.append('..')
from helpers import load_data_in_chunks, save_model

(Xs, Ys) = load_data_in_chunks('train', chunk_size=5)
regr = LinearRegression()
regr.fit(Xs.reshape(Xs.shape[0], -1), Ys)
save_model(regr, 'linear-multiple')
Exemple #19
0
def pipeline(directory: str,
             frames: List[Frame],
             parser_name: str,
             extract_features: set,
             filter_: dict,
             prune_test_data=True,
             log_data=False):
    filter = copy.deepcopy(filter_)
    # Prune sentences without an LU from the frames
    # TODO: log no. sentences pruned
    filter_faulty_sentences(frames)

    # Filter frames
    (frames, no_filtered_frames_sentences) = filter_data(frames, filter)
    print(f"Filtered data")
    # Add feature representation of each word to each word node
    (no_data_points_features) = create_feature_representation(
        frames, extract_features)
    print(f"Feature representation created")

    no_frames = f"Number of frames: {len(frames)}"
    print(no_frames)

    # Split data into training and test sets
    (train_sentences, test_sentences) = split_data_train_test(frames)

    # # For testing purpose adding all sentences to training and testing
    # sentences = []
    # for frame in frames:
    #     sentences.extend(frame.getSentences())
    # (train_sentences, test_sentences) = (sentences, sentences)

    no_sentences = f"Number of sentences: {len(train_sentences) + len(test_sentences)}"
    print(no_sentences)
    print(no_data_points_features)

    filter = copy.deepcopy(filter_)
    # Train models
    id_clf, label_clf, report_training = train_models_2(
        train_sentences, filter)

    print(f"{report_training}")
    if log_data:
        model_path = f"{directory}/models"
        if not os.path.isdir(model_path):
            # Create model folder
            try:
                os.mkdir(f"{model_path}")
            except:
                raise OSError(f"Unable to create directory {model_path}")
        # Save models
        save_model(id_clf, f"{parser_name}_identification_model",
                   f"{model_path}")
        save_model(label_clf, f"{parser_name}_labeling_model", f"{model_path}")

    filter = copy.deepcopy(filter_)
    # Test models
    (id_evaluation, label_evaluation,
     evaluation) = test_models_2(id_clf,
                                 label_clf,
                                 test_sentences,
                                 filter,
                                 prune_test_data=prune_test_data)
    print(f"Models tested")
    if log_data:
        # Save evaluation
        save_to_file(id_evaluation,
                     f"{directory}/{parser_name}_id_evaluation.txt")
        save_to_file(label_evaluation,
                     f"{directory}/{parser_name}_label_evaluation.txt")
        save_to_file(f"{evaluation}",
                     f"{directory}/{parser_name}_evaluation.txt")
        save_to_file(
            f"{no_frames}\n{no_sentences}\n{no_data_points_features}\n{report_training}",
            f"{directory}/run_description.txt")

    # Release memory using garbage collection
    del id_clf
    del label_clf
    gc.collect()
    return evaluation
Exemple #20
0
    # Backprop and update if any instances were actually provided
    if len(cls_targets) > 0:
        loss.backward()
        nn.utils.clip_grad_norm(rfcn.parameters(), args.grad_clip)
        opt.step()

    return loss


# Training loop
for epoch in range(args.epochs):
    # Get next batch of training data
    examples, classes, boxes = helpers.next_batch()

    # Run the train step
    total_loss = update(examples, classes, boxes)

    # Prevent premature logging
    if epoch == 0:
        continue

    # Print training status
    if epoch % args.print_every == 0:
        print("")

    # Save models
    if epoch % args.save_every == 0:
        helpers.save_model(rfcn, 'rfcn')

helpers.save_model(rfcn, 'rfcn')
from sklearn.linear_model import LinearRegression

import sys
sys.path.append('..')
from helpers import load_data, save_model

(Xs, Ys) = load_data('train')
regr = LinearRegression()
regr.fit(Xs, Ys)
save_model(regr, 'linear-basic')
Exemple #22
0
    'losses': losses,
    'scores': scores,
    'ave_scores': ave_scores,
    'actor_losses': actor_losses,
    'critic_losses': critic_losses
}

model = ActorCritic(model_params)
optimizer = torch.optim.Adam(lr=params['lr'], params=model.parameters())

start = perf_counter()

if __name__ == '__main__':
    try:
        worker(model, optimizer, params)
        save_model(model, optimizer, 'actor_critic.pt')
        plot_losses(params['losses'], 'loss.png')
        plot_losses(params['actor_losses'], filename='actor_loss.png', plotName="Actor Losses")
        plot_losses(params['critic_losses'], filename='critic_loss.png', plotName="Critic Losses")
        plot_scores(params['scores'], params['ave_scores'], filename='scores.png')
        end = perf_counter()
        print((end - start))
    except KeyboardInterrupt:
        pass
    finally:
        save_model(model, optimizer, 'actor_critic.pt')
        plot_losses(params['losses'], 'loss.png')
        plot_losses(params['actor_losses'], filename='actor_loss.png', plotName="Actor Losses")
        plot_losses(params['critic_losses'], filename='critic_loss.png', plotName="Critic Losses")
        plot_scores(params['scores'], params['ave_scores'], filename='scores.png')
        end = perf_counter()
                ((clouds, locations, _), (alt_clouds, alt_locs, _), labels) = data
                clouds = clouds.cuda()
                alt_clouds = alt_clouds.cuda()
                labels = labels.cuda()
                conv = scan_conv(clouds, alt_clouds)

                #Compute match prediction
                scores = scan_match(conv)
                predictions = torch.argmax(F.softmax(scores, dim=1), dim=1)
                metrics[0] += torch.sum((predictions + labels == 2)) # both prediction and lable are 1
                metrics[1] += torch.sum((predictions - labels == 1)) # prediction is 1 but label is 0
                metrics[2] += torch.sum((predictions + labels == 0)) # both prediction and label are 0
                metrics[3] += torch.sum((predictions - labels == -1)) # prediction is 0, label is 1
        
            acc = (metrics[0] + metrics[2]) / sum(metrics)
            prec = (metrics[0]) / (metrics[0] + metrics[1])
            rec = (metrics[0]) / (metrics[0] + metrics[3])
            f1 = 2 * prec * rec / (prec + rec)
            print_output('Metrics: (TP %d, FP %d, TN %d, FN %d)' % (metrics[0], metrics[1], metrics[2], metrics[3]))
            print_output('(Acc: %f, Precision: %f, Recall: %f, F1: %f)' % (acc, prec, rec, f1))
        
    if not config.lock_conv:
        helpers.save_model(scan_conv, out_dir, epoch, 'conv')
    if config.train_match:
        helpers.save_model(scan_match, out_dir, epoch, 'match')
    if config.train_transform:
        helpers.save_model(scan_transform, out_dir, epoch, 'transform')
    if (len(select.select([sys.stdin], [], [], 0)[0])):
        break

print_output("Completed training for {0} epochs".format(epoch + 1))
Exemple #24
0
from skorch import NeuralNet
from skorch.callbacks import EarlyStopping
from torch.nn import MSELoss
from torch.optim import SGD
import numpy as np

import sys
sys.path.append('..')
from helpers import load_data_in_chunks, save_model
from model import Net
from CustomLoss import CustomLoss

(Xs, Ys) = load_data_in_chunks('train', chunk_size=5)
Xs = Xs.astype(np.float32)
Ys = Ys.astype(np.float32)

regr = NeuralNet(Net,
                 max_epochs=10000000000,
                 batch_size=100,
                 iterator_train__shuffle=True,
                 criterion=MSELoss,
                 optimizer=SGD,
                 optimizer__lr=1e-5,
                 optimizer__momentum=0.95,
                 verbose=5,
                 callbacks=[('early_stop', EarlyStopping())])
regr.fit(Xs, Ys / 5000)

save_model(regr, 'lstm-mse')
Exemple #25
0
def main():
    start = time.time()
    ##### Run variables #####
    # If the data should be pruned as a part of the evaluation
    pruning_test_data = True
    # Filter the data used in both training and testing
    filter = {"min_sentences": 0, "min_role_occurrence": 6, "prune": 1}
    # Features of data to use
    features_ = {
        "",
        # "frame",
        # "core_elements",
        "word",
        "lemma",
        "pos",
        # "deprel",
        "ref",
        # "lu_words",
        # "lu_lemmas",
        # "lu_deprels",
        # "lu_pos",
        # "head_word",
        # "head_lemma",
        # "head_deprel",
        # "head_pos",
        # "child_words",
        # "child_lemmas",
        # "child_deprels",
        # "child_pos",
    }

    if not os.path.isfile('spacy_parse.pkl'):
        try:
            frames = parse_spacy()
            save_model(frames, "spacy_parse", ".")
            send_email("Parsing spaCy",
                       f"Finished parsing spaCy and saved to model",
                       email_address, send_mail)
        except Exception as err:
            send_email("Parsing spaCy",
                       f"Error when parsing spaCy\n{str(err)}", email_address,
                       send_mail)
            quit()

    ######## RUNS ########
    # for feature in features_:
    features = features_.copy()
    # features.remove(feature)
    # Change this string to represent the data manipulation made
    now = datetime.now()
    dt_string = now.strftime("_%Y-%m-%d_%H-%M-%S")
    directory = f"runs/run{dt_string}"
    readable_time = now.strftime("%H:%M:%S %Y-%m-%d")

    # Description of run
    data_description = (
        f"Testing all features not generated by parser. Features: {feature}. \nlinearSVC. \n{features=}. \n{filter=}. \n{pruning_test_data=}. \nTime: {readable_time}\n"
    )

    if log_data:
        # Create new run folder
        try:
            os.mkdir(directory)
            f = open(directory + "/run_description.txt", "a")
            f.write(data_description)
            f.close()
        except:
            raise OSError(f"Unable to create directory {directory}")

    send_email(
        directory,
        f"New run started: \n{data_description}\n",
        email_address,
        send_mail,
    )

    run_malt(data_description,
             directory,
             features,
             filter,
             prune_test_data=pruning_test_data)
    run_spacy(data_description,
              directory,
              features,
              filter,
              prune_test_data=pruning_test_data)

    send_email("Finished runs", "Tests compleate :)", email_address, send_mail)
    timestamp(start, "Total time: ")
    quit()
    'brain_name': brain_name,
    'start_epsilon': start_epsilon,
    'end_epsilon': end_epsilon,
    'epochs': epochs,
    'lr': lr,
    'gamma': gamma,
    'clc': clc,
    'reward_leadup': reward_leadup,
    'entropy_bonus': entropy_bonus,
    'batch_size': batch_size,
    'losses': losses,
    'scores': scores,
    'ave_scores': ave_scores,
    'actor_losses': actor_losses,
    'critic_losses': critic_losses
}

start = perf_counter()
worker(model, params)
save_model(model, 'actor_critic.pt')
end = perf_counter()
print((end - start))

plot_losses(params['losses'], 'loss.png')
plot_losses(params['actor_losses'],
            filename='actor_loss.png',
            plotName="Actor Losses")
plot_losses(params['critic_losses'],
            filename='critic_loss.png',
            plotName="Critic Losses")
plot_scores(params['scores'], params['ave_scores'], filename='scores.png')
Exemple #27
0
def main_trained_models():
    # If the data should be pruned as a part of the evaluation
    prune_test_data = True
    # Filter the data used in both training and testing
    filter = {"min_sentences": 0, "min_role_occurrence": 6, "prune": 1}
    if not os.path.isfile('spacy_parse.pkl'):
        try:
            print(f"Parsing spaCy frames")
            frames = parse_spacy()
            save_model(frames, "spacy_parse", ".")
            send_email("Parsing spaCy",
                       f"Finished parsing spaCy and saved to model",
                       email_address, send_mail)
        except Exception as err:
            send_email("Parsing spaCy",
                       f"Error when parsing spaCy\n{str(err)}", email_address,
                       send_mail)
            quit()

    # Parse data
    spacy_frames: List[Frame] = open_model("spacy_parse", ".")
    malt_frames: List[Frame] = parse_malt()

    #### runs ####
    for x in os.walk("runs/tmp/"):
        for folder in x[1]:
            directory = x[0] + folder
            assert os.path.isdir(directory)
            features = {folder[4:]}
            now = datetime.now()
            readable_time = now.strftime("%H:%M:%S %Y-%m-%d")

            # Description of run
            data_description = (
                f"Testing good guess, one feature at a time. \nlinearSVC. \n{features=}. \n{filter=}. \n{prune_test_data=}. \nTime: {readable_time}\n"
            )

            f = open(directory + "/run_description.txt", "w")
            f.write(data_description)
            f.close()

            send_email(
                directory,
                f"New run started: \n{data_description}\n",
                email_address,
                send_mail,
            )

            pipeline_2(directory, malt_frames, "malt", features, filter,
                       prune_test_data, log_data)
            send_email(
                directory,
                f"Pipeline for data parsed with Maltparser compleate.",
                email_address,
                send_mail,
            )
            pipeline_2(directory, spacy_frames, "spacy", features, filter,
                       prune_test_data, log_data)
            send_email(
                directory,
                f"Pipeline for data parsed with spaCy compleate.",
                email_address,
                send_mail,
            )
from skorch.callbacks import EarlyStopping
from torch.nn import MSELoss
from torch.optim import SGD
import numpy as np

import sys
sys.path.append('..')
from helpers import load_data_in_chunks, save_model
from model import Net
from CustomLoss import CustomLoss

(Xs, Ys) = load_data_in_chunks('basic', 'train', chunk_size=5)
Xs = Xs.astype(np.float32)
Ys = Ys.astype(np.float32)

regr = NeuralNet(Net,
                 max_epochs=10000000000,
                 batch_size=100,
                 iterator_train__shuffle=True,
                 criterion=CustomLoss,
                 optimizer=SGD,
                 optimizer__lr=1e-5,
                 optimizer__momentum=0.9,
                 optimizer__nesterov=True,
                 optimizer__dampening=0,
                 verbose=5,
                 callbacks=[('early_stop', EarlyStopping())])
regr.fit(Xs, Ys / 5000)

save_model(regr, 'conv-custom')
from sklearn.neural_network import MLPRegressor
import numpy as np

import sys
sys.path.append('..')
from helpers import load_data, save_model

(Xs, Ys) = load_data('train')
regr = MLPRegressor(hidden_layer_sizes=(60, 10), verbose=True, max_iter=1000)
regr.fit(Xs, Ys)
save_model(regr, 'mlp')
Exemple #30
0
    def hyperparameter_tuning(self) -> None:
        """
        Performs a hyperparameter tuning search (either grid search or randomised search) on the defined parameters and
        saves the results in a CSV file for further analysis.
        Note: only designed to work with MLP (determined based on initial evaluations).
        :return: None.
        """
        # Determine scoring metric to use based on dataset.
        scoring = str()
        if config.dataset == "binary":
            scoring = "f1"
        elif config.dataset == "multi":
            scoring = "f1_weighted"

        parameters = dict()
        search_alg_str = str()
        # Initialise Grid Search.
        if config.is_grid_search:
            print("Hyperparameter tuning technique chosen: GRID SEARCH")
            if config.dataset == "binary":
                parameters = {
                    "hidden_layer_sizes": [(98,), (98, 98), (114,), (114, 114)],
                    "learning_rate_init": [0.001, 0.03, 0.04, 0.1],
                    "alpha": [0.0001, 0.26, 0.96]
                }
                print(parameters)
            elif config.dataset == "multi":
                parameters = {
                    "hidden_layer_sizes": [(68,), (68, 68), (100,), (100, 100)],
                    "learning_rate_init": [0.001, 0.01, 0.1],
                    "momentum": [0.1, 0.9],
                    "alpha": [0.0001, 0.1, 0.9]
                }
            searchCV = GridSearchCV(
                param_grid=parameters,
                estimator=self.clf,
                cv=self.folds,
                scoring=scoring
            )
            search_alg_str = "gs"
        # Initialise Randomised Search.
        elif config.is_randomised_search:
            print("Hyperparameter tuning technique chosen: RANDOMISED SEARCH")
            parameters = {
                'hidden_layer_sizes': (sp_randint(1, 150)),
                'learning_rate_init': sp_uniform(0.001, 1),
                'momentum': sp_uniform(0.1, 0.9),
                'alpha': sp_uniform(0.0001, 1)
            }
            searchCV = RandomizedSearchCV(
                param_distributions=parameters,
                estimator=self.clf,
                n_iter=100,
                cv=self.folds,
                scoring=scoring
            )
            search_alg_str = "rs"

        # Run the search and save results in a CSV file.
        gs_results = searchCV.fit(self.X, self.y)
        gs_results_df = pd.DataFrame(gs_results.cv_results_)
        gs_results_df.to_csv("../results/grid_search/{}_{}_{}.csv".format(config.dataset, config.model, search_alg_str))

        # Print the best model found by hyperparameter tuning algorithm for the MLP and save the model in a Pickle file.
        final_model = gs_results.best_estimator_
        print("\nBest model hyperparameters found by randomised search algorithm:")
        print(final_model)
        print("Score: {}".format(gs_results.best_score_))
        save_model(final_model, config.dataset,
                   "{}_{}_{}_best_estimator".format(config.dataset, config.model, search_alg_str))