Exemple #1
0
 def __init__(self, mlflow_record):
     self.mlflow_record = mlflow_record
     self.validation_pc = 0.2
     self.X, self.y = read_data('data/train.csv', label_bool=True)
     self.transform = Transformation()
     self.learner = Learner()
     self.model_name = 'model_eval'
Exemple #2
0
def test_random_spliter():
    logger.info("*" * 20)
    data = read_data(task="ud",
                     folder=data_folder,
                     shuffle=True,
                     selected_feats=None,
                     combine_models=True)
    random_spliter = Random_Spliter(data)
    splited_data = random_spliter.split()
    assert len(splited_data["all"]["train_feats"]) == 1
    assert len(splited_data["all"]["train_feats"][0]) + len(
        splited_data["all"]["test_feats"][0]) == 72 * 25
Exemple #3
0
def test_k_spliter():
    logger.info("*" * 20)
    data = read_data(task="wiki",
                     folder=data_folder,
                     shuffle=True,
                     selected_feats=None,
                     combine_models=False)
    k_fold_spliter = K_Fold_Spliter(data)
    k_fold_data = k_fold_spliter.split()
    assert len(k_fold_data["BLEU"]["train_feats"]) == 5
    assert len(k_fold_data["BLEU"]["train_feats"][0]) + len(k_fold_data["BLEU"]["test_feats"][0]) == \
           len(k_fold_data["BLEU"]["train_feats"][1]) + len(k_fold_data["BLEU"]["test_feats"][1]) == 995
Exemple #4
0
def test_load_data():
    logger.info("*" * 20)
    data = read_data(task="monomt",
                     folder=data_folder,
                     shuffle=True,
                     selected_feats=None,
                     combine_models=False)
    assert len(data["BLEU"]["feats"]) == 54
    assert len(data["BLEU"]["labels"]) == 54
    assert len(data["BLEU"]["langs"]) == 54
    assert list(data["BLEU"]["langs"].columns.values) == [
        "Source Language", "Target Language"
    ]

    # test_feature_selection
    logger.info("*" * 20)
    data = read_data(task="monomt",
                     folder=data_folder,
                     shuffle=True,
                     selected_feats=["dataset size (sent)"],
                     combine_models=False)
    assert [data["BLEU"]["feats"].columns.values] == ["dataset size (sent)"]

    # test multi_model
    logger.info("*" * 20)
    data = read_data(task="bli",
                     folder=data_folder,
                     shuffle=True,
                     selected_feats=None,
                     combine_models=False)
    assert len(data) == 3
    data = read_data(task="bli",
                     folder=data_folder,
                     shuffle=True,
                     selected_feats=None,
                     combine_models=True)
    assert len(data) == 1
Exemple #5
0
def Main():
    X_train, y_train, X_valid, y_valid = read_data()

    X = tf.placeholder(dtype=tf.float32,
                       shape=[X_train.shape[1], X_train.shape[2]])

    with tf.name_scope("network"):
        network = Q_RNN(num_inputs=X_train.shape[2],
                        num_inputs=X_train.shape[2],
                        num_layers=3,
                        time_step=X_train.shape[1],
                        size=1,
                        scope="generative")
        proposal = network.proposal
        param_list = network.build_network(status=X)

    with tf.name_scope("loss"):
        ops = []
        loss = network.compute_loss(param_list=param_list)

        r_optimizer = tf.train.AdamOptimizer()
        g_optimizer = tf.train.AdamOptimizer()
        r_vars = proposal.get_trainable()
        g_vars = network.get_trainable()

        r_grad = r_optimizer.compute_gradients(loss=loss, var_list=r_vars)
        g_grad = g_optimizer.compute_gradients(loss=loss, var_list=g_vars)
        ops.append(r_optimizer.apply_gradients(grads_and_vars=r_grad))
        ops.append(g_optimizer.apply_gradients(grads_and_vars=g_grad))

    with tf.name_scope("miscellaneous"):
        init = tf.global_variables_initializer()
        saver = tf.train.Saver()

    with tf.Session() as sess:
        sess.run(init)
        index = 0
        while True:
            index = index + 1
            e_idx = np.random.randint(low=0, high=X_train.shape[0] - 1)
            status = np.reshape(a=X_train[e_idx],
                                newshape=[X_train.shape[1], X_train.shape[2]])
            l, _ = sess.run([loss, ops], feed_dict={X: status})
            print("At iteration {}, loss: {}".format(index, l))

            if index % 100 == 0:
                saver.save(sess=sess, save_path=save_path)
Exemple #6
0
def test_specific_spliter():
    logger.info("*" * 20)
    data = read_data(task="ma",
                     folder=data_folder,
                     shuffle=True,
                     selected_feats=None,
                     combine_models=True)
    feats = data["all"]["feats"]
    lens = len(feats)
    train_idxs = list(feats[feats["data size"] > 200].index)
    test_idxs = list(set(feats.index) - set(train_idxs))
    specific_spliter = Specific_Spliter(data, train_idxs, test_idxs)
    splited_data = specific_spliter.split()
    assert len(splited_data["all"]["train_feats"][0]) + len(
        splited_data["all"]["test_feats"][0]) == lens
    assert len(splited_data["all"]["train_labels"][0]) + len(
        splited_data["all"]["test_labels"][0]) == lens
    assert len(splited_data["all"]["train_langs"][0]) + len(
        splited_data["all"]["test_langs"][0]) == lens
Exemple #7
0
def run_ex(task, n=3, regressor="xgboost", portion=0.5):
    org_data = read_data(task, True, combine_models=True)
    feats = org_data["all"]["feats"]
    ids = feats.index

    test_rmses = {}
    baseline_rmses = {}
    models = task_eval_columns(task)

    for model in models:
        logger.info(
            "Running experiments with {} examples for a new model {}...".
            format(n, model))

        test_rmses[model] = []
        baseline_rmses[model] = defaultdict(list)
        model_ids = list(feats[feats[f"model_{model}"] == 1].index)
        other_model_ids = list(feats[feats[f"model_{model}"] == 0].index)

        test_lens = int(len(model_ids) * portion)

        logger.info(
            f"We use {portion} of the new model data as the test set. And we sample data points for training"
            f"in the remaining {1-portion} of data. We sample the split for {params.test_id_options_num} times. "
            f"There are {len(model_ids)} for model {model} and {len(other_model_ids)} for other models."
        )

        total_exs = params.test_id_options_num * params.sample_options_num
        finished_exs = 0

        for i in range(params.test_id_options_num):
            test_id_option = sample(model_ids, test_lens)

            sample_ids = list(set(model_ids) - set(test_id_option))

            total_sample_options = int(comb(len(sample_ids), n))

            logger.info(
                "There are {} experiments running for model {}.. and we sample {} experiments"
                .format(total_sample_options, model,
                        params.sample_options_num))

            finished_exs_for_one_test_set = 0
            for j in range(params.sample_options_num):
                sample_option = sample(sample_ids, n)

                train_ids = list(
                    set(sample_option).union(set(other_model_ids)))

                splitter = Specific_Spliter(org_data, [train_ids],
                                            [test_id_option])
                split_data = splitter.split()["all"]

                train_rmse, train_preds, test_rmse, test_preds, train_labels, test_labels, \
                    test_upper_preds, test_lower_preds, reg = \
                    run_once(split_data["train_feats"][0],
                             split_data["train_labels"][0],
                             split_data["test_feats"][0],
                             split_data["test_labels"][0],
                             split_data["train_labels_mns"][0],
                             split_data["train_labels_sstd"][0],
                             regressor,
                             get_ci=False)

                test_rmses[model].append(test_rmse)

                these_baselines = get_baselines(org_data, other_model_ids,
                                                sample_ids, test_id_option)
                for baseline in these_baselines:
                    baseline_rmses[model][baseline].append(
                        these_baselines[baseline])

                finished_exs_for_one_test_set += 1
                finished_exs += 1
                if finished_exs % 100 == 0:
                    logger.info(
                        "Progress: {}/{}, {:.2f}%, RMSE@{}: {:.2f}".format(
                            finished_exs, total_exs,
                            finished_exs / total_exs * 100, n,
                            np.mean(test_rmses[model])))
                    for baseline in baseline_rmses[model]:
                        logger.info(
                            f"Baseline {baseline}: {np.mean(baseline_rmses[model][baseline])}"
                        )

                if finished_exs_for_one_test_set == params.sample_options_num:
                    break

            if finished_exs == total_exs:
                logger.info("{} done! RMSE@{}: {:.2f}".format(
                    model, n, np.mean(test_rmses[model])))
                break

    logger.info("All experiments done!")

    for model in models:
        logger.info("Model: {}, ex: {}  RMSE@{}: {:.2f}".format(
            model, len(test_rmses[model]), n, np.mean(test_rmses[model])))

    logger.info("All models, RMSE@{}: {:.2f}".format(
        n, np.mean([np.mean(test_rmses[model]) for model in models])))
    for baseline in baseline_rmses[models[0]]:
        logger.info(
            f"Baseline {baseline} across all models: {np.mean([np.mean(baseline_rmses[model][baseline]) for model in models])}"
        )
Exemple #8
0
from src.read_data import read_data
from src.gradient_descent import gradient_descent
from src.data_visualisation import data_visualisation

# import libraries
import random
import numpy as np

if __name__ == '__main__':
    # initializing variables
    theta_0 = 0
    theta_1 = 0
    alpha = 1

    # Read the data from "data.csv", remove first row and calculate m
    data = read_data("data.csv")
    data.pop(0)
    data = [[int(x), int(y)] for [x, y] in data]

    # Data normalisation using the maximum absolute scaling
    x_min = np.min([x for x, y in data])
    y_min = np.min([y for x, y in data])
    x_max = np.max([x for x, y in data])
    y_max = np.max([y for x, y in data])
    data_normalised = [[x / x_max, y / y_max] for x, y in data]

    # Define the maximum number of iterations
    max_iterations = input(
        "Enter the maximum number of iterations (1000 if not specified): ")
    if (max_iterations == '' or max_iterations.isnumeric() == False):
        max_iterations = int(1000)
Exemple #9
0
 def __init__(self):
     self.X_train, self.y_train = read_data('data/train.csv', label_bool=True)
     self.X_test = read_data('data/test.csv', label_bool=False)
     self.transform = Transformation()
     self.learner = Learner()
     self.model_name = 'model_full'