def main():
    X = pd.read_csv('data/X.csv')
    y = pd.read_csv('data/y.csv')
    X_train, y_train, _, _ = train_test_split(X, y)
    X_train, y_train = balance_data_by_label(X_train, y_train, 0.4)

    param_grid = {'hidden_size': [512, 1024, 2048, 4096], 'num_core_h_layers': [2], 
                  'num_mu_h_layers': [3], 'num_log_s_h_layers': [3],
                  'lr': [9e-2, 7e-2, 5e-2, 3e-2, 1e-2], 'n_epochs': [600], 
                  'homoscedastic_vars': [None]}
    grid_search(X_train, y_train, param_grid, 'gs_results/gs_focus_lr3_hs2.csv', cv=3)
def main():
    X = pd.read_csv('data/X.csv').to_numpy()
    y = pd.read_csv('data/y.csv').to_numpy().reshape(-1)
    param_grid = {
        'hidden_size': [512, 1024, 2048, 4096],
        'num_core_h_layers': [2],
        'num_mu_h_layers': [3],
        'num_log_s_h_layers': [3],
        'lr': [9e-3, 7e-3, 5e-3, 3e-3, 1e-3],
        'n_epochs': [1000],
        'homoscedastic_vars': [None]
    }
    grid_search(X, y, param_grid, 'gs_results/focus_gs_lr2_hs2.csv', cv=3)
def main():
    X = pd.read_csv('data/X.csv')
    y = pd.read_csv('data/y.csv')
    X_train, y_train, _, _ = train_test_split(X, y)
    X_train, y_train = balance_data_by_label(X_train, y_train, 0.4)

    param_grid = {
        'hidden_size': [16],
        'num_core_h_layers': [2],
        'num_mu_h_layers': [3],
        'num_log_s_h_layers': [3],
        'lr': [5e-4],
        'n_epochs': [3],
        'homoscedastic_vars': [None]
    }
    grid_search(X_train,
                y_train,
                param_grid,
                'gs_results/gs_focus_TEST.csv',
                cv=2)
Example #4
0
def monks(task_type, param_grid, model_assessment=False):
    # this file contains the whole dataset, we rely on it instead of using the provided splitting 
    # because in that way we simulate a splitting according to hold-out technique
    dataset = ds.load('datasets/'+ task_type + '.test', 'monks')
    dataset.shuffle() # bacause data are taken randomly in monks-*.train
    # simple hold-out strategy 
    # ~123 elements for training set as in the original splitting (monks-1, monks-3)
    splitting = 43/100
    if task_type == 'monks-2':
        # monks-2 uses ~169 elements in the training set
        splitting = 59/100

    trainvalset, testset = dataset.split(splitting)
    # validation set is half of training set
    trainset, validationset = trainvalset.split(66.6/100)
    

    for params in ms.grid_search(param_grid):
        
        # if batch size is -1 means we want the batch equal to the entire training set size
        params['batch_size'] = params['batch_size'] if params['batch_size'] > 0 else trainset.size()
        print(params)
        
        epochs = params['epochs'] # value taken from the monks problem paper
        batch_size = params['batch_size']

        # trying different runs, to be independent from random weights init
        # and to have a bias-variance estimation (ensemble learning) when using inference on testset   
        runs_number = 3 # 5 can be used as well
        for r in range(runs_number): 
            # we are going to init more instances of the model to 
            # perform a better computation of the metrics    
            nn.from_parameters(params, 'sigmoid', 'sigmoid')
            model = nn.build()

            ms.add_model(model) 
        
        ms.set_datasets(trainset,validationset)

        start_time = time.time()
        for e in range(epochs):
            printProgressBar(e + 1, epochs, prefix = 'Training:', suffix = 'Complete')

            # for each model we initialized above
            for model_id, model in ms.models():
                # doing one step of training
                model.fit(trainset,batch_size,e)
                    
                # computing the output values for this training step
                train_outputs = model.forward_dataset(trainset)
                val_outputs = model.forward_dataset(validationset)

                # compute the metrics
                ms.compute_error(model_id, train_outputs, val_outputs)
                ms.compute_other(model_id, train_outputs, val_outputs, metrics=['acc'],threshold=0.5)

        training_time = time.time() - start_time
        print("TRAINING TIME " + str(training_time) + " seconds") 

        # getting the average of errors and accuracy         
        avg_tr_error, avg_val_error = ms.avg_mse()
        avg_tr_acc, avg_val_acc = ms.avg_acc()
        # precision and recall will be used during model assessment (see below)
        final_accuracy = avg_val_acc[-1]

        res.set_task(task_type)

        plt = res.plot_mse(epochs, avg_tr_error, avg_val_error, params, final_accuracy)
        msepath = res.save_plot(plt,'mse')
        
        plt = res.plot_acc(epochs,avg_tr_acc,avg_val_acc,params)
        res.save_plot(plt,'acc')
        
        # adding the result
        res.add_result(avg_tr_error[-1], avg_val_error[-1], params['batch_size'], params['weights_bound'], params['learning_rate'] , params['momentum_alpha'], final_accuracy, msepath)
        
        if not model_assessment:
            # cleaning model selection for next run
            ms.clean()

    res.add_result_header('mse_tr' , 'mse_val','batch_s','weights', 'lr','m_alpha', 'acc', 'path')     
    res.save_results()
    
    # WARNING this code must be executed only once
    # it must be executed only after model selection otherwise we will invalidate the test set
    if model_assessment:
        # here we want to use the testset to assess the model performances
        trained_models = [m  for _, m in  ms.models()]
        voted_outputs = []
        avg_outputs = []
        for batch in testset.batch(1):
            for pattern in batch:
                tmp_voted_outputs = []
                tmp_real_outputs = []
                for m in trained_models:
                    class_out , real_out = m.classify(pattern[1],threshold=0.5)
                    tmp_voted_outputs.append( class_out )
                    tmp_real_outputs.append(real_out)
                
                # we get the most frequent element ( majority vote)
                voted_outputs.append(mode(tmp_voted_outputs))
                # we get the average output to compute the error
                avg_outputs.append([mean(tmp_real_outputs)])

        metrics = ms.get_metrics()
        target_outputs = [ x[0] for x in testset.data_set[:,2]]
        # computing acc, rec and precision for the testset
        acc = metrics.accuracy(voted_outputs,target_outputs)
        recall = metrics.recall(voted_outputs, target_outputs)
        precision = metrics.precision(voted_outputs, target_outputs)

        mse = metrics.mean_square_error(avg_outputs, testset.data_set[:,2])
        
        print("ACCURACY " + str(acc))
        print("PRECISION " + str(precision))
        print("RECALL " + str(recall))
        print("MSE " + str(mse))
def cup(param_grid):
    dataset = ds.load('datasets/ML-CUP19-TR.csv', 'CUP')
    # we do the train combining the previous trainingset and validation set
    # to have more data
    trainset, testset = dataset.split(75 / 100)
    # data normalization

    params = next(ms.grid_search(param_grid))
    print(params)
    params['batch_size'] = params[
        'batch_size'] if params['batch_size'] > 0 else trainset.size()

    epochs = params['epochs']
    batch_size = params['batch_size']

    runs_number = 1
    for run in range(runs_number):
        nn.from_parameters(params, 'sigmoid', 'linear')
        model = nn.build()
        ms.add_model(model)

    ms.set_datasets(trainset, testset)

    start = time.time()
    for e in range(epochs):
        ppb(e + 1, epochs, prefix='Training', suffix='Completed')
        for model_id, model in ms.models():
            model.fit(trainset, batch_size, e)

            train_outputs = model.forward_dataset(trainset)
            test_outputs = model.forward_dataset(testset)

        ms.compute_error(model_id,
                         train_outputs,
                         test_outputs,
                         metrics=['mse', 'mee'])

    training_time = time.time() - start
    print('TRAINING TIME: ' + str(training_time) + 'seconds')

    avg_tr_mse, avg_ts_mse = ms.avg_mse()
    avg_tr_mee, avg_ts_mee = ms.avg_mee()

    res.set_task('CUP')
    plt = res.plot_mse(epochs, avg_tr_mse, avg_ts_mse, params, label2='test')
    msepath = res.save_plot(plt, 'mse')

    plt = res.plot_mee(epochs, avg_tr_mee, avg_ts_mee, params, label2='test')
    res.save_plot(plt, 'mee')

    print("TRAINING MSE " + str(avg_tr_mse[-1]))
    print("TRAINING MEE " + str(avg_tr_mee[-1]))

    # here we want to use the testset to assess the model performances
    trained_models = [m for _, m in ms.models()]
    avg_outputs = []
    for batch in testset.batch(1):
        for pattern in batch:
            tmp_real_outputs_x = []
            tmp_real_outputs_y = []
            for m in trained_models:
                real_out = m.feed_forward(pattern[1])
                tmp_real_outputs_x.append(real_out[0])
                tmp_real_outputs_y.append(real_out[1])

                # we get the average output to compute the error
                avg_outputs.append(
                    [mean(tmp_real_outputs_x),
                     mean(tmp_real_outputs_y)])

    metrics = ms.get_metrics()
    mse = metrics.mean_square_error(avg_outputs, testset.data_set[:, 2])
    mee = metrics.mean_euclidian_error(avg_outputs, testset.data_set[:, 2])

    print("MSE " + str(mse))
    print("MEE " + str(mee))

    blindds = ds.load_blind('datasets/ML-CUP19-TS.csv', 'CUP')

    avg_outputs = []
    for batch in blindds.batch(1):
        for pattern in batch:
            tmp_real_outputs_x = []
            tmp_real_outputs_y = []
            for m in trained_models:
                real_out = m.feed_forward(pattern[1])
                tmp_real_outputs_x.append(real_out[0])
                tmp_real_outputs_y.append(real_out[1])

                # we get the average output to compute the error
                avg_outputs.append(
                    [mean(tmp_real_outputs_x),
                     mean(tmp_real_outputs_y)])

    with open("report/poxebur_wikilele_ML-CUP-TS.csv", "a+") as cupfile:
        # cleaning the file
        cupfile.seek(0)
        cupfile.truncate()

        cupfile.write("# Leonardo Frioli Luigi Quarantiello \n")
        cupfile.write("# poxebur_wikilele \n")
        cupfile.write("# ML-CUP19 \n")
        cupfile.write("# 10/01/2020 \n")

        for i in range(len(avg_outputs)):
            cupfile.write(
                str(i + 1) + ", " + str(avg_outputs[i][0]) + ", " +
                str(avg_outputs[i][1]) + "\n")
Example #6
0
def cup(param_grid):
    dataset = ds.load('datasets/ML-CUP19-TR.csv', 'CUP')
    # 25% testset, 75% training set + validationset
    trainvalset, testset = dataset.split(75 / 100)
    # if we use hold out: validation set == 1/2 trainingset
    trainset, validationset = trainvalset.split(66.6 / 100)

    for params in ms.grid_search(param_grid):
        params['batch_size'] = params[
            'batch_size'] if params['batch_size'] > 0 else trainset.size()
        print(params)

        epochs = params['epochs']
        batch_size = params['batch_size']

        runs_number = 1
        for run in range(runs_number):
            nn.from_parameters(params, 'sigmoid', 'linear')
            model = nn.build()
            ms.add_model(model)

        ms.set_datasets(trainset, validationset)

        start = time.time()
        for e in range(epochs):
            ppb(e + 1, epochs, prefix='Training', suffix='Completed')

            for model_id, model in ms.models():
                model.fit(trainset, batch_size, e)

                train_outputs = model.forward_dataset(trainset)
                val_outputs = model.forward_dataset(validationset)

                ms.compute_error(model_id,
                                 train_outputs,
                                 val_outputs,
                                 metrics=['mse', 'mee'])

        training_time = time.time() - start
        print('TRAINING TIME: ' + str(training_time) + 'seconds')

        avg_tr_mse, avg_val_mse = ms.avg_mse()
        avg_tr_mee, avg_val_mee = ms.avg_mee()

        res.set_task('CUP')
        plt = res.plot_mse(epochs, avg_tr_mse, avg_val_mse, params)
        msepath = res.save_plot(plt, 'mse')

        plt = res.plot_mee(epochs, avg_tr_mee, avg_val_mee, params)
        res.save_plot(plt, 'mee')

        res.add_result(avg_tr_mse[-1], avg_val_mse[-1], avg_tr_mee[-1],
                       avg_val_mee[-1], params['epochs'], params['batch_size'],
                       params['weights_bound'], params['learning_rate'],
                       params['momentum_alpha'], params['use_nesterov'],
                       params['regularization_lambda'], msepath)
        ms.clean()

    res.add_result_header('mse_tr', 'mse_val', 'mee_tr', 'mee_val', 'batch_s',
                          'weights', 'lr', 'm_alpha', 'nesterov', 'r_lambda',
                          'path')
    res.save_results()