def main_hyperparameter_search():

    output_label = "median_house_value"

    data = pd.read_csv("housing.csv")
    data = data.sample(frac=1).reset_index(drop=True)
    x = data.loc[:, data.columns != output_label]
    y = data.loc[:, [output_label]]

    x_train = x[2000:].reset_index(drop=True)
    y_train = y[2000:].reset_index(drop=True)
    x_test = x_train[0:2000].reset_index(drop=True)
    y_test = y_train[:2000].reset_index(drop=True)

    parameters = RegressorHyperParameterSearch(x_train, y_train)

    lr, epoch = parameters

    regressor = Regressor(x_train, nb_epoch=epoch, lr=lr)
    regressor.fit(x_train, y_train)

    pred = regressor.predict(x_test)

    # Error
    error = regressor.score(x_test, y_test)
    print('--------------------------------------')
    print('Test scores: ')
    print('\nMSE: {} '.format(error[0]))
    print('\Explained Variance: {}'.format(error[1]))
    print('\R^2 score: {}'.format(error[2]))
    print('\RMSE: {}'.format(error[3]))
    print('\n--------------------------------------')
コード例 #2
0
    def __init__(
            self,
            train=False,
            test=False,
            limit_rows=False,
            transform=None,
            target_transform=None,
            download=False
    ):
        self.path = r"/Users/anders/Code/migration-analysis/data/processed/migrations_metadata.csv"
        data = pd.read_csv(self.path)

        # pre split limit
        if(limit_rows):
            data = data.sample(limit_rows, random_state=1337)

        split_point1 = int(np.floor(len(data)*0.9))
        data_train = data[0:split_point1]
        data_test = data[split_point1:]

        self.bow_column_name = BagOfWords(data_train.column_name)
        self.bow_column_data_type = BagOfWords(data_train.column_data_type)

        if(train):
            self.x = Variable(torch.tensor(self.bow_column_name.tensors, dtype=torch.float))
            self.y = Variable(torch.tensor(self.bow_column_data_type.tensors, dtype=torch.float))
        elif(test):
            self.x = Variable(torch.tensor(self.bow_column_name.tensors_for(data_test.column_name), dtype=torch.float))
            self.y = Variable(torch.tensor(self.bow_column_data_type.tensors_for(data_test.column_data_type), dtype=torch.float))
        else:
            data = []  
def main_hyperparameter_search():
    
    output_label = "median_house_value"

    data = pd.read_csv("housing.csv") 
    data = data.sample(frac = 1).reset_index(drop = True)
    x = data.loc[:, data.columns != output_label]
    y = data.loc[:, [output_label]]
    
    
    x_train = x[2000:].reset_index(drop = True)
    y_train = y[2000:].reset_index(drop = True)
    x_test = x_train[0:2000].reset_index(drop = True)
    y_test = y_train[:2000].reset_index(drop = True)
    
    parameters = RegressorHyperParameterSearch(x_train,y_train)
    
    lr,epoch = parameters
    
    regressor = Regressor(x_train, nb_epoch = epoch, lr = lr)
    regressor.fit(x_train, y_train)
    
    pred = regressor.predict(x_test)

    # Error
    error = regressor.score(x_test, y_test)
    print("\nRegressor error: {}\n".format(error))
コード例 #4
0
def main():
    data = pd.read_csv('../all_data.csv')
    data_train = data.sample(frac=1)  # unordered rn, data (if want ordered)
    files_train = list(data_train['filename'])
    ids_train = [i for i in range(len(files_train))]
    data = None
    model = UNet()
    model = torch.nn.DataParallel(model).cuda()
    criterion = nn.MSELoss().cuda()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.0005)
    train_dataset = Dataset(ids_train, files_train,
                            transforms.Compose([
                                transforms.Resize((256, 256)),
                                transforms.ToTensor(),
                                transforms.Normalize(
                                    mean=[0.5231, 0.5180, 0.5115],
                                    std=[0.2014, 0.2018, 0.2100]),
                            ]),
                            transforms.Compose([
                                transforms.Resize((256, 256)),
                                transforms.ToTensor(),
                            ]))  # normalize
    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=64,
                                               shuffle=True,
                                               num_workers=12)

    best_loss = 1e5
    for epoch in range(20):
        loss = train(train_loader, model, criterion, optimizer, epoch)
        print('Epoch: %d, MSE: %.8f' % (epoch + 1, loss))
        if loss < best_loss:
            torch.save(model.state_dict(), r'model_haze_all.pth')
            best_loss = loss
コード例 #5
0
def main_hyperparameter_search():

    # --------------------------------------------------------------------
    # LOAD THE DATA
    # define the output label
    output_label = "median_house_value"
    # get the data
    data = pd.read_csv("housing.csv")
    #Randomly shuffle the data
    data = data.sample(frac=1).reset_index(drop=True)

    # Spliting input and output
    x = data.loc[:, data.columns != output_label]
    y = data.loc[:, [output_label]]

    # keep a held out dataset for testing overfitting
    x_train = x[2000:].reset_index(drop=True)
    y_train = y[2000:].reset_index(drop=True)
    x_test = x_train[0:2000].reset_index(drop=True)
    y_test = y_train[:2000].reset_index(drop=True)

    # --------------------------------------------------------------------
    # BEGIN HYPERPARAMETER SEARCH
    parameters = RegressorHyperParameterSearch(x_train, y_train)
    # return the best parameters
    best_lr, best_batch, best_epoch = parameters

    print('--------------------------------------')
    print('BEST RESULTS FROM HYPERPARAMETER SEARCH')
    print('Best learning Rate: {}'.format(best_lr))
    print('Best Batch Size: {}'.format(best_batch))
    print('Best Epochs: {}'.format(best_epoch))

    # use the parameters to train
    regressor = Regressor(x_train,
                          nb_epoch=best_epoch,
                          lr=best_lr,
                          batch_size=best_batch)
    # --------------------------------------------------------------------
    # TRAIN
    regressor.fit(x_train, y_train)

    # --------------------------------------------------------------------
    # tEST & PREDICT
    x_test.to_csv('x_test.csv')
    pred = regressor.predict(x_test)
    pred = pd.DataFrame(pred)
    pred.to_csv('prediction.csv')

    # Error
    error = regressor.score(x_test, y_test)
    print('--------------------------------------')
    print('Test scores: ')
    print('\nMSE: {} '.format(error[0]))
    print('\Explained Variance: {}'.format(error[1]))
    print('\R^2 score: {}'.format(error[2]))
    print('\RMSE: {}'.format(error[3]))
    print('\n--------------------------------------')
def example_main():

    output_label = "median_house_value"

    # Use pandas to read CSV data as it contains various object types
    # Feel free to use another CSV reader tool
    # But remember that LabTS tests take Pandas Dataframe as inputs
    data = pd.read_csv("housing.csv")

    #Randomly shuffle the data
    data = data.sample(frac=1).reset_index(drop=True)

    # Spliting input and output
    x = data.loc[:, data.columns != output_label]
    y = data.loc[:, [output_label]]

    # Training
    # This example trains on the whole available dataset.
    # You probably want to separate some held-out data
    # to make sure the model isn't overfitting

    #Todo: Adjust with shuffling
    x_train = x[2000:]
    y_train = y[2000:]
    x_test = x_train[0:2000]
    y_test = y_train[:2000]

    regressor = Regressor(x_train, nb_epoch=100)
    regressor.fit(x_train, y_train)
    save_regressor(regressor)

    plot_validation_loss(training_loss=regressor.loss_rel[0, :],
                         validation_loss=regressor.loss_rel[1, :])

    pred = regressor.predict(x_test)

    # plot prediction for 100 samples
    plot_prediction(pred[:100], y_test[:100])

    #scaler = load(open('y_transformer.pkl', 'rb'))
    #print(scaler.inverse_transform(pred))
    #print(pred)
    #print(y_test)

    # Error
    error = regressor.score(x_test, y_test)
    print('--------------------------------------')
    print('Test scores: ')
    print('\nMSE: {} '.format(error[0]))
    print('\nExplained Variance: {}'.format(error[1]))
    print('\nR^2 score: {}'.format(error[2]))
    print('\nRMSE: {}'.format(error[3]))
    print('--------------------------------------')
def example_main():

    output_label = "median_house_value"

    # Use pandas to read CSV data as it contains various object types
    # Feel free to use another CSV reader tool
    # But remember that LabTS tests take Pandas Dataframe as inputs
    data = pd.read_csv("housing.csv") 
    
    #Randomly shuffle the data
    data = data.sample(frac = 1).reset_index(drop = True)
    
    # Spliting input and output
    x = data.loc[:, data.columns != output_label]
    y = data.loc[:, [output_label]]

    # Training
    # This example trains on the whole available dataset. 
    # You probably want to separate some held-out data 
    # to make sure the model isn't overfitting
    
    #Todo: Adjust with shuffling
    x_train = x[2000:]
    y_train = y[2000:]
    x_test = x_train[0:2000]
    y_test = y_train[:2000]
    
    
    regressor = Regressor(x_train, nb_epoch = 1000)
    regressor.fit(x_train, y_train)
    save_regressor(regressor)

    #Plots our training and validation loss
    plt.plot(np.arange(regressor.loss_rel.shape[1]),regressor.loss_rel[0,:],label = 'training_loss')
    plt.plot(np.arange(regressor.loss_rel.shape[1]),regressor.loss_rel[1,:],label = 'validation_loss')
    plt.yscale("log")
    plt.legend()
    plt.show()
    plt.savefig("loss.png")
    
    pred = regressor.predict(x_test)
    
    #scaler = load(open('y_transformer.pkl', 'rb'))
    #print(scaler.inverse_transform(pred))
    print(pred)
    print(y_test)
    

    # Error
    error = regressor.score(x_test, y_test)
    print("\nRegressor error: {}\n".format(error))
コード例 #8
0
def example_main():

    # --------------------------------------------------------------------
    # LOAD THE DATA
    #define the output label
    output_label = "median_house_value"
    # get the data
    data = pd.read_csv("housing.csv")

    #Randomly shuffle the data
    data = data.sample(frac=1).reset_index(drop=True)

    # Spliting input and output
    x = data.loc[:, data.columns != output_label]
    y = data.loc[:, [output_label]]

    # keep a held out dataset for testing overfitting
    x_train = x[2000:]
    y_train = y[2000:]
    x_test = x_train[0:2000]
    y_test = y_train[:2000]

    # --------------------------------------------------------------------
    # TRAIN
    regressor = Regressor(x_train, nb_epoch=100)
    regressor.fit(x_train, y_train)
    save_regressor(regressor)

    # --------------------------------------------------------------------
    # PLOT LOSS
    plot_validation_loss(training_loss=regressor.loss_rel[0, :],
                         validation_loss=regressor.loss_rel[1, :])

    # --------------------------------------------------------------------

    pred = regressor.predict(x_test)

    scaler = load(open('y_transformer.pkl', 'rb'))

    # --------------------------------------------------------------------
    # EVALUATE
    error = regressor.score(x_test, y_test)
    print('--------------------------------------')
    print('Test scores: ')
    print('\nMSE: {} '.format(error[0]))
    print('\nExplained Variance: {}'.format(error[1]))
    print('\nR^2 score: {}'.format(error[2]))
    print('\nRMSE: {}'.format(error[3]))
    print('--------------------------------------')
コード例 #9
0
    def __init__(self, data=None, sample_percent=1):
        """

        """
        super().__init__()

        assert (data is not None), "No data passed"

        try:
            self.RANDOM_SEED = RANDOM_SEED
        except:
            self.RANDOM_SEED = 1234

        assert (sample_percent > 0 and sample_percent <= 1)

        self._data = data

        if sample_percent < 1:
            sample_size = math.ceil(len(data) * sample_percent)
            self._data = data.sample(sample_size)
コード例 #10
0
    def __init__(self,
                 train=False,
                 test=False,
                 limit_rows=False,
                 transform=None,
                 target_transform=None,
                 download=False):
        data = pd.read_csv(path)

        # pre split limit
        if (limit_rows):
            data = data.sample(limit_rows, random_state=1337)

        split_point1 = int(np.floor(len(data) * 0.9))
        data_train = data[0:split_point1]
        data_test = data[split_point1:]

        print(len(data_train))
        print(len(data_test))

        print(data_test)
コード例 #11
0
def main():
    data = pd.read_csv('../all_data.csv')
    data_train = data.sample(frac=0.8,random_state=17)
    data_val = data.loc[~data.index.isin(data_train.index)]
    files_train = list(data_train['filename'])
    files_val = list(data_val['filename'])
    ppm_train = list(data_train['ppm'])
    ppm_val = list(data_val['ppm'])
    ids_train = [i for i in range(len(files_train))]
    ids_val = [i for i in range(len(files_val))]
    data = None
    data_train = None
    data_val = None
    # model = LeUNet()
    model = ResNetUNet()
    model = torch.nn.DataParallel(model).cuda()
    model.load_state_dict(torch.load("model_haze_all.pth"),strict=False) # on GPU
    # model = StandardNet('resnet50').cuda()
    # model = StandardNet('vgg16').cuda()
    # model = EPAPLN().cuda()
    # model = EnsembleNet().cuda()    
    criterion = nn.MSELoss().cuda()
    optimizer = torch.optim.Adam(model.parameters(),lr=1e-4)
    train_dataset = Dataset(ids_train, files_train, ppm_train, transforms.Compose([transforms.Resize((256,256)),transforms.ToTensor(),transforms.Normalize(mean=[0.5231, 0.5180, 0.5115],std=[0.2014, 0.2018, 0.2100]),])) # normalize
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=12)
    val_dataset = Dataset(ids_val, files_val, ppm_val, transforms.Compose([transforms.Resize((256,256)),transforms.ToTensor(),transforms.Normalize(mean=[0.5231, 0.5180, 0.5115],std=[0.2014, 0.2018, 0.2100]),]))
    val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=32, shuffle=False, num_workers=12)

    best_loss = 1e5
    for epoch in range(500):
        train_loss = train(train_loader,model,criterion,optimizer)
        val_loss = val(val_loader,model,criterion)
        print('Epoch: %d, MSE train set: %.8f' % (epoch+1, train_loss))
        print('Epoch: %d, MSE val set: %.8f\n' % (epoch+1, val_loss))
        if val_loss < best_loss:
            torch.save(model.state_dict(),'resnetunet_pm_all.pth')
            best_loss = val_loss
コード例 #12
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("input", type=str, help="input dataset")
    parser.add_argument("directory",
                        type=str,
                        help="directory to store data files")
    parser.add_argument("-i",
                        "--iterations",
                        type=int,
                        help="iterations to do",
                        default=1000)
    parser.add_argument("-ti",
                        "--train_iterations",
                        type=int,
                        help="iterations to train NN",
                        default=10)
    parser.add_argument("-l",
                        "--learning_rate",
                        type=float,
                        help="learning rate",
                        default=0.01)
    parser.add_argument(
        "-s",
        "--sample",
        type=int,
        help=
        "number of samples to use from dataset. If not passed - whole dataset is used",
        default=None)
    parser.add_argument("-mb",
                        "--mini_batch",
                        type=int,
                        help="minibatch size, 1000 is default",
                        default=1000)
    parser.add_argument("-tvs",
                        "--train_validation_split",
                        type=float,
                        help="train - validation split fraction",
                        default=0.8)
    parser.add_argument("-ml",
                        "--middle_layers",
                        type=int,
                        help="number of middle layers",
                        default=20)
    parser.add_argument("-mln",
                        "--middle_layer_neurons",
                        type=int,
                        help="middle layers neuron count",
                        default=2)
    parser.add_argument("-ha",
                        "--hidden_activation",
                        help="activation to use on hidden layers",
                        type=str)
    parser.add_argument("-oa",
                        "--out_activation",
                        help="activation to use on out layer",
                        type=str)
    parser.add_argument(
        "-ihl",
        "--input_has_labels",
        help=
        "pass this is input has class label. Needed for optimal predictor evaluation",
        action="store_true")
    parser.add_argument("-fc",
                        "--force_cpu",
                        help="force cpu execution for PyTorch",
                        action="store_true")
    args = parser.parse_args()

    if not os.path.exists(args.directory):
        os.makedirs(args.directory)

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    if args.force_cpu:
        device = "cpu"
    print("Running on: {0}".format(device))

    data_full = pd.read_csv(args.input, header=None)

    error_file = os.path.join(args.directory, "error.txt")
    with open(error_file, "w") as f:
        for seed in tqdm(range(args.iterations), desc="Running iterations"):
            torch.manual_seed(seed)
            np.random.seed(seed)
            if torch.cuda.is_available():
                torch.cuda.manual_seed(seed)

            data = data_full
            if args.sample:
                data = data.sample(n=args.sample)

            n = len(data)
            train_size = n * args.train_validation_split

            train_data = data.sample(n=int(train_size))
            valid_data = data.drop(train_data.index)

            layers = [data.shape[1] - 2] + ([args.middle_layer_neurons] *
                                            args.middle_layers) + [1]
            nn = TorchFeedforwardNN(layers,
                                    hidden_activation=args.hidden_activation,
                                    out_activation=args.out_activation)
            if torch.cuda.is_available():
                nn.to(device)

            inp_train = np.matrix(train_data.iloc[:,
                                                  1:train_data.shape[1] - 1])
            outp_train = np.matrix(train_data.iloc[:, train_data.shape[1] -
                                                   1:train_data.shape[1]])
            inp_valid = np.matrix(valid_data.iloc[:,
                                                  1:valid_data.shape[1] - 1])
            outp_valid = np.matrix(valid_data.iloc[:, valid_data.shape[1] -
                                                   1:valid_data.shape[1]])

            optim_err = calc_aver_error(inp_valid, outp_valid,
                                        args.input_has_labels)
            optim_err_train = calc_aver_error(inp_train, outp_train,
                                              args.input_has_labels)

            inp_train = torch.from_numpy(inp_train)
            outp_train = torch.from_numpy(outp_train)
            inp_valid = torch.from_numpy(inp_valid)
            outp_valid = torch.from_numpy(outp_valid)

            if torch.cuda.is_available():
                inp_train = inp_train.to(device)
                outp_train = outp_train.to(device)
                inp_valid = inp_valid.to(device)
                outp_valid = outp_valid.to(device)

            for _ in tqdm(range(args.train_iterations), desc="Training NN"):
                train_loader = torch.utils.data.DataLoader(
                    torch.utils.data.TensorDataset(inp_train, outp_train),
                    batch_size=args.mini_batch,
                    shuffle=True)
                for inp, target in tqdm(train_loader,
                                        desc="Running minibatches"):
                    nn.backpropagation_learn(inp,
                                             target,
                                             args.learning_rate,
                                             show_progress=True,
                                             stochastic=False)

            train_err = nn.evaluate(inp_train, outp_train)
            valid_err = nn.evaluate(inp_valid, outp_valid)

            f.write("{} {} {} {}\n".format(optim_err_train, optim_err,
                                           train_err, valid_err))
            f.flush()
コード例 #13
0
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim
import torch.utils.data
import torchvision.transforms as transforms
from PIL import Image
from train_pm import Dataset, double_conv, LeUNet, val

if __name__ == '__main__':

    data = pd.read_csv('../final_data.csv')
    data_train = data.sample(frac=0.8, random_state=17)
    data_val = data.loc[~data.index.isin(data_train.index)]
    files_val = list(data_val['filename'])
    ppm_val = list(data_val['ppm'])
    ids_val = [i for i in range(len(files_val))]

    model = LeUNet()
    model = torch.nn.DataParallel(model).cuda()
    model.load_state_dict(torch.load("model_hazy_best.pth"),
                          strict=False)  # on GPU
    criterion = nn.MSELoss().cuda()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

    val_dataset = Dataset(
        ids_val, files_val, ppm_val,
        transforms.Compose([
            transforms.Resize((256, 256)),
コード例 #14
0
def downsample(data, num):
    return data.sample(num)
コード例 #15
0
def upsample(data, num):
    return data.sample(num, replace=True)
コード例 #16
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("input", type=str, help="input dataset")
    parser.add_argument("directory",
                        type=str,
                        help="directory to store data files")
    parser.add_argument("-i",
                        "--iterations",
                        type=int,
                        help="iterations to do",
                        default=1000)
    parser.add_argument("-l",
                        "--learning_rate",
                        type=float,
                        help="learning rate",
                        default=0.01)
    parser.add_argument(
        "-s",
        "--sample",
        type=int,
        help=
        "number of samples to use from dataset. If not passed - whole dataset is used",
        default=None)
    parser.add_argument("-mb",
                        "--mini_batch",
                        type=int,
                        help="minibatch size, 1000 is default",
                        default=1000)
    parser.add_argument("-tvs",
                        "--train_validation_split",
                        type=float,
                        help="train - validation split fraction",
                        default=0.8)
    parser.add_argument(
        "-pf",
        "--pickle_file",
        type=int,
        help="pickle file index to dump neural network state after learning",
        default=None)
    parser.add_argument(
        "-uf",
        "--unpickle_file",
        type=int,
        help=
        "pickle file index to restore neural network state from at the beginning",
        default=None)
    parser.add_argument("-ml",
                        "--middle_layers",
                        type=int,
                        help="number of middle layers",
                        default=20)
    parser.add_argument("-mln",
                        "--middle_layer_neurons",
                        type=int,
                        help="middle layers neuron count",
                        default=2)
    parser.add_argument("--case",
                        type=int,
                        help="case of data popularity distribution",
                        default=1)
    parser.add_argument("-ha",
                        "--hidden_activation",
                        help="activation to use on hidden layers",
                        type=str)
    parser.add_argument("-oa",
                        "--out_activation",
                        help="activation to use on out layer",
                        type=str)
    parser.add_argument(
        "-ihl",
        "--input_has_labels",
        help=
        "pass this is input has class label. Needed for optimal predictor evaluation",
        action="store_true")
    parser.add_argument("--seed", help="seed for item sampling", type=int)
    parser.add_argument("-fc",
                        "--force_cpu",
                        help="force cpu execution for PyTorch",
                        action="store_true")
    # parser.add_argument("-aef",
    #                     "--alternative_error_function",
    #                     help="use alternative error function - error for Poisson distribution",
    #                     action="store_true")
    args = parser.parse_args()

    # In the next section you should define a mapping of items distribution

    # Case 1
    if args.case == 1:
        generator = PoissonZipfGenerator(10_000, 20.0, 0.8, 0)
        dist_mapping = generator.get_distribution_map()

    # Case 2
    elif args.case == 2:
        generator = PoissonZipfGenerator(5_000, 40.0, 0.8, 0)
        dist_mapping = generator.get_distribution_map()

        generator2 = PoissonShuffleZipfGenerator(5_000, 40.0, 0.8, 5_000,
                                                 10_000_000)
        dist_mapping2 = generator2.get_distribution_map()
        for k, v in dist_mapping2.items():
            dist_mapping[k] = v

        for k, v in dist_mapping.items():
            dist_mapping[k] = v / 2.0

    else:
        raise AttributeError("Unknown case passed")

    # End of section

    if args.seed:
        torch.manual_seed(args.seed)
        np.random.seed(args.seed)
        if torch.cuda.is_available():
            torch.cuda.manual_seed(args.seed)

    data = pd.read_csv(args.input, header=None)

    if args.sample:
        data = data.sample(n=args.sample)

    n = len(data)
    train_size = n * args.train_validation_split

    train_data = data.sample(n=int(train_size))
    valid_data = data.drop(train_data.index)

    if not os.path.exists(args.directory):
        os.makedirs(args.directory)

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    if args.force_cpu:
        device = "cpu"
    print("Running on: {0}".format(device))

    if args.unpickle_file is not None:
        filename = "nn_{0}.p".format(args.unpickle_file)
        filename = os.path.join(args.directory, filename)
        with open(filename, "rb") as unpickle_file:
            nn = pickle.load(unpickle_file)
    else:
        layers = [data.shape[1] - 2
                  ] + ([args.middle_layer_neurons] * args.middle_layers) + [1]
        nn = TorchFeedforwardNN(layers,
                                hidden_activation=args.hidden_activation,
                                out_activation=args.out_activation)
        if torch.cuda.is_available():
            nn.to(device)

    sample_map = {}
    for k, v in tqdm(dist_mapping.items(), desc="Preprocessing dataset"):
        sample_map[k] = data[data.ix[:, 0] == k]

    learning_rate = args.learning_rate
    prev_dist = 10**10

    inp_train = np.matrix(train_data.iloc[:, 1:train_data.shape[1] - 1])
    outp_train = np.matrix(train_data.iloc[:, train_data.shape[1] -
                                           1:train_data.shape[1]])

    inp_valid = np.matrix(valid_data.iloc[:, 1:valid_data.shape[1] - 1])
    outp_valid = np.matrix(valid_data.iloc[:, valid_data.shape[1] -
                                           1:valid_data.shape[1]])

    if args.case == 1:
        optim_err = calc_aver_error(inp_valid, outp_valid,
                                    args.input_has_labels)
        optim_err_train = calc_aver_error(inp_train, outp_train,
                                          args.input_has_labels)
    elif args.case == 2:
        optim_err = calc_case_2_optim_err(valid_data, args.input_has_labels)
        optim_err_train = calc_case_2_optim_err(train_data,
                                                args.input_has_labels)
    else:
        raise AttributeError("Unknown case passed")

    inp_train = torch.from_numpy(inp_train)
    outp_train = torch.from_numpy(outp_train)
    inp_valid = torch.from_numpy(inp_valid)
    outp_valid = torch.from_numpy(outp_valid)

    if torch.cuda.is_available():
        inp_train = inp_train.to(device)
        outp_train = outp_train.to(device)
        inp_valid = inp_valid.to(device)
        outp_valid = outp_valid.to(device)

    dist_file = os.path.join(args.directory, "distance.txt")
    error_file = os.path.join(args.directory, "error.txt")
    with open(error_file, "w") as err_f:
        with open(dist_file, "w") as f:

            # dist = 0.0
            # for k, v in tqdm(dist_mapping.items(), desc="Evaluating distance"):
            #     item = sample_map[k].sample(n=1)
            #     pop = nn.evaluate(np.matrix(item.iloc[:, 1:item.shape[1] - 1]),
            #                       np.matrix(item.iloc[:, item.shape[1] - 1:item.shape[1]]))[0]
            #
            #     dist += abs(v - pop)
            #
            # dist /= 2.0
            # f.write(f"{dist}\n")
            # f.flush()
            err_f.write("{} {}\n".format(optim_err_train, optim_err))
            for _ in tqdm(range(args.iterations), desc="Running iterations"):
                train_loader = torch.utils.data.DataLoader(
                    torch.utils.data.TensorDataset(inp_train, outp_train),
                    batch_size=args.mini_batch,
                    shuffle=True)
                for inp, target in tqdm(train_loader,
                                        desc="Running minibatches"):
                    nn.backpropagation_learn(inp,
                                             target,
                                             args.learning_rate,
                                             show_progress=True,
                                             stochastic=False)

                dist = 0.0
                err = 0.0
                for k, v in tqdm(dist_mapping.items(),
                                 desc="Evaluating distance"):
                    item = sample_map[k].sample(n=1)
                    inp = torch.from_numpy(
                        np.matrix(item.iloc[:, 1:item.shape[1] - 1]))
                    outp = torch.from_numpy(
                        np.matrix(item.iloc[:,
                                            item.shape[1] - 1:item.shape[1]]))

                    err += nn.evaluate(inp, outp)

                    pop = float(
                        nn(
                            torch.Tensor(
                                np.matrix(item.iloc[:, 1:item.shape[1] -
                                                    1])).double()))
                    pop = np.exp(-pop) - 10**-15

                    dist += abs(v - pop)

                err /= len(dist_mapping)

                dist /= 2.0
                prev_dist = dist

                f.write(f"{dist} {err}\n")
                f.flush()

                train_err = nn.evaluate(inp_train, outp_train)
                valid_err = nn.evaluate(inp_valid, outp_valid)

                err_f.write("{} {}\n".format(train_err, valid_err))
                err_f.flush()

    if args.pickle_file is not None:
        filename = "nn_{0}.p".format(args.pickle_file)
        filename = os.path.join(args.directory, filename)
        with open(filename, "wb") as pickle_file:
            pickle.dump(nn, pickle_file)

    cache_file = os.path.join(args.directory, "cache_hit.txt")
    with open(cache_file, "w") as f:
        popularities = []
        for k, v in tqdm(dist_mapping.items(), desc="Evaluating distance"):
            item = sample_map[k].sample(n=1)
            pop = float(
                nn(
                    torch.Tensor(np.matrix(item.iloc[:, 1:item.shape[1] -
                                                     1])).double()))
            pop = np.exp(-pop) - 10**-15

            # tmp = np.matrix(item.iloc[:, 1:item.shape[1] - 1])
            # tmp = np.exp(-tmp) - 10 ** -15  # transform from log
            # pop = float(np.mean(tmp, axis=1))

            # tmp = np.exp(-np.matrix(item.iloc[:, -1:])) - 10 ** -15  # transform from log
            # pop = float(tmp)
            popularities.append((k, pop))

        mean_val = np.mean([x[1] for x in popularities])
        median_val = np.median([x[1] for x in popularities])

        print("Popularity mean: {}".format(mean_val))
        print("Popularity median: {}".format(median_val))

        stat_file = os.path.join(args.directory, "stat.txt")
        with open(stat_file, "w") as f_stat:
            f_stat.write("Popularity mean: {}".format(mean_val))
            f_stat.write("Popularity median: {}".format(median_val))

        pops_sorted = list(
            sorted(popularities, key=lambda x: x[1], reverse=True))
        pop_order_predicted = [x[0] for x in pops_sorted]

        order_file = os.path.join(args.directory, "order.txt")
        with open(order_file, "w") as f1:
            for item in pops_sorted:
                f1.write("{0} {1} {2}\n".format(item[0], item[1],
                                                dist_mapping[item[0]]))

        pred_items_real_pops = [dist_mapping[i] for i in pop_order_predicted]

        distrib_pop_ordered = sorted(dist_mapping.values(), reverse=True)

        theory_hit = 0.0
        practice_hit = 0.0
        for distrib_pop, pred_item_pop in zip(distrib_pop_ordered,
                                              pred_items_real_pops):
            theory_hit += distrib_pop
            practice_hit += pred_item_pop
            f.write(f"{theory_hit} {practice_hit}\n")
コード例 #17
0
#         df['Count'][i] = randint(0,df['Count'].max())
#     df['Lat'][i] = int(df["Lat"][i])
#     df['Lng'][i] = int(df["Lng"][i])


# Saving the code from above
with open('dictionary.pickle', 'wb') as handle:
    pickle.dump(df, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('dictionary.pickle', 'rb') as handle:
    data = pickle.load(handle)
np.random.seed(1234)

# Converting the data to binary
data.loc[data['Count'] >= 1.0, 'Count'] = 1.0

train, validate, test = np.split(data.sample(frac=1, random_state=134),
                                 [int(.6*len(df)), int(.8*len(df))])

# We need to drop the count values for the training, validation, and test
x_train = train.drop(['Count'], axis =1).values
y_train = train['Count'].values

x_val = train.drop(['Count'], axis=1).values
y_val = train['Count'].values

x_test = test.drop(['Count'], axis=1).values
y_test = test['Count'].values


# Compute the mean of each column of the tensors. Also compute the standard deviation
# torch.mean[data, axis  =  1] data-mean/stdv
コード例 #18
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("input",
                        type=str,
                        help="input dataset")
    parser.add_argument("directory",
                        type=str,
                        help="directory to store data files")
    parser.add_argument("-i",
                        "--iterations",
                        type=int,
                        help="iterations to do",
                        default=1000)
    parser.add_argument("-l",
                        "--learning_rate",
                        type=float,
                        help="learning rate",
                        default=0.01)
    parser.add_argument("-s",
                        "--sample",
                        type=int,
                        help="number of samples to use from dataset. If not passed - whole dataset is used",
                        default=None)
    parser.add_argument("-es",
                        "--eval_sample",
                        type=int,
                        help="number of samples to use from for evaluation",
                        default=None)
    parser.add_argument("-mb",
                        "--mini_batch",
                        type=int,
                        help="minibatch size, 1000 is default",
                        default=1000)
    parser.add_argument("-mbl",
                        "--mini_batch_log",
                        type=int,
                        help="after how many batches evaluate the error",
                        default=100)
    parser.add_argument("-tvs",
                        "--train_validation_split",
                        type=float,
                        help="train - validation split fraction",
                        default=0.8)
    parser.add_argument("-pf",
                        "--pickle_file",
                        type=int,
                        help="pickle file index to dump neural network state after learning",
                        default=None)
    parser.add_argument("-uf",
                        "--unpickle_file",
                        type=int,
                        help="pickle file index to restore neural network state from at the beginning",
                        default=None)
    parser.add_argument("--seed",
                        help="seed for item sampling",
                        type=int)
    parser.add_argument("-fc",
                        "--force_cpu",
                        help="force cpu execution for PyTorch",
                        action="store_true")
    args = parser.parse_args()

    if not os.path.exists(args.directory):
        os.makedirs(args.directory)

    if args.seed:
        torch.manual_seed(args.seed)
        np.random.seed(args.seed)
        if torch.cuda.is_available():
            torch.cuda.manual_seed(args.seed)

    data = pd.read_csv(args.input, header=None, index_col=None, names=None)

    if args.sample:
        data = data.sample(n=args.sample)

    n = len(data)
    train_size = n * args.train_validation_split

    train_data = data.sample(n=int(train_size))
    valid_data = data.drop(train_data.index)

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    if args.force_cpu:
        device = "cpu"
    print("Running on: {0}".format(device))

    if args.unpickle_file is not None:
        filename = "dlstm_{0}.p".format(args.unpickle_file)
        filename = os.path.join(args.directory, filename)
        with open(filename, "rb") as unpickle_file:
            nn = pickle.load(unpickle_file)
    else:
        layers = [inputs_num, 16, 16, outputs_num]
        nn = LSTMSoftmax(layers)
        if torch.cuda.is_available():
            nn.to(device)

    inp_train = np.matrix(train_data.iloc[:, :inputs_num]).astype(float)
    outp_train = np.matrix(train_data.iloc[:, inputs_num:])

    inp_valid = np.matrix(valid_data.iloc[:, :inputs_num]).astype(float)
    outp_valid = np.matrix(valid_data.iloc[:, inputs_num:])

    inp_train = torch.from_numpy(inp_train).type(torch.FloatTensor)
    outp_train = torch.from_numpy(outp_train).type(torch.FloatTensor)
    inp_valid = torch.from_numpy(inp_valid).type(torch.FloatTensor)
    outp_valid = torch.from_numpy(outp_valid).type(torch.FloatTensor)

    if torch.cuda.is_available():
        inp_train = inp_train.to(device)
        outp_train = outp_train.to(device)
        inp_valid = inp_valid.to(device)
        outp_valid = outp_valid.to(device)

    log_counter = args.mini_batch_log
    error_file = os.path.join(args.directory, "error.txt")
    with open(error_file, "w") as f:
        for _ in tqdm(range(args.iterations), desc="Running iterations"):
            train_loader = torch.utils.data.DataLoader(torch.utils.data.TensorDataset(inp_train, outp_train),
                                                       batch_size=args.mini_batch,
                                                       shuffle=True)
            for inp, target in tqdm(train_loader, desc="Running minibatches"):
                nn.backpropagation_learn(inp, target, args.learning_rate, show_progress=True, stochastic=False)
                log_counter -= 1

                if log_counter == 0:
                    log_counter = args.mini_batch_log
                    if args.eval_sample is None:
                        train_err = nn.evaluate(inp_train, outp_train)
                        valid_err = nn.evaluate(inp_valid, outp_valid)
                    else:
                        train_tmp = train_data.sample(n=args.eval_sample)
                        valid_tmp = valid_data.sample(n=args.eval_sample)

                        inp_train_tmp = np.matrix(train_tmp.iloc[:, :inputs_num]).astype(float)
                        outp_train_tmp = np.matrix(train_tmp.iloc[:, inputs_num:])
                        inp_valid_tmp = np.matrix(valid_tmp.iloc[:, :inputs_num]).astype(float)
                        outp_valid_tmp = np.matrix(valid_tmp.iloc[:, inputs_num:])

                        inp_train_tmp = torch.from_numpy(inp_train_tmp).type(torch.FloatTensor)
                        outp_train_tmp = torch.from_numpy(outp_train_tmp).type(torch.FloatTensor)
                        inp_valid_tmp = torch.from_numpy(inp_valid_tmp).type(torch.FloatTensor)
                        outp_valid_tmp = torch.from_numpy(outp_valid_tmp).type(torch.FloatTensor)

                        train_err = nn.evaluate(inp_train_tmp, outp_train_tmp)
                        valid_err = nn.evaluate(inp_valid_tmp, outp_valid_tmp)

                    f.write("{} {}\n".format(train_err, valid_err))
                    f.flush()

    if args.pickle_file is not None:
        filename = "dlstm_{0}.p".format(args.pickle_file)
        filename = os.path.join(args.directory, filename)
        with open(filename, "wb") as pickle_file:
            pickle.dump(nn, pickle_file)
コード例 #19
0
    def __init__(self, config):
        """
        :param config:
        """
        self.config = config

        if config.data_type == "SENTEMO":
            #Init
            self.word2idx = {}
            self.idx2word = {}
            self.vocab = set()
            #Read Data
            if self.config.mode == 'test':
                self.word2idx = pickle.load(
                    open(self.config.out_dir + 'word2idx.pkl', "rb"))
                self.idx2word = pickle.load(
                    open(self.config.out_dir + 'idx2word.pkl', "rb"))
                self.vocab = pickle.load(
                    open(self.config.out_dir + 'vocab.pkl', "rb"))
                vocab_size = pickle.load(
                    open(self.config.out_dir + 'vocab_size.pkl', "rb"))
                self.config.vocab_size = vocab_size['embedded_dim']

                test_data = np.load(self.config.out_dir + 'test_data.npy')
                test_labels = np.load(self.config.out_dir + 'test_labels.npy')
                test = SENTEMO_Data(test_data, test_labels)
                self.test_loader = DataLoader(test,
                                              batch_size=config.batch_size,
                                              shuffle=True,
                                              drop_last=True)
                self.test_iterations = (len(test) + self.config.batch_size
                                        ) // self.config.batch_size

            else:
                data = self.load_from_pickle(
                    directory=self.config.SENT_EMO_Path)
                data["token_size"] = data["text"].apply(
                    lambda x: len(x.split(' ')))
                data = data.loc[data['token_size'] < 70].copy()
                # sampling
                data = data.sample(n=50000)
                # construct vocab and indexing
                self.create_index(data["text"].values.tolist())
                # vectorize to tensor
                input_tensor = [[self.word2idx[s] for s in es.split(' ')]
                                for es in data["text"].values.tolist()]
                max_length_inp = self.max_length(input_tensor)
                # inplace padding
                input_tensor = [
                    self.pad_sequences(x, max_length_inp) for x in input_tensor
                ]
                ### convert targets to one-hot encoding vectors
                emotions = list(set(data.emotions.unique()))
                # binarizer
                mlb = preprocessing.MultiLabelBinarizer()
                data_labels = [
                    set(emos) & set(emotions)
                    for emos in data[['emotions']].values
                ]
                bin_emotions = mlb.fit_transform(data_labels)
                target_tensor = np.array(bin_emotions.tolist())
                # Creating training and validation sets using an 80-20 split
                input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(
                    input_tensor, target_tensor, test_size=0.2)

                # Split the validataion further to obtain a holdout dataset (for testing) -- split 50:50
                input_tensor_val, input_tensor_test, target_tensor_val, target_tensor_test = train_test_split(
                    input_tensor_val, target_tensor_val, test_size=0.5)

                #for Infernce
                self.test_data = input_tensor_test
                self.test_labels = target_tensor_test

                #Init Transforms
                self.input_transform = standard_transforms.Compose([
                    standard_transforms.ToTensor(),
                ])

                self.target_transform = standard_transforms.Compose([
                    standard_transforms.ToTensor(),
                ])
                #Creeate Datasets
                train = SENTEMO_Data(
                    input_tensor_train, target_tensor_train
                )  #, input_transform=self.input_transform, target_transform=self.target_transform)
                valid = SENTEMO_Data(
                    input_tensor_val, target_tensor_val
                )  #, input_transform=self.input_transform, target_transform=self.target_transform)
                test = SENTEMO_Data(
                    input_tensor_test, target_tensor_test
                )  #, input_transform=self.input_transform, target_transform=self.target_transform)

                self.train_loader = DataLoader(
                    train,
                    batch_size=config.batch_size,
                    shuffle=True,
                    drop_last=True,
                )
                self.valid_loader = DataLoader(
                    valid,
                    batch_size=config.batch_size,
                    shuffle=True,
                    drop_last=True,
                )
                self.test_loader = DataLoader(
                    test,
                    batch_size=config.batch_size,
                    shuffle=True,
                    drop_last=True,
                )

                self.train_iterations = (len(train) + self.config.batch_size
                                         ) // self.config.batch_size
                self.valid_iterations = (len(valid) + self.config.batch_size
                                         ) // self.config.batch_size
                self.test_iterations = (len(test) + self.config.batch_size
                                        ) // self.config.batch_size

                self.config.vocab_size = len(self.word2idx)

        elif config.data_type == "SEM_EVAL_OC" or config.data_type == "SEM_EVAL_OC_Translated" or config.data_type == "SEM_EVAL_OC_Translated_TestOnly":
            #Init
            self.word2idx = {}
            self.idx2word = {}
            self.vocab = set()

            if self.config.mode == 'test' and not config.data_type == "SEM_EVAL_OC_Translated":
                self.word2idx = pickle.load(
                    open(self.config.out_dir + 'word2idx.pkl', "rb"))
                self.idx2word = pickle.load(
                    open(self.config.out_dir + 'idx2word.pkl', "rb"))
                self.vocab = pickle.load(
                    open(self.config.out_dir + 'vocab.pkl', "rb"))
                vocab_size = pickle.load(
                    open(self.config.out_dir + 'vocab_size.pkl', "rb"))
                self.config.vocab_size = vocab_size['embedded_dim']

                test_data = np.load(self.config.out_dir + 'test_data.npy')
                test_labels = np.load(self.config.out_dir + 'test_labels.npy')
                test = SENTEMO_Data(test_data, test_labels)
                self.test_loader = DataLoader(test,
                                              batch_size=config.batch_size,
                                              shuffle=True,
                                              drop_last=True)
                self.test_iterations = (len(test) + self.config.batch_size
                                        ) // self.config.batch_size
            elif self.config.mode == 'test' and config.data_type == "SEM_EVAL_OC_Translated_TestOnly":
                self.word2idx = pickle.load(
                    open(self.config.out_dir + 'word2idx.pkl', "rb"))
                self.idx2word = pickle.load(
                    open(self.config.out_dir + 'idx2word.pkl', "rb"))
                self.vocab = pickle.load(
                    open(self.config.out_dir + 'vocab.pkl', "rb"))
                vocab_size = pickle.load(
                    open(self.config.out_dir + 'vocab_size.pkl', "rb"))
                self.config.vocab_size = vocab_size['embedded_dim']

                test_data = np.load(self.config.out_dir + 'test_data_es.npy')
                test_labels = np.load(self.config.out_dir +
                                      'test_labels_es.npy')
                test = SENTEMO_Data(test_data, test_labels)
                self.test_loader = DataLoader(test,
                                              batch_size=config.batch_size,
                                              shuffle=True,
                                              drop_last=True)
                self.test_iterations = (len(test) + self.config.batch_size
                                        ) // self.config.batch_size

            elif self.config.mode == 'test' and config.data_type == "SEM_EVAL_OC_Translated":
                data = pd.read_csv(self.config.translated_data)
                if self.config.remove_emoji == 'remove':
                    data['text'] = data['text'].apply(
                        lambda x: emoji_pattern.sub(r'', x))
                elif self.config.remove_emoji == 'replace':
                    data['text'] = data['text'].apply(
                        lambda x: emoji.demojize(x))

                if self.config.spacy_token_preprocess == True:
                    if self.config.lang == 'en':
                        nlp = spacy.load('en_core_web_sm')
                    elif self.config.lang == 'es':
                        nlp = spacy.load('es_core_news_md')
                    tokenizer = spacy.tokenizer.Tokenizer(nlp.vocab)
                    data['text'] = data['text'].apply(lambda x: ' '.join(
                        [token.text_with_ws for token in nlp(x)]))

                if self.config.remove_capital == True:
                    data['text'] = data['text'].apply(lambda x: ' '.join(
                        [word.lower() for word in x.split()]))

                if self.config.remove_stopwords == True:
                    if self.config.lang == 'en':
                        nlp = spacy.load('en_core_web_sm')
                        spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS
                    elif self.config.lang == 'es':
                        nlp = spacy.load('es_core_news_md')
                        spacy_stopwords = spacy.lang.es.stop_words.STOP_WORDS

                    data['text'] = data['text'].apply(lambda x: ' '.join([
                        word for word in x.split()
                        if word not in (spacy_stopwords)
                    ]))

                data["token_size"] = data["text"].apply(
                    lambda x: len(x.split(' ')))

                data = data.loc[data['token_size'] < 80].copy()
                self.word2idx = pickle.load(
                    open(self.config.out_dir + 'word2idx.pkl', "rb"))
                self.idx2word = pickle.load(
                    open(self.config.out_dir + 'idx2word.pkl', "rb"))
                self.vocab = pickle.load(
                    open(self.config.out_dir + 'vocab.pkl', "rb"))
                vocab_size = pickle.load(
                    open(self.config.out_dir + 'vocab_size.pkl', "rb"))
                self.config.vocab_size = vocab_size['embedded_dim']
                #self.create_index(data["text"].values.tolist())
                input_tensor = [[
                    self.word2idx[s] for s in es.split(' ')
                    if s in self.word2idx.keys()
                ] for es in data["text"].values.tolist()]
                max_length_inp = self.max_length(input_tensor)
                input_tensor = [
                    self.pad_sequences(x, max_length_inp) for x in input_tensor
                ]
                emotions = list(set(data.emotions.unique()))
                # binarizer
                mlb = preprocessing.MultiLabelBinarizer()
                data_labels = [
                    set(emos) & set(emotions)
                    for emos in data[['emotions']].values
                ]
                bin_emotions = mlb.fit_transform(data_labels)
                target_tensor = np.array(bin_emotions.tolist())
                test = SENTEMO_Data(input_tensor, target_tensor)
                self.test_loader = DataLoader(
                    test,
                    batch_size=config.batch_size,
                    shuffle=True,
                    drop_last=True,
                )
                self.test_iterations = (len(test) + self.config.batch_size
                                        ) // self.config.batch_size
            else:

                if self.config.load_stored == 'LOAD_npy':
                    train_tensor = np.load(self.config.out_dir +
                                           'train_data.npy',
                                           allow_pickle=True)
                    target_tensor_train = np.load(self.config.out_dir +
                                                  'train_labels.npy',
                                                  allow_pickle=True)
                    train_SEMEVAL_tensor = np.load(self.config.out_dir +
                                                   'SE_train_data.npy',
                                                   allow_pickle=True)
                    target_SEMEVAL_tensor_train = np.load(
                        self.config.out_dir + 'SE_train_labels.npy',
                        allow_pickle=True)
                    valid_tensor = np.load(self.config.out_dir +
                                           'val_data.npy',
                                           allow_pickle=True)
                    target_tensor_val = np.load(self.config.out_dir +
                                                'val_labels.npy',
                                                allow_pickle=True)

                    my_list = ['anger', 'joy', 'fear', 'sadness']
                    SENTEMO_DataFrame = self.load_from_pickle(
                        directory=self.config.SENT_EMO_Path)
                    SENTEMO_DataFrame['emotions'] = SENTEMO_DataFrame[
                        'emotions'].apply(lambda x: x
                                          if x in my_list else np.NaN)
                    SENTEMO_DataFrame = SENTEMO_DataFrame.dropna()
                    SENTEMO_DataFrame = pd.DataFrame({
                        "emotions":
                        SENTEMO_DataFrame["emotions"],
                        "text":
                        SENTEMO_DataFrame["text"]
                    })
                    SENTEMO_DataFrame['emotions'] = SENTEMO_DataFrame[
                        'emotions'].apply(lambda x: my_list.index(x))

                    self.word2idx = pickle.load(
                        open(self.config.out_dir + 'word2idx.pkl', "rb"))
                    self.idx2word = pickle.load(
                        open(self.config.out_dir + 'idx2word.pkl', "rb"))
                    self.vocab = pickle.load(
                        open(self.config.out_dir + 'vocab.pkl', "rb"))
                    vocab_size = len(self.word2idx)
                    self.config.vocab_size = vocab_size

                    train = SENTEMO_Data(train_tensor, target_tensor_train)
                    train_SE = SENTEMO_Data(train_SEMEVAL_tensor,
                                            target_SEMEVAL_tensor_train)
                    valid = SENTEMO_Data(valid_tensor, target_tensor_val)

                    self.train_loader = DataLoader(
                        train,
                        batch_size=config.batch_size * 128,
                        shuffle=True,
                        drop_last=True)
                    self.train_SE_loader = DataLoader(
                        train_SE,
                        batch_size=config.batch_size,
                        shuffle=True,
                        drop_last=True)
                    self.valid_loader = DataLoader(valid,
                                                   batch_size=1,
                                                   shuffle=True,
                                                   drop_last=False)

                    self.train_iterations = (
                        len(train) + (self.config.batch_size * 128)) // (
                            self.config.batch_size * 128)
                    self.train_SE_iterations = (
                        len(train_SE) +
                        self.config.batch_size) // self.config.batch_size
                    self.valid_iterations = len(valid)
                else:
                    anger0_x, anger0_y = self.parse_oc(
                        self.config.Train_OC_Anger)
                    fear0_x, fear0_y = self.parse_oc(self.config.Train_OC_Fear)
                    joy0_x, joy0_y = self.parse_oc(self.config.Train_OC_Joy)
                    sadness0_x, sadness0_y = self.parse_oc(
                        self.config.Train_OC_Sadness)

                    anger1_x, anger1_y = self.parse_oc(
                        self.config.Valid_OC_Anger)
                    fear1_x, fear1_y = self.parse_oc(self.config.Valid_OC_Fear)
                    joy1_x, joy1_y = self.parse_oc(self.config.Valid_OC_Joy)
                    sadness1_x, sadness1_y = self.parse_oc(
                        self.config.Valid_OC_Sadness)

                    if self.config.add_extra_data == 'SENTEMO':
                        my_list = ['anger', 'joy', 'fear', 'sadness']
                        SENTEMO_DataFrame = self.load_from_pickle(
                            directory=self.config.SENT_EMO_Path)
                        SENTEMO_DataFrame['emotions'] = SENTEMO_DataFrame[
                            'emotions'].apply(lambda x: x
                                              if x in my_list else np.NaN)
                        SENTEMO_DataFrame = SENTEMO_DataFrame.dropna()
                        SENTEMO_DataFrame = pd.DataFrame({
                            "emotions":
                            SENTEMO_DataFrame["emotions"],
                            "text":
                            SENTEMO_DataFrame["text"]
                        })
                        SENTEMO_DataFrame['emotions'] = SENTEMO_DataFrame[
                            'emotions'].apply(lambda x: my_list.index(x))

                    #Preparing dataframes
                    pd_anger = pd.DataFrame({"emotions": anger0_y})
                    pd_anger["text"] = anger0_x
                    pd_joy = pd.DataFrame({"emotions": joy0_y})
                    pd_joy["text"] = joy0_x
                    pd_fear = pd.DataFrame({"emotions": fear0_y})
                    pd_fear["text"] = fear0_x
                    pd_sad = pd.DataFrame({"emotions": sadness0_y})
                    pd_sad["text"] = sadness0_x

                    pd_anger["emotions"] = pd_anger["emotions"].apply(
                        lambda x: x[1])
                    pd_anger["emotions"] = pd_anger["emotions"][
                        pd_anger["emotions"] > self.config.emo_threshold]
                    pd_anger = pd_anger.dropna()
                    pd_anger["emotions"] = pd_anger["emotions"].apply(
                        lambda x: 0)

                    pd_joy["emotions"] = pd_joy["emotions"].apply(
                        lambda x: x[1])
                    pd_joy["emotions"] = pd_joy["emotions"][
                        pd_joy["emotions"] > self.config.emo_threshold]
                    pd_joy = pd_joy.dropna()
                    pd_joy["emotions"] = pd_joy["emotions"].apply(lambda x: 1)

                    pd_fear["emotions"] = pd_fear["emotions"].apply(
                        lambda x: x[1])
                    pd_fear["emotions"] = pd_fear["emotions"][
                        pd_fear["emotions"] > self.config.emo_threshold]
                    pd_fear = pd_fear.dropna()
                    pd_fear["emotions"] = pd_fear["emotions"].apply(
                        lambda x: 2)

                    pd_sad["emotions"] = pd_sad["emotions"].apply(
                        lambda x: x[1])
                    pd_sad["emotions"] = pd_sad["emotions"][
                        pd_sad["emotions"] > self.config.emo_threshold]
                    pd_sad = pd_sad.dropna()
                    pd_sad["emotions"] = pd_sad["emotions"].apply(lambda x: 3)

                    train_data = pd.concat(
                        [pd_anger, pd_joy, pd_fear, pd_sad, SENTEMO_DataFrame],
                        ignore_index=True)
                    train_SEMEVAL_data = pd.concat(
                        [pd_anger, pd_joy, pd_fear, pd_sad], ignore_index=True)

                    pd_anger = pd.DataFrame({"emotions": anger1_y})
                    pd_anger["text"] = anger1_x
                    pd_joy = pd.DataFrame({"emotions": joy1_y})
                    pd_joy["text"] = joy1_x
                    pd_fear = pd.DataFrame({"emotions": fear1_y})
                    pd_fear["text"] = fear1_x
                    pd_sad = pd.DataFrame({"emotions": sadness1_y})
                    pd_sad["text"] = sadness1_x

                    pd_anger["emotions"] = pd_anger["emotions"].apply(
                        lambda x: x[1])
                    pd_anger["emotions"] = pd_anger["emotions"][
                        pd_anger["emotions"] > self.config.emo_threshold]
                    pd_anger = pd_anger.dropna()
                    pd_anger["emotions"] = pd_anger["emotions"].apply(
                        lambda x: 0)

                    pd_joy["emotions"] = pd_joy["emotions"].apply(
                        lambda x: x[1])
                    pd_joy["emotions"] = pd_joy["emotions"][
                        pd_joy["emotions"] > self.config.emo_threshold]
                    pd_joy = pd_joy.dropna()
                    pd_joy["emotions"] = pd_joy["emotions"].apply(lambda x: 1)

                    pd_fear["emotions"] = pd_fear["emotions"].apply(
                        lambda x: x[1])
                    pd_fear["emotions"] = pd_fear["emotions"][
                        pd_fear["emotions"] > self.config.emo_threshold]
                    pd_fear = pd_fear.dropna()
                    pd_fear["emotions"] = pd_fear["emotions"].apply(
                        lambda x: 2)

                    pd_sad["emotions"] = pd_sad["emotions"].apply(
                        lambda x: x[1])
                    pd_sad["emotions"] = pd_sad["emotions"][
                        pd_sad["emotions"] > self.config.emo_threshold]
                    pd_sad = pd_sad.dropna()
                    pd_sad["emotions"] = pd_sad["emotions"].apply(lambda x: 3)

                    valid_data = pd.concat([pd_anger, pd_joy, pd_fear, pd_sad],
                                           ignore_index=True)

                    if self.config.TRAINING_DATA == 'STRONG':
                        train_data = train_SEMEVAL_data.sample(
                            frac=1).reset_index(drop=True)
                    else:
                        train_data = train_data.sample(frac=1).reset_index(
                            drop=True)
                    train_SEMEVAL_data = train_SEMEVAL_data.sample(
                        frac=1).reset_index(drop=True)
                    valid_data = valid_data.sample(frac=1).reset_index(
                        drop=True)

                    if self.config.remove_emoji == 'remove':
                        train_data['text'] = train_data['text'].apply(
                            lambda x: emoji_pattern.sub(r'', x))
                        train_SEMEVAL_data['text'] = train_SEMEVAL_data[
                            'text'].apply(lambda x: emoji_pattern.sub(r'', x))
                        valid_data['text'] = valid_data['text'].apply(
                            lambda x: emoji_pattern.sub(r'', x))
                    elif self.config.remove_emoji == 'replace':
                        train_data['text'] = train_data['text'].apply(
                            lambda x: emoji.demojize(x))
                        train_SEMEVAL_data['text'] = train_SEMEVAL_data[
                            'text'].apply(lambda x: emoji.demojize(x))
                        valid_data['text'] = valid_data['text'].apply(
                            lambda x: emoji.demojize(x))

                    if self.config.spacy_token_preprocess == True:
                        if self.config.lang == 'en':
                            nlp = spacy.load('en_core_web_sm')
                        elif self.config.lang == 'es':
                            nlp = spacy.load('es_core_news_md')
                        tokenizer = spacy.tokenizer.Tokenizer(nlp.vocab)
                        train_data['text'] = train_data['text'].apply(
                            lambda x: ' '.join(
                                [token.text_with_ws for token in nlp(x)]))
                        train_SEMEVAL_data['text'] = train_SEMEVAL_data[
                            'text'].apply(lambda x: ' '.join(
                                [token.text_with_ws for token in nlp(x)]))
                        valid_data['text'] = valid_data['text'].apply(
                            lambda x: ' '.join(
                                [token.text_with_ws for token in nlp(x)]))

                    if self.config.remove_capital == True:
                        train_data['text'] = train_data['text'].apply(
                            lambda x: ' '.join(
                                [word.lower() for word in x.split()]))
                        train_SEMEVAL_data['text'] = train_SEMEVAL_data[
                            'text'].apply(lambda x: ' '.join(
                                [word.lower() for word in x.split()]))
                        valid_data['text'] = valid_data['text'].apply(
                            lambda x: ' '.join(
                                [word.lower() for word in x.split()]))

                    if self.config.remove_stopwords == True:
                        if self.config.lang == 'en':
                            nlp = spacy.load('en_core_web_sm')
                            spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS
                        elif self.config.lang == 'es':
                            nlp = spacy.load('es_core_news_md')
                            spacy_stopwords = spacy.lang.es.stop_words.STOP_WORDS

                        train_data['text'] = train_data['text'].apply(
                            lambda x: ' '.join([
                                word for word in x.split()
                                if word not in (spacy_stopwords)
                            ]))
                        train_SEMEVAL_data['text'] = train_SEMEVAL_data[
                            'text'].apply(lambda x: ' '.join([
                                word for word in x.split()
                                if word not in (spacy_stopwords)
                            ]))
                        valid_data['text'] = valid_data['text'].apply(
                            lambda x: ' '.join([
                                word for word in x.split()
                                if word not in (spacy_stopwords)
                            ]))

                    train_data["token_size"] = train_data["text"].apply(
                        lambda x: len(x.split(' ')))
                    train_SEMEVAL_data["token_size"] = train_SEMEVAL_data[
                        "text"].apply(lambda x: len(x.split(' ')))
                    valid_data["token_size"] = valid_data["text"].apply(
                        lambda x: len(x.split(' ')))

                    train_data = train_data.loc[
                        train_data['token_size'] < 100].copy()

                    self.create_index(train_data["text"].values.tolist())
                    print("Vocab Size: '{}'".format(len(self.word2idx)))
                    train_tensor = [[
                        self.word2idx[s] for s in es.split(' ')
                    ] for es in train_data["text"].values.tolist()]
                    max_length_inp = self.max_length(train_tensor)
                    train_tensor = [
                        self.pad_sequences(x, max_length_inp)
                        for x in train_tensor
                    ]
                    emotions = list(set(train_data.emotions.unique()))

                    train_SEMEVAL_tensor = [[
                        self.word2idx[s] for s in es.split(' ')
                    ] for es in train_SEMEVAL_data["text"].values.tolist()]
                    max_length_inp = self.max_length(train_SEMEVAL_tensor)
                    train_SEMEVAL_tensor = [
                        self.pad_sequences(x, max_length_inp)
                        for x in train_SEMEVAL_tensor
                    ]

                    valid_tensor = [[
                        self.word2idx[s] for s in es.split(' ')
                        if s in self.word2idx.keys()
                    ] for es in valid_data["text"].values.tolist()]
                    max_length_inp = self.max_length(valid_tensor)
                    valid_tensor = [
                        self.pad_sequences(x, max_length_inp)
                        for x in valid_tensor
                    ]

                    # binarizer
                    mlb = preprocessing.MultiLabelBinarizer()

                    train_labels = [
                        set(emos) & set(emotions)
                        for emos in train_data[['emotions']].values
                    ]
                    bin_emotions = mlb.fit_transform(train_labels)
                    target_tensor_train = np.array(bin_emotions.tolist())

                    train_SEMEVAL_labels = [
                        set(emos) & set(emotions)
                        for emos in train_SEMEVAL_data[['emotions']].values
                    ]
                    bin_emotions = mlb.fit_transform(train_SEMEVAL_labels)
                    target_SEMEVAL_tensor_train = np.array(
                        bin_emotions.tolist())

                    valid_labels = [
                        set(emos) & set(emotions)
                        for emos in valid_data[['emotions']].values
                    ]
                    bin_emotions = mlb.fit_transform(valid_labels)
                    target_tensor_val = np.array(bin_emotions.tolist())

                    #Saving for reading later
                    np.save(self.config.out_dir + 'train_data.npy',
                            train_tensor,
                            allow_pickle=True)
                    np.save(self.config.out_dir + 'train_labels.npy',
                            target_tensor_train,
                            allow_pickle=True)
                    np.save(self.config.out_dir + 'SE_train_data.npy',
                            train_SEMEVAL_tensor,
                            allow_pickle=True)
                    np.save(self.config.out_dir + 'SE_train_labels.npy',
                            target_SEMEVAL_tensor_train,
                            allow_pickle=True)
                    np.save(self.config.out_dir + 'val_data.npy',
                            valid_tensor,
                            allow_pickle=True)
                    np.save(self.config.out_dir + 'val_labels.npy',
                            target_tensor_val,
                            allow_pickle=True)
                    self.convert_to_pickle(
                        self.word2idx, self.config.out_dir + 'word2idx.pkl')
                    self.convert_to_pickle(
                        self.idx2word, self.config.out_dir + 'idx2word.pkl')
                    self.convert_to_pickle(self.vocab,
                                           self.config.out_dir + 'vocab.pkl')
                    self.config.vocab_size = len(self.word2idx)
                    vocab_size = {'embedded_dim': self.config.vocab_size}

                    train = SENTEMO_Data(train_tensor, target_tensor_train)
                    train_SE = SENTEMO_Data(train_SEMEVAL_tensor,
                                            target_SEMEVAL_tensor_train)
                    valid = SENTEMO_Data(valid_tensor, target_tensor_val)

                    self.train_loader = DataLoader(
                        train,
                        batch_size=config.batch_size,
                        shuffle=True,
                        drop_last=True)
                    self.train_SE_loader = DataLoader(
                        train_SE,
                        batch_size=config.batch_size,
                        shuffle=True,
                        drop_last=True)
                    self.valid_loader = DataLoader(valid,
                                                   batch_size=1,
                                                   shuffle=True,
                                                   drop_last=False)

                    self.train_iterations = (
                        len(train) +
                        self.config.batch_size) // self.config.batch_size
                    self.train_SE_iterations = (
                        len(train_SE) +
                        self.config.batch_size) // self.config.batch_size
                    self.valid_iterations = len(valid)

                    self.config.vocab_size = len(self.word2idx)
        elif self.config.data_type == 'IEMOCAP':
            raise NotImplementedError("This mode is not implemented YET")
            #utterances, videoSpeakers, videoLabels, videoText, videoAudio, videoVisual, transcripts, scripts, testVid = self.load_from_pickle(directory=self.config.pickle_path, encoding=self.config.pickle_encoding)
            #Create Tokenizer
            #self.tokenizer = spacy.load('en_core_web_sm')
            #Loop through all data and do tokenization
            #self.data_seq_len = []
            #self.data_text = []
            #for vid in scripts:
            #    self.data_seq_len.append(len(utterances[vid]))
            #    self.data_text.append(transcripts[vid])
            #Create Vocab

            #Padding
        else:
            raise Exception(
                "Please specify in the json a specified mode in data_mode")