def main_hyperparameter_search(): output_label = "median_house_value" data = pd.read_csv("housing.csv") data = data.sample(frac=1).reset_index(drop=True) x = data.loc[:, data.columns != output_label] y = data.loc[:, [output_label]] x_train = x[2000:].reset_index(drop=True) y_train = y[2000:].reset_index(drop=True) x_test = x_train[0:2000].reset_index(drop=True) y_test = y_train[:2000].reset_index(drop=True) parameters = RegressorHyperParameterSearch(x_train, y_train) lr, epoch = parameters regressor = Regressor(x_train, nb_epoch=epoch, lr=lr) regressor.fit(x_train, y_train) pred = regressor.predict(x_test) # Error error = regressor.score(x_test, y_test) print('--------------------------------------') print('Test scores: ') print('\nMSE: {} '.format(error[0])) print('\Explained Variance: {}'.format(error[1])) print('\R^2 score: {}'.format(error[2])) print('\RMSE: {}'.format(error[3])) print('\n--------------------------------------')
def __init__( self, train=False, test=False, limit_rows=False, transform=None, target_transform=None, download=False ): self.path = r"/Users/anders/Code/migration-analysis/data/processed/migrations_metadata.csv" data = pd.read_csv(self.path) # pre split limit if(limit_rows): data = data.sample(limit_rows, random_state=1337) split_point1 = int(np.floor(len(data)*0.9)) data_train = data[0:split_point1] data_test = data[split_point1:] self.bow_column_name = BagOfWords(data_train.column_name) self.bow_column_data_type = BagOfWords(data_train.column_data_type) if(train): self.x = Variable(torch.tensor(self.bow_column_name.tensors, dtype=torch.float)) self.y = Variable(torch.tensor(self.bow_column_data_type.tensors, dtype=torch.float)) elif(test): self.x = Variable(torch.tensor(self.bow_column_name.tensors_for(data_test.column_name), dtype=torch.float)) self.y = Variable(torch.tensor(self.bow_column_data_type.tensors_for(data_test.column_data_type), dtype=torch.float)) else: data = []
def main_hyperparameter_search(): output_label = "median_house_value" data = pd.read_csv("housing.csv") data = data.sample(frac = 1).reset_index(drop = True) x = data.loc[:, data.columns != output_label] y = data.loc[:, [output_label]] x_train = x[2000:].reset_index(drop = True) y_train = y[2000:].reset_index(drop = True) x_test = x_train[0:2000].reset_index(drop = True) y_test = y_train[:2000].reset_index(drop = True) parameters = RegressorHyperParameterSearch(x_train,y_train) lr,epoch = parameters regressor = Regressor(x_train, nb_epoch = epoch, lr = lr) regressor.fit(x_train, y_train) pred = regressor.predict(x_test) # Error error = regressor.score(x_test, y_test) print("\nRegressor error: {}\n".format(error))
def main(): data = pd.read_csv('../all_data.csv') data_train = data.sample(frac=1) # unordered rn, data (if want ordered) files_train = list(data_train['filename']) ids_train = [i for i in range(len(files_train))] data = None model = UNet() model = torch.nn.DataParallel(model).cuda() criterion = nn.MSELoss().cuda() optimizer = torch.optim.Adam(model.parameters(), lr=0.0005) train_dataset = Dataset(ids_train, files_train, transforms.Compose([ transforms.Resize((256, 256)), transforms.ToTensor(), transforms.Normalize( mean=[0.5231, 0.5180, 0.5115], std=[0.2014, 0.2018, 0.2100]), ]), transforms.Compose([ transforms.Resize((256, 256)), transforms.ToTensor(), ])) # normalize train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=64, shuffle=True, num_workers=12) best_loss = 1e5 for epoch in range(20): loss = train(train_loader, model, criterion, optimizer, epoch) print('Epoch: %d, MSE: %.8f' % (epoch + 1, loss)) if loss < best_loss: torch.save(model.state_dict(), r'model_haze_all.pth') best_loss = loss
def main_hyperparameter_search(): # -------------------------------------------------------------------- # LOAD THE DATA # define the output label output_label = "median_house_value" # get the data data = pd.read_csv("housing.csv") #Randomly shuffle the data data = data.sample(frac=1).reset_index(drop=True) # Spliting input and output x = data.loc[:, data.columns != output_label] y = data.loc[:, [output_label]] # keep a held out dataset for testing overfitting x_train = x[2000:].reset_index(drop=True) y_train = y[2000:].reset_index(drop=True) x_test = x_train[0:2000].reset_index(drop=True) y_test = y_train[:2000].reset_index(drop=True) # -------------------------------------------------------------------- # BEGIN HYPERPARAMETER SEARCH parameters = RegressorHyperParameterSearch(x_train, y_train) # return the best parameters best_lr, best_batch, best_epoch = parameters print('--------------------------------------') print('BEST RESULTS FROM HYPERPARAMETER SEARCH') print('Best learning Rate: {}'.format(best_lr)) print('Best Batch Size: {}'.format(best_batch)) print('Best Epochs: {}'.format(best_epoch)) # use the parameters to train regressor = Regressor(x_train, nb_epoch=best_epoch, lr=best_lr, batch_size=best_batch) # -------------------------------------------------------------------- # TRAIN regressor.fit(x_train, y_train) # -------------------------------------------------------------------- # tEST & PREDICT x_test.to_csv('x_test.csv') pred = regressor.predict(x_test) pred = pd.DataFrame(pred) pred.to_csv('prediction.csv') # Error error = regressor.score(x_test, y_test) print('--------------------------------------') print('Test scores: ') print('\nMSE: {} '.format(error[0])) print('\Explained Variance: {}'.format(error[1])) print('\R^2 score: {}'.format(error[2])) print('\RMSE: {}'.format(error[3])) print('\n--------------------------------------')
def example_main(): output_label = "median_house_value" # Use pandas to read CSV data as it contains various object types # Feel free to use another CSV reader tool # But remember that LabTS tests take Pandas Dataframe as inputs data = pd.read_csv("housing.csv") #Randomly shuffle the data data = data.sample(frac=1).reset_index(drop=True) # Spliting input and output x = data.loc[:, data.columns != output_label] y = data.loc[:, [output_label]] # Training # This example trains on the whole available dataset. # You probably want to separate some held-out data # to make sure the model isn't overfitting #Todo: Adjust with shuffling x_train = x[2000:] y_train = y[2000:] x_test = x_train[0:2000] y_test = y_train[:2000] regressor = Regressor(x_train, nb_epoch=100) regressor.fit(x_train, y_train) save_regressor(regressor) plot_validation_loss(training_loss=regressor.loss_rel[0, :], validation_loss=regressor.loss_rel[1, :]) pred = regressor.predict(x_test) # plot prediction for 100 samples plot_prediction(pred[:100], y_test[:100]) #scaler = load(open('y_transformer.pkl', 'rb')) #print(scaler.inverse_transform(pred)) #print(pred) #print(y_test) # Error error = regressor.score(x_test, y_test) print('--------------------------------------') print('Test scores: ') print('\nMSE: {} '.format(error[0])) print('\nExplained Variance: {}'.format(error[1])) print('\nR^2 score: {}'.format(error[2])) print('\nRMSE: {}'.format(error[3])) print('--------------------------------------')
def example_main(): output_label = "median_house_value" # Use pandas to read CSV data as it contains various object types # Feel free to use another CSV reader tool # But remember that LabTS tests take Pandas Dataframe as inputs data = pd.read_csv("housing.csv") #Randomly shuffle the data data = data.sample(frac = 1).reset_index(drop = True) # Spliting input and output x = data.loc[:, data.columns != output_label] y = data.loc[:, [output_label]] # Training # This example trains on the whole available dataset. # You probably want to separate some held-out data # to make sure the model isn't overfitting #Todo: Adjust with shuffling x_train = x[2000:] y_train = y[2000:] x_test = x_train[0:2000] y_test = y_train[:2000] regressor = Regressor(x_train, nb_epoch = 1000) regressor.fit(x_train, y_train) save_regressor(regressor) #Plots our training and validation loss plt.plot(np.arange(regressor.loss_rel.shape[1]),regressor.loss_rel[0,:],label = 'training_loss') plt.plot(np.arange(regressor.loss_rel.shape[1]),regressor.loss_rel[1,:],label = 'validation_loss') plt.yscale("log") plt.legend() plt.show() plt.savefig("loss.png") pred = regressor.predict(x_test) #scaler = load(open('y_transformer.pkl', 'rb')) #print(scaler.inverse_transform(pred)) print(pred) print(y_test) # Error error = regressor.score(x_test, y_test) print("\nRegressor error: {}\n".format(error))
def example_main(): # -------------------------------------------------------------------- # LOAD THE DATA #define the output label output_label = "median_house_value" # get the data data = pd.read_csv("housing.csv") #Randomly shuffle the data data = data.sample(frac=1).reset_index(drop=True) # Spliting input and output x = data.loc[:, data.columns != output_label] y = data.loc[:, [output_label]] # keep a held out dataset for testing overfitting x_train = x[2000:] y_train = y[2000:] x_test = x_train[0:2000] y_test = y_train[:2000] # -------------------------------------------------------------------- # TRAIN regressor = Regressor(x_train, nb_epoch=100) regressor.fit(x_train, y_train) save_regressor(regressor) # -------------------------------------------------------------------- # PLOT LOSS plot_validation_loss(training_loss=regressor.loss_rel[0, :], validation_loss=regressor.loss_rel[1, :]) # -------------------------------------------------------------------- pred = regressor.predict(x_test) scaler = load(open('y_transformer.pkl', 'rb')) # -------------------------------------------------------------------- # EVALUATE error = regressor.score(x_test, y_test) print('--------------------------------------') print('Test scores: ') print('\nMSE: {} '.format(error[0])) print('\nExplained Variance: {}'.format(error[1])) print('\nR^2 score: {}'.format(error[2])) print('\nRMSE: {}'.format(error[3])) print('--------------------------------------')
def __init__(self, data=None, sample_percent=1): """ """ super().__init__() assert (data is not None), "No data passed" try: self.RANDOM_SEED = RANDOM_SEED except: self.RANDOM_SEED = 1234 assert (sample_percent > 0 and sample_percent <= 1) self._data = data if sample_percent < 1: sample_size = math.ceil(len(data) * sample_percent) self._data = data.sample(sample_size)
def __init__(self, train=False, test=False, limit_rows=False, transform=None, target_transform=None, download=False): data = pd.read_csv(path) # pre split limit if (limit_rows): data = data.sample(limit_rows, random_state=1337) split_point1 = int(np.floor(len(data) * 0.9)) data_train = data[0:split_point1] data_test = data[split_point1:] print(len(data_train)) print(len(data_test)) print(data_test)
def main(): data = pd.read_csv('../all_data.csv') data_train = data.sample(frac=0.8,random_state=17) data_val = data.loc[~data.index.isin(data_train.index)] files_train = list(data_train['filename']) files_val = list(data_val['filename']) ppm_train = list(data_train['ppm']) ppm_val = list(data_val['ppm']) ids_train = [i for i in range(len(files_train))] ids_val = [i for i in range(len(files_val))] data = None data_train = None data_val = None # model = LeUNet() model = ResNetUNet() model = torch.nn.DataParallel(model).cuda() model.load_state_dict(torch.load("model_haze_all.pth"),strict=False) # on GPU # model = StandardNet('resnet50').cuda() # model = StandardNet('vgg16').cuda() # model = EPAPLN().cuda() # model = EnsembleNet().cuda() criterion = nn.MSELoss().cuda() optimizer = torch.optim.Adam(model.parameters(),lr=1e-4) train_dataset = Dataset(ids_train, files_train, ppm_train, transforms.Compose([transforms.Resize((256,256)),transforms.ToTensor(),transforms.Normalize(mean=[0.5231, 0.5180, 0.5115],std=[0.2014, 0.2018, 0.2100]),])) # normalize train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=12) val_dataset = Dataset(ids_val, files_val, ppm_val, transforms.Compose([transforms.Resize((256,256)),transforms.ToTensor(),transforms.Normalize(mean=[0.5231, 0.5180, 0.5115],std=[0.2014, 0.2018, 0.2100]),])) val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=32, shuffle=False, num_workers=12) best_loss = 1e5 for epoch in range(500): train_loss = train(train_loader,model,criterion,optimizer) val_loss = val(val_loader,model,criterion) print('Epoch: %d, MSE train set: %.8f' % (epoch+1, train_loss)) print('Epoch: %d, MSE val set: %.8f\n' % (epoch+1, val_loss)) if val_loss < best_loss: torch.save(model.state_dict(),'resnetunet_pm_all.pth') best_loss = val_loss
def main(): parser = argparse.ArgumentParser() parser.add_argument("input", type=str, help="input dataset") parser.add_argument("directory", type=str, help="directory to store data files") parser.add_argument("-i", "--iterations", type=int, help="iterations to do", default=1000) parser.add_argument("-ti", "--train_iterations", type=int, help="iterations to train NN", default=10) parser.add_argument("-l", "--learning_rate", type=float, help="learning rate", default=0.01) parser.add_argument( "-s", "--sample", type=int, help= "number of samples to use from dataset. If not passed - whole dataset is used", default=None) parser.add_argument("-mb", "--mini_batch", type=int, help="minibatch size, 1000 is default", default=1000) parser.add_argument("-tvs", "--train_validation_split", type=float, help="train - validation split fraction", default=0.8) parser.add_argument("-ml", "--middle_layers", type=int, help="number of middle layers", default=20) parser.add_argument("-mln", "--middle_layer_neurons", type=int, help="middle layers neuron count", default=2) parser.add_argument("-ha", "--hidden_activation", help="activation to use on hidden layers", type=str) parser.add_argument("-oa", "--out_activation", help="activation to use on out layer", type=str) parser.add_argument( "-ihl", "--input_has_labels", help= "pass this is input has class label. Needed for optimal predictor evaluation", action="store_true") parser.add_argument("-fc", "--force_cpu", help="force cpu execution for PyTorch", action="store_true") args = parser.parse_args() if not os.path.exists(args.directory): os.makedirs(args.directory) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") if args.force_cpu: device = "cpu" print("Running on: {0}".format(device)) data_full = pd.read_csv(args.input, header=None) error_file = os.path.join(args.directory, "error.txt") with open(error_file, "w") as f: for seed in tqdm(range(args.iterations), desc="Running iterations"): torch.manual_seed(seed) np.random.seed(seed) if torch.cuda.is_available(): torch.cuda.manual_seed(seed) data = data_full if args.sample: data = data.sample(n=args.sample) n = len(data) train_size = n * args.train_validation_split train_data = data.sample(n=int(train_size)) valid_data = data.drop(train_data.index) layers = [data.shape[1] - 2] + ([args.middle_layer_neurons] * args.middle_layers) + [1] nn = TorchFeedforwardNN(layers, hidden_activation=args.hidden_activation, out_activation=args.out_activation) if torch.cuda.is_available(): nn.to(device) inp_train = np.matrix(train_data.iloc[:, 1:train_data.shape[1] - 1]) outp_train = np.matrix(train_data.iloc[:, train_data.shape[1] - 1:train_data.shape[1]]) inp_valid = np.matrix(valid_data.iloc[:, 1:valid_data.shape[1] - 1]) outp_valid = np.matrix(valid_data.iloc[:, valid_data.shape[1] - 1:valid_data.shape[1]]) optim_err = calc_aver_error(inp_valid, outp_valid, args.input_has_labels) optim_err_train = calc_aver_error(inp_train, outp_train, args.input_has_labels) inp_train = torch.from_numpy(inp_train) outp_train = torch.from_numpy(outp_train) inp_valid = torch.from_numpy(inp_valid) outp_valid = torch.from_numpy(outp_valid) if torch.cuda.is_available(): inp_train = inp_train.to(device) outp_train = outp_train.to(device) inp_valid = inp_valid.to(device) outp_valid = outp_valid.to(device) for _ in tqdm(range(args.train_iterations), desc="Training NN"): train_loader = torch.utils.data.DataLoader( torch.utils.data.TensorDataset(inp_train, outp_train), batch_size=args.mini_batch, shuffle=True) for inp, target in tqdm(train_loader, desc="Running minibatches"): nn.backpropagation_learn(inp, target, args.learning_rate, show_progress=True, stochastic=False) train_err = nn.evaluate(inp_train, outp_train) valid_err = nn.evaluate(inp_valid, outp_valid) f.write("{} {} {} {}\n".format(optim_err_train, optim_err, train_err, valid_err)) f.flush()
import numpy as np import pandas as pd import torch import torch.nn as nn import torch.optim import torch.utils.data import torchvision.transforms as transforms from PIL import Image from train_pm import Dataset, double_conv, LeUNet, val if __name__ == '__main__': data = pd.read_csv('../final_data.csv') data_train = data.sample(frac=0.8, random_state=17) data_val = data.loc[~data.index.isin(data_train.index)] files_val = list(data_val['filename']) ppm_val = list(data_val['ppm']) ids_val = [i for i in range(len(files_val))] model = LeUNet() model = torch.nn.DataParallel(model).cuda() model.load_state_dict(torch.load("model_hazy_best.pth"), strict=False) # on GPU criterion = nn.MSELoss().cuda() optimizer = torch.optim.Adam(model.parameters(), lr=1e-4) val_dataset = Dataset( ids_val, files_val, ppm_val, transforms.Compose([ transforms.Resize((256, 256)),
def downsample(data, num): return data.sample(num)
def upsample(data, num): return data.sample(num, replace=True)
def main(): parser = argparse.ArgumentParser() parser.add_argument("input", type=str, help="input dataset") parser.add_argument("directory", type=str, help="directory to store data files") parser.add_argument("-i", "--iterations", type=int, help="iterations to do", default=1000) parser.add_argument("-l", "--learning_rate", type=float, help="learning rate", default=0.01) parser.add_argument( "-s", "--sample", type=int, help= "number of samples to use from dataset. If not passed - whole dataset is used", default=None) parser.add_argument("-mb", "--mini_batch", type=int, help="minibatch size, 1000 is default", default=1000) parser.add_argument("-tvs", "--train_validation_split", type=float, help="train - validation split fraction", default=0.8) parser.add_argument( "-pf", "--pickle_file", type=int, help="pickle file index to dump neural network state after learning", default=None) parser.add_argument( "-uf", "--unpickle_file", type=int, help= "pickle file index to restore neural network state from at the beginning", default=None) parser.add_argument("-ml", "--middle_layers", type=int, help="number of middle layers", default=20) parser.add_argument("-mln", "--middle_layer_neurons", type=int, help="middle layers neuron count", default=2) parser.add_argument("--case", type=int, help="case of data popularity distribution", default=1) parser.add_argument("-ha", "--hidden_activation", help="activation to use on hidden layers", type=str) parser.add_argument("-oa", "--out_activation", help="activation to use on out layer", type=str) parser.add_argument( "-ihl", "--input_has_labels", help= "pass this is input has class label. Needed for optimal predictor evaluation", action="store_true") parser.add_argument("--seed", help="seed for item sampling", type=int) parser.add_argument("-fc", "--force_cpu", help="force cpu execution for PyTorch", action="store_true") # parser.add_argument("-aef", # "--alternative_error_function", # help="use alternative error function - error for Poisson distribution", # action="store_true") args = parser.parse_args() # In the next section you should define a mapping of items distribution # Case 1 if args.case == 1: generator = PoissonZipfGenerator(10_000, 20.0, 0.8, 0) dist_mapping = generator.get_distribution_map() # Case 2 elif args.case == 2: generator = PoissonZipfGenerator(5_000, 40.0, 0.8, 0) dist_mapping = generator.get_distribution_map() generator2 = PoissonShuffleZipfGenerator(5_000, 40.0, 0.8, 5_000, 10_000_000) dist_mapping2 = generator2.get_distribution_map() for k, v in dist_mapping2.items(): dist_mapping[k] = v for k, v in dist_mapping.items(): dist_mapping[k] = v / 2.0 else: raise AttributeError("Unknown case passed") # End of section if args.seed: torch.manual_seed(args.seed) np.random.seed(args.seed) if torch.cuda.is_available(): torch.cuda.manual_seed(args.seed) data = pd.read_csv(args.input, header=None) if args.sample: data = data.sample(n=args.sample) n = len(data) train_size = n * args.train_validation_split train_data = data.sample(n=int(train_size)) valid_data = data.drop(train_data.index) if not os.path.exists(args.directory): os.makedirs(args.directory) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") if args.force_cpu: device = "cpu" print("Running on: {0}".format(device)) if args.unpickle_file is not None: filename = "nn_{0}.p".format(args.unpickle_file) filename = os.path.join(args.directory, filename) with open(filename, "rb") as unpickle_file: nn = pickle.load(unpickle_file) else: layers = [data.shape[1] - 2 ] + ([args.middle_layer_neurons] * args.middle_layers) + [1] nn = TorchFeedforwardNN(layers, hidden_activation=args.hidden_activation, out_activation=args.out_activation) if torch.cuda.is_available(): nn.to(device) sample_map = {} for k, v in tqdm(dist_mapping.items(), desc="Preprocessing dataset"): sample_map[k] = data[data.ix[:, 0] == k] learning_rate = args.learning_rate prev_dist = 10**10 inp_train = np.matrix(train_data.iloc[:, 1:train_data.shape[1] - 1]) outp_train = np.matrix(train_data.iloc[:, train_data.shape[1] - 1:train_data.shape[1]]) inp_valid = np.matrix(valid_data.iloc[:, 1:valid_data.shape[1] - 1]) outp_valid = np.matrix(valid_data.iloc[:, valid_data.shape[1] - 1:valid_data.shape[1]]) if args.case == 1: optim_err = calc_aver_error(inp_valid, outp_valid, args.input_has_labels) optim_err_train = calc_aver_error(inp_train, outp_train, args.input_has_labels) elif args.case == 2: optim_err = calc_case_2_optim_err(valid_data, args.input_has_labels) optim_err_train = calc_case_2_optim_err(train_data, args.input_has_labels) else: raise AttributeError("Unknown case passed") inp_train = torch.from_numpy(inp_train) outp_train = torch.from_numpy(outp_train) inp_valid = torch.from_numpy(inp_valid) outp_valid = torch.from_numpy(outp_valid) if torch.cuda.is_available(): inp_train = inp_train.to(device) outp_train = outp_train.to(device) inp_valid = inp_valid.to(device) outp_valid = outp_valid.to(device) dist_file = os.path.join(args.directory, "distance.txt") error_file = os.path.join(args.directory, "error.txt") with open(error_file, "w") as err_f: with open(dist_file, "w") as f: # dist = 0.0 # for k, v in tqdm(dist_mapping.items(), desc="Evaluating distance"): # item = sample_map[k].sample(n=1) # pop = nn.evaluate(np.matrix(item.iloc[:, 1:item.shape[1] - 1]), # np.matrix(item.iloc[:, item.shape[1] - 1:item.shape[1]]))[0] # # dist += abs(v - pop) # # dist /= 2.0 # f.write(f"{dist}\n") # f.flush() err_f.write("{} {}\n".format(optim_err_train, optim_err)) for _ in tqdm(range(args.iterations), desc="Running iterations"): train_loader = torch.utils.data.DataLoader( torch.utils.data.TensorDataset(inp_train, outp_train), batch_size=args.mini_batch, shuffle=True) for inp, target in tqdm(train_loader, desc="Running minibatches"): nn.backpropagation_learn(inp, target, args.learning_rate, show_progress=True, stochastic=False) dist = 0.0 err = 0.0 for k, v in tqdm(dist_mapping.items(), desc="Evaluating distance"): item = sample_map[k].sample(n=1) inp = torch.from_numpy( np.matrix(item.iloc[:, 1:item.shape[1] - 1])) outp = torch.from_numpy( np.matrix(item.iloc[:, item.shape[1] - 1:item.shape[1]])) err += nn.evaluate(inp, outp) pop = float( nn( torch.Tensor( np.matrix(item.iloc[:, 1:item.shape[1] - 1])).double())) pop = np.exp(-pop) - 10**-15 dist += abs(v - pop) err /= len(dist_mapping) dist /= 2.0 prev_dist = dist f.write(f"{dist} {err}\n") f.flush() train_err = nn.evaluate(inp_train, outp_train) valid_err = nn.evaluate(inp_valid, outp_valid) err_f.write("{} {}\n".format(train_err, valid_err)) err_f.flush() if args.pickle_file is not None: filename = "nn_{0}.p".format(args.pickle_file) filename = os.path.join(args.directory, filename) with open(filename, "wb") as pickle_file: pickle.dump(nn, pickle_file) cache_file = os.path.join(args.directory, "cache_hit.txt") with open(cache_file, "w") as f: popularities = [] for k, v in tqdm(dist_mapping.items(), desc="Evaluating distance"): item = sample_map[k].sample(n=1) pop = float( nn( torch.Tensor(np.matrix(item.iloc[:, 1:item.shape[1] - 1])).double())) pop = np.exp(-pop) - 10**-15 # tmp = np.matrix(item.iloc[:, 1:item.shape[1] - 1]) # tmp = np.exp(-tmp) - 10 ** -15 # transform from log # pop = float(np.mean(tmp, axis=1)) # tmp = np.exp(-np.matrix(item.iloc[:, -1:])) - 10 ** -15 # transform from log # pop = float(tmp) popularities.append((k, pop)) mean_val = np.mean([x[1] for x in popularities]) median_val = np.median([x[1] for x in popularities]) print("Popularity mean: {}".format(mean_val)) print("Popularity median: {}".format(median_val)) stat_file = os.path.join(args.directory, "stat.txt") with open(stat_file, "w") as f_stat: f_stat.write("Popularity mean: {}".format(mean_val)) f_stat.write("Popularity median: {}".format(median_val)) pops_sorted = list( sorted(popularities, key=lambda x: x[1], reverse=True)) pop_order_predicted = [x[0] for x in pops_sorted] order_file = os.path.join(args.directory, "order.txt") with open(order_file, "w") as f1: for item in pops_sorted: f1.write("{0} {1} {2}\n".format(item[0], item[1], dist_mapping[item[0]])) pred_items_real_pops = [dist_mapping[i] for i in pop_order_predicted] distrib_pop_ordered = sorted(dist_mapping.values(), reverse=True) theory_hit = 0.0 practice_hit = 0.0 for distrib_pop, pred_item_pop in zip(distrib_pop_ordered, pred_items_real_pops): theory_hit += distrib_pop practice_hit += pred_item_pop f.write(f"{theory_hit} {practice_hit}\n")
# df['Count'][i] = randint(0,df['Count'].max()) # df['Lat'][i] = int(df["Lat"][i]) # df['Lng'][i] = int(df["Lng"][i]) # Saving the code from above with open('dictionary.pickle', 'wb') as handle: pickle.dump(df, handle, protocol=pickle.HIGHEST_PROTOCOL) with open('dictionary.pickle', 'rb') as handle: data = pickle.load(handle) np.random.seed(1234) # Converting the data to binary data.loc[data['Count'] >= 1.0, 'Count'] = 1.0 train, validate, test = np.split(data.sample(frac=1, random_state=134), [int(.6*len(df)), int(.8*len(df))]) # We need to drop the count values for the training, validation, and test x_train = train.drop(['Count'], axis =1).values y_train = train['Count'].values x_val = train.drop(['Count'], axis=1).values y_val = train['Count'].values x_test = test.drop(['Count'], axis=1).values y_test = test['Count'].values # Compute the mean of each column of the tensors. Also compute the standard deviation # torch.mean[data, axis = 1] data-mean/stdv
def main(): parser = argparse.ArgumentParser() parser.add_argument("input", type=str, help="input dataset") parser.add_argument("directory", type=str, help="directory to store data files") parser.add_argument("-i", "--iterations", type=int, help="iterations to do", default=1000) parser.add_argument("-l", "--learning_rate", type=float, help="learning rate", default=0.01) parser.add_argument("-s", "--sample", type=int, help="number of samples to use from dataset. If not passed - whole dataset is used", default=None) parser.add_argument("-es", "--eval_sample", type=int, help="number of samples to use from for evaluation", default=None) parser.add_argument("-mb", "--mini_batch", type=int, help="minibatch size, 1000 is default", default=1000) parser.add_argument("-mbl", "--mini_batch_log", type=int, help="after how many batches evaluate the error", default=100) parser.add_argument("-tvs", "--train_validation_split", type=float, help="train - validation split fraction", default=0.8) parser.add_argument("-pf", "--pickle_file", type=int, help="pickle file index to dump neural network state after learning", default=None) parser.add_argument("-uf", "--unpickle_file", type=int, help="pickle file index to restore neural network state from at the beginning", default=None) parser.add_argument("--seed", help="seed for item sampling", type=int) parser.add_argument("-fc", "--force_cpu", help="force cpu execution for PyTorch", action="store_true") args = parser.parse_args() if not os.path.exists(args.directory): os.makedirs(args.directory) if args.seed: torch.manual_seed(args.seed) np.random.seed(args.seed) if torch.cuda.is_available(): torch.cuda.manual_seed(args.seed) data = pd.read_csv(args.input, header=None, index_col=None, names=None) if args.sample: data = data.sample(n=args.sample) n = len(data) train_size = n * args.train_validation_split train_data = data.sample(n=int(train_size)) valid_data = data.drop(train_data.index) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") if args.force_cpu: device = "cpu" print("Running on: {0}".format(device)) if args.unpickle_file is not None: filename = "dlstm_{0}.p".format(args.unpickle_file) filename = os.path.join(args.directory, filename) with open(filename, "rb") as unpickle_file: nn = pickle.load(unpickle_file) else: layers = [inputs_num, 16, 16, outputs_num] nn = LSTMSoftmax(layers) if torch.cuda.is_available(): nn.to(device) inp_train = np.matrix(train_data.iloc[:, :inputs_num]).astype(float) outp_train = np.matrix(train_data.iloc[:, inputs_num:]) inp_valid = np.matrix(valid_data.iloc[:, :inputs_num]).astype(float) outp_valid = np.matrix(valid_data.iloc[:, inputs_num:]) inp_train = torch.from_numpy(inp_train).type(torch.FloatTensor) outp_train = torch.from_numpy(outp_train).type(torch.FloatTensor) inp_valid = torch.from_numpy(inp_valid).type(torch.FloatTensor) outp_valid = torch.from_numpy(outp_valid).type(torch.FloatTensor) if torch.cuda.is_available(): inp_train = inp_train.to(device) outp_train = outp_train.to(device) inp_valid = inp_valid.to(device) outp_valid = outp_valid.to(device) log_counter = args.mini_batch_log error_file = os.path.join(args.directory, "error.txt") with open(error_file, "w") as f: for _ in tqdm(range(args.iterations), desc="Running iterations"): train_loader = torch.utils.data.DataLoader(torch.utils.data.TensorDataset(inp_train, outp_train), batch_size=args.mini_batch, shuffle=True) for inp, target in tqdm(train_loader, desc="Running minibatches"): nn.backpropagation_learn(inp, target, args.learning_rate, show_progress=True, stochastic=False) log_counter -= 1 if log_counter == 0: log_counter = args.mini_batch_log if args.eval_sample is None: train_err = nn.evaluate(inp_train, outp_train) valid_err = nn.evaluate(inp_valid, outp_valid) else: train_tmp = train_data.sample(n=args.eval_sample) valid_tmp = valid_data.sample(n=args.eval_sample) inp_train_tmp = np.matrix(train_tmp.iloc[:, :inputs_num]).astype(float) outp_train_tmp = np.matrix(train_tmp.iloc[:, inputs_num:]) inp_valid_tmp = np.matrix(valid_tmp.iloc[:, :inputs_num]).astype(float) outp_valid_tmp = np.matrix(valid_tmp.iloc[:, inputs_num:]) inp_train_tmp = torch.from_numpy(inp_train_tmp).type(torch.FloatTensor) outp_train_tmp = torch.from_numpy(outp_train_tmp).type(torch.FloatTensor) inp_valid_tmp = torch.from_numpy(inp_valid_tmp).type(torch.FloatTensor) outp_valid_tmp = torch.from_numpy(outp_valid_tmp).type(torch.FloatTensor) train_err = nn.evaluate(inp_train_tmp, outp_train_tmp) valid_err = nn.evaluate(inp_valid_tmp, outp_valid_tmp) f.write("{} {}\n".format(train_err, valid_err)) f.flush() if args.pickle_file is not None: filename = "dlstm_{0}.p".format(args.pickle_file) filename = os.path.join(args.directory, filename) with open(filename, "wb") as pickle_file: pickle.dump(nn, pickle_file)
def __init__(self, config): """ :param config: """ self.config = config if config.data_type == "SENTEMO": #Init self.word2idx = {} self.idx2word = {} self.vocab = set() #Read Data if self.config.mode == 'test': self.word2idx = pickle.load( open(self.config.out_dir + 'word2idx.pkl', "rb")) self.idx2word = pickle.load( open(self.config.out_dir + 'idx2word.pkl', "rb")) self.vocab = pickle.load( open(self.config.out_dir + 'vocab.pkl', "rb")) vocab_size = pickle.load( open(self.config.out_dir + 'vocab_size.pkl', "rb")) self.config.vocab_size = vocab_size['embedded_dim'] test_data = np.load(self.config.out_dir + 'test_data.npy') test_labels = np.load(self.config.out_dir + 'test_labels.npy') test = SENTEMO_Data(test_data, test_labels) self.test_loader = DataLoader(test, batch_size=config.batch_size, shuffle=True, drop_last=True) self.test_iterations = (len(test) + self.config.batch_size ) // self.config.batch_size else: data = self.load_from_pickle( directory=self.config.SENT_EMO_Path) data["token_size"] = data["text"].apply( lambda x: len(x.split(' '))) data = data.loc[data['token_size'] < 70].copy() # sampling data = data.sample(n=50000) # construct vocab and indexing self.create_index(data["text"].values.tolist()) # vectorize to tensor input_tensor = [[self.word2idx[s] for s in es.split(' ')] for es in data["text"].values.tolist()] max_length_inp = self.max_length(input_tensor) # inplace padding input_tensor = [ self.pad_sequences(x, max_length_inp) for x in input_tensor ] ### convert targets to one-hot encoding vectors emotions = list(set(data.emotions.unique())) # binarizer mlb = preprocessing.MultiLabelBinarizer() data_labels = [ set(emos) & set(emotions) for emos in data[['emotions']].values ] bin_emotions = mlb.fit_transform(data_labels) target_tensor = np.array(bin_emotions.tolist()) # Creating training and validation sets using an 80-20 split input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split( input_tensor, target_tensor, test_size=0.2) # Split the validataion further to obtain a holdout dataset (for testing) -- split 50:50 input_tensor_val, input_tensor_test, target_tensor_val, target_tensor_test = train_test_split( input_tensor_val, target_tensor_val, test_size=0.5) #for Infernce self.test_data = input_tensor_test self.test_labels = target_tensor_test #Init Transforms self.input_transform = standard_transforms.Compose([ standard_transforms.ToTensor(), ]) self.target_transform = standard_transforms.Compose([ standard_transforms.ToTensor(), ]) #Creeate Datasets train = SENTEMO_Data( input_tensor_train, target_tensor_train ) #, input_transform=self.input_transform, target_transform=self.target_transform) valid = SENTEMO_Data( input_tensor_val, target_tensor_val ) #, input_transform=self.input_transform, target_transform=self.target_transform) test = SENTEMO_Data( input_tensor_test, target_tensor_test ) #, input_transform=self.input_transform, target_transform=self.target_transform) self.train_loader = DataLoader( train, batch_size=config.batch_size, shuffle=True, drop_last=True, ) self.valid_loader = DataLoader( valid, batch_size=config.batch_size, shuffle=True, drop_last=True, ) self.test_loader = DataLoader( test, batch_size=config.batch_size, shuffle=True, drop_last=True, ) self.train_iterations = (len(train) + self.config.batch_size ) // self.config.batch_size self.valid_iterations = (len(valid) + self.config.batch_size ) // self.config.batch_size self.test_iterations = (len(test) + self.config.batch_size ) // self.config.batch_size self.config.vocab_size = len(self.word2idx) elif config.data_type == "SEM_EVAL_OC" or config.data_type == "SEM_EVAL_OC_Translated" or config.data_type == "SEM_EVAL_OC_Translated_TestOnly": #Init self.word2idx = {} self.idx2word = {} self.vocab = set() if self.config.mode == 'test' and not config.data_type == "SEM_EVAL_OC_Translated": self.word2idx = pickle.load( open(self.config.out_dir + 'word2idx.pkl', "rb")) self.idx2word = pickle.load( open(self.config.out_dir + 'idx2word.pkl', "rb")) self.vocab = pickle.load( open(self.config.out_dir + 'vocab.pkl', "rb")) vocab_size = pickle.load( open(self.config.out_dir + 'vocab_size.pkl', "rb")) self.config.vocab_size = vocab_size['embedded_dim'] test_data = np.load(self.config.out_dir + 'test_data.npy') test_labels = np.load(self.config.out_dir + 'test_labels.npy') test = SENTEMO_Data(test_data, test_labels) self.test_loader = DataLoader(test, batch_size=config.batch_size, shuffle=True, drop_last=True) self.test_iterations = (len(test) + self.config.batch_size ) // self.config.batch_size elif self.config.mode == 'test' and config.data_type == "SEM_EVAL_OC_Translated_TestOnly": self.word2idx = pickle.load( open(self.config.out_dir + 'word2idx.pkl', "rb")) self.idx2word = pickle.load( open(self.config.out_dir + 'idx2word.pkl', "rb")) self.vocab = pickle.load( open(self.config.out_dir + 'vocab.pkl', "rb")) vocab_size = pickle.load( open(self.config.out_dir + 'vocab_size.pkl', "rb")) self.config.vocab_size = vocab_size['embedded_dim'] test_data = np.load(self.config.out_dir + 'test_data_es.npy') test_labels = np.load(self.config.out_dir + 'test_labels_es.npy') test = SENTEMO_Data(test_data, test_labels) self.test_loader = DataLoader(test, batch_size=config.batch_size, shuffle=True, drop_last=True) self.test_iterations = (len(test) + self.config.batch_size ) // self.config.batch_size elif self.config.mode == 'test' and config.data_type == "SEM_EVAL_OC_Translated": data = pd.read_csv(self.config.translated_data) if self.config.remove_emoji == 'remove': data['text'] = data['text'].apply( lambda x: emoji_pattern.sub(r'', x)) elif self.config.remove_emoji == 'replace': data['text'] = data['text'].apply( lambda x: emoji.demojize(x)) if self.config.spacy_token_preprocess == True: if self.config.lang == 'en': nlp = spacy.load('en_core_web_sm') elif self.config.lang == 'es': nlp = spacy.load('es_core_news_md') tokenizer = spacy.tokenizer.Tokenizer(nlp.vocab) data['text'] = data['text'].apply(lambda x: ' '.join( [token.text_with_ws for token in nlp(x)])) if self.config.remove_capital == True: data['text'] = data['text'].apply(lambda x: ' '.join( [word.lower() for word in x.split()])) if self.config.remove_stopwords == True: if self.config.lang == 'en': nlp = spacy.load('en_core_web_sm') spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS elif self.config.lang == 'es': nlp = spacy.load('es_core_news_md') spacy_stopwords = spacy.lang.es.stop_words.STOP_WORDS data['text'] = data['text'].apply(lambda x: ' '.join([ word for word in x.split() if word not in (spacy_stopwords) ])) data["token_size"] = data["text"].apply( lambda x: len(x.split(' '))) data = data.loc[data['token_size'] < 80].copy() self.word2idx = pickle.load( open(self.config.out_dir + 'word2idx.pkl', "rb")) self.idx2word = pickle.load( open(self.config.out_dir + 'idx2word.pkl', "rb")) self.vocab = pickle.load( open(self.config.out_dir + 'vocab.pkl', "rb")) vocab_size = pickle.load( open(self.config.out_dir + 'vocab_size.pkl', "rb")) self.config.vocab_size = vocab_size['embedded_dim'] #self.create_index(data["text"].values.tolist()) input_tensor = [[ self.word2idx[s] for s in es.split(' ') if s in self.word2idx.keys() ] for es in data["text"].values.tolist()] max_length_inp = self.max_length(input_tensor) input_tensor = [ self.pad_sequences(x, max_length_inp) for x in input_tensor ] emotions = list(set(data.emotions.unique())) # binarizer mlb = preprocessing.MultiLabelBinarizer() data_labels = [ set(emos) & set(emotions) for emos in data[['emotions']].values ] bin_emotions = mlb.fit_transform(data_labels) target_tensor = np.array(bin_emotions.tolist()) test = SENTEMO_Data(input_tensor, target_tensor) self.test_loader = DataLoader( test, batch_size=config.batch_size, shuffle=True, drop_last=True, ) self.test_iterations = (len(test) + self.config.batch_size ) // self.config.batch_size else: if self.config.load_stored == 'LOAD_npy': train_tensor = np.load(self.config.out_dir + 'train_data.npy', allow_pickle=True) target_tensor_train = np.load(self.config.out_dir + 'train_labels.npy', allow_pickle=True) train_SEMEVAL_tensor = np.load(self.config.out_dir + 'SE_train_data.npy', allow_pickle=True) target_SEMEVAL_tensor_train = np.load( self.config.out_dir + 'SE_train_labels.npy', allow_pickle=True) valid_tensor = np.load(self.config.out_dir + 'val_data.npy', allow_pickle=True) target_tensor_val = np.load(self.config.out_dir + 'val_labels.npy', allow_pickle=True) my_list = ['anger', 'joy', 'fear', 'sadness'] SENTEMO_DataFrame = self.load_from_pickle( directory=self.config.SENT_EMO_Path) SENTEMO_DataFrame['emotions'] = SENTEMO_DataFrame[ 'emotions'].apply(lambda x: x if x in my_list else np.NaN) SENTEMO_DataFrame = SENTEMO_DataFrame.dropna() SENTEMO_DataFrame = pd.DataFrame({ "emotions": SENTEMO_DataFrame["emotions"], "text": SENTEMO_DataFrame["text"] }) SENTEMO_DataFrame['emotions'] = SENTEMO_DataFrame[ 'emotions'].apply(lambda x: my_list.index(x)) self.word2idx = pickle.load( open(self.config.out_dir + 'word2idx.pkl', "rb")) self.idx2word = pickle.load( open(self.config.out_dir + 'idx2word.pkl', "rb")) self.vocab = pickle.load( open(self.config.out_dir + 'vocab.pkl', "rb")) vocab_size = len(self.word2idx) self.config.vocab_size = vocab_size train = SENTEMO_Data(train_tensor, target_tensor_train) train_SE = SENTEMO_Data(train_SEMEVAL_tensor, target_SEMEVAL_tensor_train) valid = SENTEMO_Data(valid_tensor, target_tensor_val) self.train_loader = DataLoader( train, batch_size=config.batch_size * 128, shuffle=True, drop_last=True) self.train_SE_loader = DataLoader( train_SE, batch_size=config.batch_size, shuffle=True, drop_last=True) self.valid_loader = DataLoader(valid, batch_size=1, shuffle=True, drop_last=False) self.train_iterations = ( len(train) + (self.config.batch_size * 128)) // ( self.config.batch_size * 128) self.train_SE_iterations = ( len(train_SE) + self.config.batch_size) // self.config.batch_size self.valid_iterations = len(valid) else: anger0_x, anger0_y = self.parse_oc( self.config.Train_OC_Anger) fear0_x, fear0_y = self.parse_oc(self.config.Train_OC_Fear) joy0_x, joy0_y = self.parse_oc(self.config.Train_OC_Joy) sadness0_x, sadness0_y = self.parse_oc( self.config.Train_OC_Sadness) anger1_x, anger1_y = self.parse_oc( self.config.Valid_OC_Anger) fear1_x, fear1_y = self.parse_oc(self.config.Valid_OC_Fear) joy1_x, joy1_y = self.parse_oc(self.config.Valid_OC_Joy) sadness1_x, sadness1_y = self.parse_oc( self.config.Valid_OC_Sadness) if self.config.add_extra_data == 'SENTEMO': my_list = ['anger', 'joy', 'fear', 'sadness'] SENTEMO_DataFrame = self.load_from_pickle( directory=self.config.SENT_EMO_Path) SENTEMO_DataFrame['emotions'] = SENTEMO_DataFrame[ 'emotions'].apply(lambda x: x if x in my_list else np.NaN) SENTEMO_DataFrame = SENTEMO_DataFrame.dropna() SENTEMO_DataFrame = pd.DataFrame({ "emotions": SENTEMO_DataFrame["emotions"], "text": SENTEMO_DataFrame["text"] }) SENTEMO_DataFrame['emotions'] = SENTEMO_DataFrame[ 'emotions'].apply(lambda x: my_list.index(x)) #Preparing dataframes pd_anger = pd.DataFrame({"emotions": anger0_y}) pd_anger["text"] = anger0_x pd_joy = pd.DataFrame({"emotions": joy0_y}) pd_joy["text"] = joy0_x pd_fear = pd.DataFrame({"emotions": fear0_y}) pd_fear["text"] = fear0_x pd_sad = pd.DataFrame({"emotions": sadness0_y}) pd_sad["text"] = sadness0_x pd_anger["emotions"] = pd_anger["emotions"].apply( lambda x: x[1]) pd_anger["emotions"] = pd_anger["emotions"][ pd_anger["emotions"] > self.config.emo_threshold] pd_anger = pd_anger.dropna() pd_anger["emotions"] = pd_anger["emotions"].apply( lambda x: 0) pd_joy["emotions"] = pd_joy["emotions"].apply( lambda x: x[1]) pd_joy["emotions"] = pd_joy["emotions"][ pd_joy["emotions"] > self.config.emo_threshold] pd_joy = pd_joy.dropna() pd_joy["emotions"] = pd_joy["emotions"].apply(lambda x: 1) pd_fear["emotions"] = pd_fear["emotions"].apply( lambda x: x[1]) pd_fear["emotions"] = pd_fear["emotions"][ pd_fear["emotions"] > self.config.emo_threshold] pd_fear = pd_fear.dropna() pd_fear["emotions"] = pd_fear["emotions"].apply( lambda x: 2) pd_sad["emotions"] = pd_sad["emotions"].apply( lambda x: x[1]) pd_sad["emotions"] = pd_sad["emotions"][ pd_sad["emotions"] > self.config.emo_threshold] pd_sad = pd_sad.dropna() pd_sad["emotions"] = pd_sad["emotions"].apply(lambda x: 3) train_data = pd.concat( [pd_anger, pd_joy, pd_fear, pd_sad, SENTEMO_DataFrame], ignore_index=True) train_SEMEVAL_data = pd.concat( [pd_anger, pd_joy, pd_fear, pd_sad], ignore_index=True) pd_anger = pd.DataFrame({"emotions": anger1_y}) pd_anger["text"] = anger1_x pd_joy = pd.DataFrame({"emotions": joy1_y}) pd_joy["text"] = joy1_x pd_fear = pd.DataFrame({"emotions": fear1_y}) pd_fear["text"] = fear1_x pd_sad = pd.DataFrame({"emotions": sadness1_y}) pd_sad["text"] = sadness1_x pd_anger["emotions"] = pd_anger["emotions"].apply( lambda x: x[1]) pd_anger["emotions"] = pd_anger["emotions"][ pd_anger["emotions"] > self.config.emo_threshold] pd_anger = pd_anger.dropna() pd_anger["emotions"] = pd_anger["emotions"].apply( lambda x: 0) pd_joy["emotions"] = pd_joy["emotions"].apply( lambda x: x[1]) pd_joy["emotions"] = pd_joy["emotions"][ pd_joy["emotions"] > self.config.emo_threshold] pd_joy = pd_joy.dropna() pd_joy["emotions"] = pd_joy["emotions"].apply(lambda x: 1) pd_fear["emotions"] = pd_fear["emotions"].apply( lambda x: x[1]) pd_fear["emotions"] = pd_fear["emotions"][ pd_fear["emotions"] > self.config.emo_threshold] pd_fear = pd_fear.dropna() pd_fear["emotions"] = pd_fear["emotions"].apply( lambda x: 2) pd_sad["emotions"] = pd_sad["emotions"].apply( lambda x: x[1]) pd_sad["emotions"] = pd_sad["emotions"][ pd_sad["emotions"] > self.config.emo_threshold] pd_sad = pd_sad.dropna() pd_sad["emotions"] = pd_sad["emotions"].apply(lambda x: 3) valid_data = pd.concat([pd_anger, pd_joy, pd_fear, pd_sad], ignore_index=True) if self.config.TRAINING_DATA == 'STRONG': train_data = train_SEMEVAL_data.sample( frac=1).reset_index(drop=True) else: train_data = train_data.sample(frac=1).reset_index( drop=True) train_SEMEVAL_data = train_SEMEVAL_data.sample( frac=1).reset_index(drop=True) valid_data = valid_data.sample(frac=1).reset_index( drop=True) if self.config.remove_emoji == 'remove': train_data['text'] = train_data['text'].apply( lambda x: emoji_pattern.sub(r'', x)) train_SEMEVAL_data['text'] = train_SEMEVAL_data[ 'text'].apply(lambda x: emoji_pattern.sub(r'', x)) valid_data['text'] = valid_data['text'].apply( lambda x: emoji_pattern.sub(r'', x)) elif self.config.remove_emoji == 'replace': train_data['text'] = train_data['text'].apply( lambda x: emoji.demojize(x)) train_SEMEVAL_data['text'] = train_SEMEVAL_data[ 'text'].apply(lambda x: emoji.demojize(x)) valid_data['text'] = valid_data['text'].apply( lambda x: emoji.demojize(x)) if self.config.spacy_token_preprocess == True: if self.config.lang == 'en': nlp = spacy.load('en_core_web_sm') elif self.config.lang == 'es': nlp = spacy.load('es_core_news_md') tokenizer = spacy.tokenizer.Tokenizer(nlp.vocab) train_data['text'] = train_data['text'].apply( lambda x: ' '.join( [token.text_with_ws for token in nlp(x)])) train_SEMEVAL_data['text'] = train_SEMEVAL_data[ 'text'].apply(lambda x: ' '.join( [token.text_with_ws for token in nlp(x)])) valid_data['text'] = valid_data['text'].apply( lambda x: ' '.join( [token.text_with_ws for token in nlp(x)])) if self.config.remove_capital == True: train_data['text'] = train_data['text'].apply( lambda x: ' '.join( [word.lower() for word in x.split()])) train_SEMEVAL_data['text'] = train_SEMEVAL_data[ 'text'].apply(lambda x: ' '.join( [word.lower() for word in x.split()])) valid_data['text'] = valid_data['text'].apply( lambda x: ' '.join( [word.lower() for word in x.split()])) if self.config.remove_stopwords == True: if self.config.lang == 'en': nlp = spacy.load('en_core_web_sm') spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS elif self.config.lang == 'es': nlp = spacy.load('es_core_news_md') spacy_stopwords = spacy.lang.es.stop_words.STOP_WORDS train_data['text'] = train_data['text'].apply( lambda x: ' '.join([ word for word in x.split() if word not in (spacy_stopwords) ])) train_SEMEVAL_data['text'] = train_SEMEVAL_data[ 'text'].apply(lambda x: ' '.join([ word for word in x.split() if word not in (spacy_stopwords) ])) valid_data['text'] = valid_data['text'].apply( lambda x: ' '.join([ word for word in x.split() if word not in (spacy_stopwords) ])) train_data["token_size"] = train_data["text"].apply( lambda x: len(x.split(' '))) train_SEMEVAL_data["token_size"] = train_SEMEVAL_data[ "text"].apply(lambda x: len(x.split(' '))) valid_data["token_size"] = valid_data["text"].apply( lambda x: len(x.split(' '))) train_data = train_data.loc[ train_data['token_size'] < 100].copy() self.create_index(train_data["text"].values.tolist()) print("Vocab Size: '{}'".format(len(self.word2idx))) train_tensor = [[ self.word2idx[s] for s in es.split(' ') ] for es in train_data["text"].values.tolist()] max_length_inp = self.max_length(train_tensor) train_tensor = [ self.pad_sequences(x, max_length_inp) for x in train_tensor ] emotions = list(set(train_data.emotions.unique())) train_SEMEVAL_tensor = [[ self.word2idx[s] for s in es.split(' ') ] for es in train_SEMEVAL_data["text"].values.tolist()] max_length_inp = self.max_length(train_SEMEVAL_tensor) train_SEMEVAL_tensor = [ self.pad_sequences(x, max_length_inp) for x in train_SEMEVAL_tensor ] valid_tensor = [[ self.word2idx[s] for s in es.split(' ') if s in self.word2idx.keys() ] for es in valid_data["text"].values.tolist()] max_length_inp = self.max_length(valid_tensor) valid_tensor = [ self.pad_sequences(x, max_length_inp) for x in valid_tensor ] # binarizer mlb = preprocessing.MultiLabelBinarizer() train_labels = [ set(emos) & set(emotions) for emos in train_data[['emotions']].values ] bin_emotions = mlb.fit_transform(train_labels) target_tensor_train = np.array(bin_emotions.tolist()) train_SEMEVAL_labels = [ set(emos) & set(emotions) for emos in train_SEMEVAL_data[['emotions']].values ] bin_emotions = mlb.fit_transform(train_SEMEVAL_labels) target_SEMEVAL_tensor_train = np.array( bin_emotions.tolist()) valid_labels = [ set(emos) & set(emotions) for emos in valid_data[['emotions']].values ] bin_emotions = mlb.fit_transform(valid_labels) target_tensor_val = np.array(bin_emotions.tolist()) #Saving for reading later np.save(self.config.out_dir + 'train_data.npy', train_tensor, allow_pickle=True) np.save(self.config.out_dir + 'train_labels.npy', target_tensor_train, allow_pickle=True) np.save(self.config.out_dir + 'SE_train_data.npy', train_SEMEVAL_tensor, allow_pickle=True) np.save(self.config.out_dir + 'SE_train_labels.npy', target_SEMEVAL_tensor_train, allow_pickle=True) np.save(self.config.out_dir + 'val_data.npy', valid_tensor, allow_pickle=True) np.save(self.config.out_dir + 'val_labels.npy', target_tensor_val, allow_pickle=True) self.convert_to_pickle( self.word2idx, self.config.out_dir + 'word2idx.pkl') self.convert_to_pickle( self.idx2word, self.config.out_dir + 'idx2word.pkl') self.convert_to_pickle(self.vocab, self.config.out_dir + 'vocab.pkl') self.config.vocab_size = len(self.word2idx) vocab_size = {'embedded_dim': self.config.vocab_size} train = SENTEMO_Data(train_tensor, target_tensor_train) train_SE = SENTEMO_Data(train_SEMEVAL_tensor, target_SEMEVAL_tensor_train) valid = SENTEMO_Data(valid_tensor, target_tensor_val) self.train_loader = DataLoader( train, batch_size=config.batch_size, shuffle=True, drop_last=True) self.train_SE_loader = DataLoader( train_SE, batch_size=config.batch_size, shuffle=True, drop_last=True) self.valid_loader = DataLoader(valid, batch_size=1, shuffle=True, drop_last=False) self.train_iterations = ( len(train) + self.config.batch_size) // self.config.batch_size self.train_SE_iterations = ( len(train_SE) + self.config.batch_size) // self.config.batch_size self.valid_iterations = len(valid) self.config.vocab_size = len(self.word2idx) elif self.config.data_type == 'IEMOCAP': raise NotImplementedError("This mode is not implemented YET") #utterances, videoSpeakers, videoLabels, videoText, videoAudio, videoVisual, transcripts, scripts, testVid = self.load_from_pickle(directory=self.config.pickle_path, encoding=self.config.pickle_encoding) #Create Tokenizer #self.tokenizer = spacy.load('en_core_web_sm') #Loop through all data and do tokenization #self.data_seq_len = [] #self.data_text = [] #for vid in scripts: # self.data_seq_len.append(len(utterances[vid])) # self.data_text.append(transcripts[vid]) #Create Vocab #Padding else: raise Exception( "Please specify in the json a specified mode in data_mode")