def load(FLAGS): """ Load all data and store it in either a list (old) or in a dataset class (new) """ questions = [] queries = [] answers = [] impression_lvls = [] engagement_lvls = [] click_probs = [] np.random.seed(42) filename_dataset = f"Data/dataset_filename={FLAGS.filename}_expanded={FLAGS.expanded}_balance={FLAGS.balance}_impression={FLAGS.impression}_reduced_classes={FLAGS.reduced_classes}_embedder={FLAGS.embedder}_negative_samples={FLAGS.negative_samples}.p" # Check if loadable file exists if not os.path.exists(FLAGS.folder): raise OSError(f"Folder {FLAGS.folder} does not exist") if not os.path.exists(FLAGS.folder + FLAGS.filename): raise OSError(f"File {FLAGS.folder+FLAGS.filename} does not exist") N = 500 with open(FLAGS.folder + FLAGS.filename) as tsvfile: tsvreader = csv.reader(tsvfile, delimiter="\t") # skip the first line (consists of labels) next(tsvreader, None) for i, line in enumerate(tsvreader): # skip the instances that have a low impression level if FLAGS.impression and line[7] == "low": continue # if i == N: # break # Add values to the data lists queries.append(line[0]) questions.append(line[1]) answers.append([line[i] for i in range(2, 7)]) impression_lvls.append(line[7]) if FLAGS.reduced_classes: engagement_lvls.append(0 if int(line[8]) == 0 else 1) else: engagement_lvls.append(int(line[8])) click_probs.append([float(line[i]) for i in range(9, 14)]) # Attempt to fix class imbalance assuming 0 is to large if FLAGS.balance: # Index the locations of zeros and non-zeros engagement_lvls = np.array(engagement_lvls) zero_indices = np.where(engagement_lvls == 0)[0] non_zero_indices = np.where(engagement_lvls != 0)[0] # Get the median size of the engagement levels if FLAGS.reduced_classes: median_size = int(Counter(engagement_lvls)[1]) else: median_size = int( np.median(list(Counter(engagement_lvls).values()))) # Return the to be used indices sampled_indices = np.random.choice(zero_indices, median_size, replace=False) indices = np.concatenate((sampled_indices, non_zero_indices)) # Update datalist based on indices queries = [queries[i] for i in indices] questions = [questions[i] for i in indices] answers = [answers[i] for i in indices] impression_lvls = [impression_lvls[i] for i in indices] engagement_lvls = [engagement_lvls[i] for i in indices] click_probs = [click_probs[i] for i in indices] if FLAGS.expanded and FLAGS.negative_samples: # Get values for sampling n_questions = len(questions) ranges = get_ranges(queries) sampled_question_indices = [] for r in ranges: # Negative samples for each query range samples = np.random.choice( [i for i in range(n_questions) if i not in r], FLAGS.sample_size, replace=False) sampled_question_indices.append(samples) # Update the engagement levels to 2 for max engagement and 1 for other max_engagement = np.max([engagement_lvls[i] for i in r]) for i in r: if engagement_lvls[i] == max_engagement: engagement_lvls[i] = 2 else: engagement_lvls[i] = 1 # set language model if FLAGS.embedder == "Bert": # Flatten to load into embedder answers = [i for sublist in answers for i in sublist] embedder = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens') question_embeds = embedder.encode(questions, convert_to_tensor=False, show_progress_bar=True, batch_size=128, num_workers=4) query_embeds = embedder.encode(queries, convert_to_tensor=False, show_progress_bar=True, batch_size=128, num_workers=4) answer_embeds = embedder.encode(answers, convert_to_tensor=False, show_progress_bar=True, batch_size=128, num_workers=4) query_embeds = torch.from_numpy(query_embeds) question_embeds = torch.from_numpy(question_embeds) answer_embeds = torch.from_numpy(answer_embeds) print(query_embeds.shape) print(question_embeds.shape) print(answer_embeds.shape) answers = list(zip(*[iter(answers)] * 5)) if FLAGS.expanded and FLAGS.negative_samples: # Make list to extend the embeddings answer_embeds = list( answer_embeds.reshape(query_embeds.shape[0], -1)) question_embeds = list(question_embeds) query_embeds = list(query_embeds) # Extend the data with the negative samples for r, samples in zip(ranges, sampled_question_indices): queries.extend([queries[r[0]]] * len(samples)) questions.extend([questions[i] for i in samples]) answers.extend([answers[i] for i in samples]) impression_lvls.extend([impression_lvls[i] for i in samples]) engagement_lvls.extend([0] * len(samples)) click_probs.extend([click_probs[i] for i in samples]) query_embeds.extend([query_embeds[r[0]]] * len(samples)) question_embeds.extend([question_embeds[i] for i in samples]) answer_embeds.extend([answer_embeds[i] for i in samples]) # Turn the embeddings back to torch tensors query_embeds = torch.stack(query_embeds) question_embeds = torch.stack(question_embeds) answer_embeds = torch.stack(answer_embeds) print(query_embeds.shape) print(question_embeds.shape) print(answer_embeds.shape) elif FLAGS.embedder == "TFIDF": # initialize the vectorized if FLAGS.expanded: with open(f"{FLAGS.folder}TFIDF_vocab.p") as f: vocab = pkl.load(f) vectorizer = TfidfVectorizer(vocabulary=vocab) else: vectorizer = TfidfVectorizer() if FLAGS.expanded and FLAGS.negative_samples: # Extend the data with the negative samples for r, samples in zip(ranges, sampled_question_indices): queries.extend([queries[r[0]]] * len(samples)) questions.extend([questions[i] for i in samples]) answers.extend([answers[i] for i in samples]) impression_lvls.extend([impression_lvls[i] for i in samples]) engagement_lvls.extend([0] * len(samples)) click_probs.extend([click_probs[i] for i in samples]) # create the corpus: a list of string, each string is a data instance corpus = [ " ".join([queries[i], questions[i], " ".join(answers[i])]) for i in range(len(queries)) ] # this yields a sparse vector X = vectorizer.fit_transform(corpus) if not FLAGS.expanded: with open(f"{FLAGS.folder}TFIDF_vocab.p", "wb") as f: pkl.dump(vectorizer.vocabulary_, f) # use code snippet from https://ray075hl.github.io/ray075hl.github.io/sparse_matrix_pytorch/ to convert to torch tensor X = X.tocoo().astype(np.float32) indices = torch.from_numpy(np.vstack((X.row, X.col))).long() values = torch.from_numpy(X.data) shape = torch.Size(X.shape) X = torch.sparse_coo_tensor(indices, values, shape) print(f"shape of X: {X.shape}") else: print(f"Embedder {FLAGS.embedder} does not exist") return # either return the dataset for regression, with only questios, queries # and answers, or return with all attributes if FLAGS.expanded: # TODO # if statement if TFIDF or BERT # load neural net and perform forward pass on the data, yielding the predicted engagement levels if FLAGS.embedder == "Bert": answer_embeds = answer_embeds.reshape(query_embeds.shape[0], -1) input_matrix = torch.cat( (query_embeds, question_embeds, answer_embeds), dim=1) nn = Regression(n_inputs=input_matrix.shape[1], n_hidden=[300, 32], dropout_percentages=[0.0, 0.0], n_classes=1, batchnorm=True) nn.load_state_dict(torch.load("Models/Best_regression_model.pt")) nn.eval() with torch.no_grad(): preds = nn(input_matrix).squeeze() elif FLAGS.embedder == "TFIDF": nn = Regression(n_inputs=X.shape[1], n_hidden=[300, 32], dropout_percentages=[0.0, 0.0], n_classes=1, batchnorm=True) # TODO Correct model nn.load_state_dict( torch.load( "Models/Regression_Bert_SGD_0.0001_1e-05_300, 32_0.0, 0.0_True_40.pt" )) nn.eval() with torch.no_grad(): preds = nn(X).squeeze() # Save in Data object dataset = Data(queries, questions, answers, impression_lvls, engagement_lvls, click_probs, preds) # save the dataloader with open(filename_dataset, "wb") as f: pkl.dump(dataset, f, protocol=4) # return the dataset for regression else: dataset = [] if FLAGS.embedder == "Bert": for i, (query, question) in tqdm( enumerate(zip(query_embeds, question_embeds))): # reshape answers answers = answer_embeds[i * 5:i * 5 + 5] answers = answers.reshape(-1) engagement_lvl = torch.Tensor([int(engagement_lvls[i]) ]).float() inp = torch.cat((query, question, answers), 0) # Add the datapoint to the dataset dataset.append((inp, engagement_lvl)) elif FLAGS.embedder == "TFIDF": for i, inp in enumerate(X): dataset.append( (inp, torch.Tensor([int(engagement_lvls[i])]).float())) # save the dataloader with open(filename_dataset, "wb") as f: pkl.dump(dataset, f)
def train(): """ Performs training and evaluation of Regression model. """ # Set the random seeds for reproducibility np.random.seed(10) torch.manual_seed(10) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False # Get number of units in each hidden layer if FLAGS.dnn_hidden_units: dnn_hidden_units = FLAGS.dnn_hidden_units.split(",") dnn_hidden_units = [ int(dnn_hidden_unit_) for dnn_hidden_unit_ in dnn_hidden_units ] else: dnn_hidden_units = [] # convert dropout percentages dropout_probs = [float(prob) for prob in FLAGS.dropout_probs.split(',')] # check if length of dropout is equal to nr of hidden layers if len(dropout_probs) != len(dnn_hidden_units): dropout_len = len(dropout_probs) hidden_len = len(dnn_hidden_units) if dropout_len < hidden_len: for _ in range(hidden_len - dropout_len): dropout_probs.append(0) else: dropout_probs = dropout_probs[:hidden_len] # use GPU if available device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print("Device :", device) # extract all data and divide into train, valid and split dataloaders dataset_filename = f"dataset_filename=MIMICS-Click.tsv_expanded=False_balance=True_impression={FLAGS.impression}_reduced_classes={FLAGS.reduced_classes}_embedder={FLAGS.embedder}.p" with open(os.path.join(FLAGS.data_dir, dataset_filename), "rb") as f: dataset = pkl.load(f) len_all = len(dataset) train_len, valid_len = int(0.7 * len_all), int(0.15 * len_all) test_len = len_all - train_len - valid_len splits = [train_len, valid_len, test_len] train_data, valid_data, test_data = random_split(dataset, splits) train_dl = DataLoader(train_data, batch_size=FLAGS.batch_size, shuffle=True, drop_last=True) valid_dl = DataLoader(valid_data, batch_size=FLAGS.batch_size, shuffle=True, drop_last=True) test_dl = DataLoader(test_data, batch_size=FLAGS.batch_size, shuffle=True, drop_last=True) with open(f"{FLAGS.data_dir}/test_dl.pt", "wb") as f: pkl.dump(test_dl, f) # initialize MLP and loss function input_size = iter(train_dl).next()[0].shape[1] # 5376 for BERT embeddings nn = Regression(input_size, dnn_hidden_units, dropout_probs, 1, FLAGS.neg_slope, FLAGS.batchnorm).to(device) loss_function = torch.nn.MSELoss() if FLAGS.verbose: print(f"neural net:\n {[param.data for param in nn.parameters()]}") # initialize optimizer if FLAGS.optimizer == "SGD": optimizer = torch.optim.SGD(nn.parameters(), lr=FLAGS.learning_rate, weight_decay=FLAGS.weightdecay, momentum=FLAGS.momentum) elif FLAGS.optimizer == "Adam": optimizer = torch.optim.Adam(nn.parameters(), lr=FLAGS.learning_rate, amsgrad=FLAGS.amsgrad, weight_decay=FLAGS.weightdecay) elif FLAGS.optimizer == "AdamW": optimizer = torch.optim.AdamW(nn.parameters(), lr=FLAGS.learning_rate, amsgrad=FLAGS.amsgrad, weight_decay=FLAGS.weightdecay) elif FLAGS.optimizer == "RMSprop": optimizer = torch.optim.RMSprop(nn.parameters(), lr=FLAGS.learning_rate, weight_decay=FLAGS.weightdecay, momentum=FLAGS.momentum) # initialization for plotting and metrics training_losses = [] valid_losses = [] initial_train_loss = eval_on_test(nn, loss_function, train_dl, device) training_losses.append(initial_train_loss) initial_valid_loss = eval_on_test(nn, loss_function, valid_dl, device) valid_losses.append(initial_valid_loss) # construct name for saving models and figures variables_string = f"regression_{FLAGS.embedder}_{FLAGS.impression}_{FLAGS.reduced_classes}_{FLAGS.optimizer}_{FLAGS.learning_rate}_{FLAGS.weightdecay}_{FLAGS.momentum}_{FLAGS.dnn_hidden_units}_{FLAGS.dropout_probs}_{FLAGS.batchnorm}_{FLAGS.nr_epochs}" overall_batch = 0 min_valid_loss = 10000 # training loop for epoch in range(FLAGS.nr_epochs): print(f"\nEpoch: {epoch}") for batch, (x, y) in enumerate(train_dl): nn.train() # squeeze the input, and put on device x = x.to(device) y = y.to(device) optimizer.zero_grad() # forward pass pred = nn(x).to(device) # compute loss and backpropagate loss = loss_function(pred, y) loss.backward() # update the weights optimizer.step() # save training loss training_losses.append(loss.item()) # print(f"batch loss ({batch}): {loss.item()}") # get loss on validation set and evaluate if overall_batch % FLAGS.eval_freq == 0 and overall_batch != 0: valid_loss = eval_on_test(nn, loss_function, valid_dl, device) valid_losses.append(valid_loss) print( f"Training loss: {loss.item()} / Valid loss: {valid_loss}") if valid_loss < min_valid_loss: print( f"Model is saved in epoch {epoch}, overall batch: {overall_batch}" ) torch.save(nn.state_dict(), f"Models/Regression_{variables_string}.pt") min_valid_loss = valid_loss optimal_batch = overall_batch overall_batch += 1 # Load the optimal model (with loweest validation loss, and evaluate on test set) optimal_nn = Regression(input_size, dnn_hidden_units, dropout_probs, 1, FLAGS.neg_slope, FLAGS.batchnorm).to(device) optimal_nn.load_state_dict( torch.load(f"Models/Regression_{variables_string}.pt")) test_loss, test_pred, test_true = eval_on_test(optimal_nn, loss_function, test_dl, device, verbose=FLAGS.verbose, return_preds=True) # save the test predictions of the regressor with open( f"Predictions/regression_test_preds{FLAGS.embedder}_{FLAGS.reduced_classes}_{FLAGS.impression}.pt", "wb") as f: pkl.dump(test_pred, f) print( f"Loss on test set of optimal model (batch {optimal_batch}): {test_loss}" ) significance_testing(test_pred, test_true, loss_function, FLAGS) if FLAGS.plotting: plotting(training_losses, valid_losses, test_loss, variables_string, optimal_batch, FLAGS)