def make_figure_three( *, args, topn_results_obs, topn_results_counter_diss, topn_results_counter_suff ): topn_results_obs = load_from_pickle(args.results / RESULTS_OBS_FILE) topn_results_counter_suff = load_from_pickle( args.results / RESULTS_CF_SUFFICIENCY_FILE ) x = sum(topn_results_obs) / len(topn_results_obs) y = sum(topn_results_counter_suff) / len(topn_results_counter_suff) plt.figure(figsize=(5, 4)) p_obs = sum(topn_results_obs) / len(topn_results_obs) error_obs = np.sqrt(p_obs * (1 - p_obs) / len(topn_results_obs)) p_counter = sum(topn_results_counter_suff) / len(topn_results_counter_suff) error_counter = np.sqrt( p_counter * (1 - p_counter) / len(topn_results_counter_suff) ) xmarks = [i + 1 for i in range(len(p_obs))] plt.plot(xmarks, 1 - p_obs, label="Associative", color="blue") plt.plot(xmarks, 1 - p_counter, label="Counterfactual", color="seagreen") plt.plot(xmarks, 1 - (1 - y) / (1 - x), linestyle="--", color="black") plt.fill_between( xmarks, 1 - p_obs - 2 * error_obs, 1 - p_obs + 2 * error_obs, alpha=0.2, edgecolor="#1B2ACC", facecolor="#089FFF", linewidth=0, linestyle="None", antialiased=True, ) plt.fill_between( xmarks, 1 - p_counter - 2 * error_counter, 1 - p_counter + 2 * error_counter, alpha=0.2, edgecolor="#1B2ACC", facecolor="seagreen", linewidth=0, linestyle="None", antialiased=True, ) plt.xticks([i + 1 for i in range(16)]) plt.xlim(1, 15) plt.ylim(0, 0.5) plt.show() plt.savefig(args.results / "algo_vs_algo.pdf")
def preprocess_data(train_files, config): x_train, y_train, x_dev, y_dev = [], [], [], [] for file in tqdm(train_files, desc="preprocessing_data"): cache_file_path = file.replace("data", "cache").replace( ".csv", "_fw{}_pw{}_pad{}_ts{}_cache.pickle".format( config["feature_window_size"], config["prediction_window_size"], config["pad_size"], config["test_size"])) if os.path.exists(cache_file_path): print("Load cache from {}".format(cache_file_path)) x_window_train, y_window_train, x_window_dev, y_window_dev = utils.load_from_pickle( cache_file_path) else: x_train_single, y_train_single, x_dev_single, y_dev_single = read_data_from_file( file, config["test_size"]) x_window_train, y_window_train, x_window_dev, y_window_dev = \ prepare_data(x_train_single, y_train_single, x_dev_single, y_dev_single, config) print("Save cache to {}".format(cache_file_path)) utils.save_to_pickle( (x_window_train, y_window_train, x_window_dev, y_window_dev), cache_file_path) x_train.extend(x_window_train) y_train.extend(y_window_train) x_dev.extend(x_window_dev) y_dev.extend(y_window_dev) return x_train, y_train, x_dev, y_dev
def get_bot_response(): user_text = request.args.get('msg') custom_answer = find_custom_answer(user_text, threshold=COS_SIM_THRESHOLD) if custom_answer: reply_text = custom_answer else: helper_data = load_from_pickle(TMP_FILENAME_FOR_DIALOGUE_HELPER_DATA) restart_dialogue = helper_data["restart_dialogue"] chat_history_ids = helper_data["chat_history_ids"] if any(w == user_text.strip().lower() for w in restart_keywords): reply_text = "Ok, let's start from scratch, I am ready" restart_dialogue = True elif any(w == user_text.strip().lower() for w in exit_keywords): reply_text = "Ok, bye! Just waiting if you type something..." restart_dialogue = True else: reply_text, chat_history_ids = dialog_gpt(user_text, chat_history_ids, restart_dialogue) restart_dialogue = False helper_data = { "restart_dialogue": restart_dialogue, "chat_history_ids": chat_history_ids } save_to_pickle(helper_data, TMP_FILENAME_FOR_DIALOGUE_HELPER_DATA) return reply_text
def _load(self): """ Load previous robot objects, currently using pickle. """ try: robot_list = load_from_pickle() except IOError: pass # Just pass, self.robots need not be modified else: # Update instead of assignmet in case loading somehow happens after some bots are already created self.robots.update({bot.name: bot for bot in robot_list})
def __init__(self, logger, config, data_name, data_path, embed_path=None, user_dict=None, vocab_path=None, stop_word=None, max_len=50, query_max_len=20, target_max_len=20, test_split=0.0, training=True): self.logger = logger self.reset = config.reset self._data_dir = Path('data') / data_name self.query_max_len = query_max_len self.target_max_len = target_max_len self.max_len = max_len if training: embedding_path = self._data_dir / embed_path print(embedding_path.absolute()) self._embedding = Embedding(str(embedding_path), logger=logger) print( f"Begin to build segment and ..... feature engnieer .... ngram ....." ) self._segment = Segment_jieba(user_dict=str(self._data_dir / user_dict)) if training: print(f"Begin to build vocab") self._vocab = Vocab(str(self._data_dir / 'RAW' / vocab_path), self._segment, self._embedding) self.word2idx, self.idx2word = self._vocab.word2idx, self._vocab.idx2word dump_to_pickle(str(self._data_dir / 'vocab.pkl'), (self.word2idx, self.idx2word), self.reset) else: print(f"load the vocab") (self.word2idx, self.idx2word) = load_from_pickle( str(self._data_dir / 'vocab.pkl')) self.vocab_size = len(self.word2idx) if training: filename = str(self._data_dir / 'RAW' / data_path) # train_test_split and exist self._get_train_and_test(filename, test_split)
def produce_results(*, args): print(f"> Producing results from {str(args.results.absolute())}") topn_results_obs = load_from_pickle(args.results / RESULTS_OBS_FILE) topn_results_counter_diss = load_from_pickle( args.results / RESULTS_CF_DISSABLEMENT_FILE ) topn_results_counter_suff = load_from_pickle( args.results / RESULTS_CF_SUFFICIENCY_FILE ) make_supplementary_table_one( args=args, topn_results_obs=topn_results_obs, topn_results_counter_diss=topn_results_counter_diss, topn_results_counter_suff=topn_results_counter_suff, ) make_figure_three( args=args, topn_results_obs=topn_results_obs, topn_results_counter_diss=topn_results_counter_diss, topn_results_counter_suff=topn_results_counter_suff, ) make_table_one_and_supplementary_table_two( args=args, topn_results_obs=topn_results_obs, topn_results_counter_diss=topn_results_counter_diss, topn_results_counter_suff=topn_results_counter_suff, ) df_results, doc_topn = make_supplementary_table_three( args=args, topn_results_obs=topn_results_obs, topn_results_counter_diss=topn_results_counter_diss, topn_results_counter_suff=topn_results_counter_suff, ) make_table_two(args=args, df_results=df_results, doc_topn=doc_topn) make_figure_four(args=args, df_results=df_results)
def main(): # Read all the data instances task_instances_dict, tag_statistics, question_keys_and_tags = load_from_pickle( args.data_file) data, subtasks_list = get_multitask_instances_for_valid_tasks( task_instances_dict, tag_statistics) if args.retrain: logging.info("Creating and training the model from 'bert-base-cased' ") # Create the save_directory if not exists make_dir_if_not_exists(args.save_directory) # Initialize tokenizer and model with pretrained weights tokenizer = BertTokenizer.from_pretrained('bert-base-cased') config = BertConfig.from_pretrained('bert-base-cased') config.subtasks = subtasks_list # print(config) model = MultiTaskBertForCovidEntityClassification.from_pretrained( 'bert-base-cased', config=config) # Add new tokens in tokenizer new_special_tokens_dict = { "additional_special_tokens": ["<E>", "</E>", "<URL>", "@USER"] } # new_special_tokens_dict = {"additional_special_tokens": ["<E>", "</E>"]} tokenizer.add_special_tokens(new_special_tokens_dict) # Add the new embeddings in the weights print("Embeddings type:", model.bert.embeddings.word_embeddings.weight.data.type()) print("Embeddings shape:", model.bert.embeddings.word_embeddings.weight.data.size()) embedding_size = model.bert.embeddings.word_embeddings.weight.size(1) new_embeddings = torch.FloatTensor( len(new_special_tokens_dict["additional_special_tokens"]), embedding_size).uniform_(-0.1, 0.1) # new_embeddings = torch.FloatTensor(2, embedding_size).uniform_(-0.1, 0.1) print("new_embeddings shape:", new_embeddings.size()) new_embedding_weight = torch.cat( (model.bert.embeddings.word_embeddings.weight.data, new_embeddings), 0) model.bert.embeddings.word_embeddings.weight.data = new_embedding_weight print("Embeddings shape:", model.bert.embeddings.word_embeddings.weight.data.size()) # Update model config vocab size model.config.vocab_size = model.config.vocab_size + len( new_special_tokens_dict["additional_special_tokens"]) else: # Load the tokenizer and model from the save_directory tokenizer = BertTokenizer.from_pretrained(args.save_directory) model = MultiTaskBertForCovidEntityClassification.from_pretrained( args.save_directory) # print(model.state_dict().keys()) # TODO save and load the subtask classifier weights separately # Load from individual state dicts for subtask in model.subtasks: model.classifiers[subtask].load_state_dict( torch.load( os.path.join(args.save_directory, f"{subtask}_classifier.bin"))) # print(model.config) # exit() model.to(device) # Explicitly move the classifiers to device for subtask, classifier in model.classifiers.items(): classifier.to(device) entity_start_token_id = tokenizer.convert_tokens_to_ids(["<E>"])[0] logging.info( f"Task dataset for task: {args.task} loaded from {args.data_file}.") model_config = dict() results = dict() # Split the data into train, dev and test and shuffle the train segment train_data, dev_data, test_data = split_multitask_instances_in_train_dev_test( data) random.shuffle(train_data) # shuffle happens in-place logging.info("Train Data:") total_train_size, pos_subtasks_train_size, neg_subtasks_train_size = log_multitask_data_statistics( train_data, model.subtasks) logging.info("Dev Data:") total_dev_size, pos_subtasks_dev_size, neg_subtasks_dev_size = log_multitask_data_statistics( dev_data, model.subtasks) logging.info("Test Data:") total_test_size, pos_subtasks_test_size, neg_subtasks_test_size = log_multitask_data_statistics( test_data, model.subtasks) logging.info("\n") model_config["train_data"] = { "size": total_train_size, "pos": pos_subtasks_train_size, "neg": neg_subtasks_train_size } model_config["dev_data"] = { "size": total_dev_size, "pos": pos_subtasks_dev_size, "neg": neg_subtasks_dev_size } model_config["test_data"] = { "size": total_test_size, "pos": pos_subtasks_test_size, "neg": neg_subtasks_test_size } # Extract subtasks data for dev and test dev_subtasks_data = split_data_based_on_subtasks(dev_data, model.subtasks) test_subtasks_data = split_data_based_on_subtasks(test_data, model.subtasks) # Load the instances into pytorch dataset train_dataset = COVID19TaskDataset(train_data) dev_dataset = COVID19TaskDataset(dev_data) test_dataset = COVID19TaskDataset(test_data) logging.info("Loaded the datasets into Pytorch datasets") tokenize_collator = TokenizeCollator(tokenizer, model.subtasks, entity_start_token_id) train_dataloader = DataLoader(train_dataset, batch_size=POSSIBLE_BATCH_SIZE, shuffle=True, num_workers=0, collate_fn=tokenize_collator) dev_dataloader = DataLoader(dev_dataset, batch_size=POSSIBLE_BATCH_SIZE, shuffle=False, num_workers=0, collate_fn=tokenize_collator) test_dataloader = DataLoader(test_dataset, batch_size=POSSIBLE_BATCH_SIZE, shuffle=False, num_workers=0, collate_fn=tokenize_collator) logging.info("Created train and test dataloaders with batch aggregation") # Only retrain if needed if args.retrain: print('DO RETRAIN') ################################################################################################## # NOTE: Training Tutorial Reference # https://mccormickml.com/2019/07/22/BERT-fine-tuning/#41-bertforsequenceclassification ################################################################################################## # Create an optimizer training schedule for the BERT text classification model # NOTE: AdamW is a class from the huggingface library (as opposed to pytorch) # I believe the 'W' stands for 'Weight Decay fix" # Recommended Schedule for BERT fine-tuning as per the paper # Batch size: 16, 32 # Learning rate (Adam): 5e-5, 3e-5, 2e-5 # Number of epochs: 2, 3, 4 optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8) logging.info("Created model optimizer") # Number of training epochs. The BERT authors recommend between 2 and 4. # We chose to run for 4, but we'll see later that this may be over-fitting the # training data. epochs = args.n_epochs # Total number of training steps is [number of batches] x [number of epochs]. # (Note that this is not the same as the number of training samples). total_steps = len(train_dataloader) * epochs # Create the learning rate scheduler. # NOTE: num_warmup_steps = 0 is the Default value in run_glue.py scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=total_steps) # We'll store a number of quantities such as training and validation loss, # validation accuracy, and timings. training_stats = [] logging.info(f"Initiating training loop for {args.n_epochs} epochs...") # Measure the total training time for the whole run. total_start_time = time.time() # Find the accumulation steps accumulation_steps = args.batch_size / POSSIBLE_BATCH_SIZE # Loss trajectory for epochs epoch_train_loss = list() # Dev validation trajectory dev_subtasks_validation_statistics = { subtask: list() for subtask in model.subtasks } for epoch in range(epochs): pbar = tqdm(train_dataloader) logging.info(f"Initiating Epoch {epoch+1}:") # Reset the total loss for each epoch. total_train_loss = 0 train_loss_trajectory = list() # Reset timer for each epoch start_time = time.time() model.train() dev_log_frequency = 5 n_steps = len(train_dataloader) dev_steps = int(n_steps / dev_log_frequency) for step, batch in enumerate(pbar): # Upload labels of each subtask to device for subtask in model.subtasks: subtask_labels = batch["gold_labels"][subtask] subtask_labels = subtask_labels.to(device) # print("HAHAHAHAH:", subtask_labels.is_cuda) batch["gold_labels"][subtask] = subtask_labels # print("HAHAHAHAH:", batch["gold_labels"][subtask].is_cuda) # Forward input_dict = { "input_ids": batch["input_ids"].to(device), "entity_start_positions": batch["entity_start_positions"].to(device), "labels": batch["gold_labels"] } input_ids = batch["input_ids"] entity_start_positions = batch["entity_start_positions"] gold_labels = batch["gold_labels"] batch_data = batch["batch_data"] loss, logits = model(**input_dict) # loss = loss / accumulation_steps # Accumulate loss total_train_loss += loss.item() # Backward: compute gradients loss.backward() if (step + 1) % accumulation_steps == 0: # Calculate elapsed time in minutes and print loss on the tqdm bar elapsed = format_time(time.time() - start_time) avg_train_loss = total_train_loss / (step + 1) # keep track of changing avg_train_loss train_loss_trajectory.append(avg_train_loss) pbar.set_description( f"Epoch:{epoch+1}|Batch:{step}/{len(train_dataloader)}|Time:{elapsed}|Avg. Loss:{avg_train_loss:.4f}|Loss:{loss.item():.4f}" ) # Clip the norm of the gradients to 1.0. # This is to help prevent the "exploding gradients" problem. torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) # Update parameters optimizer.step() # Clean the model's previous gradients model.zero_grad() # Reset gradients tensors # Update the learning rate. scheduler.step() pbar.update() if (step + 1) % dev_steps == 0: # Perform validation with the model and log the performance logging.info("Running Validation...") # Put the model in evaluation mode--the dropout layers behave differently # during evaluation. model.eval() dev_predicted_labels, dev_prediction_scores, dev_gold_labels = make_predictions_on_dataset( dev_dataloader, model, device, args.task + "_dev", True) for subtask in model.subtasks: dev_subtask_data = dev_subtasks_data[subtask] dev_subtask_prediction_scores = dev_prediction_scores[ subtask] dev_F1, dev_P, dev_R, dev_TP, dev_FP, dev_FN = get_TP_FP_FN( dev_subtask_data, dev_subtask_prediction_scores) logging.info( f"Subtask:{subtask:>15}\tN={dev_TP + dev_FN}\tF1={dev_F1}\tP={dev_P}\tR={dev_R}\tTP={dev_TP}\tFP={dev_FP}\tFN={dev_FN}" ) dev_subtasks_validation_statistics[subtask].append( (epoch + 1, step + 1, dev_TP + dev_FN, dev_F1, dev_P, dev_R, dev_TP, dev_FP, dev_FN)) # logging.info("DEBUG:Validation on Test") # dev_predicted_labels, dev_prediction_scores, dev_gold_labels = make_predictions_on_dataset(test_dataloader, model, device, args.task + "_dev", True) # for subtask in model.subtasks: # dev_subtask_data = test_subtasks_data[subtask] # dev_subtask_prediction_scores = dev_prediction_scores[subtask] # dev_F1, dev_P, dev_R, dev_TP, dev_FP, dev_FN = get_TP_FP_FN(dev_subtask_data, dev_subtask_prediction_scores) # logging.info(f"Subtask:{subtask:>15}\tN={dev_TP + dev_FN}\tF1={dev_F1}\tP={dev_P}\tR={dev_R}\tTP={dev_TP}\tFP={dev_FP}\tFN={dev_FN}") # dev_subtasks_validation_statistics[subtask].append((epoch + 1, step + 1, dev_TP + dev_FN, dev_F1, dev_P, dev_R, dev_TP, dev_FP, dev_FN)) # Put the model back in train setting model.train() # Calculate the average loss over all of the batches. avg_train_loss = total_train_loss / len(train_dataloader) training_time = format_time(time.time() - start_time) # Record all statistics from this epoch. training_stats.append({ 'epoch': epoch + 1, 'Training Loss': avg_train_loss, 'Training Time': training_time }) # Save the loss trajectory epoch_train_loss.append(train_loss_trajectory) logging.info( f"Training complete with total Train time:{format_time(time.time()- total_start_time)}" ) log_list(training_stats) # Save the model and the Tokenizer here: logging.info( f"Saving the model and tokenizer in {args.save_directory}") model.save_pretrained(args.save_directory) # Save each subtask classifiers weights to individual state dicts for subtask, classifier in model.classifiers.items(): classifier_save_file = os.path.join(args.save_directory, f"{subtask}_classifier.bin") logging.info( f"Saving the model's {subtask} classifier weights at {classifier_save_file}" ) torch.save(classifier.state_dict(), classifier_save_file) tokenizer.save_pretrained(args.save_directory) # Plot the train loss trajectory in a plot train_loss_trajectory_plot_file = os.path.join( args.output_dir, "train_loss_trajectory.png") logging.info( f"Saving the Train loss trajectory at {train_loss_trajectory_plot_file}" ) plot_train_loss(epoch_train_loss, train_loss_trajectory_plot_file) # TODO: Plot the validation performance # Save dev_subtasks_validation_statistics else: logging.info("No training needed. Directly going to evaluation!") # Save the model name in the model_config file model_config["model"] = "MultiTaskBertForCovidEntityClassification" model_config["epochs"] = args.n_epochs # Find best threshold for each subtask based on dev set performance thresholds = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9] test_predicted_labels, test_prediction_scores, test_gold_labels = make_predictions_on_dataset( test_dataloader, model, device, args.task, True) dev_predicted_labels, dev_prediction_scores, dev_gold_labels = make_predictions_on_dataset( dev_dataloader, model, device, args.task + "_dev", True) best_test_thresholds = {subtask: 0.5 for subtask in model.subtasks} best_dev_thresholds = {subtask: 0.5 for subtask in model.subtasks} best_test_F1s = {subtask: 0.0 for subtask in model.subtasks} best_dev_F1s = {subtask: 0.0 for subtask in model.subtasks} test_subtasks_t_F1_P_Rs = {subtask: list() for subtask in model.subtasks} dev_subtasks_t_F1_P_Rs = {subtask: list() for subtask in model.subtasks} # for subtask in model.subtasks: # test_subtask_data = test_subtasks_data[subtask] # test_subtask_prediction_scores = test_prediction_scores[subtask] # for t in thresholds: # test_F1, test_P, test_R, test_TP, test_FP, test_FN = get_TP_FP_FN(test_subtask_data, test_subtask_prediction_scores, THRESHOLD=t) # test_subtasks_t_F1_P_Rs[subtask].append((t, test_F1, test_P, test_R, test_TP + test_FN, test_TP, test_FP, test_FN)) # if test_F1 > best_test_F1s[subtask]: # best_test_thresholds[subtask] = t # best_test_F1s[subtask] = test_F1 # logging.info(f"Subtask:{subtask:>15}") # log_list(test_subtasks_t_F1_P_Rs[subtask]) # logging.info(f"Best Test Threshold for subtask: {best_test_thresholds[subtask]}\t Best test F1: {best_test_F1s[subtask]}") for subtask in model.subtasks: dev_subtask_data = dev_subtasks_data[subtask] dev_subtask_prediction_scores = dev_prediction_scores[subtask] for t in thresholds: dev_F1, dev_P, dev_R, dev_TP, dev_FP, dev_FN = get_TP_FP_FN( dev_subtask_data, dev_subtask_prediction_scores, THRESHOLD=t) dev_subtasks_t_F1_P_Rs[subtask].append( (t, dev_F1, dev_P, dev_R, dev_TP + dev_FN, dev_TP, dev_FP, dev_FN)) if dev_F1 > best_dev_F1s[subtask]: best_dev_thresholds[subtask] = t best_dev_F1s[subtask] = dev_F1 logging.info(f"Subtask:{subtask:>15}") log_list(dev_subtasks_t_F1_P_Rs[subtask]) logging.info( f"Best Dev Threshold for subtask: {best_dev_thresholds[subtask]}\t Best dev F1: {best_dev_F1s[subtask]}" ) # Save the best dev threshold and dev_F1 in results dict results["best_dev_threshold"] = best_dev_thresholds results["best_dev_F1s"] = best_dev_F1s results["dev_t_F1_P_Rs"] = dev_subtasks_t_F1_P_Rs # Evaluate on Test logging.info("Testing on test dataset") # test_predicted_labels, test_prediction_scores, test_gold_labels = make_predictions_on_dataset(test_dataloader, model, device, args.task) predicted_labels, prediction_scores, gold_labels = make_predictions_on_dataset( test_dataloader, model, device, args.task) # Test for subtask in model.subtasks: logging.info(f"Testing the trained classifier on subtask: {subtask}") # print(len(test_dataloader)) # print(len(prediction_scores[subtask])) # print(len(test_subtasks_data[subtask])) results[subtask] = dict() cm = metrics.confusion_matrix(gold_labels[subtask], predicted_labels[subtask]) classification_report = metrics.classification_report( gold_labels[subtask], predicted_labels[subtask], output_dict=True) logging.info(cm) logging.info( metrics.classification_report(gold_labels[subtask], predicted_labels[subtask])) results[subtask]["CM"] = cm.tolist( ) # Storing it as list of lists instead of numpy.ndarray results[subtask]["Classification Report"] = classification_report # SQuAD style EM and F1 evaluation for all test cases and for positive test cases (i.e. for cases where annotators had a gold annotation) EM_score, F1_score, total = get_raw_scores(test_subtasks_data[subtask], prediction_scores[subtask]) logging.info("Word overlap based SQuAD evaluation style metrics:") logging.info(f"Total number of cases: {total}") logging.info(f"EM_score: {EM_score}") logging.info(f"F1_score: {F1_score}") results[subtask]["SQuAD_EM"] = EM_score results[subtask]["SQuAD_F1"] = F1_score results[subtask]["SQuAD_total"] = total pos_EM_score, pos_F1_score, pos_total = get_raw_scores( test_subtasks_data[subtask], prediction_scores[subtask], positive_only=True) logging.info(f"Total number of Positive cases: {pos_total}") logging.info(f"Pos. EM_score: {pos_EM_score}") logging.info(f"Pos. F1_score: {pos_F1_score}") results[subtask]["SQuAD_Pos. EM"] = pos_EM_score results[subtask]["SQuAD_Pos. F1"] = pos_F1_score results[subtask]["SQuAD_Pos. EM_F1_total"] = pos_total # New evaluation suggested by Alan F1, P, R, TP, FP, FN = get_TP_FP_FN( test_subtasks_data[subtask], prediction_scores[subtask], THRESHOLD=best_dev_thresholds[subtask]) logging.info("New evaluation scores:") logging.info(f"F1: {F1}") logging.info(f"Precision: {P}") logging.info(f"Recall: {R}") logging.info(f"True Positive: {TP}") logging.info(f"False Positive: {FP}") logging.info(f"False Negative: {FN}") results[subtask]["F1"] = F1 results[subtask]["P"] = P results[subtask]["R"] = R results[subtask]["TP"] = TP results[subtask]["FP"] = FP results[subtask]["FN"] = FN N = TP + FN results[subtask]["N"] = N # # Top predictions in the Test case # prediction_scores[subtask] = np.array(prediction_scores[subtask]) # sorted_prediction_ids = np.argsort(-prediction_scores[subtask]) # K = 200 # logging.info("Top {} predictions:".format(K)) # logging.info("\t".join(["Tweet", "BERT model input", "candidate chunk", "prediction score", "predicted label", "gold label", "gold chunks"])) # for i in range(K): # instance_id = sorted_prediction_ids[i] # # text :: candidate_chunk :: candidate_chunk_id :: chunk_start_text_id :: chunk_end_text_id :: tokenized_tweet :: tokenized_tweet_with_masked_q_token :: tagged_chunks :: question_label # tweet = test_subtasks_data[subtask][instance_id][0].replace("\n", " ") # chunk = test_subtasks_data[subtask][instance_id][1] # tokenized_tweet_with_masked_chunk = test_subtasks_data[subtask][instance_id][6] # if chunk in ["AUTHOR OF THE TWEET", "NEAR AUTHOR OF THE TWEET"]: # # First element of the text will be considered as AUTHOR OF THE TWEET or NEAR AUTHOR OF THE TWEET # bert_model_input_text = tokenized_tweet_with_masked_chunk.replace(Q_TOKEN, "<E> </E>") # # print(tokenized_tweet_with_masked_chunk) # # print(bert_model_input_text) # # exit() # else: # bert_model_input_text = tokenized_tweet_with_masked_chunk.replace(Q_TOKEN, "<E> " + chunk + " </E>") # list_to_print = [tweet, bert_model_input_text, chunk, str(prediction_scores[subtask][instance_id]), str(predicted_labels[subtask][instance_id]), str(test_subtasks_data[subtask][instance_id][-1]), str(test_subtasks_data[subtask][instance_id][-2])] # logging.info("\t".join(list_to_print)) # Save model_config and results model_config_file = os.path.join(args.output_dir, "model_config.json") results_file = os.path.join(args.output_dir, "results.json") logging.info(f"Saving model config at {model_config_file}") save_in_json(model_config, model_config_file) logging.info(f"Saving results at {results_file}") save_in_json(results, results_file)
def train(choice, dirname, window): # -------------------------------------------------------------------------------------------------------------------- dimensionality = 50 # No need to adjust, unless you want to experiment with custom embeddings print("Dimensionality:", dimensionality) regex = re.compile(r"[+-.]?\d+[-.,\d+:]*(th|st|nd|rd)?") if choice == 'imm': base = '_imm' elif choice == 'prewin': base = '_prewin' style = 'test' mlmr_dir = dirname seq_length = window # Adjust to 5 for PreWin and 5, 10, 50 for baseline results neg = load_from_pickle("{}/wiki_LOCATION_{}{}.pkl".format( mlmr_dir, style, base)) pos = load_from_pickle("{}/wiki_INSTITUTE_{}{}.pkl".format( mlmr_dir, style, base)) if path.exists("{}/wiki_EVENT_{}{}.pkl".format(mlmr_dir, style, base)): pos.extend( load_from_pickle("{}/wiki_EVENT_{}{}.pkl".format( mlmr_dir, style, base))) if path.exists("{}/wiki_TEAM_{}{}.pkl".format(mlmr_dir, style, base)): pos.extend( load_from_pickle("{}/wiki_TEAM_{}{}.pkl".format( mlmr_dir, style, base))) if path.exists("{}/wiki_ARTIFACT_{}{}.pkl".format(mlmr_dir, style, base)): pos.extend( load_from_pickle("{}/wiki_ARTIFACT_{}{}.pkl".format( mlmr_dir, style, base))) print("Sequence Length: 2 times ", seq_length) A = [] dep_labels = {u"<u>"} for coll in [neg, pos]: for l in coll: A.append(l) dep_labels.update(set(l[1][-seq_length:] + l[3][:seq_length])) random.shuffle(A) X_L, D_L, X_R, D_R, Y = [], [], [], [], [] for a in A: X_L.append(a[0][-seq_length:]) D_L.append(a[1][-seq_length:]) X_R.append(a[2][:seq_length]) D_R.append(a[3][:seq_length]) Y.append(a[4]) print('No of training examples: ', len(X_L)) dump_to_pickle("dep_labels.pkl", dep_labels) dep_labels = load_from_pickle("dep_labels.pkl") # -------------------------------------------------------------------------------------------------------------------- vocabulary = {u"<u>", u"0.0"} vocab_limit = 100000 print('Vocabulary Size: ', vocab_limit) print("Building sequences...") count = 0 vectors_glove = {u'<u>': np.ones(dimensionality)} # Please supply your own embeddings, see README.md for details for line in codecs.open("glove.6B.50d.txt", encoding="utf-8"): tokens = line.split() vocabulary.add(tokens[0]) vectors_glove[tokens[0]] = [float(x) for x in tokens[1:]] count += 1 if count >= vocab_limit: break vectors_glove[u"0.0"] = np.zeros(dimensionality) word_to_index = dict([(w, i) for i, w in enumerate(vocabulary)]) dep_to_index = dict([(w, i) for i, w in enumerate(dep_labels)]) for x_l, x_r, d_l, d_r in zip(X_L, X_R, D_L, D_R): for i, w in enumerate(x_l): if w != u"0.0": w = regex.sub(u"1", w) if w in word_to_index: x_l[i] = word_to_index[w] else: x_l[i] = word_to_index[u"<u>"] for i, w in enumerate(x_r): if w != u"0.0": w = regex.sub(u"1", w) if w in word_to_index: x_r[i] = word_to_index[w] else: x_r[i] = word_to_index[u"<u>"] for i, w in enumerate(d_l): arr = np.zeros(len(dep_labels)) if w in dep_to_index: arr[dep_to_index[w]] = 1 else: arr[dep_to_index[u"<u>"]] = 1 d_l[i] = arr for i, w in enumerate(d_r): arr = np.zeros(len(dep_labels)) if w in dep_to_index: arr[dep_to_index[w]] = 1 else: arr[dep_to_index[u"<u>"]] = 1 d_r[i] = arr X_L = np.asarray(X_L) X_R = np.asarray(X_R) D_L = np.asarray(D_L) D_R = np.asarray(D_R) Y = np.asarray(Y) # convert labels to one-hot format num_classes = Y.max() + 1 one_hot = np.zeros((Y.size, num_classes)) one_hot[np.arange(Y.size), Y] = 1 Y = one_hot weights = np.zeros((len(vocabulary), dimensionality)) for w in vocabulary: if w in vectors_glove: weights[word_to_index[w]] = vectors_glove[w] weights = np.array([weights]) print(u"Done...") # -------------------------------------------------------------------------------------------------------------------- print(u'Building model...') first_input = Input(shape=(seq_length, )) a = Embedding(len(vocabulary), dimensionality, input_length=(seq_length, ), embeddings_initializer=Constant(weights))(first_input) b = LSTM(units=15)(a) first_output = Dropout(0.2)(b) model_left = Model(inputs=first_input, outputs=first_output) second_input = Input(shape=(seq_length, len(dep_labels))) a = TimeDistributed(Dense(units=15))(second_input) b = Dropout(0.2)(a) second_output = Flatten()(b) dep_left = Model(inputs=second_input, outputs=second_output) third_input = Input(shape=(seq_length, )) a = Embedding(len(vocabulary), dimensionality, input_length=(seq_length, ), embeddings_initializer=Constant(weights))(third_input) b = LSTM(units=15, go_backwards=True)(a) third_output = Dropout(0.2)(b) model_right = Model(inputs=third_input, outputs=third_output) fourth_input = Input(shape=(seq_length, len(dep_labels))) a = TimeDistributed(Dense(units=15))(fourth_input) b = Dropout(0.2)(a) fourth_output = Flatten()(b) dep_right = Model(inputs=fourth_input, outputs=fourth_output) a = concatenate([first_output, second_output, third_output, fourth_output]) b = Dense(10)(a) c = Dense(num_classes, activation='softmax')(b) merged_model = Model( inputs=[first_input, second_input, third_input, fourth_input], outputs=c) merged_model.compile(loss='categorical_crossentropy', optimizer='adagrad', metrics=['accuracy']) print(u"Done...") # -------------------------------------------------------------------------------------------------------------------- checkpoint = ModelCheckpoint(filepath="lstm.hdf5", verbose=0) merged_model.fit([X_L, D_L, X_R, D_R], Y, batch_size=16, epochs=5, callbacks=[checkpoint], verbose=0)
# collection="test_pd", *args, **kwargs): # """ Read a dataset with metadata compute metadata and # save to json from a tuple(url,format,id)""" # # d, stringio = response # if stringio is not None: # extension, mid, url = d['format'], d['id_metadata'], d['url'] # meta = {} # try: # if extension == 'csv': # meta = DataExploration(pd.read_csv(stringio)).metadata() # if extension == 'html': # meta = DataExploration(pd.read_html(stringio)).metadata() # if extension == 'xslx': # meta = DataExploration(pd.read_excel(stringio)).metadata() # except Exception as e: # logger.exception(e) # meta['id_metadata'] = mid # meta['url'] = url # print(meta) # #write_to_json_line(meta, file_path) # logger.info('{} : Successful write_to_json_line '.format(url)) # # db[collection].insert_one(meta) if __name__ == "__main__": # list_d = get_list_csv(DB['datagov']) list_d = load_from_pickle('list_urls') for d in tqdm.tqdm(list_d): get_meta(downloader_s(d), collection="test5")
def main(): # Read all the data instances task_instances_dict, tag_statistics, question_keys_and_tags = load_from_pickle( args.data_file) data, subtasks_list = get_multitask_instances_for_valid_tasks( task_instances_dict, tag_statistics) if args.retrain: logging.info("Creating and training the model from 'bert-base-cased' ") # Create the save_directory if not exists make_dir_if_not_exists(args.save_directory) # Initialize tokenizer and model with pretrained weights tokenizer = BertTokenizer.from_pretrained('bert-base-cased') config = BertConfig.from_pretrained('bert-base-cased') config.subtasks = subtasks_list # print(config) model = MultiTaskBertForCovidEntityClassification.from_pretrained( 'bert-base-cased', config=config) # Add new tokens in tokenizer new_special_tokens_dict = { "additional_special_tokens": ["<E>", "</E>", "<URL>", "@USER"] } # new_special_tokens_dict = {"additional_special_tokens": ["<E>", "</E>"]} tokenizer.add_special_tokens(new_special_tokens_dict) # Add the new embeddings in the weights print("Embeddings type:", model.bert.embeddings.word_embeddings.weight.data.type()) print("Embeddings shape:", model.bert.embeddings.word_embeddings.weight.data.size()) embedding_size = model.bert.embeddings.word_embeddings.weight.size(1) new_embeddings = torch.FloatTensor( len(new_special_tokens_dict["additional_special_tokens"]), embedding_size).uniform_(-0.1, 0.1) # new_embeddings = torch.FloatTensor(2, embedding_size).uniform_(-0.1, 0.1) print("new_embeddings shape:", new_embeddings.size()) new_embedding_weight = torch.cat( (model.bert.embeddings.word_embeddings.weight.data, new_embeddings), 0) model.bert.embeddings.word_embeddings.weight.data = new_embedding_weight print("Embeddings shape:", model.bert.embeddings.word_embeddings.weight.data.size()) # Update model config vocab size model.config.vocab_size = model.config.vocab_size + len( new_special_tokens_dict["additional_special_tokens"]) else: # Load the tokenizer and model from the save_directory tokenizer = BertTokenizer.from_pretrained(args.save_directory) model = MultiTaskBertForCovidEntityClassification.from_pretrained( args.save_directory) # print(model.state_dict().keys()) # TODO save and load the subtask classifier weights separately # Load from individual state dicts for subtask in model.subtasks: model.classifiers[subtask].load_state_dict( torch.load( os.path.join(args.save_directory, f"{subtask}_classifier.bin"))) # print(model.config) # exit() model.to(device) # Explicitly move the classifiers to device for subtask, classifier in model.classifiers.items(): classifier.to(device) entity_start_token_id = tokenizer.convert_tokens_to_ids(["<E>"])[0] logging.info( f"Task dataset for task: {args.task} loaded from {args.data_file}.") model_config = dict() results = dict() test_data = data logging.info("Test Data:") total_test_size, pos_subtasks_test_size, neg_subtasks_test_size = log_multitask_data_statistics( test_data, model.subtasks) logging.info("\n") # model_config["train_data"] = {"size":total_train_size, "pos":pos_subtasks_train_size, "neg":neg_subtasks_train_size} # model_config["dev_data"] = {"size":total_dev_size, "pos":pos_subtasks_dev_size, "neg":neg_subtasks_dev_size} model_config["test_data"] = { "size": total_test_size, "pos": pos_subtasks_test_size, "neg": neg_subtasks_test_size } # Extract subtasks data for dev and test #dev_subtasks_data = split_data_based_on_subtasks(dev_data, model.subtasks) test_subtasks_data = split_data_based_on_subtasks(test_data, model.subtasks) # Load the instances into pytorch dataset # train_dataset = COVID19TaskDataset(train_data) # dev_dataset = COVID19TaskDataset(dev_data) test_dataset = COVID19TaskDataset(test_data) logging.info("Loaded the datasets into Pytorch datasets") tokenize_collator = TokenizeCollator(tokenizer, model.subtasks, entity_start_token_id) # train_dataloader = DataLoader(train_dataset, batch_size=POSSIBLE_BATCH_SIZE, shuffle=True, num_workers=0, collate_fn=tokenize_collator) # dev_dataloader = DataLoader(dev_dataset, batch_size=POSSIBLE_BATCH_SIZE, shuffle=False, num_workers=0, collate_fn=tokenize_collator) test_dataloader = DataLoader(test_dataset, batch_size=POSSIBLE_BATCH_SIZE, shuffle=False, num_workers=0, collate_fn=tokenize_collator) logging.info("Created train and test dataloaders with batch aggregation") # Save the model name in the model_config file model_config["model"] = "MultiTaskBertForCovidEntityClassification" model_config["epochs"] = args.n_epochs # Find best threshold for each subtask based on dev set performance thresholds = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9] test_predicted_labels, test_prediction_scores, test_gold_labels = make_predictions_on_dataset( test_dataloader, model, device, args.task, True) #dev_predicted_labels, dev_prediction_scores, dev_gold_labels = make_predictions_on_dataset(dev_dataloader, model, device, args.task + "_dev", True) best_test_thresholds = {subtask: 0.5 for subtask in model.subtasks} best_dev_thresholds = {subtask: 0.5 for subtask in model.subtasks} best_test_F1s = {subtask: 0.0 for subtask in model.subtasks} best_dev_F1s = {subtask: 0.0 for subtask in model.subtasks} test_subtasks_t_F1_P_Rs = {subtask: list() for subtask in model.subtasks} dev_subtasks_t_F1_P_Rs = {subtask: list() for subtask in model.subtasks} # Evaluate on Test logging.info("Testing on test dataset") # test_predicted_labels, test_prediction_scores, test_gold_labels = make_predictions_on_dataset(test_dataloader, model, device, args.task) predicted_labels, prediction_scores, gold_labels = make_predictions_on_dataset( test_dataloader, model, device, args.task) # Test for subtask in model.subtasks: logging.info(f"Testing the trained classifier on subtask: {subtask}") # print(len(test_dataloader)) # print(len(prediction_scores[subtask])) # print(len(test_subtasks_data[subtask])) results[subtask] = dict() cm = metrics.confusion_matrix(gold_labels[subtask], predicted_labels[subtask]) classification_report = metrics.classification_report( gold_labels[subtask], predicted_labels[subtask], output_dict=True) logging.info(cm) logging.info( metrics.classification_report(gold_labels[subtask], predicted_labels[subtask])) results[subtask]["CM"] = cm.tolist( ) # Storing it as list of lists instead of numpy.ndarray results[subtask]["Classification Report"] = classification_report # SQuAD style EM and F1 evaluation for all test cases and for positive test cases (i.e. for cases where annotators had a gold annotation) EM_score, F1_score, total = get_raw_scores(test_subtasks_data[subtask], prediction_scores[subtask]) logging.info("Word overlap based SQuAD evaluation style metrics:") logging.info(f"Total number of cases: {total}") logging.info(f"EM_score: {EM_score}") logging.info(f"F1_score: {F1_score}") results[subtask]["SQuAD_EM"] = EM_score results[subtask]["SQuAD_F1"] = F1_score results[subtask]["SQuAD_total"] = total pos_EM_score, pos_F1_score, pos_total = get_raw_scores( test_subtasks_data[subtask], prediction_scores[subtask], positive_only=True) logging.info(f"Total number of Positive cases: {pos_total}") logging.info(f"Pos. EM_score: {pos_EM_score}") logging.info(f"Pos. F1_score: {pos_F1_score}") results[subtask]["SQuAD_Pos. EM"] = pos_EM_score results[subtask]["SQuAD_Pos. F1"] = pos_F1_score results[subtask]["SQuAD_Pos. EM_F1_total"] = pos_total # New evaluation suggested by Alan F1, P, R, TP, FP, FN = get_TP_FP_FN( test_subtasks_data[subtask], prediction_scores[subtask], THRESHOLD=best_dev_thresholds[subtask]) logging.info("New evaluation scores:") logging.info(f"F1: {F1}") logging.info(f"Precision: {P}") logging.info(f"Recall: {R}") logging.info(f"True Positive: {TP}") logging.info(f"False Positive: {FP}") logging.info(f"False Negative: {FN}") results[subtask]["F1"] = F1 results[subtask]["P"] = P results[subtask]["R"] = R results[subtask]["TP"] = TP results[subtask]["FP"] = FP results[subtask]["FN"] = FN N = TP + FN results[subtask]["N"] = N # # Top predictions in the Test case # prediction_scores[subtask] = np.array(prediction_scores[subtask]) # sorted_prediction_ids = np.argsort(-prediction_scores[subtask]) # K = 200 # logging.info("Top {} predictions:".format(K)) # logging.info("\t".join(["Tweet", "BERT model input", "candidate chunk", "prediction score", "predicted label", "gold label", "gold chunks"])) # for i in range(K): # instance_id = sorted_prediction_ids[i] # # text :: candidate_chunk :: candidate_chunk_id :: chunk_start_text_id :: chunk_end_text_id :: tokenized_tweet :: tokenized_tweet_with_masked_q_token :: tagged_chunks :: question_label # tweet = test_subtasks_data[subtask][instance_id][0].replace("\n", " ") # chunk = test_subtasks_data[subtask][instance_id][1] # tokenized_tweet_with_masked_chunk = test_subtasks_data[subtask][instance_id][6] # if chunk in ["AUTHOR OF THE TWEET", "NEAR AUTHOR OF THE TWEET"]: # # First element of the text will be considered as AUTHOR OF THE TWEET or NEAR AUTHOR OF THE TWEET # bert_model_input_text = tokenized_tweet_with_masked_chunk.replace(Q_TOKEN, "<E> </E>") # # print(tokenized_tweet_with_masked_chunk) # # print(bert_model_input_text) # # exit() # else: # bert_model_input_text = tokenized_tweet_with_masked_chunk.replace(Q_TOKEN, "<E> " + chunk + " </E>") # list_to_print = [tweet, bert_model_input_text, chunk, str(prediction_scores[subtask][instance_id]), str(predicted_labels[subtask][instance_id]), str(test_subtasks_data[subtask][instance_id][-1]), str(test_subtasks_data[subtask][instance_id][-2])] # logging.info("\t".join(list_to_print)) # Save model_config and results model_config_file = os.path.join(args.output_dir, "model_config.json") results_file = os.path.join(args.output_dir, "results.json") logging.info(f"Saving model config at {model_config_file}") save_in_json(model_config, model_config_file) logging.info(f"Saving results at {results_file}") save_in_json(results, results_file)
def main(): task_instances_dict, tag_statistics, question_keys_and_tags = load_from_pickle( args.data_file) data = extract_instances_for_current_subtask(task_instances_dict, args.sub_task) logging.info( f"Task dataset for task: {args.task} loaded from {args.data_file}.") model_config = dict() results = dict() # Split the data into train, dev and test and shuffle the train segment train_data, dev_data, test_data = split_instances_in_train_dev_test(data) random.shuffle(train_data) # shuffle happens in-place logging.info("Train Data:") total_train_size, pos_train_size, neg_train_size = log_data_statistics( train_data) logging.info("Dev Data:") total_dev_size, pos_dev_size, neg_dev_size = log_data_statistics(dev_data) logging.info("Test Data:") total_test_size, pos_test_size, neg_test_size = log_data_statistics( test_data) logging.info("\n") model_config["train_data"] = { "size": total_train_size, "pos": pos_train_size, "neg": neg_train_size } model_config["dev_data"] = { "size": total_dev_size, "pos": pos_dev_size, "neg": neg_dev_size } model_config["test_data"] = { "size": total_test_size, "pos": pos_test_size, "neg": neg_test_size } # Extract n-gram features from the train data # Returned ngrams will be dict of dict # TODO: update the feature extractor feature2i, i2feature = create_ngram_features_from(train_data) logging.info( f"Total number of features extracted from train = {len(feature2i)}, {len(i2feature)}" ) model_config["features"] = {"size": len(feature2i)} # Extract Feature vectors and labels from train and test data train_X, train_Y = convert_data_to_feature_vector_and_labels( train_data, feature2i) dev_X, dev_Y = convert_data_to_feature_vector_and_labels( dev_data, feature2i) test_X, test_Y = convert_data_to_feature_vector_and_labels( test_data, feature2i) logging.info( f"Train Data Features = {train_X.shape} and Labels = {len(train_Y)}") logging.info( f"Dev Data Features = {dev_X.shape} and Labels = {len(dev_Y)}") logging.info( f"Test Data Features = {test_X.shape} and Labels = {len(test_Y)}") model_config["train_data"]["features_shape"] = train_X.shape model_config["train_data"]["labels_shape"] = len(train_Y) model_config["dev_data"]["features_shape"] = dev_X.shape model_config["dev_data"]["labels_shape"] = len(dev_Y) model_config["test_data"]["features_shape"] = test_X.shape model_config["test_data"]["labels_shape"] = len(test_Y) # Train logistic regression classifier logging.info("Training the Logistic Regression classifier") lr = LogisticRegression(solver='lbfgs') lr.fit(train_X, train_Y) model_config["model"] = "LogisticRegression(solver='lbfgs')" # Find best threshold based on dev set performance thresholds = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9] dev_prediction_probs = lr.predict_proba(dev_X)[:, 1] dev_t_F1_P_Rs = list() best_threshold_based_on_F1 = 0.5 best_dev_F1 = 0.0 for t in thresholds: dev_F1, dev_P, dev_R, dev_TP, dev_FP, dev_FN = get_TP_FP_FN( dev_data, dev_prediction_probs, THRESHOLD=t) dev_t_F1_P_Rs.append( (t, dev_F1, dev_P, dev_R, dev_TP + dev_FN, dev_TP, dev_FP, dev_FN)) if dev_F1 > best_dev_F1: best_threshold_based_on_F1 = t best_dev_F1 = dev_F1 log_list(dev_t_F1_P_Rs) logging.info( f"Best Threshold: {best_threshold_based_on_F1}\t Best dev F1: {best_dev_F1}" ) # Save the best dev threshold and dev_F1 in results dict results["best_dev_threshold"] = best_threshold_based_on_F1 results["best_dev_F1"] = best_dev_F1 results["dev_t_F1_P_Rs"] = dev_t_F1_P_Rs # y_pred = (clf.predict_proba(X_test)[:,1] >= 0.3).astype(bool) # Test logging.info("Testing the trained classifier") predictions = lr.predict(test_X) probs = lr.predict_proba(test_X) test_Y_prediction_probs = probs[:, 1] cm = metrics.confusion_matrix(test_Y, predictions) classification_report = metrics.classification_report(test_Y, predictions, output_dict=True) logging.info(cm) logging.info(metrics.classification_report(test_Y, predictions)) results["CM"] = cm.tolist( ) # Storing it as list of lists instead of numpy.ndarray results["Classification Report"] = classification_report # SQuAD style EM and F1 evaluation for all test cases and for positive test cases (i.e. for cases where annotators had a gold annotation) EM_score, F1_score, total = get_raw_scores(test_data, test_Y_prediction_probs) logging.info("Word overlap based SQuAD evaluation style metrics:") logging.info(f"Total number of cases: {total}") logging.info(f"EM_score: {EM_score}") logging.info(f"F1_score: {F1_score}") results["SQuAD_EM"] = EM_score results["SQuAD_F1"] = F1_score results["SQuAD_total"] = total pos_EM_score, pos_F1_score, pos_total = get_raw_scores( test_data, test_Y_prediction_probs, positive_only=True) logging.info(f"Total number of Positive cases: {pos_total}") logging.info(f"Pos. EM_score: {pos_EM_score}") logging.info(f"Pos. F1_score: {pos_F1_score}") results["SQuAD_Pos. EM"] = pos_EM_score results["SQuAD_Pos. F1"] = pos_F1_score results["SQuAD_Pos. EM_F1_total"] = pos_total # New evaluation suggested by Alan F1, P, R, TP, FP, FN = get_TP_FP_FN(test_data, test_Y_prediction_probs, THRESHOLD=best_threshold_based_on_F1) logging.info("New evaluation scores:") logging.info(f"F1: {F1}") logging.info(f"Precision: {P}") logging.info(f"Recall: {R}") logging.info(f"True Positive: {TP}") logging.info(f"False Positive: {FP}") logging.info(f"False Negative: {FN}") results["F1"] = F1 results["P"] = P results["R"] = R results["TP"] = TP results["FP"] = FP results["FN"] = FN N = TP + FN results["N"] = N # Top predictions in the Test case sorted_prediction_ids = np.argsort(-test_Y_prediction_probs) K = 30 logging.info("Top {} predictions:".format(K)) for i in range(K): instance_id = sorted_prediction_ids[i] # text :: candidate_chunk :: candidate_chunk_id :: chunk_start_text_id :: chunk_end_text_id :: tokenized_tweet :: tokenized_tweet_with_masked_q_token :: tagged_chunks :: question_label list_to_print = [ test_data[instance_id][0], test_data[instance_id][6], test_data[instance_id][1], str(test_Y_prediction_probs[instance_id]), str(test_Y[instance_id]), str(test_data[instance_id][-1]), str(test_data[instance_id][-2]) ] logging.info("\t".join(list_to_print)) # Top feature analysis coefs = lr.coef_[0] K = 10 sorted_feature_ids = np.argsort(-coefs) logging.info("Top {} features:".format(K)) for i in range(K): feature_id = sorted_feature_ids[i] logging.info(f"{i2feature[feature_id]}\t{coefs[feature_id]}") # Plot the precision recall curve save_figure_file = os.path.join(args.output_dir, "Precision Recall Curve.png") logging.info(f"Saving precision recall curve at {save_figure_file}") disp = plot_precision_recall_curve(lr, test_X, test_Y) disp.ax_.set_title('2-class Precision-Recall curve') disp.ax_.figure.savefig(save_figure_file) # Save the model and features in pickle file model_and_features_save_file = os.path.join(args.output_dir, "model_and_features.pkl") logging.info( f"Saving LR model and features at {model_and_features_save_file}") save_in_pickle((lr, feature2i, i2feature), model_and_features_save_file) # Save model_config and results model_config_file = os.path.join(args.output_dir, "model_config.json") results_file = os.path.join(args.output_dir, "results.json") logging.info(f"Saving model config at {model_config_file}") save_in_json(model_config, model_config_file) logging.info(f"Saving results at {results_file}") save_in_json(results, results_file)
def train(choice, dirname, window): # -------------------------------------------------------------------------------------------------------------------- dimensionality = BERT_DIMENSIONS # No need to adjust, unless you want to experiment with custom embeddings print("Dimensionality:", dimensionality) if choice == 'imm': base = '_imm' elif choice == 'prewin': base = '_prewin' style = 'train' mlmr_dir = dirname seq_length = window # Adjust to 5 for PreWin and 5, 10, 50 for baseline results neg = load_from_hdf5("{}/wiki_LOCATION_{}{}.hdf5".format( mlmr_dir, style, base)) pos = load_from_hdf5("{}/wiki_INSTITUTE_{}{}.hdf5".format( mlmr_dir, style, base)) if path.exists("{}/wiki_EVENT_{}{}.hdf5".format(mlmr_dir, style, base)): pos.extend( load_from_hdf5("{}/wiki_EVENT_{}{}.hdf5".format( mlmr_dir, style, base))) if path.exists("{}/wiki_TEAM_{}{}.hdf5".format(mlmr_dir, style, base)): pos.extend( load_from_hdf5("{}/wiki_TEAM_{}{}.hdf5".format( mlmr_dir, style, base))) if path.exists("{}/wiki_ARTIFACT_{}{}.hdf5".format(mlmr_dir, style, base)): pos.extend( load_from_hdf5("{}/wiki_ARTIFACT_{}{}.hdf5".format( mlmr_dir, style, base))) print("Sequence Length: 2 times ", seq_length) A = [] dep_labels = {u"<u>"} for coll in [neg, pos]: for l in coll: A.append(l) dep_labels.update(set(l[0][-seq_length:] + l[2][:seq_length])) random.shuffle(A) D_L, E_L, D_R, E_R, Y = [], [], [], [], [] for a in A: D_L.append(a[0][-seq_length:]) E_L.append(a[1][-seq_length:]) D_R.append(a[2][:seq_length]) E_R.append(a[3][:seq_length]) Y.append(a[4]) print('No of training examples: ', len(D_L)) dump_to_pickle("dep_labels.pkl", dep_labels) dep_labels = load_from_pickle("dep_labels.pkl") # -------------------------------------------------------------------------------------------------------------------- print("Building sequences...") dep_to_index = dict([(w, i) for i, w in enumerate(dep_labels)]) for d_l, d_r in zip(D_L, D_R): for i, w in enumerate(d_l): arr = np.zeros(len(dep_labels)) if w in dep_to_index: arr[dep_to_index[w]] = 1 else: arr[dep_to_index[u"<u>"]] = 1 d_l[i] = arr for i, w in enumerate(d_r): arr = np.zeros(len(dep_labels)) if w in dep_to_index: arr[dep_to_index[w]] = 1 else: arr[dep_to_index[u"<u>"]] = 1 d_r[i] = arr D_L = np.asarray(D_L) D_R = np.asarray(D_R) E_L = torch.stack(E_L).detach() E_R = torch.stack(E_R).detach() Y = np.asarray(Y) # convert labels to one-hot format num_classes = Y.max() + 1 one_hot = np.zeros((Y.size, num_classes)) one_hot[np.arange(Y.size), Y] = 1 Y = one_hot print(u"Done...") # -------------------------------------------------------------------------------------------------------------------- print(u'Building model...') first_input = Input(shape=(seq_length, dimensionality)) a = LSTM(units=15)(first_input) first_output = Dropout(0.2)(a) model_left = Model(inputs=first_input, outputs=first_output) second_input = Input(shape=(seq_length, len(dep_labels))) a = TimeDistributed(Dense(units=15))(second_input) b = Dropout(0.2)(a) second_output = Flatten()(b) dep_left = Model(inputs=second_input, outputs=second_output) third_input = Input(shape=(seq_length, dimensionality)) a = LSTM(units=15, go_backwards=True)(third_input) third_output = Dropout(0.2)(a) model_right = Model(inputs=third_input, outputs=third_output) fourth_input = Input(shape=(seq_length, len(dep_labels))) a = TimeDistributed(Dense(units=15))(fourth_input) b = Dropout(0.2)(a) fourth_output = Flatten()(b) dep_right = Model(inputs=fourth_input, outputs=fourth_output) a = concatenate([first_output, second_output, third_output, fourth_output]) b = Dense(10)(a) c = Dense(num_classes, activation='softmax')(b) merged_model = Model( inputs=[first_input, second_input, third_input, fourth_input], outputs=c) merged_model.compile(loss='categorical_crossentropy', optimizer='adagrad', metrics=['accuracy']) print(u"Done...") # -------------------------------------------------------------------------------------------------------------------- checkpoint = ModelCheckpoint(filepath="lstm.hdf5", verbose=0) merged_model.fit([E_L, D_L, E_R, D_R], Y, batch_size=16, epochs=5, callbacks=[checkpoint], verbose=0)
def test(choice, dirname, window): # -------------------------------------------------------------------------------------------------------------------- dimensionality = BERT_DIMENSIONS # No need to adjust, unless you want to experiment with custom embeddings seq_length = 5 # Adjust to 5 for PreWin and 5, 10, 50 for baseline results print("Dimensionality:", dimensionality) if choice == 'imm': base = '_imm' elif choice == 'prewin': base = '_prewin' style = 'test' mlmr_dir = dirname seq_length = window # Adjust to 5 for PreWin and 5, 10, 50 for baseline results neg = load_from_hdf5("{}/wiki_LOCATION_{}{}.hdf5".format( mlmr_dir, style, base)) pos = load_from_hdf5("{}/wiki_INSTITUTE_{}{}.hdf5".format( mlmr_dir, style, base)) if path.exists("{}/wiki_EVENT_{}{}.hdf5".format(mlmr_dir, style, base)): pos.extend( load_from_hdf5("{}/wiki_EVENT_{}{}.hdf5".format( mlmr_dir, style, base))) if path.exists("{}/wiki_TEAM_{}{}.hdf5".format(mlmr_dir, style, base)): pos.extend( load_from_hdf5("{}/wiki_TEAM_{}{}.hdf5".format( mlmr_dir, style, base))) if path.exists("{}/wiki_ARTIFACT_{}{}.hdf5".format(mlmr_dir, style, base)): pos.extend( load_from_hdf5("{}/wiki_ARTIFACT_{}{}.hdf5".format( mlmr_dir, style, base))) print("Sequence Length: 2 times ", seq_length) D_L, E_L, D_R, E_R, Y = [], [], [], [], [] for a in copy.deepcopy(neg + pos): D_L.append(a[0][-seq_length:]) E_L.append(a[1][-seq_length:]) D_R.append(a[2][:seq_length]) E_R.append(a[3][:seq_length]) Y.append(a[4]) print('No of test examples: ', len(D_L)) dep_labels = load_from_pickle("dep_labels.pkl") # -------------------------------------------------------------------------------------------------------------------- print("Building sequences...") dep_to_index = dict([(w, i) for i, w in enumerate(dep_labels)]) for d_l, d_r in zip(D_L, D_R): for i, w in enumerate(d_l): arr = np.zeros(len(dep_labels)) if w in dep_to_index: arr[dep_to_index[w]] = 1 else: arr[dep_to_index[u"<u>"]] = 1 d_l[i] = arr for i, w in enumerate(d_r): arr = np.zeros(len(dep_labels)) if w in dep_to_index: arr[dep_to_index[w]] = 1 else: arr[dep_to_index[u"<u>"]] = 1 d_r[i] = arr D_L = np.asarray(D_L) D_R = np.asarray(D_R) E_L = torch.stack(E_L).detach() E_R = torch.stack(E_R).detach() Y = np.asarray(Y) # convert labels to one-hot format num_classes = Y.max() + 1 one_hot = np.zeros((Y.size, num_classes)) one_hot[np.arange(Y.size), Y] = 1 Y = one_hot print(u"Done...") # -------------------------------------------------------------------------------------------------------------------- print(u'Building model...') first_input = Input(shape=(seq_length, dimensionality)) a = LSTM(units=15)(first_input) first_output = Dropout(0.2)(a) model_left = Model(inputs=first_input, outputs=first_output) second_input = Input(shape=(seq_length, len(dep_labels))) a = TimeDistributed(Dense(units=15))(second_input) b = Dropout(0.2)(a) second_output = Flatten()(b) dep_left = Model(inputs=second_input, outputs=second_output) third_input = Input(shape=(seq_length, dimensionality)) a = LSTM(units=15, go_backwards=True)(third_input) third_output = Dropout(0.2)(a) model_right = Model(inputs=third_input, outputs=third_output) fourth_input = Input(shape=(seq_length, len(dep_labels))) a = TimeDistributed(Dense(units=15))(fourth_input) b = Dropout(0.2)(a) fourth_output = Flatten()(b) dep_right = Model(inputs=fourth_input, outputs=fourth_output) a = concatenate([first_output, second_output, third_output, fourth_output]) b = Dense(10)(a) c = Dense(num_classes, activation='softmax')(b) merged_model = Model( inputs=[first_input, second_input, third_input, fourth_input], outputs=c) merged_model.load_weights("lstm.hdf5") merged_model.compile(loss='categorical_crossentropy', optimizer='adagrad', metrics=['accuracy']) print(u"Done...") # -------------------------------------------------------------------------------------------------------------------- predictions = merged_model.predict_on_batch([E_L, D_L, E_R, D_R]) y_pred = predictions.argmax(axis=1) one_hot = np.zeros((y_pred.size, num_classes)) one_hot[np.arange(y_pred.size), y_pred] = 1 y_pred = one_hot print('Macro-averaged metrics: ', precision_recall_fscore_support(Y, y_pred, average='macro')) print('Micro-averaged metrics: ', precision_recall_fscore_support(Y, y_pred, average='micro'))
def load_loss_gradients(n_samples, filename, savedir, relpath=DATA): path = relpath + savedir + filename + "_samp=" + str( n_samples) + "_lossGrads.pkl" return load_from_pickle(path=path)
def main(): # Read all the data instances task_instances_dict, tag_statistics, question_keys_and_tags = load_from_pickle( args.data_file) data, subtasks_list = get_multitask_instances_for_valid_tasks( task_instances_dict, tag_statistics) data = add_marker_for_loss_ignore( data, 1.0 if args.loss_for_no_consensus else 0.0) if args.retrain: if args.large_bert: model_name = "bert-large-cased" elif args.covid_bert: model_name = "digitalepidemiologylab/covid-twitter-bert" else: model_name = "bert-base-cased" logging.info("Creating and training the model from '" + model_name + "'") # Create the save_directory if not exists make_dir_if_not_exists(args.save_directory) # Initialize tokenizer and model with pretrained weights tokenizer = BertTokenizer.from_pretrained(model_name) config = BertConfig.from_pretrained(model_name) config.subtasks = subtasks_list model = MultiTaskBertForCovidEntityClassification.from_pretrained( model_name, config=config) # Add new tokens in tokenizer new_special_tokens_dict = { "additional_special_tokens": ["<E>", "</E>", "<URL>", "@USER"] } tokenizer.add_special_tokens(new_special_tokens_dict) # Add the new embeddings in the weights print("Embeddings type:", model.bert.embeddings.word_embeddings.weight.data.type()) print("Embeddings shape:", model.bert.embeddings.word_embeddings.weight.data.size()) embedding_size = model.bert.embeddings.word_embeddings.weight.size(1) new_embeddings = torch.FloatTensor( len(new_special_tokens_dict["additional_special_tokens"]), embedding_size).uniform_(-0.1, 0.1) # new_embeddings = torch.FloatTensor(2, embedding_size).uniform_(-0.1, 0.1) print("new_embeddings shape:", new_embeddings.size()) new_embedding_weight = torch.cat( (model.bert.embeddings.word_embeddings.weight.data, new_embeddings), 0) model.bert.embeddings.word_embeddings.weight.data = new_embedding_weight print("Embeddings shape:", model.bert.embeddings.word_embeddings.weight.data.size()) # Update model config vocab size model.config.vocab_size = model.config.vocab_size + len( new_special_tokens_dict["additional_special_tokens"]) else: # Load the tokenizer and model from the save_directory tokenizer = BertTokenizer.from_pretrained(args.save_directory) model = MultiTaskBertForCovidEntityClassification.from_pretrained( args.save_directory) # Load from individual state dicts for subtask in model.subtasks: model.classifiers[subtask].load_state_dict( torch.load( os.path.join(args.save_directory, f"{subtask}_classifier.bin"))) model.to(device) if args.wandb: wandb.watch(model) # Explicitly move the classifiers to device for subtask, classifier in model.classifiers.items(): classifier.to(device) for subtask, classifier in model.context_vectors.items(): classifier.to(device) entity_start_token_id = tokenizer.convert_tokens_to_ids(["<E>"])[0] entity_end_token_id = tokenizer.convert_tokens_to_ids(["</E>"])[0] logging.info( f"Task dataset for task: {args.task} loaded from {args.data_file}.") model_config = dict() results = dict() # Split the data into train, dev and test and shuffle the train segment train_data, dev_data = split_multitask_instances_in_train_dev(data) random.shuffle(train_data) # shuffle happens in-place logging.info("Train Data:") total_train_size, pos_subtasks_train_size, neg_subtasks_train_size = log_multitask_data_statistics( train_data, model.subtasks) logging.info("Dev Data:") total_dev_size, pos_subtasks_dev_size, neg_subtasks_dev_size = log_multitask_data_statistics( dev_data, model.subtasks) #logging.info("Test Data:") #total_test_size, pos_subtasks_test_size, neg_subtasks_test_size = log_multitask_data_statistics(test_data, model.subtasks) logging.info("\n") model_config["train_data"] = { "size": total_train_size, "pos": pos_subtasks_train_size, "neg": neg_subtasks_train_size } model_config["dev_data"] = { "size": total_dev_size, "pos": pos_subtasks_dev_size, "neg": neg_subtasks_dev_size } #model_config["test_data"] = {"size":total_test_size, "pos":pos_subtasks_test_size, "neg":neg_subtasks_test_size} # Extract subtasks data for dev and test train_subtasks_data = split_data_based_on_subtasks(train_data, model.subtasks) dev_subtasks_data = split_data_based_on_subtasks(dev_data, model.subtasks) #test_subtasks_data = split_data_based_on_subtasks(test_data, model.subtasks) # Load the instances into pytorch dataset train_dataset = COVID19TaskDataset(train_data) dev_dataset = COVID19TaskDataset(dev_data) #test_dataset = COVID19TaskDataset(test_data) logging.info("Loaded the datasets into Pytorch datasets") tokenize_collator = TokenizeCollator(tokenizer, model.subtasks, entity_start_token_id, entity_end_token_id) train_dataloader = DataLoader(train_dataset, batch_size=POSSIBLE_BATCH_SIZE, shuffle=True, num_workers=0, collate_fn=tokenize_collator) dev_dataloader = DataLoader(dev_dataset, batch_size=POSSIBLE_BATCH_SIZE, shuffle=False, num_workers=0, collate_fn=tokenize_collator) #test_dataloader = DataLoader(test_dataset, batch_size=POSSIBLE_BATCH_SIZE, shuffle=False, num_workers=0, collate_fn=tokenize_collator) logging.info("Created train and test dataloaders with batch aggregation") # Only retrain if needed if args.retrain: optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8) logging.info("Created model optimizer") #if args.sentence_level_classify: # args.n_epochs += 2 epochs = args.n_epochs # Total number of training steps is [number of batches] x [number of epochs]. total_steps = len(train_dataloader) * epochs # Create the learning rate scheduler. # NOTE: num_warmup_steps = 0 is the Default value in run_glue.py scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=total_steps) # We'll store a number of quantities such as training and validation loss, validation accuracy, and timings. training_stats = [] print("\n\n\n ====== Training for task", args.task, "=============\n\n\n") logging.info(f"Initiating training loop for {args.n_epochs} epochs...") print(model.state_dict().keys()) total_start_time = time.time() # Find the accumulation steps accumulation_steps = args.batch_size / POSSIBLE_BATCH_SIZE # Dev validation trajectory epoch_train_loss = list() train_subtasks_validation_statistics = { subtask: list() for subtask in model.subtasks } dev_subtasks_validation_statistics = { subtask: list() for subtask in model.subtasks } best_dev_F1 = 0 for epoch in range(epochs): logging.info(f"Initiating Epoch {epoch+1}:") # Reset the total loss for each epoch. total_train_loss = 0 train_loss_trajectory = list() # Reset timer for each epoch start_time = time.time() model.train() dev_log_frequency = 5 n_steps = len(train_dataloader) dev_steps = int(n_steps / dev_log_frequency) for step, batch in enumerate(train_dataloader): # Upload labels of each subtask to device for subtask in model.subtasks: subtask_labels = batch["gold_labels"][subtask] subtask_labels = subtask_labels.to(device) batch["gold_labels"][subtask] = subtask_labels batch["label_ignore_loss"][subtask] = batch[ "label_ignore_loss"][subtask].to(device) # Forward input_dict = { "input_ids": batch["input_ids"].to(device), "entity_start_positions": batch["entity_start_positions"].to(device), "entity_end_positions": batch["entity_end_positions"].to(device), "labels": batch["gold_labels"], "label_weight": batch["label_ignore_loss"] } input_ids = batch["input_ids"] entity_start_positions = batch["entity_start_positions"] gold_labels = batch["gold_labels"] batch_data = batch["batch_data"] loss, logits = model(**input_dict) # Accumulate loss total_train_loss += loss.item() # Backward: compute gradients loss.backward() if (step + 1) % accumulation_steps == 0: # Calculate elapsed time in minutes and print loss on the tqdm bar elapsed = format_time(time.time() - start_time) avg_train_loss = total_train_loss / (step + 1) # keep track of changing avg_train_loss train_loss_trajectory.append(avg_train_loss) if (step + 1) % (accumulation_steps * 20) == 0: print( f"Epoch:{epoch+1}|Batch:{step}/{len(train_dataloader)}|Time:{elapsed}|Avg. Loss:{avg_train_loss:.4f}|Loss:{loss.item():.4f}" ) torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) optimizer.step() # Clean the model's previous gradients model.zero_grad() scheduler.step() # Calculate the average loss over all of the batches. avg_train_loss = total_train_loss / len(train_dataloader) # Perform validation with the model and log the performance print("\n") logging.info("Running Validation...") # Put the model in evaluation mode--the dropout layers behave differently during evaluation. model.eval() dev_predicted_labels, dev_prediction_scores, dev_gold_labels = make_predictions_on_dataset( dev_dataloader, model, device, args.task + "_dev", True) wandb_log_dict = {"Train Loss": avg_train_loss} print("Dev Set:") collect_TP_FP_FN = {"TP": 0, "FP": 0, "FN": 0} for subtask in model.subtasks: dev_subtask_data = dev_subtasks_data[subtask] dev_subtask_prediction_scores = dev_prediction_scores[subtask] dev_F1, dev_P, dev_R, dev_TP, dev_FP, dev_FN = get_TP_FP_FN( dev_subtask_data, dev_subtask_prediction_scores, task=subtask) if subtask not in IGNORE_TASKS: collect_TP_FP_FN["TP"] += dev_TP collect_TP_FP_FN["FP"] += dev_FP collect_TP_FP_FN["FN"] += dev_FN else: print("IGNORE: ", end="") print( f"Subtask:{subtask:>15}\tN={dev_TP + dev_FN}\tF1={dev_F1}\tP={dev_P}\tR={dev_R}\tTP={dev_TP}\tFP={dev_FP}\tFN={dev_FN}" ) dev_subtasks_validation_statistics[subtask].append( (epoch + 1, step + 1, dev_TP + dev_FN, dev_F1, dev_P, dev_R, dev_TP, dev_FP, dev_FN)) wandb_log_dict["Dev_ " + subtask + "_F1"] = dev_F1 wandb_log_dict["Dev_ " + subtask + "_P"] = dev_P wandb_log_dict["Dev_ " + subtask + "_R"] = dev_R dev_macro_P = collect_TP_FP_FN["TP"] / (collect_TP_FP_FN["TP"] + collect_TP_FP_FN["FP"]) dev_macro_R = collect_TP_FP_FN["TP"] / (collect_TP_FP_FN["TP"] + collect_TP_FP_FN["FN"]) dev_macro_F1 = (2 * dev_macro_P * dev_macro_R) / (dev_macro_P + dev_macro_R) print(collect_TP_FP_FN) print("dev_macro_P:", dev_macro_P, "\ndev_macro_R:", dev_macro_R, "\ndev_macro_F1:", dev_macro_F1, "\n") wandb_log_dict["Dev_macro_F1"] = dev_macro_F1 wandb_log_dict["Dev_macro_P"] = dev_macro_P wandb_log_dict["Dev_macro_R"] = dev_macro_R if args.wandb: wandb.log(wandb_log_dict) if dev_macro_F1 > best_dev_F1: best_dev_F1 = dev_macro_F1 print("NEW BEST F1:", best_dev_F1, " Saving checkpoint now.") torch.save(model.state_dict(), args.output_dir + "/ckpt.pth") #print(model.state_dict().keys()) #model.save_pretrained(args.save_directory) model.train() training_time = format_time(time.time() - start_time) # Record all statistics from this epoch. training_stats.append({ 'epoch': epoch + 1, 'Training Loss': avg_train_loss, 'Training Time': training_time }) # Save the loss trajectory epoch_train_loss.append(train_loss_trajectory) print("\n\n") logging.info( f"Training complete with total Train time:{format_time(time.time()- total_start_time)}" ) log_list(training_stats) model.load_state_dict(torch.load(args.output_dir + "/ckpt.pth")) model.eval() # Save the model and the Tokenizer here: #logging.info(f"Saving the model and tokenizer in {args.save_directory}") #model.save_pretrained(args.save_directory) # Save each subtask classifiers weights to individual state dicts #for subtask, classifier in model.classifiers.items(): # classifier_save_file = os.path.join(args.save_directory, f"{subtask}_classifier.bin") # logging.info(f"Saving the model's {subtask} classifier weights at {classifier_save_file}") # torch.save(classifier.state_dict(), classifier_save_file) #tokenizer.save_pretrained(args.save_directory) # Plot the train loss trajectory in a plot #train_loss_trajectory_plot_file = os.path.join(args.output_dir, "train_loss_trajectory.png") #logging.info(f"Saving the Train loss trajectory at {train_loss_trajectory_plot_file}") #print(epoch_train_loss) # TODO: Plot the validation performance # Save dev_subtasks_validation_statistics else: raise logging.info("No training needed. Directly going to evaluation!") # Save the model name in the model_config file model_config["model"] = "MultiTaskBertForCovidEntityClassification" model_config["epochs"] = args.n_epochs # Find best threshold for each subtask based on dev set performance thresholds = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9] #test_predicted_labels, test_prediction_scores, test_gold_labels = make_predictions_on_dataset(test_dataloader, model, device, args.task, True) dev_predicted_labels, dev_prediction_scores, dev_gold_labels = make_predictions_on_dataset( dev_dataloader, model, device, args.task + "_dev", True) best_test_thresholds = {subtask: 0.5 for subtask in model.subtasks} best_dev_thresholds = {subtask: 0.5 for subtask in model.subtasks} best_test_F1s = {subtask: 0.0 for subtask in model.subtasks} best_dev_F1s = {subtask: 0.0 for subtask in model.subtasks} #test_subtasks_t_F1_P_Rs = {subtask: list() for subtask in model.subtasks} dev_subtasks_t_F1_P_Rs = {subtask: list() for subtask in model.subtasks} for subtask in model.subtasks: dev_subtask_data = dev_subtasks_data[subtask] dev_subtask_prediction_scores = dev_prediction_scores[subtask] for t in thresholds: dev_F1, dev_P, dev_R, dev_TP, dev_FP, dev_FN = get_TP_FP_FN( dev_subtask_data, dev_subtask_prediction_scores, THRESHOLD=t, task=subtask) dev_subtasks_t_F1_P_Rs[subtask].append( (t, dev_F1, dev_P, dev_R, dev_TP + dev_FN, dev_TP, dev_FP, dev_FN)) if dev_F1 > best_dev_F1s[subtask]: best_dev_thresholds[subtask] = t best_dev_F1s[subtask] = dev_F1 logging.info(f"Subtask:{subtask:>15}") log_list(dev_subtasks_t_F1_P_Rs[subtask]) logging.info( f"Best Dev Threshold for subtask: {best_dev_thresholds[subtask]}\t Best dev F1: {best_dev_F1s[subtask]}" ) # Save the best dev threshold and dev_F1 in results dict results["best_dev_threshold"] = best_dev_thresholds results["best_dev_F1s"] = best_dev_F1s results["dev_t_F1_P_Rs"] = dev_subtasks_t_F1_P_Rs # Evaluate on Test logging.info("Testing on eval dataset") predicted_labels, prediction_scores, gold_labels = make_predictions_on_dataset( dev_dataloader, model, device, args.task) # Test for subtask in model.subtasks: logging.info(f"\nTesting the trained classifier on subtask: {subtask}") results[subtask] = dict() cm = metrics.confusion_matrix(gold_labels[subtask], predicted_labels[subtask]) classification_report = metrics.classification_report( gold_labels[subtask], predicted_labels[subtask], output_dict=True) logging.info(cm) logging.info( metrics.classification_report(gold_labels[subtask], predicted_labels[subtask])) results[subtask]["CM"] = cm.tolist( ) # Storing it as list of lists instead of numpy.ndarray results[subtask]["Classification Report"] = classification_report # SQuAD style EM and F1 evaluation for all test cases and for positive test cases (i.e. for cases where annotators had a gold annotation) EM_score, F1_score, total = get_raw_scores(dev_subtasks_data[subtask], prediction_scores[subtask]) logging.info("Word overlap based SQuAD evaluation style metrics:") logging.info(f"Total number of cases: {total}") logging.info(f"EM_score: {EM_score}") logging.info(f"F1_score: {F1_score}") results[subtask]["SQuAD_EM"] = EM_score results[subtask]["SQuAD_F1"] = F1_score results[subtask]["SQuAD_total"] = total pos_EM_score, pos_F1_score, pos_total = get_raw_scores( dev_subtasks_data[subtask], prediction_scores[subtask], positive_only=True) logging.info(f"Total number of Positive cases: {pos_total}") logging.info(f"Pos. EM_score: {pos_EM_score}") logging.info(f"Pos. F1_score: {pos_F1_score}") results[subtask]["SQuAD_Pos. EM"] = pos_EM_score results[subtask]["SQuAD_Pos. F1"] = pos_F1_score results[subtask]["SQuAD_Pos. EM_F1_total"] = pos_total # New evaluation suggested by Alan F1, P, R, TP, FP, FN = get_TP_FP_FN( dev_subtasks_data[subtask], prediction_scores[subtask], THRESHOLD=best_dev_thresholds[subtask], task=subtask) logging.info("New evaluation scores:") logging.info(f"F1: {F1}") logging.info(f"Precision: {P}") logging.info(f"Recall: {R}") logging.info(f"True Positive: {TP}") logging.info(f"False Positive: {FP}") logging.info(f"False Negative: {FN}") results[subtask]["F1"] = F1 results[subtask]["P"] = P results[subtask]["R"] = R results[subtask]["TP"] = TP results[subtask]["FP"] = FP results[subtask]["FN"] = FN N = TP + FN results[subtask]["N"] = N # Save model_config and results model_config_file = os.path.join(args.output_dir, "model_config.json") results_file = os.path.join(args.output_dir, "results.json") logging.info(f"Saving model config at {model_config_file}") save_in_json(model_config, model_config_file) logging.info(f"Saving results at {results_file}") save_in_json(results, results_file)
def predict(vedio_path, st, base_dir, ver): data_dir = os.path.join(base_dir, 'data') known_faces_dir = os.path.join(data_dir, 'known_faces' + ver) full_vedios_dir = os.path.join(data_dir, 'full_vedio') pkl_files_dir = os.path.join(data_dir, "pkl_files") known_face_names = load_from_pickle( os.path.join(pkl_files_dir, "known_face_names" + ver + ".pkl")) known_face_encodings = load_from_pickle( os.path.join(pkl_files_dir, "known_face_encodings" + ver + ".pkl")) st.video(vedio_path) video_capture = cv2.VideoCapture(vedio_path) fps = int(video_capture.get(cv2.CAP_PROP_FPS)) vst = st.number_input("time(secs) to start the vedio from", min_value=0, max_value=5 * 60, value=15, step=1) ### vet = st.number_input("time(secs) to exit", min_value=0, max_value=5 * 60, value=25, step=1) num_frames_per_sec = st.number_input( "number of frames to process per second", min_value=1, max_value=fps, value=1, step=1) show = st.radio("show vedio", ["no", "yes"], index=1) ### if show == "yes": i = -1 # make empty containers to repalce outputs image_location = st.empty() image_text = st.empty() time_text = st.empty() while video_capture.isOpened(): i += 1 if int(i / fps) < vst: ret, frame = video_capture.read() continue if int(i / fps) > vet: break # Grab a single frame of video ret, frame = video_capture.read() # Convert the image from BGR color (which OpenCV uses) to RGB color (which face_recognition uses) rgb_frame = frame[:, :, ::-1] # process if i % int(fps / num_frames_per_sec) == 0: face_locations = fr_fl(rgb_frame, **fr_fl_dict) if not len(face_locations) == 1: image_location.image(rgb_frame) time_text.text(f"time(sec): {int(i/fps)}") image_text.text("no face") continue face_encoding = fr_fe(rgb_frame, known_face_locations=face_locations, **fr_fe_dict)[0] face_distances = face_recognition.face_distance( known_face_encodings, face_encoding) best_match_index = np.argmin(face_distances) name = known_face_names[best_match_index] top, right, bottom, left = face_locations[0] # Draw a box around the face cv2.rectangle(frame, (left, top), (right, bottom), (0, 0, 255), 2) # Draw a label with a name below the face cv2.rectangle(frame, (left, bottom - 35), (right, bottom), (0, 0, 255), cv2.FILLED) font = cv2.FONT_HERSHEY_DUPLEX cv2.putText(frame, name, (left + 6, bottom - 6), font, 1.0, (255, 255, 255), 1) image_location.image(frame[:, :, ::-1]) image_text.text(f"predicted name: {name}") time_text.text(f"time(sec): {int(i/fps)}") # Hit 'q' on the keyboard to quit! if cv2.waitKey(10) & 0xFF == ord('q'): break # Release handle to the webcam video_capture.release() cv2.destroyAllWindows() # clear outputs image_location.empty() time_text.empty() image_text.empty()
def main_try(args): task_instances_dict, tag_statistics, question_keys_and_tags = load_from_pickle( args.data_file) data, subtasks_list = get_multitask_instances_for_valid_tasks( task_instances_dict, tag_statistics) data = add_marker_for_loss_ignore(data, 1.0 if False else 0.0) model_name = "digitalepidemiologylab/covid-twitter-bert" print("\n\n===========\n\n", subtasks_list, "\n\n===========\n\n") tokenizer = BertTokenizer.from_pretrained(model_name) config = BertConfig.from_pretrained(model_name) config.subtasks = subtasks_list model = MultiTaskBertForCovidEntityClassification.from_pretrained( model_name, config=config) # Add new tokens in tokenizer new_special_tokens_dict = { "additional_special_tokens": ["<E>", "</E>", "<URL>", "@USER"] } tokenizer.add_special_tokens(new_special_tokens_dict) # Add the new embeddings in the weights print("Embeddings type:", model.bert.embeddings.word_embeddings.weight.data.type()) print("Embeddings shape:", model.bert.embeddings.word_embeddings.weight.data.size()) embedding_size = model.bert.embeddings.word_embeddings.weight.size(1) new_embeddings = torch.FloatTensor( len(new_special_tokens_dict["additional_special_tokens"]), embedding_size).uniform_(-0.1, 0.1) # new_embeddings = torch.FloatTensor(2, embedding_size).uniform_(-0.1, 0.1) print("new_embeddings shape:", new_embeddings.size()) new_embedding_weight = torch.cat( (model.bert.embeddings.word_embeddings.weight.data, new_embeddings), 0) model.bert.embeddings.word_embeddings.weight.data = new_embedding_weight print("Embeddings shape:", model.bert.embeddings.word_embeddings.weight.data.size()) # Update model config vocab size model.config.vocab_size = model.config.vocab_size + len( new_special_tokens_dict["additional_special_tokens"]) model.load_state_dict(torch.load(args.save_path + "ckpt.pth")) print("loaded_model") model.to("cuda") entity_start_token_id = tokenizer.convert_tokens_to_ids(["<E>"])[0] entity_end_token_id = tokenizer.convert_tokens_to_ids(["</E>"])[0] print(f"Task dataset for task: {args.task} loaded from {args.data_file}.") model_config = dict() results = dict() # Split the data into train, dev and test and shuffle the train segment dev_data = data print("Dev Data:") total_dev_size, pos_subtasks_dev_size, neg_subtasks_dev_size = log_multitask_data_statistics( dev_data, model.subtasks) model_config["dev_data"] = { "size": total_dev_size, "pos": pos_subtasks_dev_size, "neg": neg_subtasks_dev_size } # Extract subtasks data for dev and test dev_subtasks_data = split_data_based_on_subtasks(dev_data, model.subtasks) # Load the instances into pytorch dataset dev_dataset = COVID19TaskDataset(dev_data) tokenize_collator = TokenizeCollator(tokenizer, model.subtasks, entity_start_token_id, entity_end_token_id) dev_dataloader = DataLoader(dev_dataset, batch_size=POSSIBLE_BATCH_SIZE, collate_fn=tokenize_collator) print("Created dev dataloaders with batch aggregation") dev_predicted_labels, dev_prediction_scores, dev_gold_labels = make_predictions_on_dataset( dev_dataloader, model, device, args.task + "_dev", True) # print(dev_predicted_labels['age'][0], dev_prediction_scores['age'][0], dev_gold_labels['age'][0]) assert dev_predicted_labels.keys() == dev_prediction_scores.keys() assert dev_predicted_labels.keys() == dev_gold_labels.keys() for st in dev_gold_labels.keys(): print(st, ":", len(dev_predicted_labels[st]), len(dev_prediction_scores[st]), len(dev_gold_labels[st])) dev_threshold = json.load(open(args.save_path + "/results.json", "r"))['best_dev_threshold'] print(dev_threshold) # [print(k, v) for k,v in get_chunk_tweet_id(dev_subtasks_data['age'], dev_prediction_scores['age'], dev_threshold['age']).items()] dev_pred_chunks = {} for subtask in subtasks_list: if subtask not in IGNORE_TASKS: dev_pred_chunks[subtask] = get_chunk_tweet_id( dev_subtasks_data[subtask], dev_prediction_scores[subtask], dev_threshold[subtask]) json_save_predicts(dev_pred_chunks, args.output_dir + "/" + args.task + ".json", {k: v for k, v in question_keys_and_tags}) collect_TP_FP_FN = {"TP": 0.0001, "FP": 0.0001, "FN": 0.0001} for subtask in model.subtasks: dev_subtask_data = dev_subtasks_data[subtask] dev_subtask_prediction_scores = dev_prediction_scores[subtask] dev_F1, dev_P, dev_R, dev_TP, dev_FP, dev_FN = get_TP_FP_FN( dev_subtask_data, dev_subtask_prediction_scores, dev_threshold[subtask], task=subtask) if subtask not in IGNORE_TASKS: collect_TP_FP_FN["TP"] += dev_TP collect_TP_FP_FN["FP"] += dev_FP collect_TP_FP_FN["FN"] += dev_FN else: print("IGNORE: ", end="") print( f"Subtask:{subtask:>15}\tN={dev_TP + dev_FN}\tF1={dev_F1}\tP={dev_P}\tR={dev_R}\tTP={dev_TP}\tFP={dev_FP}\tFN={dev_FN}" ) dev_macro_P = collect_TP_FP_FN["TP"] / (collect_TP_FP_FN["TP"] + collect_TP_FP_FN["FP"]) dev_macro_R = collect_TP_FP_FN["TP"] / (collect_TP_FP_FN["TP"] + collect_TP_FP_FN["FN"]) dev_macro_F1 = (2 * dev_macro_P * dev_macro_R) / (dev_macro_P + dev_macro_R) print(collect_TP_FP_FN) print("dev_macro_P:", dev_macro_P, "\ndev_macro_R:", dev_macro_R, "\ndev_macro_F1:", dev_macro_F1, "\n")
def main(): task_instances_dict = load_from_pickle(args.data_file) data = extract_instances_for_current_subtask(task_instances_dict, args.sub_task) logging.info( f"Task dataset for task: {args.task} loaded from {args.data_file}.") model_config = dict() results = dict() # Split the data into train, dev and test and shuffle the train segment train_data, dev_data, test_data = split_instances_in_train_dev_test(data) random.shuffle(train_data) # shuffle happens in-place logging.info("Train Data:") total_train_size, pos_train_size, neg_train_size = log_data_statistics( train_data) logging.info("Dev Data:") total_dev_size, pos_dev_size, neg_dev_size = log_data_statistics(dev_data) logging.info("Test Data:") total_test_size, pos_test_size, neg_test_size = log_data_statistics( test_data) # logging.info("\n") model_config["train_data"] = { "size": total_train_size, "pos": pos_train_size, "neg": neg_train_size } model_config["dev_data"] = { "size": total_dev_size, "pos": pos_dev_size, "neg": neg_dev_size } model_config["test_data"] = { "size": total_test_size, "pos": pos_test_size, "neg": neg_test_size } # Extract n-gram features from the train data # Returned ngrams will be dict of dict # TODO: update the feature extractor feature2i, i2feature = create_ngram_features_from(train_data) logging.info( f"Total number of features extracted from train = {len(feature2i)}, {len(i2feature)}" ) model_config["features"] = {"size": len(feature2i)} # Extract Feature vectors and labels from train and test data train_X, train_Y = convert_data_to_feature_vector_and_labels( train_data, feature2i) dev_X, dev_Y = convert_data_to_feature_vector_and_labels( dev_data, feature2i) test_X, test_Y = convert_data_to_feature_vector_and_labels( test_data, feature2i) logging.info( f"Train Data Features = {train_X.shape} and Labels = {len(train_Y)}") logging.info( f"Dev Data Features = {dev_X.shape} and Labels = {len(dev_Y)}") logging.info( f"Test Data Features = {test_X.shape} and Labels = {len(test_Y)}") model_config["train_data"]["features_shape"] = train_X.shape model_config["train_data"]["labels_shape"] = len(train_Y) model_config["dev_data"]["features_shape"] = dev_X.shape model_config["dev_data"]["labels_shape"] = len(dev_Y) model_config["test_data"]["features_shape"] = test_X.shape model_config["test_data"]["labels_shape"] = len(test_Y) # Train logistic regression classifier logging.info("Training the Logistic Regression classifier") lr = LogisticRegression(solver='lbfgs', max_iter=1000) lr.fit(train_X, train_Y) model_config["model"] = "LogisticRegression(solver='lbfgs')" # # Find best threshold based on dev set performance # thresholds = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9] # # dev_prediction_probs = lr.predict_proba(dev_X)[:, 1] # dev_t_F1_P_Rs = list() # best_threshold_based_on_F1 = 0.5 # best_dev_F1 = 0.0 # for t in thresholds: # dev_F1, dev_P, dev_R, dev_TP, dev_FP, dev_FN = get_TP_FP_FN(dev_data, dev_prediction_probs, THRESHOLD=t) # dev_t_F1_P_Rs.append((t, dev_F1, dev_P, dev_R, dev_TP + dev_FN, dev_TP, dev_FP, dev_FN)) # if dev_F1 > best_dev_F1: # best_threshold_based_on_F1 = t # best_dev_F1 = dev_F1 # # log_list(dev_t_F1_P_Rs) # # logging.info(f"Best Threshold: {best_threshold_based_on_F1}\t Best dev F1: {best_dev_F1}") # # Save the best dev threshold and dev_F1 in results dict # results["best_dev_threshold"] = best_threshold_based_on_F1 # results["best_dev_F1"] = best_dev_F1 # results["dev_t_F1_P_Rs"] = dev_t_F1_P_Rs # # y_pred = (clf.predict_proba(X_test)[:,1] >= 0.3).astype(bool) # Test logging.info("Testing the trained classifier") predictions = lr.predict(test_X) probs = lr.predict_proba(test_X) test_Y_prediction_probs = probs[:, 1] cm = metrics.confusion_matrix(test_Y, predictions) classification_report = metrics.classification_report(test_Y, predictions, output_dict=True) logging.info(cm) logging.info(metrics.classification_report(test_Y, predictions)) results["CM"] = cm.tolist( ) # Storing it as list of lists instead of numpy.ndarray results["Classification Report"] = classification_report # evaluation script F1, P, R, TP, FP, FN = get_TP_FP_FN(test_data, test_Y_prediction_probs, THRESHOLD=0.5) logging.info("New evaluation scores:") logging.info(f"F1: {F1}") logging.info(f"Precision: {P}") logging.info(f"Recall: {R}") logging.info(f"True Positive: {TP}") logging.info(f"False Positive: {FP}") logging.info(f"False Negative: {FN}") results["F1"] = F1 results["P"] = P results["R"] = R results["TP"] = TP results["FP"] = FP results["FN"] = FN N = TP + FN results["N"] = N # Save the model and features in pickle file model_and_features_save_file = os.path.join(args.output_dir, "model_and_features.pkl") logging.info( f"Saving LR model and features at {model_and_features_save_file}") save_in_pickle((lr, feature2i, i2feature), model_and_features_save_file) # Save model_config and results model_config_file = os.path.join(args.output_dir, "model_config.json") results_file = os.path.join(args.output_dir, "results.json") logging.info(f"Saving model config at {model_config_file}") save_in_json(model_config, model_config_file) logging.info(f"Saving results at {results_file}") save_in_json(results, results_file)