def main(args): log = util.get_logger(args.save_dir, args.name) log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}') # Get embeddings log.info('Loading embeddings...') word_vectors = util.torch_from_json(args.word_emb_file) # Prepare BiDAF model (must already trained) log.info('Building BiDAF model (should be pretrained)') bidaf_model = BiDAF(word_vectors=word_vectors, # todo: these word vectors shouldn't matter? hidden_size=args.hidden_size) # since they will be loaded in during load_model? #drop_prob=args.drop_prob) # no drop probability since we are not training #bidaf_model = nn.DataParallel(bidaf_model, args.gpu_ids) #log.info(f'Loading checkpoint from {args.load_path}...') #bidaf_model = util.load_model(bidaf_model, args.load_path, args.gpu_ids, return_step=False) # don't need step since we aren't training #bidaf_model = bidaf_model.to(device) bidaf_model.eval() # we eval only (vs train) # Setup the Paraphraser model #ema = util.EMA(bidaf_model, args.ema_decay) # Get saver # saver = util.CheckpointSaver(args.save_dir, # max_checkpoints=args.max_checkpoints, # metric_name=args.metric_name, # maximize_metric=args.maximize_metric, # log=log) # Get optimizer and scheduler # optimizer = optim.Adadelta(model.parameters(), args.lr, # weight_decay=args.l2_wd) # scheduler = sched.LambdaLR(optimizer, lambda s: 1.) # Constant LR # Get data loader a = np.array([[5, 0, 3, 0], [4, 0, 1, 0]]) c_idx = torch.from_numpy(a).long() pp(c_idx[c_idx.nonzero()])
def predict(args, cw_idxs, qn_idxs): args.save_dir = util.get_save_dir(args.save_dir, args.name, training=False) log = util.get_logger(args.save_dir, args.name) log.info('Args: {}'.format(dumps(vars(args), indent=4, sort_keys=True))) device, gpu_ids = util.get_available_devices() args.batch_size *= max(1, len(gpu_ids)) # Get embeddings log.info('Loading embeddings...') word_vectors = util.torch_from_json(args.word_emb_file) # Get model log.info('Building model...') model = BiDAF(word_vectors=word_vectors, hidden_size=args.hidden_size) model = nn.DataParallel(model, gpu_ids) log.info('Loading checkpoint from {}...'.format(args.load_path)) model = util.load_model(model, args.load_path, gpu_ids, return_step=False) model = model.to(device) model.eval() y_pred = model(cw_idxs, qn_idxs) return y_pred
def main(args): # Set up logging args.save_dir = util.get_save_dir(args.save_dir, args.name, training=False) log = util.get_logger(args.save_dir, args.name) device, gpu_ids = util.get_available_devices() args.batch_size *= max(1, len(gpu_ids)) log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}') # Get embeddings log.info('Loading embeddings...') word_vectors = util.torch_from_json(args.word_emb_file) models = {} if args.use_ensemble: total_models = 0 for model_name in ['bidaf', 'bidafextra', 'fusionnet']: models_list = [] for model_file in glob.glob( f'{args.load_path}/{model_name}-*/{args.ensemble_models}'): # Get model log.info('Building model...') if model_name == 'bidaf': model = BiDAF(word_vectors=word_vectors, hidden_size=args.hidden_size) elif model_name == 'bidafextra': model = BiDAFExtra(word_vectors=word_vectors, args=args) elif model_name == 'fusionnet': model = FusionNet(word_vectors=word_vectors, args=args) model = nn.DataParallel(model, gpu_ids) log.info(f'Loading checkpoint from {model_file}...') model = util.load_model(model, model_file, gpu_ids, return_step=False) # Load each model on CPU (have plenty of RAM ...) model = model.cpu() model.eval() models_list.append(model) models[model_name] = models_list total_models += len(models_list) log.info(f'Using an ensemble of {total_models} models') else: device, gpu_ids = util.get_available_devices() # Get model log.info('Building model...') if args.model == 'bidaf': model = BiDAF(word_vectors=word_vectors, hidden_size=args.hidden_size) elif args.model == 'bidafextra': model = BiDAFExtra(word_vectors=word_vectors, args=args) elif args.model == 'fusionnet': model = FusionNet(word_vectors=word_vectors, args=args) model = nn.DataParallel(model, gpu_ids) log.info(f'Loading checkpoint from {args.load_path}...') model = util.load_model(model, args.load_path, gpu_ids, return_step=False) model = model.to(device) model.eval() models[args.model] = [model] # Get data loader log.info('Building dataset...') record_file = vars(args)[f'{args.split}_record_file'] dataset = SQuAD(record_file, args) data_loader = data.DataLoader(dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn) # Evaluate log.info(f'Evaluating on {args.split} split...') nll_meter = util.AverageMeter() pred_dict = {} # Predictions for TensorBoard sub_dict = {} # Predictions for submission eval_file = vars(args)[f'{args.split}_eval_file'] with open(eval_file, 'r') as fh: gold_dict = json_load(fh) with torch.no_grad(), \ tqdm(total=len(dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, cw_pos, cw_ner, cw_freq, cqw_extra, y1, y2, ids in data_loader: # Setup for forward cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) batch_size = cw_idxs.size(0) p1s = [] p2s = [] for model_name in models: for model in models[model_name]: # Move model to GPU to evaluate model = model.to(device) # Forward if model_name == 'bidaf': log_p1, log_p2 = model.to(device)(cw_idxs, qw_idxs) else: log_p1, log_p2 = model.to(device)(cw_idxs, qw_idxs, cw_pos, cw_ner, cw_freq, cqw_extra) log_p1, log_p2 = log_p1.cpu(), log_p2.cpu() if not args.use_ensemble: y1, y2 = y1.to(device), y2.to(device) log_p1, log_p2 = log_p1.to(device), log_p2.to(device) loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) nll_meter.update(loss.item(), batch_size) # Move model back to CPU to release GPU memory model = model.cpu() # Get F1 and EM scores p1, p2 = log_p1.exp().unsqueeze( -1).cpu(), log_p2.exp().unsqueeze(-1).cpu() p1s.append(p1), p2s.append(p2) best_ps = torch.max( torch.cat([ torch.cat(p1s, -1).unsqueeze(-1), torch.cat(p2s, -1).unsqueeze(-1) ], -1), -2)[0] p1, p2 = best_ps[:, :, 0], best_ps[:, :, 1] starts, ends = util.discretize(p1, p2, args.max_ans_len, args.use_squad_v2) # Log info progress_bar.update(batch_size) if args.split != 'test': # No labels for the test set, so NLL would be invalid progress_bar.set_postfix(NLL=nll_meter.avg) idx2pred, uuid2pred = util.convert_tokens(gold_dict, ids.tolist(), starts.tolist(), ends.tolist(), args.use_squad_v2) pred_dict.update(idx2pred) sub_dict.update(uuid2pred) # Log results (except for test set, since it does not come with labels) if args.split != 'test': results = util.eval_dicts(gold_dict, pred_dict, args.use_squad_v2) results_list = [('NLL', nll_meter.avg), ('F1', results['F1']), ('EM', results['EM'])] if args.use_squad_v2: results_list.append(('AvNA', results['AvNA'])) results = OrderedDict(results_list) # Log to console results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in results.items()) log.info(f'{args.split.title()} {results_str}') # Log to TensorBoard tbx = SummaryWriter(args.save_dir) util.visualize(tbx, pred_dict=pred_dict, eval_path=eval_file, step=0, split=args.split, num_visuals=args.num_visuals) # Write submission file sub_path = join(args.save_dir, args.split + '_' + args.sub_file) log.info(f'Writing submission file to {sub_path}...') with open(sub_path, 'w', newline='', encoding='utf-8') as csv_fh: csv_writer = csv.writer(csv_fh, delimiter=',') csv_writer.writerow(['Id', 'Predicted']) for uuid in sorted(sub_dict): csv_writer.writerow([uuid, sub_dict[uuid]])
def main(args): # Set up logging args.save_dir = util.get_save_dir(args.save_dir, args.name, training=False) log = util.get_logger(args.save_dir, args.name) log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}') device, gpu_ids = util.get_available_devices() args.batch_size *= max(1, len(gpu_ids)) # Get embeddings log.info('Loading embeddings...') word_vectors = util.torch_from_json(args.word_emb_file) ch_vectors = util.torch_from_json(args.char_emb_file) # Get model log.info('Building model...') model = BiDAF(word_vectors=word_vectors, ch_vectors=ch_vectors, hidden_size=args.hidden_size) model = nn.DataParallel(model, gpu_ids) log.info(f'Loading checkpoint from {args.load_path}...') model = util.load_model(model, args.load_path, gpu_ids, return_step=False) model = model.to(device) model.eval() # Get data loader log.info('Building dataset...') record_file = vars(args)[f'{args.split}_record_file'] dataset = SQuAD(record_file, args.use_squad_v2) data_loader = data.DataLoader(dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn) # Evaluate log.info(f'Evaluating on {args.split} split...') nll_meter = util.AverageMeter() pred_dict = {} # Predictions for TensorBoard sub_dict = {} # Predictions for submission eval_file = vars(args)[f'{args.split}_eval_file'] with open(eval_file, 'r') as fh: gold_dict = json_load(fh) with torch.no_grad(), \ tqdm(total=len(dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in data_loader: # Setup for forward cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) cc_idxs = cc_idxs.to(device) qc_idxs = qc_idxs.to(device) batch_size = cw_idxs.size(0) # Forward log_p1, log_p2 = model(cw_idxs, qw_idxs, cc_idxs, qc_idxs) y1, y2 = y1.to(device), y2.to(device) loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) nll_meter.update(loss.item(), batch_size) # Get F1 and EM scores p1, p2 = log_p1.exp(), log_p2.exp() starts, ends = util.discretize(p1, p2, args.max_ans_len, args.use_squad_v2) # Log info progress_bar.update(batch_size) if args.split != 'test': # No labels for the test set, so NLL would be invalid progress_bar.set_postfix(NLL=nll_meter.avg) idx2pred, uuid2pred = util.convert_tokens(gold_dict, ids.tolist(), starts.tolist(), ends.tolist(), args.use_squad_v2) pred_dict.update(idx2pred) sub_dict.update(uuid2pred) # Log results (except for test set, since it does not come with labels) if args.split != 'test': results = util.eval_dicts(gold_dict, pred_dict, args.use_squad_v2) results_list = [('NLL', nll_meter.avg), ('F1', results['F1']), ('EM', results['EM'])] if args.use_squad_v2: results_list.append(('AvNA', results['AvNA'])) results = OrderedDict(results_list) # Log to console results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in results.items()) log.info(f'{args.split.title()} {results_str}') # Log to TensorBoard tbx = SummaryWriter(args.save_dir) util.visualize(tbx, pred_dict=pred_dict, eval_path=eval_file, step=0, split=args.split, num_visuals=args.num_visuals) # Write submission file sub_path = join(args.save_dir, args.split + '_' + args.sub_file) log.info(f'Writing submission file to {sub_path}...') with open(sub_path, 'w', newline='', encoding='utf-8') as csv_fh: csv_writer = csv.writer(csv_fh, delimiter=',') csv_writer.writerow(['Id', 'Predicted']) for uuid in sorted(sub_dict): csv_writer.writerow([uuid, sub_dict[uuid]])
def main(args): # Set up logging args.save_dir = util.get_save_dir(args.save_dir, args.name, training=False) log = util.get_logger(args.save_dir, args.name) log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}') device, gpu_ids = util.get_available_devices() args.batch_size *= max(1, len(gpu_ids)) # Get embeddings log.info('Loading embeddings...') word_vectors = util.torch_from_json(args.word_emb_file) # Get model log.info('Building model...') if args.model == 'bidaf': model = BiDAF(word_vectors=word_vectors, hidden_size=args.hidden_size) elif args.model == 'bidafextra': model = BiDAFExtra(word_vectors=word_vectors, args=args) elif args.model == 'fusionnet': model = FusionNet(word_vectors=word_vectors, args=args) model = nn.DataParallel(model, gpu_ids) log.info(f'Loading checkpoint from {args.load_path}...') model = util.load_model(model, args.load_path, gpu_ids, return_step=False) model = model.to(device) model.eval() # Get data loader log.info('Building dataset...') record_file = vars(args)[f'{args.split}_record_file'] dataset = SQuAD(record_file, args) data_loader = data.DataLoader(dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn) # print("*"*80) # print(len(dataset.question_idxs)) # for question_idx in dataset.question_idxs: # print(question_idx) # print("*" * 80) # print(self.question_idxs[question_idx]) # self.question_idxs[idx] # print("data_loader: ",data_loader) # Evaluate log.info(f'Evaluating on {args.split} split...') nll_meter = util.AverageMeter() pred_dict = {} # Predictions for TensorBoard sub_dict = {} # Predictions for submission eval_file = vars(args)[f'{args.split}_eval_file'] with open(eval_file, 'r') as fh: gold_dict = json_load(fh) # create statistics # print("*"*80) # print(len(gold_dict)) # print(gold_dict['1']['question']) count_questions_type = defaultdict(lambda: 0) audit_trail_from_question_type = defaultdict(lambda: []) list_of_interrogative_pronouns = [ "what", "whose", "why", "which", "where", "when", "how", "who", "whom" ] for index in range(1, len(gold_dict)): # transform the question in lower case to simplify the analysis, thus losing the benefit of the capital letters # possibly indicating the position of the interrogative pronoun in the sentence. question_lower_case = gold_dict[str(index)]['question'].lower() list_question_lower_case_with_punctuation = question_lower_case.translate( {ord(i): " " for i in "'"}).split() # question_lower_case = [] for item in list_question_lower_case_with_punctuation: question_lower_case.append( item.translate({ord(i): "" for i in ",.<>!@£$%^&*()_-+=?"})) # defining a variable for the first word first_word_question_lower_case = question_lower_case[0] # defining variable for the second word second_word_question_lower_case = question_lower_case[1] # defining variable for the first and second word combined_first_and_second_words = first_word_question_lower_case + " " + second_word_question_lower_case #printing on the screen test for debugging purpose # Analyzing the sentence if first_word_question_lower_case in list_of_interrogative_pronouns: count_questions_type[first_word_question_lower_case] += 1 audit_trail_from_question_type[ first_word_question_lower_case].append(str(index)) # composed question starting by in elif first_word_question_lower_case == "in": if second_word_question_lower_case in list_of_interrogative_pronouns and second_word_question_lower_case != "whose": count_questions_type[combined_first_and_second_words] += 1 audit_trail_from_question_type[ combined_first_and_second_words].append(str(index)) else: pronoun = find_first_interrogative_pronoun( list_of_interrogative_pronouns, question_lower_case) count_questions_type[pronoun] += 1 audit_trail_from_question_type[pronoun].append(str(index)) # composed question starting by by elif first_word_question_lower_case == "by": if second_word_question_lower_case in list_of_interrogative_pronouns \ and second_word_question_lower_case !="whom"\ and second_word_question_lower_case !="which"\ and second_word_question_lower_case !="when"\ and second_word_question_lower_case !="how": count_questions_type[combined_first_and_second_words] += 1 audit_trail_from_question_type[ combined_first_and_second_words].append(str(index)) else: pronoun = find_first_interrogative_pronoun( list_of_interrogative_pronouns, question_lower_case) count_questions_type[pronoun] += 1 audit_trail_from_question_type[pronoun].append(str(index)) else: pronoun = find_first_interrogative_pronoun( list_of_interrogative_pronouns, question_lower_case) #if pronoun =="": # print(">>", question_lower_case) # print("@@@", gold_dict[str(index)]['question']) count_questions_type[pronoun] += 1 audit_trail_from_question_type[pronoun].append(str(index)) # if pronoun =="": # print(">>", question_lower_case.split()) # print() #if first_word_question_lower_case == "if": # print(">>", question_lower_case.split()) # print(count_questions_type) # if gold_dict[str(index)]['question'].lower().split()[0] == "in": # print(gold_dict[str(index)]['question']) reverse_dict_by_value = OrderedDict( sorted(count_questions_type.items(), key=lambda x: x[1])) # print(count_questions_type) total_questions = sum(count_questions_type.values()) # print(reverse_dict) #for k, v in reverse_dict_by_value.items(): # print( "%s: %s and in percentage: %s" % (k, v, 100*v/total_questions)) #print(audit_trail_from_question_type) # exit() with torch.no_grad(), \ tqdm(total=len(dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, cw_pos, cw_ner, cw_freq, cqw_extra, y1, y2, ids in data_loader: # Setup for forward cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) batch_size = cw_idxs.size(0) # Forward if args.model == 'bidaf': log_p1, log_p2 = model(cw_idxs, qw_idxs) else: log_p1, log_p2 = model(cw_idxs, qw_idxs, cw_pos, cw_ner, cw_freq, cqw_extra) y1, y2 = y1.to(device), y2.to(device) loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) nll_meter.update(loss.item(), batch_size) # Get F1 and EM scores p1, p2 = log_p1.exp(), log_p2.exp() starts, ends = util.discretize(p1, p2, args.max_ans_len, args.use_squad_v2) # Log info progress_bar.update(batch_size) if args.split != 'test': # No labels for the test set, so NLL would be invalid progress_bar.set_postfix(NLL=nll_meter.avg) idx2pred, uuid2pred = util.convert_tokens(gold_dict, ids.tolist(), starts.tolist(), ends.tolist(), args.use_squad_v2) pred_dict.update(idx2pred) sub_dict.update(uuid2pred) # Log results (except for test set, since it does not come with labels) if args.split != 'test': results = util.eval_dicts(gold_dict, pred_dict, args.use_squad_v2) # Printing information for questions without interrogative pronouns """" print("len(gold_dict): ", len(gold_dict)) print("len(pred_dict): ", len(pred_dict)) print("Is gold_dict.keys() identical to pred_dict.keys(): ", gold_dict.keys()==pred_dict.keys()) if gold_dict.keys()!=pred_dict.keys(): for key in gold_dict.keys(): if key not in pred_dict.keys(): print("key ", key, " missing in pred_dict.keys(") """ results_list = [('NLL', nll_meter.avg), ('F1', results['F1']), ('EM', results['EM'])] if args.use_squad_v2: results_list.append(('AvNA', results['AvNA'])) results = OrderedDict(results_list) # Computing the F1 score for each type of question # # audit_trail_from_question_type[pronoun].append(str(index)) # create a list of the types of questions by extracting the keys from the dict audit_trail_from_question_type types_of_questions = list(audit_trail_from_question_type.keys()) gold_dict_per_type_of_questions = defaultdict(lambda: []) pred_dict_per_type_of_questions = {} gold_dict_per_type_of_questions_start = {} pred_dict_per_type_of_questions_start = {} gold_dict_per_type_of_questions_middle = {} pred_dict_per_type_of_questions_middle = {} gold_dict_per_type_of_questions_end = {} pred_dict_per_type_of_questions_end = {} for type_of_questions in types_of_questions: #gold_pred = {key: value for key, value in gold_dict.items() if key in audit_trail_from_question_type[type_of_questions]} #lst_pred = {key: value for key, value in pred_dict.items() if key in audit_trail_from_question_type[type_of_questions]} # Create two dictionnaries for each type of sentence for gold_dict_per_type_of_questions and pred_dict_per_type_of_questions gold_dict_per_type_of_questions[type_of_questions] = { key: value for key, value in gold_dict.items() if key in audit_trail_from_question_type[type_of_questions] and key in pred_dict.keys() } pred_dict_per_type_of_questions[type_of_questions] = { key: value for key, value in pred_dict.items() if key in audit_trail_from_question_type[type_of_questions] and key in pred_dict.keys() } # print(type_of_questions," F1 score: ", util.eval_dicts(gold_dict_per_type_of_questions[type_of_questions], pred_dict_per_type_of_questions[type_of_questions], args.use_squad_v2)['F1']) gold_dict_per_type_of_questions_start[type_of_questions] = { key: value for key, value in gold_dict.items() if key in audit_trail_from_question_type[type_of_questions] and key in pred_dict.keys() } pred_dict_per_type_of_questions_start[type_of_questions] = { key: value for key, value in pred_dict.items() if key in audit_trail_from_question_type[type_of_questions] and key in pred_dict.keys() } gold_dict_per_type_of_questions_middle[type_of_questions] = { key: value for key, value in gold_dict.items() if key in audit_trail_from_question_type[type_of_questions] and key in pred_dict.keys() } pred_dict_per_type_of_questions_middle[type_of_questions] = { key: value for key, value in pred_dict.items() if key in audit_trail_from_question_type[type_of_questions] and key in pred_dict.keys() } gold_dict_per_type_of_questions_end[type_of_questions] = { key: value for key, value in gold_dict.items() if key in audit_trail_from_question_type[type_of_questions] and key in pred_dict.keys() } pred_dict_per_type_of_questions_end[type_of_questions] = { key: value for key, value in pred_dict.items() if key in audit_trail_from_question_type[type_of_questions] and key in pred_dict.keys() } for key, value in gold_dict.items(): #if key in audit_trail_from_question_type[type_of_questions] and key in pred_dict.keys(): if key in audit_trail_from_question_type[ type_of_questions] and type_of_questions != "" and key in pred_dict_per_type_of_questions[ type_of_questions]: """ print("type_of_questions: ",type_of_questions) print("key: ", key) print("question: ", value["question"]) sub_index = value["question"].lower().find(type_of_questions) print("sub_index: ",sub_index) test_fc = value["question"].lower().find(type_of_questions) print("present type of the var: ",type(test_fc)) #print("question: ", value["question"][str(key)]) print("length of the question: ", len(value["question"])) print('Position of the interrogative pronoun in the question:', ) """ # Create two dictionnaries for each type of sentence based at the start of the sentence if value["question"].lower().find( type_of_questions) == 1 or value["question"].lower( ).find(type_of_questions) == 0: #print("BEGINNING") if type_of_questions != "": try: del gold_dict_per_type_of_questions_middle[ type_of_questions][key] except KeyError: pass try: del pred_dict_per_type_of_questions_middle[ type_of_questions][key] except KeyError: pass try: del gold_dict_per_type_of_questions_end[ type_of_questions][key] except KeyError: pass try: del pred_dict_per_type_of_questions_end[ type_of_questions][key] except KeyError: pass #pred_dict_per_type_of_questions_start[type_of_questions] = {key: pred_dict[key] for key in # gold_dict_per_type_of_questions_start[ # type_of_questions].keys()} elif value["question"].lower( ).find(type_of_questions) >= len( value["question"]) - len(type_of_questions) - 5: #print("END") if type_of_questions != "": try: del gold_dict_per_type_of_questions_middle[ type_of_questions][key] except KeyError: pass try: del pred_dict_per_type_of_questions_middle[ type_of_questions][key] except KeyError: pass try: del gold_dict_per_type_of_questions_start[ type_of_questions][key] except KeyError: pass try: del pred_dict_per_type_of_questions_start[ type_of_questions][key] except KeyError: pass #print("type_of_questions: ",type_of_questions) #sub_index = value["question"].lower().find(type_of_questions) #print("sub_index: ", sub_index) #print("len(value['question']) - len(type_of_questions) - 2: ", len(value["question"])-len(type_of_questions)-2) #start_string = len(value["question"])-len(type_of_questions)-6 #end_string = len(value["question"])-1 #print("extract at the end: ", value["question"][start_string:end_string]) else: #print("MIDDLE") if type_of_questions != "": try: del gold_dict_per_type_of_questions_start[ type_of_questions][key] except KeyError: pass try: del pred_dict_per_type_of_questions_start[ type_of_questions][key] except KeyError: pass try: del gold_dict_per_type_of_questions_end[ type_of_questions][key] except KeyError: pass try: del pred_dict_per_type_of_questions_end[ type_of_questions][key] except KeyError: pass pass """ if type_of_questions != "": gold_dict_per_type_of_questions_start[type_of_questions] = {key: value for key, value in gold_dict.items() if (key in audit_trail_from_question_type[type_of_questions] \ and (value["question"].lower().find(type_of_questions) <= 1) \ and key in pred_dict_per_type_of_questions[type_of_questions]) } """ """ for key in gold_dict_per_type_of_questions_start[type_of_questions].keys(): print("key:: ", key ) print("type(key):: ", type(key) ) print("pred_dict[,key,] : ", pred_dict[key]) print("@@@@@@@@@@@@@@@@@@@@@@@@") pred_dict_per_type_of_questions_start[type_of_questions] = {key: pred_dict[key] for key in gold_dict_per_type_of_questions_start[type_of_questions].keys()} #pred_dict_per_type_of_questions_start[type_of_questions] = {key: value for key, value in pred_dict.items() if key in list(gold_dict_per_type_of_questions_start[type_of_questions].keys()) } # Create two dictionnaries for each type of sentence based at the end of the sentence gold_dict_per_type_of_questions_end[type_of_questions] = {key: value for key, value in gold_dict.items() if key in audit_trail_from_question_type[type_of_questions] \ and value["question"].lower().find(type_of_questions) >= len(value["question"])-len(type_of_questions)-2 \ and key in pred_dict_per_type_of_questions[type_of_questions]} pred_dict_per_type_of_questions_end[type_of_questions] = {key: pred_dict[key] for key in list(gold_dict_per_type_of_questions_end[type_of_questions].keys())} #print("*"*80) # Create two dictionnaries for each type of sentence based at the middle of the sentencecount_questions_type gold_dict_per_type_of_questions_middle[type_of_questions] = {key: value for key, value in gold_dict.items() if key not in list(gold_dict_per_type_of_questions_start[type_of_questions].keys()) \ and key not in list(gold_dict_per_type_of_questions_end[type_of_questions].keys())} pred_dict_per_type_of_questions_middle[type_of_questions] = {key: pred_dict[key] for key in list(gold_dict_per_type_of_questions_end[type_of_questions].keys())} else: gold_dict_per_type_of_questions_start[""] = gold_dict_per_type_of_questions[""] pred_dict_per_type_of_questions_start[""] = pred_dict_per_type_of_questions[""] gold_dict_per_type_of_questions_end[""] = gold_dict_per_type_of_questions[""] pred_dict_per_type_of_questions_end[""] = pred_dict_per_type_of_questions[""] gold_dict_per_type_of_questions_middle[""] = gold_dict_per_type_of_questions[""] pred_dict_per_type_of_questions_middle[""] = pred_dict_per_type_of_questions[""] """ positions_in_question = ["beginning", "middle", "end"] # print(type_of_questions," F1 score: ", util.eval_dicts(gold_dict_per_type_of_questions[type_of_questions], pred_dict_per_type_of_questions[type_of_questions], args.use_squad_v2)['F1']) list_beginning = [ util.eval_dicts( gold_dict_per_type_of_questions_start[type_of_questions], pred_dict_per_type_of_questions_start[type_of_questions], args.use_squad_v2)['F1'] for type_of_questions in types_of_questions ] list_middle = [ util.eval_dicts( gold_dict_per_type_of_questions_middle[type_of_questions], pred_dict_per_type_of_questions_middle[type_of_questions], args.use_squad_v2)['F1'] for type_of_questions in types_of_questions ] list_end = [ util.eval_dicts( gold_dict_per_type_of_questions_end[type_of_questions], pred_dict_per_type_of_questions_end[type_of_questions], args.use_squad_v2)['F1'] for type_of_questions in types_of_questions ] #for type_of_questions in types_of_questions: # print("gold_dict_per_type_of_questions_start[type_of_questions]: ",gold_dict_per_type_of_questions_start[type_of_questions]) # print("pred_dict_per_type_of_questions[type_of_questions]: ",pred_dict_per_type_of_questions[type_of_questions]) F1 = np.array([list_beginning, list_middle, list_end]) m, n = F1.shape value_to_ignore = [] for i in range(m): for j in range(n): if F1[i, j] == "NA" or F1[i, j] == 0: value_to_ignore.append((i, j)) print("value to ignore: ", value_to_ignore) #F1 = np.array([[0, 0, 0, 0, 0.1, 0, 0, 0, 0, 0, 0, 0, 0], # [0, 0, 0, 0, 0.1, 0, 0, 0, 0, 0, 0, 0, 0], # [0, 0, 0, 0, 0.1, 0, 0, 0, 0, 0, 0, 0, 0]]) data_label = copy.deepcopy(F1) for row in data_label: for column_idx in range(len(row)): if row[column_idx] == "NA": row[column_idx] = "" # print question without interrogative pronoun required for the second part of the analysis: for key, value in gold_dict.items(): if key in audit_trail_from_question_type[ ""] and key in pred_dict.keys(): print("question: ", gold_dict_per_type_of_questions['']) print("golden answers: ", ) print("prediction: ", pred_dict[key]) print() fig, ax = plt.subplots() types_of_questions[types_of_questions.index( "")] = "Implicit question without interrogative pronoun" im, cbar = heatmap(F1, positions_in_question, types_of_questions, ax=ax, \ cmap="YlGn", cbarlabel="F1 scores") texts = annotate_heatmap(im, data=data_label, valfmt="{x:.1f}", ignore=value_to_ignore) fig.tight_layout() plt.show() # Log to console results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in results.items()) log.info(f'{args.split.title()} {results_str}') # Log to TensorBoard tbx = SummaryWriter(args.save_dir) util.visualize(tbx, pred_dict=pred_dict, eval_path=eval_file, step=0, split=args.split, num_visuals=args.num_visuals) # Write submission file sub_path = join(args.save_dir, args.split + '_' + args.sub_file) log.info(f'Writing submission file to {sub_path}...') with open(sub_path, 'w', newline='', encoding='utf-8') as csv_fh: csv_writer = csv.writer(csv_fh, delimiter=',') csv_writer.writerow(['Id', 'Predicted']) for uuid in sorted(sub_dict): csv_writer.writerow([uuid, sub_dict[uuid]])
def test(args): # Set up logging log = util.get_logger(args.save_dir, args.name) log.info(f"Args: {dumps(vars(args), indent=4, sort_keys=True)}") device, gpu_ids = util.get_available_devices() args.batch_size *= max(1, len(gpu_ids)) # Get embeddings log.info("Loading embeddings...") word_vectors = util.torch_from_json(args.word_emb_file) # Get model log.info("Building model...") model = BiDAF( word_vectors=word_vectors, hidden_size=args.hidden_size, use_glove=args.use_glove, ) model = nn.DataParallel(model, gpu_ids) log.info(f"Loading checkpoint from {args.load_path}...") model = util.load_model(model, args.load_path, gpu_ids, return_step=False) model = model.to(device) model.eval() # Get data loader log.info("Building dataset...") record_file = vars(args)[f"{args.split}_record_file"] dataset = SQuAD(record_file, args.use_squad_v2) data_loader = data.DataLoader( dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn, ) # Evaluate log.info(f"Evaluating on {args.split} split...") nll_meter = stats.AverageMeter() pred_dict = {} # Predictions for TensorBoard sub_dict = {} # Predictions for submission eval_file = vars(args)[f"{args.split}_eval_file"] with open(eval_file, "r") as fh: gold_dict = json_load(fh) with torch.no_grad(), tqdm(total=len(dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in data_loader: # Setup for forward cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) batch_size = cw_idxs.size(0) # Forward log_p1, log_p2 = model(cw_idxs, qw_idxs) y1, y2 = y1.to(device), y2.to(device) loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) nll_meter.update(loss.item(), batch_size) # Get F1 and EM scores p1, p2 = log_p1.exp(), log_p2.exp() starts, ends = util.discretize(p1, p2, args.max_ans_len, args.use_squad_v2) # Log info progress_bar.update(batch_size) if args.split != "test": # No labels for the test set, so NLL would be invalid progress_bar.set_postfix(NLL=nll_meter.avg) idx2pred, uuid2pred = util.convert_tokens( gold_dict, ids.tolist(), starts.tolist(), ends.tolist(), args.use_squad_v2, ) pred_dict.update(idx2pred) sub_dict.update(uuid2pred) # Log results (except for test set, since it does not come with labels) if args.split != "test": results = {"NLL": nll_meter.avg} results.update(eval.eval_dicts(gold_dict, pred_dict, args.use_squad_v2)) # Log to console results_str = ", ".join(f"{k}: {v:05.2f}" for k, v in results.items()) log.info(f"{args.split.title()} {results_str}") # Log to TensorBoard tbx = SummaryWriter(args.save_dir) util.visualize( tbx, pred_dict=pred_dict, eval_path=eval_file, step=0, split=args.split, num_visuals=args.num_visuals, ) # Write submission file if args.split == "dev": sub_path = join(args.save_dir, "val" + "_" + args.sub_file) else: sub_path = join(args.save_dir, args.split + "_" + args.sub_file) log.info(f"Writing submission file to {sub_path}...") eval.write_submission(sub_path, sub_dict)
def main(args): # Set up logging args.save_dir = util.get_save_dir(args.save_dir, args.name, training=False) log = util.get_logger(args.save_dir, args.name) log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}') device, gpu_ids = util.get_available_devices() args.batch_size *= max(1, len(gpu_ids)) # Get embeddings log.info('Loading embeddings...') word_vectors = util.torch_from_json(args.word_emb_file) char_vectors = util.torch_from_json(args.char_emb_file) # Get model log.info('Building model...') nbr_model = 0 if (args.load_path_baseline): model_baseline = Baseline(word_vectors=word_vectors, hidden_size=100) model_baseline = nn.DataParallel(model_baseline, gpu_ids) log.info(f'Loading checkpoint from {args.load_path_baseline}...') model_baseline = util.load_model(model_baseline, args.load_path_baseline, gpu_ids, return_step=False) model_baseline = model_baseline.to(device) model_baseline.eval() nll_meter_baseline = util.AverageMeter() nbr_model += 1 save_prob_baseline_start = [] save_prob_baseline_end = [] if (args.load_path_bidaf): model_bidaf = BiDAF(word_vectors=word_vectors, char_vectors=char_vectors, char_emb_dim=args.char_emb_dim, hidden_size=args.hidden_size) model_bidaf = nn.DataParallel(model_bidaf, gpu_ids) log.info(f'Loading checkpoint from {args.load_path_bidaf}...') model_bidaf = util.load_model(model_bidaf, args.load_path_bidaf, gpu_ids, return_step=False) model_bidaf = model_bidaf.to(device) model_bidaf.eval() nll_meter_bidaf = util.AverageMeter() nbr_model += 1 save_prob_bidaf_start = [] save_prob_bidaf_end = [] if (args.load_path_bidaf_fusion): model_bidaf_fu = BiDAF_fus(word_vectors=word_vectors, char_vectors=char_vectors, char_emb_dim=args.char_emb_dim, hidden_size=args.hidden_size) model_bidaf_fu = nn.DataParallel(model_bidaf_fu, gpu_ids) log.info(f'Loading checkpoint from {args.load_path_bidaf_fusion}...') model_bidaf_fu = util.load_model(model_bidaf_fu, args.load_path_bidaf_fusion, gpu_ids, return_step=False) model_bidaf_fu = model_bidaf_fu.to(device) model_bidaf_fu.eval() nll_meter_bidaf_fu = util.AverageMeter() nbr_model += 1 save_prob_bidaf_fu_start = [] save_prob_bidaf_fu_end = [] if (args.load_path_qanet): model_qanet = QANet(word_vectors=word_vectors, char_vectors=char_vectors, char_emb_dim=args.char_emb_dim, hidden_size=args.hidden_size, n_heads=args.n_heads, n_conv_emb_enc=args.n_conv_emb, n_conv_mod_enc=args.n_conv_mod, n_emb_enc_blocks=args.n_emb_blocks, n_mod_enc_blocks=args.n_mod_blocks, divisor_dim_kqv=args.divisor_dim_kqv) model_qanet = nn.DataParallel(model_qanet, gpu_ids) log.info(f'Loading checkpoint from {args.load_path_qanet}...') model_qanet = util.load_model(model_qanet, args.load_path_qanet, gpu_ids, return_step=False) model_qanet = model_qanet.to(device) model_qanet.eval() nll_meter_qanet = util.AverageMeter() nbr_model += 1 save_prob_qanet_start = [] save_prob_qanet_end = [] if (args.load_path_qanet_old): model_qanet_old = QANet_old(word_vectors=word_vectors, char_vectors=char_vectors, device=device, char_emb_dim=args.char_emb_dim, hidden_size=args.hidden_size, n_heads=args.n_heads, n_conv_emb_enc=args.n_conv_emb, n_conv_mod_enc=args.n_conv_mod, n_emb_enc_blocks=args.n_emb_blocks, n_mod_enc_blocks=args.n_mod_blocks) model_qanet_old = nn.DataParallel(model_qanet_old, gpu_ids) log.info(f'Loading checkpoint from {args.load_path_qanet_old}...') model_qanet_old = util.load_model(model_qanet_old, args.load_path_qanet_old, gpu_ids, return_step=False) model_qanet_old = model_qanet_old.to(device) model_qanet_old.eval() nll_meter_qanet_old = util.AverageMeter() nbr_model += 1 save_prob_qanet_old_start = [] save_prob_qanet_old_end = [] if (args.load_path_qanet_inde): model_qanet_inde = QANet_independant_encoder( word_vectors=word_vectors, char_vectors=char_vectors, char_emb_dim=args.char_emb_dim, hidden_size=args.hidden_size, n_heads=args.n_heads, n_conv_emb_enc=args.n_conv_emb, n_conv_mod_enc=args.n_conv_mod, n_emb_enc_blocks=args.n_emb_blocks, n_mod_enc_blocks=args.n_mod_blocks, divisor_dim_kqv=args.divisor_dim_kqv) model_qanet_inde = nn.DataParallel(model_qanet_inde, gpu_ids) log.info(f'Loading checkpoint from {args.load_path_qanet_inde}...') model_qanet_inde = util.load_model(model_qanet_inde, args.load_path_qanet_inde, gpu_ids, return_step=False) model_qanet_inde = model_qanet_inde.to(device) model_qanet_inde.eval() nll_meter_qanet_inde = util.AverageMeter() nbr_model += 1 save_prob_qanet_inde_start = [] save_prob_qanet_inde_end = [] if (args.load_path_qanet_s_e): model_qanet_s_e = QANet_S_E(word_vectors=word_vectors, char_vectors=char_vectors, char_emb_dim=args.char_emb_dim, hidden_size=args.hidden_size, n_heads=args.n_heads, n_conv_emb_enc=args.n_conv_emb, n_conv_mod_enc=args.n_conv_mod, n_emb_enc_blocks=args.n_emb_blocks, n_mod_enc_blocks=args.n_mod_blocks, divisor_dim_kqv=args.divisor_dim_kqv) model_qanet_s_e = nn.DataParallel(model_qanet_s_e, gpu_ids) log.info(f'Loading checkpoint from {args.load_path_qanet_s_e}...') model_qanet_s_e = util.load_model(model_qanet_s_e, args.load_path_qanet_s_e, gpu_ids, return_step=False) model_qanet_s_e = model_qanet_s_e.to(device) model_qanet_s_e.eval() nll_meter_qanet_s_e = util.AverageMeter() nbr_model += 1 save_prob_qanet_s_e_start = [] save_prob_qanet_s_e_end = [] # Get data loader log.info('Building dataset...') record_file = vars(args)[f'{args.split}_record_file'] dataset = SQuAD(record_file, args.use_squad_v2) data_loader = data.DataLoader(dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn) # Evaluate log.info(f'Evaluating on {args.split} split...') pred_dict = {} # Predictions for TensorBoard sub_dict = {} # Predictions for submission eval_file = vars(args)[f'{args.split}_eval_file'] with open(eval_file, 'r') as fh: gold_dict = json_load(fh) with torch.no_grad(), \ tqdm(total=len(dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in data_loader: # Setup for forward cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) cc_idxs = cc_idxs.to(device) qc_idxs = qc_idxs.to(device) batch_size = cw_idxs.size(0) y1, y2 = y1.to(device), y2.to(device) l_p1, l_p2 = [], [] # Forward if (args.load_path_baseline): log_p1_baseline, log_p2_baseline = model_baseline( cw_idxs, cc_idxs) loss_baseline = F.nll_loss(log_p1_baseline, y1) + F.nll_loss( log_p2_baseline, y2) nll_meter_baseline.update(loss_baseline.item(), batch_size) l_p1 += [log_p1_baseline.exp()] l_p2 += [log_p2_baseline.exp()] if (args.save_probabilities): save_prob_baseline_start += [ log_p1_baseline.exp().detach().cpu().numpy() ] save_prob_baseline_end += [ log_p2_baseline.exp().detach().cpu().numpy() ] if (args.load_path_qanet): log_p1_qanet, log_p2_qanet = model_qanet( cw_idxs, cc_idxs, qw_idxs, qc_idxs) loss_qanet = F.nll_loss(log_p1_qanet, y1) + F.nll_loss( log_p2_qanet, y2) nll_meter_qanet.update(loss_qanet.item(), batch_size) # Get F1 and EM scores l_p1 += [log_p1_qanet.exp()] l_p2 += [log_p2_qanet.exp()] if (args.save_probabilities): save_prob_qanet_start += [ log_p1_qanet.exp().detach().cpu().numpy() ] save_prob_qanet_end += [ log_p2_qanet.exp().detach().cpu().numpy() ] if (args.load_path_qanet_old): log_p1_qanet_old, log_p2_qanet_old = model_qanet_old( cw_idxs, cc_idxs, qw_idxs, qc_idxs) loss_qanet_old = F.nll_loss(log_p1_qanet_old, y1) + F.nll_loss( log_p2_qanet_old, y2) nll_meter_qanet_old.update(loss_qanet_old.item(), batch_size) # Get F1 and EM scores l_p1 += [log_p1_qanet_old.exp()] l_p2 += [log_p2_qanet_old.exp()] if (args.save_probabilities): save_prob_qanet_old_start += [ log_p1_qanet_old.exp().detach().cpu().numpy() ] save_prob_qanet_old_end += [ log_p2_qanet_old.exp().detach().cpu().numpy() ] if (args.load_path_qanet_inde): log_p1_qanet_inde, log_p2_qanet_inde = model_qanet_inde( cw_idxs, cc_idxs, qw_idxs, qc_idxs) loss_qanet_inde = F.nll_loss( log_p1_qanet_inde, y1) + F.nll_loss(log_p2_qanet_inde, y2) nll_meter_qanet_inde.update(loss_qanet_inde.item(), batch_size) # Get F1 and EM scores l_p1 += [log_p1_qanet_inde.exp()] l_p2 += [log_p2_qanet_inde.exp()] if (args.save_probabilities): save_prob_qanet_inde_start += [ log_p1_qanet_inde.exp().detach().cpu().numpy() ] save_prob_qanet_inde_end += [ log_p2_qanet_inde.exp().detach().cpu().numpy() ] if (args.load_path_qanet_s_e): log_p1_qanet_s_e, log_p2_qanet_s_e = model_qanet_s_e( cw_idxs, cc_idxs, qw_idxs, qc_idxs) loss_qanet_s_e = F.nll_loss(log_p1_qanet_s_e, y1) + F.nll_loss( log_p2_qanet_s_e, y2) nll_meter_qanet_s_e.update(loss_qanet_s_e.item(), batch_size) # Get F1 and EM scores l_p1 += [log_p1_qanet_s_e.exp()] l_p2 += [log_p2_qanet_s_e.exp()] if (args.save_probabilities): save_prob_qanet_s_e_start += [ log_p1_qanet_s_e.exp().detach().cpu().numpy() ] save_prob_qanet_s_e_end += [ log_p2_qanet_s_e.exp().detach().cpu().numpy() ] if (args.load_path_bidaf): log_p1_bidaf, log_p2_bidaf = model_bidaf( cw_idxs, cc_idxs, qw_idxs, qc_idxs) loss_bidaf = F.nll_loss(log_p1_bidaf, y1) + F.nll_loss( log_p2_bidaf, y2) nll_meter_bidaf.update(loss_bidaf.item(), batch_size) l_p1 += [log_p1_bidaf.exp()] l_p2 += [log_p2_bidaf.exp()] if (args.save_probabilities): save_prob_bidaf_start += [ log_p1_bidaf.exp().detach().cpu().numpy() ] save_prob_bidaf_end += [ log_p2_bidaf.exp().detach().cpu().numpy() ] if (args.load_path_bidaf_fusion): log_p1_bidaf_fu, log_p2_bidaf_fu = model_bidaf_fu( cw_idxs, cc_idxs, qw_idxs, qc_idxs) loss_bidaf_fu = F.nll_loss(log_p1_bidaf_fu, y1) + F.nll_loss( log_p2_bidaf_fu, y2) nll_meter_bidaf_fu.update(loss_bidaf_fu.item(), batch_size) l_p1 += [log_p1_bidaf_fu.exp()] l_p2 += [log_p2_bidaf_fu.exp()] if (args.save_probabilities): save_prob_bidaf_fu_start += [ log_p1_bidaf_fu.exp().detach().cpu().numpy() ] save_prob_bidaf_fu_end += [ log_p2_bidaf_fu.exp().detach().cpu().numpy() ] p1, p2 = l_p1[0], l_p2[0] for i in range(1, nbr_model): p1 += l_p1[i] p2 += l_p2[i] p1 /= nbr_model p2 /= nbr_model starts, ends = util.discretize(p1, p2, args.max_ans_len, args.use_squad_v2) # Log info progress_bar.update(batch_size) if args.split != 'test': # No labels for the test set, so NLL would be invalid if (args.load_path_qanet): progress_bar.set_postfix(NLL=nll_meter_qanet.avg) elif (args.load_path_bidaf): progress_bar.set_postfix(NLL=nll_meter_bidaf.avg) elif (args.load_path_bidaf_fusion): progress_bar.set_postfix(NLL=nll_meter_bidaf_fu.avg) elif (args.load_path_qanet_old): progress_bar.set_postfix(NLL=nll_meter_qanet_old.avg) elif (args.load_path_qanet_inde): progress_bar.set_postfix(NLL=nll_meter_qanet_inde.avg) elif (args.load_path_qanet_s_e): progress_bar.set_postfix(NLL=nll_meter_qanet_s_e.avg) else: progress_bar.set_postfix(NLL=nll_meter_baseline.avg) idx2pred, uuid2pred = util.convert_tokens(gold_dict, ids.tolist(), starts.tolist(), ends.tolist(), args.use_squad_v2) pred_dict.update(idx2pred) sub_dict.update(uuid2pred) if (args.save_probabilities): if (args.load_path_baseline): with open(args.save_dir + "/probs_start", "wb") as fp: #Pickling pickle.dump(save_prob_baseline_start, fp) with open(args.save_dir + "/probs_end", "wb") as fp: #Pickling pickle.dump(save_prob_baseline_end, fp) if (args.load_path_bidaf): with open(args.save_dir + "/probs_start", "wb") as fp: #Pickling pickle.dump(save_prob_bidaf_start, fp) with open(args.save_dir + "/probs_end", "wb") as fp: #Pickling pickle.dump(save_prob_bidaf_end, fp) if (args.load_path_bidaf_fusion): with open(args.save_dir + "/probs_start", "wb") as fp: #Pickling pickle.dump(save_prob_bidaf_fu_start, fp) with open(args.save_dir + "/probs_end", "wb") as fp: #Pickling pickle.dump(save_prob_bidaf_fu_end, fp) if (args.load_path_qanet): with open(args.save_dir + "/probs_start", "wb") as fp: #Pickling pickle.dump(save_prob_qanet_start, fp) with open(args.save_dir + "/probs_end", "wb") as fp: #Pickling pickle.dump(save_prob_qanet_end, fp) if (args.load_path_qanet_old): with open(args.save_dir + "/probs_start", "wb") as fp: #Pickling pickle.dump(save_prob_qanet_old_start, fp) with open(args.save_dir + "/probs_end", "wb") as fp: #Pickling pickle.dump(save_prob_qanet_old_end, fp) if (args.load_path_qanet_inde): with open(args.save_dir + "/probs_start", "wb") as fp: #Pickling pickle.dump(save_prob_qanet_inde_start, fp) with open(args.save_dir + "/probs_end", "wb") as fp: #Pickling pickle.dump(save_prob_qanet_inde_end, fp) if (args.load_path_qanet_s_e): with open(args.save_dir + "/probs_start", "wb") as fp: #Pickling pickle.dump(save_prob_qanet_s_e_start, fp) with open(args.save_dir + "/probs_end", "wb") as fp: #Pickling pickle.dump(save_prob_qanet_s_e_end, fp) # Log results (except for test set, since it does not come with labels) if args.split != 'test': results = util.eval_dicts(gold_dict, pred_dict, args.use_squad_v2) if (args.load_path_qanet): meter_avg = nll_meter_qanet.avg elif (args.load_path_bidaf): meter_avg = nll_meter_bidaf.avg elif (args.load_path_bidaf_fusion): meter_avg = nll_meter_bidaf_fu.avg elif (args.load_path_qanet_inde): meter_avg = nll_meter_qanet_inde.avg elif (args.load_path_qanet_s_e): meter_avg = nll_meter_qanet_s_e.avg elif (args.load_path_qanet_old): meter_avg = nll_meter_qanet_old.avg else: meter_avg = nll_meter_baseline.avg results_list = [('NLL', meter_avg), ('F1', results['F1']), ('EM', results['EM'])] if args.use_squad_v2: results_list.append(('AvNA', results['AvNA'])) results = OrderedDict(results_list) # Log to console results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in results.items()) log.info(f'{args.split.title()} {results_str}') # Log to TensorBoard tbx = SummaryWriter(args.save_dir) util.visualize(tbx, pred_dict=pred_dict, eval_path=eval_file, step=0, split=args.split, num_visuals=args.num_visuals) # Write submission file sub_path = join(args.save_dir, args.split + '_' + args.sub_file) log.info(f'Writing submission file to {sub_path}...') with open(sub_path, 'w', newline='', encoding='utf-8') as csv_fh: csv_writer = csv.writer(csv_fh, delimiter=',') csv_writer.writerow(['Id', 'Predicted']) for uuid in sorted(sub_dict): csv_writer.writerow([uuid, sub_dict[uuid]])
def test_model(questions, context, use_squad_v2=True, model_path="../save/training-02/best.pth.tar"): # Set up logging #args.save_dir = util.get_save_dir(args.save_dir, args.name, training=False) #log = util.get_logger(args.save_dir, args.name) #log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}') #args = get_test_args() device, gpu_ids = util.get_available_devices() batch_size = 64 * max(1, len(gpu_ids)) # Get embeddings #print('Loading embeddings...') word_vectors = util.torch_from_json('../data/word_emb.json') # Get model #print('Building model...') model = BiDAF(word_vectors=word_vectors, hidden_size=100) model = nn.DataParallel(model, gpu_ids) #model_path = "../save/training-02/best.pth.tar" #print(f'Loading checkpoint from {args.load_path}...') model = util.load_model(model, model_path, gpu_ids, return_step=False) model = model.to(device) model.eval() # Get data loader #print('Building dataset...') #record_file = vars(args)[f'{args.split}_record_file'] # my code start here # this is a simple approch when dealing with the user date # according to your approch of creating the interface you can change this code # and also you have to check the function "process_file" in the setup.py file processed_questions = [] for index, question in enumerate(questions): processed_question = { "question": question, "id": index, "answers": [{ "answer_start": 0, "text": "never mind" }] } processed_questions.append(processed_question) source = {"paragraphs": [{"qas": processed_questions, "context": context}]} word_counter, char_counter = Counter(), Counter() with open("../data/word2idx.json", "r") as f1: word2idx_dict = json.load(f1) with open("../data/char2idx.json", "r") as f2: char2idx_dict = json.load(f2) my_test_examples, my_test_eval = process_file(source, "my_test", word_counter, char_counter) npz = build_features(my_test_examples, "my_test", word2idx_dict, char2idx_dict, is_test=True) #my code end here dataset = SQuAD(npz, use_squad_v2) data_loader = data.DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=4, collate_fn=collate_fn) # Evaluate #print(f'Evaluating on {args.split} split...') nll_meter = util.AverageMeter() pred_dict = {} # Predictions for TensorBoard sub_dict = {} # Predictions for submission #eval_file = vars(args)[f'{args.split}_eval_file'] gold_dict = my_test_eval #print("gold_dict", gold_dict) #print("data_loader", data_loader) with torch.no_grad(), \ tqdm(total=len(dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in data_loader: # Setup for forward cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) batch_size = cw_idxs.size(0) # Forward log_p1, log_p2 = model(cw_idxs, qw_idxs) y1, y2 = y1.to(device), y2.to(device) loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) nll_meter.update(loss.item(), batch_size) # Get F1 and EM scores p1, p2 = log_p1.exp(), log_p2.exp() starts, ends = util.discretize(p1, p2, 15, use_squad_v2) print("starts ", starts, " ends ", ends) # Log info progress_bar.update(batch_size) #if args.split != 'test': # No labels for the test set, so NLL would be invalid #progress_bar.set_postfix(NLL=nll_meter.avg) idx2pred, uuid2pred = util.convert_tokens(gold_dict, ids.tolist(), starts.tolist(), ends.tolist(), use_squad_v2) pred_dict.update(idx2pred) sub_dict.update(uuid2pred) #print("my evaluation ....") #for el in pred_dict: #print(el, pred_dict[el]) #for el in sub_dict: #print(el, sub_dict[el]) return pred_dict
def main(args): # Set up logging and devices (unchanged from train.py) args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True) log = util.get_logger(args.save_dir, args.name) tbx = SummaryWriter(args.save_dir) # train only, not in test device, args.gpu_ids = util.get_available_devices() # todo(small): should this be args (compare test_para) log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}') args.batch_size *= max(1, len(args.gpu_ids)) # args.py: default size is 64 # Set random seed (unchanged) - train only log.info(f'Using random seed {args.seed}...') random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) # Get embeddings log.info('Loading embeddings...') word_vectors = util.torch_from_json(args.word_emb_file) # Prepare BiDAF model (must already trained) log.info('Building BiDAF model (should be pretrained)') bidaf_model = BiDAF(word_vectors=word_vectors, # todo: these word vectors shouldn't matter? hidden_size=args.hidden_size) # since they will be loaded in during load_model? #drop_prob=args.drop_prob) # no drop probability since we are not training bidaf_model = nn.DataParallel(bidaf_model, args.gpu_ids) if args.short_test: args.hidden_size = 5 elif not args.load_path: log.info("Trying to trian paraphraser withou bidaf model. " "First train BiDAF and then specify the load path. Exiting") exit(1) else: log.info(f'Loading checkpoint from {args.load_path}...') bidaf_model = util.load_model(bidaf_model, args.load_path, args.gpu_ids, return_step=False) # don't need step since we aren't training bidaf_model = bidaf_model.to(device) bidaf_model.eval() # we eval only (vs train) # todo: Setup the Paraphraser model paraphaser_model = Paraphraser(word_vectors=word_vectors, hidden_size=args.hidden_size, drop_prob=args.drop_prob) # Get data loader log.info('Building dataset...') # New for paraphrase: squad_paraphrase has extra fields train_dataset = SQuAD_paraphrase(args.train_record_file, args.use_squad_v2) # train.npz (from setup.py, build_features()) train_loader = data.DataLoader(train_dataset, # this dataloader used for all epoch iteration batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=collate_fn_para) dev_dataset = SQuAD_paraphrase(args.dev_record_file, args.use_squad_v2) # dev.npz (same as above) dev_loader = data.DataLoader(dev_dataset, # dev.npz used in evaluate() fcn batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn_para) # todo: this is just for looking at the paraphrases idx2word_dict = load(args.idx2word_file) #Get saver # saver = util.CheckpointSaver(args.save_dir, # max_checkpoints=args.max_checkpoints, # metric_name=args.metric_name, # maximize_metric=args.maximize_metric, # log=log) #Get optimizer and scheduler # ema = util.EMA(paraphaser_model, args.ema_decay) # optimizer = optim.Adadelta(paraphaser_model.parameters(), args.lr, # weight_decay=args.l2_wd) # scheduler = sched.LambdaLR(optimizer, lambda s: 1.) # Constant LR # Train step = 0 log.info('Training...') steps_till_eval = args.eval_steps epoch = step // len(train_dataset) while epoch != args.num_epochs: epoch += 1 log.info(f'Starting epoch {epoch}...') with torch.enable_grad(), \ tqdm(total=len(train_loader.dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, cphr_idxs, qphr_idxs, qphr_types, ids in train_loader: # Setup for forward # note that cc_idxs, qc_idxs are not used! (character indices) cw_idxs = cw_idxs.to(device) # todo what does this actually do qw_idxs = qw_idxs.to(device) cphr_idxs = cphr_idxs.to(device) qphr_idxs = qphr_idxs.to(device) qphr_types = qphr_types.to(device) batch_size = cw_idxs.size(0) # if args.short_test: # print(f'batch size: {batch_size}') # for i, type in enumerate(cphr_idxs[0]): # print(f'type: {i}') # pp(type) # for x in (qphr_idxs[0], qphr_types[0]): # pp(x) # return paraphrased = paraphaser_model(qphr_idxs, qphr_types, cphr_idxs) for idx, p in enumerate(paraphrased): # enumerate over batch_size non_zeros = p[p.nonzero()].squeeze() #paraphrased[idx] = non_zeros sentence_as_list = [idx2word_dict[str(w.item())] for w in non_zeros] pp(" ".join(sentence_as_list)) #pp([idx2word_dict[w] for w in non_zeros]) if args.short_test: return optimizer.zero_grad() # Forward log_p1, log_p2 = model(cw_idxs, qw_idxs) y1, y2 = y1.to(device), y2.to(device) loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) loss_val = loss.item() # Backward loss.backward() nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step(step // batch_size) # // is floor division ema(model, step // batch_size) # Log info step += batch_size progress_bar.update(batch_size) progress_bar.set_postfix(epoch=epoch, NLL=loss_val) tbx.add_scalar('train/NLL', loss_val, step) tbx.add_scalar('train/LR', optimizer.param_groups[0]['lr'], step) steps_till_eval -= batch_size if steps_till_eval <= 0: steps_till_eval = args.eval_steps # Evaluate and save checkpoint log.info(f'Evaluating at step {step}...') ema.assign(model) results, pred_dict = evaluate(model, dev_loader, device, # call eval with dev_loader args.dev_eval_file, args.max_ans_len, args.use_squad_v2) saver.save(step, model, results[args.metric_name], device) ema.resume(model) # Log to console results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in results.items()) log.info(f'Dev {results_str}') # Log to TensorBoard log.info('Visualizing in TensorBoard...') for k, v in results.items(): tbx.add_scalar(f'dev/{k}', v, step) util.visualize(tbx, pred_dict=pred_dict, eval_path=args.dev_eval_file, step=step, split='dev', num_visuals=args.num_visuals)
def main(args): # Set up logging args.save_dir = util.get_save_dir(args.save_dir, args.name, training=False) #log = util.get_logger(args.save_dir, args.name) #log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}') device, gpu_ids = util.get_available_devices() args.batch_size *= max(1, len(gpu_ids)) # Get embeddings print('Loading embeddings...') word_vectors = util.torch_from_json(args.word_emb_file) # Get model print('Building model...') model = BiDAF(word_vectors=word_vectors, hidden_size=args.hidden_size) model = nn.DataParallel(model, gpu_ids) print(f'Loading checkpoint from {args.load_path}...') model = util.load_model(model, args.load_path, gpu_ids, return_step=False) model = model.to(device) model.eval() # Get data loader print('Building dataset...') #record_file = vars(args)[f'{args.split}_record_file'] dataset = SQuAD("./data/my_test.npz", args.use_squad_v2) data_loader = data.DataLoader(dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn) # Evaluate print(f'Evaluating on {args.split} split...') nll_meter = util.AverageMeter() pred_dict = {} # Predictions for TensorBoard sub_dict = {} # Predictions for submission #eval_file = vars(args)[f'{args.split}_eval_file'] with open("./data/my_test_eval.json", 'r') as fh: gold_dict = json_load(fh) with torch.no_grad(), \ tqdm(total=len(dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in data_loader: print("viewing the dataset") print(cw_idxs, cc_idxs, qw_idxs, qc_idxs) # Setup for forward cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) batch_size = cw_idxs.size(0) # Forward log_p1, log_p2 = model(cw_idxs, qw_idxs) y1, y2 = y1.to(device), y2.to(device) loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) nll_meter.update(loss.item(), batch_size) # Get F1 and EM scores p1, p2 = log_p1.exp(), log_p2.exp() starts, ends = util.discretize(p1, p2, args.max_ans_len, args.use_squad_v2) # Log info progress_bar.update(batch_size) #if args.split != 'test': # No labels for the test set, so NLL would be invalid #progress_bar.set_postfix(NLL=nll_meter.avg) idx2pred, uuid2pred = util.convert_tokens(gold_dict, ids.tolist(), starts.tolist(), ends.tolist(), args.use_squad_v2) pred_dict.update(idx2pred) sub_dict.update(uuid2pred) print("my evaluation ....") for el in pred_dict: print(el, pred_dict[el]) for el in sub_dict: print(el, sub_dict[el])
def main(args): args.save_dir = util.get_save_dir(args.save_dir, "exp1_training", training=False) log = get_logger(args.logging_dir, "exp1_training") log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}') device, gpu_ids = util.get_available_devices() args.batch_size *= max(1, len(gpu_ids)) # Get embeddings log.info('Loading embeddings...') word_vectors = util.torch_from_json(args.word_emb_file) # Get model log.info('Building model...') model = BiDAF(word_vectors=word_vectors, hidden_size=args.hidden_size) model = nn.DataParallel(model, gpu_ids) log.info(f'Loading checkpoint from {args.load_path}...') model = util.load_model(model, c.load_path, gpu_ids, return_step=False) model = model.to(device) model.eval() # Get data loader log.info('Building dataset...') dataset = SQuAD(args.test_record_file, True) data_loader = data.DataLoader(dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn) # Evaluate log.info(f'Evaluating on {args.datasplit} split...') nll_meter = util.AverageMeter() pred_dict = {} # Predictions for TensorBoard sub_dict = {} # Predictions for submission with open(args.test_eval_file, 'r') as fh: gold_dict = json_load(fh) with torch.no_grad(), \ tqdm(total=len(dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in data_loader: # Setup for forward cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) batch_size = cw_idxs.size(0) # Forward log_p1, log_p2 = model(cw_idxs, qw_idxs) y1, y2 = y1.to(device), y2.to(device) loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) nll_meter.update(loss.item(), batch_size) # Get F1 and EM scores p1, p2 = log_p1.exp(), log_p2.exp() starts, ends = util.discretize(p1, p2, c.max_ans_len, True) # Log info progress_bar.update(batch_size) # Not using the unlabeled test set # if args.split != 'test': # # No labels for the test set, so NLL would be invalid # progress_bar.set_postfix(NLL=nll_meter.avg) idx2pred, uuid2pred = util.convert_tokens(gold_dict, ids.tolist(), starts.tolist(), ends.tolist(), True) pred_dict.update(idx2pred) sub_dict.update(uuid2pred) # Log results (except for test set, since it does not come with labels) results = util.eval_dicts(gold_dict, pred_dict, True) results_list = [('NLL', nll_meter.avg), ('F1', results['F1']), ('EM', results['EM'])] results_list.append(('AvNA', results['AvNA'])) results = OrderedDict(results_list) # Log to console results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in results.items()) log.info(f'{args.datasplit} {results_str}') # Log to TensorBoard tbx = SummaryWriter(c.save_dir) util.visualize(tbx, pred_dict=pred_dict, eval_path=args.test_eval_file, step=0, split=args.datasplit, num_visuals=args.num_visuals)
def main(args): # Load TF-IDF from pickle scorer = TFIDF([]) scorer.get_from_pickle() # Set up logging args.save_dir = util.get_save_dir(args.save_dir, args.name, training=False) log = util.get_logger(args.save_dir, args.name) log.info('Args: {}'.format(dumps(vars(args), indent=4, sort_keys=True))) device, gpu_ids = util.get_available_devices() args.batch_size *= max(1, len(gpu_ids)) # Get embeddings log.info('Loading embeddings...') word_vectors = util.torch_from_json(args.word_emb_file) # Get data loader log.info('Building dataset...') record_file = vars(args)['{}_record_file'.format(args.split)] dataset = SQuAD(record_file, args.use_squad_v2) data_loader = data.DataLoader(dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn) # Get model log.info('Building model...') model = BiDAF(word_vectors=word_vectors, char_vocab_size= 1376, hidden_size=args.hidden_size) model = nn.DataParallel(model, gpu_ids) log.info('Loading checkpoint from {}...'.format(args.load_path)) model = util.load_model(model, args.load_path, gpu_ids, return_step=False) model = model.to(device) model.eval() # Evaluate log.info('Evaluating on {} split...'.format(args.split)) nll_meter = util.AverageMeter() pred_dict = {} # Predictions for TensorBoard sub_dict = {} # Predictions for submission eval_file = vars(args)['{}_eval_file'.format(args.split)] with open(eval_file, 'r') as fh: gold_dict = json_load(fh) with torch.no_grad(), \ tqdm(total=len(dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in data_loader: # Setup for forward cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) batch_size = cw_idxs.size(0) # Forward log_p1, log_p2 = model(cw_idxs, qw_idxs, cc_idxs,qc_idxs) y1, y2 = y1.to(device), y2.to(device) loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) nll_meter.update(loss.item(), batch_size) # Get F1 and EM scores p1, p2 = log_p1.exp(), log_p2.exp() starts, ends = util.discretize(p1, p2, args.max_ans_len, args.use_squad_v2) # Log info progress_bar.update(batch_size) if args.split != 'test': # No labels for the test set, so NLL would be invalid progress_bar.set_postfix(NLL=nll_meter.avg) idx2pred, uuid2pred = util.convert_tokens(gold_dict, ids.tolist(), starts.tolist(), ends.tolist(), args.use_squad_v2) pred_dict.update(idx2pred) sub_dict.update(uuid2pred) if (args.use_tfidf): # Apply TF-IDF filtering to pred_dict tf_idf_threshold = 2 tf_idf_common_threshold = 1 for key, value in pred_dict.items(): if value != "": tf_idf_score = scorer.normalized_additive_idf_ignore_common_words( value, threshold_frequency=tf_idf_common_threshold) if tf_idf_score < tf_idf_threshold: pred_dict[key] = '' pass # print ("pred_dict: {}, pruned".format(tf_idf_score)) else: pass # print ("pred_dict: {}, kept".format(tf_idf_score)) # Log results (except for test set, since it does not come with labels) if args.split != 'test': results = util.eval_dicts(gold_dict, pred_dict, args.use_squad_v2) results_list = [('NLL', nll_meter.avg), ('F1', results['F1']), ('EM', results['EM'])] if args.use_squad_v2: results_list.append(('AvNA', results['AvNA'])) results = OrderedDict(results_list) # Log to console results_str = ', '.join('{}: {:05.2f}'.format(k, v) for k, v in results.items()) log.info('{} {}'.format(args.split.title(), results_str)) # Log to TensorBoard tbx = SummaryWriter(args.save_dir) util.visualize(tbx, pred_dict=pred_dict, eval_path=eval_file, step=0, split=args.split, num_visuals=args.num_visuals) # Write submission file sub_path = join(args.save_dir, args.split + '_' + args.sub_file) log.info('Writing submission file to {}...'.format(sub_path)) with open(sub_path, 'w') as csv_fh: csv_writer = csv.writer(csv_fh, delimiter=',') csv_writer.writerow(['Id', 'Predicted']) for uuid in sorted(sub_dict): csv_writer.writerow([uuid, sub_dict[uuid]])
def main(args, actions=None): """" actions is a tuple (action, number of actions to be taken) action can be either: "substitute", "delete" or "add". number of actions to be taken: the number of words to apply the "substitute", "delete" or "add" action. """ # check that actions parameters received #print("actions: ",actions) # Set up logging args.save_dir = util.get_save_dir(args.save_dir, args.name, training=False) log = util.get_logger(args.save_dir, args.name) log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}') device, gpu_ids = util.get_available_devices() args.batch_size *= max(1, len(gpu_ids)) # Get embeddings log.info('Loading embeddings...') word_vectors = util.torch_from_json(args.word_emb_file) # Get model log.info('Building model...') if args.model == 'bidaf': model = BiDAF(word_vectors=word_vectors, hidden_size=args.hidden_size) elif args.model == 'bidafextra': model = BiDAFExtra(word_vectors=word_vectors, args=args) elif args.model == 'fusionnet': model = FusionNet(word_vectors=word_vectors, args=args) model = nn.DataParallel(model, gpu_ids) log.info(f'Loading checkpoint from {args.load_path}...') model = util.load_model(model, args.load_path, gpu_ids, return_step=False) model = model.to(device) model.eval() # Get data loader log.info('Building dataset...') record_file = vars(args)[f'{args.split}_record_file'] dataset = SQuAD(record_file, args) data_loader = data.DataLoader(dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn) # print("*"*80) #print(len(dataset.question_idxs)) #for question_idx in dataset.question_idxs: # print(question_idx) # print("*" * 80) #print(self.question_idxs[question_idx]) #self.question_idxs[idx] # print("data_loader: ",data_loader) # Evaluate log.info(f'Evaluating on {args.split} split...') nll_meter = util.AverageMeter() pred_dict = {} # Predictions for TensorBoard sub_dict = {} # Predictions for submission eval_file = vars(args)[f'{args.split}_eval_file'] with open(eval_file, 'r') as fh: gold_dict = json_load(fh) count_questions_type = defaultdict(lambda: 0) audit_trail_from_question_type = defaultdict(lambda: []) list_of_interrogative_pronouns = [ "what", "whose", "why", "which", "where", "when", "how", "who", "whom" ] for index in range(1, len(gold_dict)): # transform the question in lower case to simplify the analysis, thus losing the benefit of the capital letters # possibly indicating the position of the interrogative pronoun in the sentence. question_lower_case = gold_dict[str(index)]['question'].lower() list_question_lower_case_with_punctuation = question_lower_case.translate( {ord(i): " " for i in "'"}).split() # question_lower_case = [] for item in list_question_lower_case_with_punctuation: question_lower_case.append( item.translate({ord(i): "" for i in ",.<>!@£$%^&*()_-+=?"})) # defining a variable for the first word first_word_question_lower_case = question_lower_case[0] # defining variable for the second word second_word_question_lower_case = question_lower_case[1] # defining variable for the first and second word combined_first_and_second_words = first_word_question_lower_case + " " + second_word_question_lower_case # Analyzing the sentence if first_word_question_lower_case in list_of_interrogative_pronouns: count_questions_type[first_word_question_lower_case] += 1 audit_trail_from_question_type[ first_word_question_lower_case].append(str(index)) # composed question starting by in elif first_word_question_lower_case == "in": if second_word_question_lower_case in list_of_interrogative_pronouns and second_word_question_lower_case != "whose": count_questions_type[combined_first_and_second_words] += 1 audit_trail_from_question_type[ combined_first_and_second_words].append(str(index)) else: pronoun = find_first_interrogative_pronoun( list_of_interrogative_pronouns, question_lower_case) count_questions_type[pronoun] += 1 audit_trail_from_question_type[pronoun].append(str(index)) # composed question starting by by elif first_word_question_lower_case == "by": if second_word_question_lower_case in list_of_interrogative_pronouns \ and second_word_question_lower_case != "whom" \ and second_word_question_lower_case != "which" \ and second_word_question_lower_case != "when" \ and second_word_question_lower_case != "how": count_questions_type[combined_first_and_second_words] += 1 audit_trail_from_question_type[ combined_first_and_second_words].append(str(index)) else: pronoun = find_first_interrogative_pronoun( list_of_interrogative_pronouns, question_lower_case) count_questions_type[pronoun] += 1 audit_trail_from_question_type[pronoun].append(str(index)) else: pronoun = find_first_interrogative_pronoun( list_of_interrogative_pronouns, question_lower_case) count_questions_type[pronoun] += 1 audit_trail_from_question_type[pronoun].append(str(index)) reverse_dict_by_value = OrderedDict( sorted(count_questions_type.items(), key=lambda x: x[1])) total_questions = sum(count_questions_type.values()) with torch.no_grad(), \ tqdm(total=len(dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, cw_pos, cw_ner, cw_freq, cqw_extra, y1, y2, ids in data_loader: # Setup for forward # ********************************************************** # # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> # # Where we make the modif: if actions[0] == "substitute": # substitute to random token in each question of the batch (substitution is made within the same sentence: batch_size = cw_idxs.size()[0] number_of_actions = actions[1] for _ in range(number_of_actions): length_index_batch = cw_idxs.size()[1] for i in range(batch_size): tensor_with_zero_value = (( cw_idxs[i] == 0).nonzero()).squeeze() try: first_zero_value = torch.min( tensor_with_zero_value) except: first_zero_value = length_index_batch if first_zero_value > 2: select_item_idx_1 = random.randint( 0, first_zero_value - 1) select_item_idx_2 = random.randint( 0, first_zero_value - 1) save_value_1 = copy.deepcopy( cw_idxs[i, select_item_idx_1]) cw_idxs[i, select_item_idx_1] = cw_idxs[ i, select_item_idx_2] cw_idxs[i, select_item_idx_2] = save_value_1 elif actions[0] == "delete": # substitute to random token in each question of the batch (substitution is made within the same sentence: batch_size = cw_idxs.size()[0] number_of_actions = actions[1] for _ in range(number_of_actions): length_index_batch = cw_idxs.size()[1] for i in range(batch_size): tensor_with_zero_value = (( cw_idxs[i] == 0).nonzero()).squeeze() try: first_zero_value = torch.min( tensor_with_zero_value) except: first_zero_value = length_index_batch if first_zero_value > 2: print("debug:", i) print("> size before amendment of cw_idxs", cw_idxs.size()) print(cw_idxs[i]) select_item_idx_1 = random.randint( 0, first_zero_value - 1) #print("section 1 :", cw_idxs[i, 0:select_item_idx_1) #print("remove:", cw_idxs[i,0:select_item_idx_1) #print("section 2 :", cw_idxs[i, 0:select_item_idx_1) cw_idxs[i, :] = torch.cat( (cw_idxs[i, 0:select_item_idx_1], cw_idxs[i, select_item_idx_1 + 1:], torch.tensor([0])), -1) print("> size before amendment of cw_idxs", cw_idxs.size()) print(cw_idxs[i]) elif actions[0] == "add": batch_size = cw_idxs.size()[0] number_of_actions = actions[1] for _ in range(number_of_actions): length_index_batch = cw_idxs.size()[1] for i in range(batch_size): tensor_with_zero_value = (( cw_idxs[i] == 0).nonzero()).squeeze() try: first_zero_value = torch.min( tensor_with_zero_value) except: first_zero_value = length_index_batch if first_zero_value > 2: select_item_idx_1 = random.randint( 0, first_zero_value - 1) cw_idxs[i, select_item_idx_1] = random.randint( 1, 50000) elif actions[0] == "add2": # substitute to random token in each question of the batch (substitution is made within the same sentence: batch_size = cw_idxs.size()[0] number_of_actions = actions[1] for _ in range(number_of_actions): length_index_batch = cw_idxs.size()[1] for i in range(batch_size): tensor_with_zero_value = (( cw_idxs[i] == 0).nonzero()).squeeze() try: first_zero_value = torch.min( tensor_with_zero_value) except: first_zero_value = length_index_batch if first_zero_value > 2: select_item_idx_1 = random.randint( 0, first_zero_value - 1) select_item_idx_2 = random.randint( 0, first_zero_value - 1) cw_idxs[i, select_item_idx_1] = random.randint( 1, 50000) cw_idxs[i, select_item_idx_2] = random.randint( 1, 50000) else: print("Incorrect command: exiting") exit() cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) batch_size = cw_idxs.size(0) # Forward if args.model == 'bidaf': log_p1, log_p2 = model(cw_idxs, qw_idxs) else: log_p1, log_p2 = model(cw_idxs, qw_idxs, cw_pos, cw_ner, cw_freq, cqw_extra) y1, y2 = y1.to(device), y2.to(device) loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) nll_meter.update(loss.item(), batch_size) # Get F1 and EM scores p1, p2 = log_p1.exp(), log_p2.exp() starts, ends = util.discretize(p1, p2, args.max_ans_len, args.use_squad_v2) # Log info progress_bar.update(batch_size) if args.split != 'test': # No labels for the test set, so NLL would be invalid progress_bar.set_postfix(NLL=nll_meter.avg) idx2pred, uuid2pred = util.convert_tokens(gold_dict, ids.tolist(), starts.tolist(), ends.tolist(), args.use_squad_v2) pred_dict.update(idx2pred) sub_dict.update(uuid2pred) # Log results (except for test set, since it does not come with labels) if args.split != 'test': results = util.eval_dicts(gold_dict, pred_dict, args.use_squad_v2) # Printing information for questions without interrogative pronouns """" print("len(gold_dict): ", len(gold_dict)) print("len(pred_dict): ", len(pred_dict)) print("Is gold_dict.keys() identical to pred_dict.keys(): ", gold_dict.keys()==pred_dict.keys()) if gold_dict.keys()!=pred_dict.keys(): for key in gold_dict.keys(): if key not in pred_dict.keys(): print("key ", key, " missing in pred_dict.keys(") """ results_list = [('NLL', nll_meter.avg), ('F1', results['F1']), ('EM', results['EM'])] if args.use_squad_v2: results_list.append(('AvNA', results['AvNA'])) results = OrderedDict(results_list) # Computing the F1 score for each type of question print("for ", actions, ": ", results['F1']) # create a list of the types of questions by extracting the keys from the dict audit_trail_from_question_type # Log to console results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in results.items()) log.info(f'{args.split.title()} {results_str}') # Log to TensorBoard tbx = SummaryWriter(args.save_dir) util.visualize(tbx, pred_dict=pred_dict, eval_path=eval_file, step=0, split=args.split, num_visuals=args.num_visuals) # Write submission file sub_path = join(args.save_dir, args.split + '_' + args.sub_file) log.info(f'Writing submission file to {sub_path}...') with open(sub_path, 'w', newline='', encoding='utf-8') as csv_fh: csv_writer = csv.writer(csv_fh, delimiter=',') csv_writer.writerow(['Id', 'Predicted']) for uuid in sorted(sub_dict): csv_writer.writerow([uuid, sub_dict[uuid]]) return results['F1']
def main(args): # Set up logging args.save_dir = util.get_save_dir(args.save_dir, args.name, training=False) log = util.get_logger(args.save_dir, args.name) log.info('Args: {}'.format(dumps(vars(args), indent=4, sort_keys=True))) device, gpu_ids = util.get_available_devices() args.batch_size *= max(1, len(gpu_ids)) # Get embeddings log.info('Loading embeddings...') word_vectors = util.torch_from_json(args.word_emb_file) char_vectors = util.torch_from_json(args.char_emb_file) # NEW : load the tag embeddings pos_vectors = util.torch_from_json(args.pos_emb_file) ner_vectors = util.torch_from_json(args.ner_emb_file) # Choose model log.info('Building model {}...'.format(args.name)) if 'baseline' in args.name: model = BiDAF(word_vectors=word_vectors, hidden_size=args.hidden_size) elif args.name == 'BiDAF_char': model = BiDAF_char(word_vectors=word_vectors, char_vectors=char_vectors, hidden_size=args.hidden_size) # NEW elif (args.name == 'BiDAF_tag') or (args.name == 'BiDAF_tag_unfrozen') or ( args.name == 'BiDAF_tag_loss') or (args.name == 'BiDAF_tag_unfrozen_loss'): model = BiDAF_tag(word_vectors=word_vectors, char_vectors=char_vectors, pos_vectors=pos_vectors, ner_vectors=ner_vectors, hidden_size=args.hidden_size) elif (args.name == 'BiDAF_tag_ext') or (args.name == 'BiDAF_tag_ext_unfrozen'): model = BiDAF_tag_ext(word_vectors=word_vectors, char_vectors=char_vectors, pos_vectors=pos_vectors, ner_vectors=ner_vectors, hidden_size=args.hidden_size) elif args.name == 'coattn': model = CoattentionModel(hidden_dim=args.hidden_size, embedding_matrix=word_vectors, train_word_embeddings=False, dropout=0.35, pooling_size=16, number_of_iters=4, number_of_layers=2, device=device) else: raise NameError('No model named ' + args.name) model = nn.DataParallel(model, gpu_ids) log.info('Loading checkpoint from {}...'.format(args.load_path)) model = util.load_model(model, args.load_path, gpu_ids, return_step=False) model = model.to(device) model.eval() # Get data loader log.info('Building dataset...') record_file = vars(args)['{}_record_file'.format(args.split)] dataset = SQuAD(record_file, args.use_squad_v2) data_loader = data.DataLoader(dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn) # Evaluate log.info('Evaluating on {} split...'.format(args.split)) nll_meter = util.AverageMeter() pred_dict = {} # Predictions for TensorBoard sub_dict = {} # Predictions for submission eval_file = vars(args)['{}_eval_file'.format(args.split)] with open(eval_file, 'r') as fh: gold_dict = json_load(fh) with torch.no_grad(), \ tqdm(total=len(dataset)) as progress_bar: for cw_idxs, cc_idxs, cpos_idxs, cner_idxs, cw_ems, cw_tfs, qw_idxs, qc_idxs, qpos_idxs, qner_idxs, qw_ems, qw_tfs, y1, y2, ids in data_loader: # NEW # Setup for forward cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) batch_size = cw_idxs.size(0) # Forward if 'baseline' in args.name: log_p1, log_p2 = model(cw_idxs, qw_idxs) elif args.name == 'BiDAF_char': # Additional setup for forward cc_idxs = cc_idxs.to(device) qc_idxs = qc_idxs.to(device) log_p1, log_p2 = model(cw_idxs, qw_idxs, cc_idxs, qc_idxs) elif args.name == 'coattn': max_c_len = cw_idxs.size(1) max_q_len = qw_idxs.size(1) c_len = [] q_len = [] for i in range(cw_idxs.size(0)): if len((cw_idxs[i] == 0).nonzero()) != 0: c_len_i = (cw_idxs[i] == 0).nonzero()[0].item() else: c_len_i = cw_idxs.size(1) if len((qw_idxs[i] == 0).nonzero()) != 0: q_len_i = (qw_idxs[i] == 0).nonzero()[0].item() else: q_len_i = qw_idxs.size(1) c_len.append(int(c_len_i)) q_len.append(int(q_len_i)) c_len = torch.Tensor(c_len).int() q_len = torch.Tensor(q_len).int() num_examples = int(cw_idxs.size(0) / len(gpu_ids)) log_p1, log_p2 = model(max_c_len, max_q_len, cw_idxs, qw_idxs, c_len, q_len, num_examples, True, False) # NEW elif (args.name == 'BiDAF_tag') or (args.name == 'BiDAF_tag_unfrozen') or ( args.name == 'BiDAF_tag_loss') or (args.name == 'BiDAF_tag_unfrozen_loss'): # Additional setup for forward cc_idxs = cc_idxs.to(device) cpos_idxs = cpos_idxs.to(device) cner_idxs = cner_idxs.to(device) qc_idxs = qc_idxs.to(device) qpos_idxs = qpos_idxs.to(device) qner_idxs = qner_idxs.to(device) log_p1, log_p2 = model(cw_idxs, qw_idxs, cc_idxs, qc_idxs, cpos_idxs, qpos_idxs, cner_idxs, qner_idxs) elif (args.name == 'BiDAF_tag_ext') or (args.name == 'BiDAF_tag_ext_unfrozen'): # Additional setup for forward cc_idxs = cc_idxs.to(device) cpos_idxs = cpos_idxs.to(device) cner_idxs = cner_idxs.to(device) cw_ems = cw_ems.to(device) cw_tfs = cw_tfs.to(device) qc_idxs = qc_idxs.to(device) qpos_idxs = qpos_idxs.to(device) qner_idxs = qner_idxs.to(device) qw_ems = qw_ems.to(device) qw_tfs = qw_tfs.to(device) log_p1, log_p2 = model(cw_idxs, qw_idxs, cc_idxs, qc_idxs, cpos_idxs, qpos_idxs, cner_idxs, qner_idxs, cw_ems, qw_ems, cw_tfs, qw_tfs) else: raise NameError('No model named ' + args.name) y1, y2 = y1.to(device), y2.to(device) loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) nll_meter.update(loss.item(), batch_size) # Get F1 and EM scores p1, p2 = log_p1.exp(), log_p2.exp() starts, ends = util.discretize(p1, p2, args.max_ans_len, args.use_squad_v2) # Log info progress_bar.update(batch_size) if args.split != 'test': # No labels for the test set, so NLL would be invalid progress_bar.set_postfix(NLL=nll_meter.avg) idx2pred, uuid2pred = util.convert_tokens(gold_dict, ids.tolist(), starts.tolist(), ends.tolist(), args.use_squad_v2) pred_dict.update(idx2pred) sub_dict.update(uuid2pred) # Log results (except for test set, since it does not come with labels) if args.split != 'test': results = util.eval_dicts(gold_dict, pred_dict, args.use_squad_v2) results_list = [('NLL', nll_meter.avg), ('F1', results['F1']), ('EM', results['EM'])] if args.use_squad_v2: results_list.append(('AvNA', results['AvNA'])) results = OrderedDict(results_list) # Log to console results_str = ', '.join('{}: {:05.2f}'.format(k, v) for k, v in results.items()) log.info('{} {}'.format(args.split.title(), results_str)) # More detailed error analysis results = util.analysis_dicts(gold_dict, pred_dict, args.use_squad_v2) results_str = ', '.join('{}: {:05.2f}'.format(k, v) for k, v in results.items()) log.info('{} {}'.format(args.split.title(), results_str)) # Log to TensorBoard tbx = SummaryWriter(args.save_dir) util.visualize(tbx, pred_dict=pred_dict, eval_path=eval_file, step=0, split=args.split, num_visuals=args.num_visuals) # Write submission file sub_path = join(args.save_dir, args.split + '_' + args.sub_file) log.info('Writing submission file to {}...'.format(sub_path)) with open(sub_path, 'w', newline='', encoding='utf-8') as csv_fh: csv_writer = csv.writer(csv_fh, delimiter=',') csv_writer.writerow(['Id', 'Predicted']) for uuid in sorted(sub_dict): csv_writer.writerow([uuid, sub_dict[uuid]])