def main(args): with open(args.input_path, encoding='utf-8') as f: data = json.load(f) output_data = [] print('processing input') for sample_id in tqdm(data): sample = data[sample_id] question = sample['question'] answers = sample['answer'] predictions = sample['prediction'][0:args.top_k] titles = sample['title'][0:args.top_k] evidences = sample['evidence'][0:args.top_k] scores = sample['score'][0:args.top_k] is_impossible = [] start_pos = [] end_pos = [] match_fn = drqa_regex_match_score if args.regex else drqa_exact_match_score #answer_text = "" #ds_context = "" #ds_title = "" # is_from_context = False # check if prediction is matched in a golden answer in the answer list for pred_idx, pred in enumerate(predictions): if pred != "" and drqa_metric_max_over_ground_truths(match_fn, pred, answers): is_impossible.append(False) answer_start = find_substring_and_return_first(pred, evidences[pred_idx]) start_pos.append(answer_start) end_pos.append(answer_start + len(pred)-1) else: is_impossible.append(True) start_pos.append(-1) end_pos.append(-1) # could add score in here if we somehow make the reader also take the score into account output_data.append({ 'question': question, 'answer': answers, 'prediction': predictions, 'title': [title[0] for title in titles], 'evidence': evidences, 'is_impossible': is_impossible, 'start_pos': start_pos, 'end_pos': end_pos, 'score': scores, }) with open(args.output_path, 'w', encoding='utf-8') as f: json.dump({ 'data': output_data },f)
def get_phrase_vecs(mips, questions, answers, outs, top_k=100, regex=False): assert mips is not None # Get phrase and vectors phrase_idxs = [[(out_['doc_idx'], out_['start_idx'], out_['end_idx'], out_['answer'], out_['start_vec'], out_['end_vec']) for out_ in out] for out in outs] for b_idx, phrase_idx in enumerate(phrase_idxs): while len(phrase_idxs[b_idx] ) < top_k * 2: # two separate top-k from start/end phrase_idxs[b_idx].append((-1, 0, 0, '', np.zeros( (768)), np.zeros((768)))) phrase_idxs[b_idx] = phrase_idxs[b_idx][:top_k * 2] flat_phrase_idxs = [ phrase for phrase_idx in phrase_idxs for phrase in phrase_idx ] doc_idxs = [int(phrase_idx_[0]) for phrase_idx_ in flat_phrase_idxs] start_idxs = [int(phrase_idx_[1]) for phrase_idx_ in flat_phrase_idxs] end_idxs = [int(phrase_idx_[2]) for phrase_idx_ in flat_phrase_idxs] phrases = [phrase_idx_[3] for phrase_idx_ in flat_phrase_idxs] start_vecs = [phrase_idx_[4] for phrase_idx_ in flat_phrase_idxs] end_vecs = [phrase_idx_[5] for phrase_idx_ in flat_phrase_idxs] start_vecs = np.stack( # [mips.dequant(mips.offset, mips.scale, start_vec) # Use this for IVFSQ4 [start_vec for start_vec, start_idx in zip(start_vecs, start_idxs)]) end_vecs = np.stack( # [mips.dequant(mips.offset, mips.scale, end_vec) # Use this for IVFSQ4 [end_vec for end_vec, end_idx in zip(end_vecs, end_idxs)]) zero_mask = np.array([[1] if doc_idx >= 0 else [0] for doc_idx in doc_idxs]) start_vecs = start_vecs * zero_mask end_vecs = end_vecs * zero_mask # Find targets based on exact string match match_fn = drqa_regex_match_score if regex else drqa_exact_match_score # Punctuation included targets = [[ drqa_metric_max_over_ground_truths(match_fn, phrase[3], answer) for phrase in phrase_idx ] for phrase_idx, answer in zip(phrase_idxs, answers)] targets = [[ii if val else None for ii, val in enumerate(target)] for target in targets] # Reshape batch_size = len(answers) start_vecs = np.reshape(start_vecs, (batch_size, top_k * 2, -1)) end_vecs = np.reshape(end_vecs, (batch_size, top_k * 2, -1)) return start_vecs, end_vecs, targets
def evaluate_results(predictions, qids, questions, answers, args, evidences, scores, titles, se_positions=None): # Filter if there's candidate if args.candidate_path is not None: candidates = set() with open(args.candidate_path) as f: for line in f: line = line.strip().lower() candidates.add(line) logger.info( f'{len(candidates)} candidates are loaded from {args.candidate_path}' ) topk_preds = [ list( filter( lambda x: (x in candidates) or (x.lower() in candidates), a)) for a in predictions ] topk_preds = [ a[:args.top_k] if len(a) > 0 else [''] for a in topk_preds ] predictions = topk_preds[:] top1_preds = [a[0] for a in topk_preds] else: predictions = [ a[:args.top_k] if len(a) > 0 else [''] for a in predictions ] top1_preds = [a[0] for a in predictions] no_ans = sum([a == '' for a in top1_preds]) logger.info(f'no_ans/all: {no_ans}, {len(top1_preds)}') logger.info(f'Evaluating {len(top1_preds)} answers') # Get em/f1 f1s, ems = [], [] for prediction, groundtruth in zip(top1_preds, answers): if len(groundtruth) == 0: f1s.append(0) ems.append(0) continue f1s.append(max([f1_score(prediction, gt)[0] for gt in groundtruth])) ems.append( max([exact_match_score(prediction, gt) for gt in groundtruth])) final_f1, final_em = np.mean(f1s), np.mean(ems) if not args.regex: logger.info('EM: %.2f, F1: %.2f' % (final_em * 100, final_f1 * 100)) def long_answer_em(prediction, groundtruths, se_pos): em = [] for gt in groundtruths: start = prediction.find(gt) if start != -1 and se_pos[0] >= start and se_pos[1] <= start + len( gt): em.append(1) else: em.append(0) return max(em) # Top 1/k em (or regex em) exact_match_topk = 0 exact_match_top1 = 0 f1_score_topk = 0 f1_score_top1 = 0 redundant_topk = 0 pred_out = {} if "question_type" in args and "4" in args.question_type: exact_match_long_topk = 0 exact_match_long_top1 = 0 for i in range(len(predictions)): # For debugging if i < 10: logger.info(f'{i+1}) {questions[i]}') logger.info( f'=> groundtruths: {answers[i]}, top 5 prediction: {predictions[i][:5]}' ) match_fn = drqa_regex_match_score if args.regex else drqa_exact_match_score em_topk = max([ drqa_metric_max_over_ground_truths(match_fn, prediction, answers[i]) for prediction in predictions[i][:args.top_k] ]) em_top1 = drqa_metric_max_over_ground_truths(match_fn, top1_preds[i], answers[i]) exact_match_topk += em_topk exact_match_top1 += em_top1 if "question_type" in args and "4" in args.question_type: em_long_topk = [ long_answer_em(prediction, answers[i], se) for prediction, se in zip(predictions[i][:args.top_k], se_positions[i][:args.top_k]) ] exact_match_long_topk += max(em_long_topk) exact_match_long_top1 += em_long_topk[0] if i < 10: logger.info( f"Top 1 score {exact_match_long_top1}, top 10 score {exact_match_long_topk}" ) # Compute top-k redundancy (could be ill-defined for regex) rd_topk = sum([ drqa_metric_max_over_ground_truths(match_fn, prediction, [predictions[i][0]]) for prediction in predictions[i][:args.top_k] ]) redundant_topk += rd_topk f1_topk = 0 f1_top1 = 0 if not args.regex: match_fn = lambda x, y: f1_score(x, y)[0] f1_topk = max([ drqa_metric_max_over_ground_truths(match_fn, prediction, answers[i]) for prediction in predictions[i][:args.top_k] ]) f1_top1 = drqa_metric_max_over_ground_truths( match_fn, top1_preds[i], answers[i]) f1_score_topk += f1_topk f1_score_top1 += f1_top1 # Score statistics assert len(predictions[i]) <= args.top_k pred_out[qids[i]] = { 'question': questions[i], 'answer': answers[i], 'prediction': predictions[i], 'score': scores[i], 'title': titles[i], 'evidence': evidences[i] if evidences is not None else '', 'em_top1': bool(em_top1), f'em_top{args.top_k}': bool(em_topk), 'f1_top1': f1_top1, f'f1_top{args.top_k}': f1_topk, 'se_pos': se_positions[i] if se_positions is not None else (-1, -1), 'rd_topk': rd_topk, } if "question_type" in args and "4" in args.question_type: exact_match_top1 = exact_match_long_top1 exact_match_topk = exact_match_long_topk total = len(predictions) exact_match_top1 = 100.0 * exact_match_top1 / total f1_score_top1 = 100.0 * f1_score_top1 / total logger.info({ 'exact_match_top1': exact_match_top1, 'f1_score_top1': f1_score_top1 }) exact_match_topk = 100.0 * exact_match_topk / total f1_score_topk = 100.0 * f1_score_topk / total logger.info({ f'exact_match_top{args.top_k}': exact_match_topk, f'f1_score_top{args.top_k}': f1_score_topk }) redundant_topk = redundant_topk / total logger.info({f'redundancy of top{args.top_k}': redundant_topk}) # Dump predictions if len(args.load_dir) == 0: pred_dir = os.path.join(os.environ['SAVE_DIR'], 'pred') else: pred_dir = os.path.join(args.load_dir, 'pred') if not os.path.exists(pred_dir): os.makedirs(pred_dir) if args.save_pred: pred_path = os.path.join( pred_dir, os.path.splitext(os.path.basename(args.test_path))[0] + f'_{total}_top{args.top_k}.pred') logger.info(f'Saving prediction file to {pred_path}') with open(pred_path, 'w') as f: json.dump(pred_out, f) logger.info( f"Results for {args.test_path}: exact match top 1: {exact_match_top1} | f1 score top 1: {f1_score_top1} | exact match top {args.top_k}: {exact_match_topk} | f1 score top {args.top_k}: {f1_score_topk}" ) if args.wandb: wandb.log({ "Test Path": args.test_path, "Model": args.load_dir, "Top 1 EM": exact_match_top1, "Top 1 F1": f1_score_top1, f"Top {args.top_k} EM": exact_match_topk, f"Top {args.top_k} F1": f1_score_topk }) wandb_table = wandb.Table(columns=[ "Model", "Question Type", "Top 1 EM", "Top 1 F1", f"Top {args.top_k} EM", f"Top {args.top_k} F1" ]) wandb_table.add_data(args.load_dir, args.question_type, exact_match_top1, f1_score_top1, exact_match_topk, f1_score_topk) wandb.log({args.test_path: wandb_table}) # Evaluate passage retrieval if args.eval_psg: evaluate_results_psg(pred_path, args) return exact_match_top1, f1_score_top1, exact_match_topk, f1_score_topk
def annotate_phrase_vecs(mips, q_ids, questions, answers, titles, phrase_groups, args): assert mips is not None batch_size = len(answers) # Phrase groups are in size of [batch, top_k, values] # phrase_groups = [[( # out_['doc_idx'], out_['start_idx'], out_['end_idx'], out_['answer'], # out_['start_vec'], out_['end_vec'], out_['context'], out_['title']) # for out_ in out] for out in outs # ] dummy_group = { 'doc_idx': -1, 'start_idx': 0, 'end_idx': 0, 'answer': '', 'start_vec': np.zeros(768), 'end_vec': np.zeros(768), 'context': '', 'title': [''] } # Pad phrase groups (two separate top-k coming from start/end, so pad with top_k*2) for b_idx, phrase_idx in enumerate(phrase_groups): while len(phrase_groups[b_idx]) < args.top_k * 2: phrase_groups[b_idx].append(dummy_group) assert len(phrase_groups[b_idx]) == args.top_k * 2 # Flatten phrase groups flat_phrase_groups = [ phrase for phrase_group in phrase_groups for phrase in phrase_group ] doc_idxs = [ int(phrase_group['doc_idx']) for phrase_group in flat_phrase_groups ] start_vecs = [ phrase_group['start_vec'] for phrase_group in flat_phrase_groups ] end_vecs = [phrase_group['end_vec'] for phrase_group in flat_phrase_groups] # stack vectors start_vecs = np.stack(start_vecs) end_vecs = np.stack(end_vecs) zero_mask = np.array([[1] if doc_idx >= 0 else [0] for doc_idx in doc_idxs]) start_vecs = start_vecs * zero_mask end_vecs = end_vecs * zero_mask # Reshape start_vecs = np.reshape(start_vecs, (batch_size, args.top_k * 2, -1)) end_vecs = np.reshape(end_vecs, (batch_size, args.top_k * 2, -1)) # Dummy targets targets = [[None for phrase in phrase_group] for phrase_group in phrase_groups] p_targets = [[None for phrase in phrase_group] for phrase_group in phrase_groups] # TODO: implement dynamic label_strategy based on the task name (label_strat = dynamic) # Annotate for L_phrase if 'phrase' in args.label_strat.split(','): match_fns = [ drqa_regex_match_score if args.regex or ('trec' in q_id.lower()) else drqa_exact_match_score for q_id in q_ids ] targets = [[ drqa_metric_max_over_ground_truths(match_fn, phrase['answer'], answer_set) for phrase in phrase_group ] for phrase_group, answer_set, match_fn in zip( phrase_groups, answers, match_fns)] targets = [[ii if val else None for ii, val in enumerate(target)] for target in targets] # Annotate for L_doc if 'doc' in args.label_strat.split(','): p_targets = [[ any(phrase['title'][0].lower() == tit.lower() for tit in title) for phrase in phrase_group ] for phrase_group, title in zip(phrase_groups, titles)] p_targets = [[ii if val else None for ii, val in enumerate(target)] for target in p_targets] return start_vecs, end_vecs, targets, p_targets
def evaluate_results(predictions, qids, questions, answers, args, evidences, scores, titles, q_tokens=None): wandb.init(project="DensePhrases (open)", mode="online" if args.wandb else "disabled") wandb.config.update(args) # Filter if there's candidate if args.candidate_path is not None: candidates = set() with open(args.candidate_path) as f: for line in f: line = line.strip().lower() candidates.add(line) logger.info( f'{len(candidates)} candidates are loaded from {args.candidate_path}' ) topk_preds = [ list( filter( lambda x: (x in candidates) or (x.lower() in candidates), a)) for a in predictions ] topk_preds = [ a[:args.top_k] if len(a) > 0 else [''] for a in topk_preds ] predictions = topk_preds[:] top1_preds = [a[0] for a in topk_preds] else: predictions = [ a[:args.top_k] if len(a) > 0 else [''] for a in predictions ] top1_preds = [a[0] for a in predictions] no_ans = sum([a == '' for a in top1_preds]) logger.info(f'no_ans/all: {no_ans}, {len(top1_preds)}') logger.info(f'Evaluating {len(top1_preds)} answers.') # Get em/f1 f1s, ems = [], [] for prediction, groundtruth in zip(top1_preds, answers): if len(groundtruth) == 0: f1s.append(0) ems.append(0) continue f1s.append(max([f1_score(prediction, gt)[0] for gt in groundtruth])) ems.append( max([exact_match_score(prediction, gt) for gt in groundtruth])) final_f1, final_em = np.mean(f1s), np.mean(ems) logger.info('EM: %.2f, F1: %.2f' % (final_em * 100, final_f1 * 100)) # Top 1/k em (or regex em) exact_match_topk = 0 exact_match_top1 = 0 f1_score_topk = 0 f1_score_top1 = 0 pred_out = {} for i in range(len(predictions)): # For debugging if i < 3: logger.info(f'{i+1}) {questions[i]}') logger.info( f'=> groundtruths: {answers[i]}, top 5 prediction: {predictions[i][:5]}' ) match_fn = drqa_regex_match_score if args.regex else drqa_exact_match_score em_topk = max([ drqa_metric_max_over_ground_truths(match_fn, prediction, answers[i]) for prediction in predictions[i][:args.top_k] ]) em_top1 = drqa_metric_max_over_ground_truths(match_fn, top1_preds[i], answers[i]) exact_match_topk += em_topk exact_match_top1 += em_top1 f1_topk = 0 f1_top1 = 0 if not args.regex: match_fn = lambda x, y: f1_score(x, y)[0] f1_topk = max([ drqa_metric_max_over_ground_truths(match_fn, prediction, answers[i]) for prediction in predictions[i][:args.top_k] ]) f1_top1 = drqa_metric_max_over_ground_truths( match_fn, top1_preds[i], answers[i]) f1_score_topk += f1_topk f1_score_top1 += f1_top1 pred_out[qids[i]] = { 'question': questions[i], 'answer': answers[i], 'prediction': predictions[i], 'score': scores[i], 'title': titles[i], 'evidence': evidences[i] if evidences is not None else '', 'em_top1': bool(em_top1), f'em_top{args.top_k}': bool(em_topk), 'f1_top1': f1_top1, f'f1_top{args.top_k}': f1_topk, 'q_tokens': q_tokens[i] if q_tokens is not None else [''] } total = len(predictions) exact_match_top1 = 100.0 * exact_match_top1 / total f1_score_top1 = 100.0 * f1_score_top1 / total logger.info({ 'exact_match_top1': exact_match_top1, 'f1_score_top1': f1_score_top1 }) exact_match_topk = 100.0 * exact_match_topk / total f1_score_topk = 100.0 * f1_score_topk / total logger.info({ f'exact_match_top{args.top_k}': exact_match_topk, f'f1_score_top{args.top_k}': f1_score_topk }) wandb.log({ "Top1 EM": exact_match_top1, "Top1 F1": f1_score_top1, "Topk EM": exact_match_topk, "Topk F1": f1_score_topk }) # Dump predictions if len(args.query_encoder_path) == 0: pred_dir = os.path.join(os.environ['DPH_SAVE_DIR'], 'pred') else: pred_dir = os.path.join(args.query_encoder_path, 'pred') if not os.path.exists(pred_dir): os.makedirs(pred_dir) pred_path = os.path.join( pred_dir, os.path.splitext(os.path.basename(args.test_path))[0] + f'_{total}.pred') logger.info(f'Saving prediction file to {pred_path}') with open(pred_path, 'w') as f: json.dump(pred_out, f) return exact_match_top1
def main(args): with open(args.input_path, encoding='utf-8') as f: data = json.load(f) output_data = [] nlp = English() sentencizer = nlp.add_pipe('sentencizer') for i, sample_id in enumerate(tqdm(data, desc="Processing Inputs")): sample = data[sample_id] question = sample['question'] answers = sample['answer'] predictions = sample['prediction'][0:args.top_k] titles = sample['title'][0:args.top_k] evidences = sample['evidence'][0:args.top_k] scores = sample['score'][0:args.top_k] se_pos = sample['se_pos'][0:args.top_k] f1s = [ max([f1_score(prediction, gt)[0] for gt in answers]) for prediction in predictions ] ems = [ max([exact_match_score(prediction, gt) for gt in answers]) for prediction in predictions ] # not a useful example if there are no exact match if max(ems) < 1: continue is_impossible = [] start_pos = [] end_pos = [] match_fn = drqa_regex_match_score if args.regex else drqa_exact_match_score # only check if prediction is matched in a golden answer in the answer list for pred_idx, pred in enumerate(predictions): if pred != "" and drqa_metric_max_over_ground_truths( match_fn, pred, answers): is_impossible.append(False) answer_start = find_substring_and_return_first( pred, evidences[pred_idx]) start_pos.append(answer_start) end_pos.append(answer_start + len(pred) - 1) else: is_impossible.append(True) start_pos.append(-1) end_pos.append(-1) docs = [nlp(evi) for evi in evidences] spans = [ doc.char_span( se_pos[j][0], se_pos[j][1] if se_pos[j][1] >= se_pos[j][0] else se_pos[j][0], alignment_mode='expand') for j, doc in enumerate(docs) ] sents = [ doc[span[0].sent.start:span[-1].sent.end].text if len(span) > 0 else "" for doc, span in zip(docs, spans) ] # could add score in here if we somehow make the reader also take the score into account output_data.append({ 'question': question, 'answer': answers, 'prediction': predictions, 'title': [title[0] for title in titles], 'evidence': evidences, 'is_impossible': is_impossible, 'start_pos': start_pos, 'end_pos': end_pos, 'score': scores, 'f1s': f1s, 'exact_matches': ems, 'se_pos': se_pos, 'sentence': sents, }) with open(args.output_path, 'w', encoding='utf-8') as f: json.dump({'data': output_data}, f)
def top1(fn): return drqa_metric_max_over_ground_truths( fn, top1_preds[i], answers[i])
def topk(fn): return max([ drqa_metric_max_over_ground_truths(fn, p, answers[i]) for p in predictions[i][:args.top_k] ])