def show_web_paragraphs(): splitter = MergeParagraphs(400) stop = NltkPlusStopWords(True) ranker = TopTfIdf(stop, 6) stop_words = stop.words corpus = TriviaQaWebDataset() train = corpus.get_train() points = flatten_iterable([(q, d) for d in q.all_docs] for q in train) np.random.shuffle(points) for q, d in points: q_words = {strip_accents_unicode(w.lower()) for w in q.question} q_words = {x for x in q_words if x not in stop_words} doc = corpus.evidence.get_document(d.doc_id) doc = splitter.split_annotated(doc, d.answer_spans) ranked = ranker.dists(q.question, doc) if len(ranked) < 2 or len(ranked[1][0].answer_spans) == 0: continue print(" ".join(q.question)) print(q.answer.all_answers) for i, (para, dist) in enumerate(ranked[0:2]): text = flatten_iterable(para.text) print("Start=%d, Rank=%d, Dist=%.4f" % (para.start, i, dist)) if len(para.answer_spans) == 0: continue for s, e in para.answer_spans: text[s] = bcolors.CYAN + text[s] text[e] = text[e] + bcolors.ENDC for i, w in enumerate(text): if strip_accents_unicode(w.lower()) in q_words: text[i] = bcolors.ERROR + text[i] + bcolors.ENDC print(" ".join(text)) input()
def main(): data = TriviaQaWebDataset() stop = NltkPlusStopWords() splitter = MergeParagraphs(400) selector = TopTfIdf(stop, 4) print("Loading data..") train = data.get_train() print("Start") for q in train: for doc in q.all_docs: if len(doc.answer_spans) > 3: text = splitter.split_annotated( data.evidence.get_document(doc.doc_id), doc.answer_spans) text = selector.prune(q.question, text) for para in text: if len(para.answer_spans) > 3: print(q.question) text = flatten_iterable(para.text) for s, e in para.answer_spans: text[s] = "{{{" + text[s] text[e] = text[e] + "}}}" print(" ".join(text)) input()
def check_preprocess(): data = TriviaQaWebDataset() merge = MergeParagraphs(400) questions = data.get_dev() pre = WithIndicators(False) remove_cross = WithIndicators(True) rng = np.random.RandomState(0) rng.shuffle(questions) for q in tqdm(questions[:1000]): doc = rng.choice(q.all_docs, 1)[0] text = data.evidence.get_document(doc.doc_id, n_tokens=800) paras = merge.split_annotated(text, doc.answer_spans) para = paras[np.random.randint(0, len(paras))] built = pre.encode_extracted_paragraph(q.question, para) expected_text = flatten_iterable(para.text) if expected_text != [ x for x in built.text if x not in pre.special_tokens() ]: raise ValueError() expected = [expected_text[s:e + 1] for s, e in para.answer_spans] expected = Counter([tuple(x) for x in expected]) actual = [tuple(built.text[s:e + 1]) for s, e in built.answer_spans] actual_cleaned = Counter( tuple(z for z in x if z not in pre.special_tokens()) for x in actual) if actual_cleaned != expected: raise ValueError() r_built = remove_cross.encode_extracted_paragraph(q.question, para) rc = Counter( tuple(r_built.text[s:e + 1]) for s, e in r_built.answer_spans) removed = Counter() for w in actual: if all(x not in pre.special_tokens() for x in w): removed[w] += 1 if rc != removed: raise ValueError()
def show_stats(): splitter = MergeParagraphs(400) stop = NltkPlusStopWords(True) ranker = TopTfIdf(stop, 6) corpus = TriviaQaWebDataset() train = corpus.get_train() points = flatten_iterable([(q, d) for d in q.all_docs] for q in train) np.random.shuffle(points) counts = np.zeros(6) answers = np.zeros(6) n_answers = [] points = points[:1000] for q, d in tqdm(points): doc = corpus.evidence.get_document(d.doc_id) doc = splitter.split_annotated(doc, d.answer_spans) ranked = ranker.prune(q.question, doc) counts[:len(ranked)] += 1 for i, para in enumerate(ranked): if len(para.answer_spans) > 0: answers[i] += 1 n_answers.append( tuple(i for i, x in enumerate(ranked) if len(x.answer_spans) > 0)) print(answers / counts) c = Counter() other = 0 for tup in n_answers: if len(tup) <= 2: c[tup] += 1 else: other += 1 for p in sorted(c.keys()): print(p, c.get(p) / len(points)) print(other / len(points))
def contains_question_word(): data = TriviaQaWebDataset() stop = NltkPlusStopWords(punctuation=True).words doc_filter = ContainsQuestionWord(NltkPlusStopWords(punctuation=True)) splits = MergeParagraphs(400) # splits = Truncate(400) questions = data.get_dev() pairs = flatten_iterable([(q, doc) for doc in q.all_docs] for q in questions) pairs.sort(key=lambda x: (x[0].question_id, x[1].doc_id)) np.random.RandomState(0).shuffle(questions) has_token = 0 total = 0 used = Counter() for q, doc in tqdm(pairs[:1000]): text = data.evidence.get_document(doc.doc_id, splits.reads_first_n) q_tokens = set(x.lower() for x in q.question) q_tokens -= stop for para in splits.split_annotated(text, doc.answer_spans): # if para.start == 0: # continue if len(para.answer_spans) == 0: continue if any(x.lower() in q_tokens for x in flatten_iterable(para.text)): has_token += 1 for x in flatten_iterable(para.text): if x in q_tokens: used[x] += 1 # else: # print_questions(q.question, q.answer.all_answers, para.text, para.answer_spans) # input() total += 1 for k, v in used.most_common(200): print("%s: %d" % (k, v)) print(has_token / total)
def main(): parser = argparse.ArgumentParser(description='Train a model on TriviaQA web') parser.add_argument('mode', choices=["paragraph-level", "confidence", "merge", "shared-norm", "sigmoid", "shared-norm-600"]) parser.add_argument("name", help="Where to store the model") parser.add_argument('-n', '--n_processes', type=int, default=2, help="Number of processes (i.e., select which paragraphs to train on) " "the data with") args = parser.parse_args() mode = args.mode out = args.name + "-" + datetime.now().strftime("%m%d-%H%M%S") model = get_model(100, 140, mode, WithIndicators()) stop = NltkPlusStopWords(True) if mode == "paragraph-level": extract = ExtractSingleParagraph(MergeParagraphs(400), TopTfIdf(stop, 1), model.preprocessor, intern=True) elif mode == "shared-norm-600": extract = ExtractMultiParagraphs(MergeParagraphs(600), TopTfIdf(stop, 4), model.preprocessor, intern=True) else: extract = ExtractMultiParagraphs(MergeParagraphs(400), TopTfIdf(stop, 4), model.preprocessor, intern=True) if mode == "paragraph-level": n_epochs = 16 train = ParagraphAndQuestionsBuilder(ClusteredBatcher(60, ContextLenBucketedKey(3), True)) test = ParagraphAndQuestionsBuilder(ClusteredBatcher(60, ContextLenKey(), False)) n_dev, n_train = 21000, 12000 eval = [LossEvaluator(), SpanEvaluator([4, 8], "triviaqa")] else: eval = [LossEvaluator(), MultiParagraphSpanEvaluator(8, "triviaqa", mode != "merge")] # we sample two paragraphs per a (question, doc) pair, so evaluate on fewer questions n_dev, n_train = 15000, 8000 if mode == "confidence" or mode == "sigmoid": if mode == "sigmoid": # Trains very slowly, do this at your own risk n_epochs = 71 else: n_epochs = 28 test = RandomParagraphSetDatasetBuilder(120, "flatten", True, 1) train = StratifyParagraphsBuilder(ClusteredBatcher(60, ContextLenBucketedKey(3), True), 0, 1) else: n_epochs = 14 test = RandomParagraphSetDatasetBuilder(120, "merge" if mode == "merge" else "group", True, 1) train = StratifyParagraphSetsBuilder(35, mode == "merge", True, 1) data = TriviaQaWebDataset() params = get_triviaqa_train_params(n_epochs, n_dev, n_train) data = PreprocessedData(data, extract, train, test, eval_on_verified=False) data.preprocess(args.n_processes, 1000) with open(__file__, "r") as f: notes = f.read() notes = "*" * 10 + "\nMode: " + args.mode + "\n" + "*"*10 + "\n" + notes trainer.start_training(data, model, params, eval, model_dir.ModelDir(out), notes)
def main(): parser = argparse.ArgumentParser(description='Evaluate a model on TriviaQA data') parser.add_argument('model', help='model directory') parser.add_argument('-p', '--paragraph_output', type=str, help="Save fine grained results for each paragraph in csv format") parser.add_argument('-o', '--official_output', type=str, help="Build an offical output file with the model's" " most confident span for each (question, doc) pair") parser.add_argument('--no_ema', action="store_true", help="Don't use EMA weights even if they exist") parser.add_argument('--n_processes', type=int, default=None, help="Number of processes to do the preprocessing (selecting paragraphs+loading context) with") parser.add_argument('-i', '--step', type=int, default=None, help="checkpoint to load, default to latest") parser.add_argument('-n', '--n_sample', type=int, default=None, help="Number of questions to evaluate on") parser.add_argument('-a', '--async', type=int, default=10) parser.add_argument('-t', '--tokens', type=int, default=400, help="Max tokens per a paragraph") parser.add_argument('-g', '--n_paragraphs', type=int, default=15, help="Number of paragraphs to run the model on") parser.add_argument('-f', '--filter', type=str, default=None, choices=["tfidf", "truncate", "linear"], help="How to select paragraphs") parser.add_argument('-b', '--batch_size', type=int, default=200, help="Batch size, larger sizes might be faster but wll take more memory") parser.add_argument('--max_answer_len', type=int, default=8, help="Max answer span to select") parser.add_argument('-c', '--corpus', choices=["web-dev", "web-test", "web-verified-dev", "web-train", "open-dev", "open-train"], default="web-verified-dev") args = parser.parse_args() model_dir = ModelDir(args.model) model = model_dir.get_model() if args.corpus.startswith('web'): dataset = TriviaQaWebDataset() corpus = dataset.evidence if args.corpus == "web-dev": test_questions = dataset.get_dev() elif args.corpus == "web-test": test_questions = dataset.get_test() elif args.corpus == "web-verified-dev": test_questions = dataset.get_verified() elif args.corpus == "web-train": test_questions = dataset.get_train() else: raise RuntimeError() else: dataset = TriviaQaOpenDataset() corpus = dataset.evidence if args.corpus == "open-dev": test_questions = dataset.get_dev() elif args.corpus == "open-train": test_questions = dataset.get_train() else: raise RuntimeError() splitter = MergeParagraphs(args.tokens) per_document = not args.corpus.startswith("open") filter_name = args.filter if filter_name is None: if args.corpus.startswith("open"): filter_name = "linear" else: filter_name = "tfidf" print("Selecting %d paragraphs using %s method per %s" % (args.n_paragraphs, filter_name, ("question-document pair" if per_document else "question"))) if filter_name == "tfidf": para_filter = TopTfIdf(NltkPlusStopWords(punctuation=True), args.n_paragraphs) elif filter_name == "truncate": para_filter = FirstN(args.n_paragraphs) elif filter_name == "linear": para_filter = ShallowOpenWebRanker(args.n_paragraphs) else: raise ValueError() n_questions = args.n_sample if n_questions is not None: test_questions.sort(key=lambda x:x.question_id) np.random.RandomState(0).shuffle(test_questions) test_questions = test_questions[:n_questions] print("Building question/paragraph pairs...") # Loads the relevant questions/documents, selects the right paragraphs, and runs the model's preprocessor if per_document: prep = ExtractMultiParagraphs(splitter, para_filter, model.preprocessor, require_an_answer=False) else: prep = ExtractMultiParagraphsPerQuestion(splitter, para_filter, model.preprocessor, require_an_answer=False) prepped_data = preprocess_par(test_questions, corpus, prep, args.n_processes, 1000) data = [] for q in prepped_data.data: for i, p in enumerate(q.paragraphs): if q.answer_text is None: ans = None else: ans = TokenSpans(q.answer_text, p.answer_spans) data.append(DocumentParagraphQuestion(q.question_id, p.doc_id, (p.start, p.end), q.question, p.text, ans, i)) # Reverse so our first batch will be the largest (so OOMs happen early) questions = sorted(data, key=lambda x: (x.n_context_words, len(x.question)), reverse=True) print("Done, starting eval") if args.step is not None: if args.step == "latest": checkpoint = model_dir.get_latest_checkpoint() else: checkpoint = model_dir.get_checkpoint(int(args.step)) else: checkpoint = model_dir.get_best_weights() if checkpoint is not None: print("Using best weights") else: print("Using latest checkpoint") checkpoint = model_dir.get_latest_checkpoint() test_questions = ParagraphAndQuestionDataset(questions, FixedOrderBatcher(args.batch_size, True)) evaluation = trainer.test(model, [RecordParagraphSpanPrediction(args.max_answer_len, True)], {args.corpus:test_questions}, ResourceLoader(), checkpoint, not args.no_ema, args.async)[args.corpus] if not all(len(x) == len(data) for x in evaluation.per_sample.values()): raise RuntimeError() df = pd.DataFrame(evaluation.per_sample) if args.official_output is not None: print("Saving question result") # I didn't store the unormalized filenames exactly, so unfortunately we have to reload # the source data to get exact filename to output an official test script fns = {} print("Loading proper filenames") if args.corpus == 'web-test': source = join(TRIVIA_QA, "qa", "web-test-without-answers.json") elif args.corpus == "web-dev": source = join(TRIVIA_QA, "qa", "web-dev.json") else: raise NotImplementedError() with open(join(source)) as f: data = json.load(f)["Data"] for point in data: for doc in point["EntityPages"]: filename = doc["Filename"] fn = join("wikipedia", filename[:filename.rfind(".")]) fn = normalize_wiki_filename(fn) fns[(point["QuestionId"], fn)] = filename answers = {} scores = {} for q_id, doc_id, start, end, txt, score in df[["question_id", "doc_id", "para_start", "para_end", "text_answer", "predicted_score"]].itertuples(index=False): filename = dataset.evidence.file_id_map[doc_id] if filename.startswith("web"): true_name = filename[4:] + ".txt" else: true_name = fns[(q_id, filename)] key = q_id + "--" + true_name prev_score = scores.get(key) if prev_score is None or prev_score < score: scores[key] = score answers[key] = txt with open(args.official_output, "w") as f: json.dump(answers, f) if per_document: group_by = ["question_id", "doc_id"] else: group_by = ["question_id"] # Print a table of scores as more paragraphs are used df.sort_values(group_by + ["rank"], inplace=True) f1 = compute_model_scores(df, "predicted_score", "text_f1", group_by) em = compute_model_scores(df, "predicted_score", "text_em", group_by) table = [["N Paragraphs", "EM", "F1"]] table += list([str(i+1), "%.4f" % e, "%.4f" % f] for i, (e, f) in enumerate(zip(em, f1))) print_table(table) output_file = args.paragraph_output if output_file is not None: print("Saving paragraph result") if output_file.endswith("json"): with open(output_file, "w") as f: json.dump(evaluation.per_sample, f) elif output_file.endswith("pkl"): with open(output_file, "wb") as f: pickle.dump(evaluation.per_sample, f) elif output_file.endswith("csv"): df.to_csv(output_file, index=False) else: raise ValueError("Unrecognized file format")
np.random.RandomState(0).shuffle(questions) has_token = 0 total = 0 used = Counter() for q, doc in tqdm(pairs[:1000]): text = data.evidence.get_document(doc.doc_id, splits.reads_first_n) q_tokens = set(x.lower() for x in q.question) q_tokens -= stop for para in splits.split_annotated(text, doc.answer_spans): # if para.start == 0: # continue if len(para.answer_spans) == 0: continue if any(x.lower() in q_tokens for x in flatten_iterable(para.text)): has_token += 1 for x in flatten_iterable(para.text): if x in q_tokens: used[x] += 1 # else: # print_questions(q.question, q.answer.all_answers, para.text, para.answer_spans) # input() total += 1 for k, v in used.most_common(200): print("%s: %d" % (k, v)) print(has_token / total) if __name__ == "__main__": paragraph_stats(TriviaQaWebDataset(), MergeParagraphs(400), 1000)
def main(): parser = argparse.ArgumentParser( description='Evaluate a model on TriviaQA data') parser.add_argument('model', help='model directory') parser.add_argument( '-p', '--paragraph_output', type=str, help="Save fine grained results for each paragraph in csv format") parser.add_argument('-o', '--official_output', type=str, help="Build an offical output file with the model's" " most confident span for each (question, doc) pair") parser.add_argument('--no_ema', action="store_true", help="Don't use EMA weights even if they exist") parser.add_argument( '--n_processes', type=int, default=None, help= "Number of processes to do the preprocessing (selecting paragraphs+loading context) with" ) parser.add_argument('-i', '--step', type=int, default=None, help="checkpoint to load, default to latest") parser.add_argument('-n', '--n_sample', type=int, default=None, help="Number of questions to evaluate on") parser.add_argument('-a', '--async', type=int, default=10) parser.add_argument('-t', '--tokens', type=int, default=400, help="Max tokens per a paragraph") parser.add_argument('-g', '--n_paragraphs', type=int, default=15, help="Number of paragraphs to run the model on") parser.add_argument('-f', '--filter', type=str, default=None, choices=["tfidf", "truncate", "linear"], help="How to select paragraphs") parser.add_argument( '-b', '--batch_size', type=int, default=200, help="Batch size, larger sizes might be faster but wll take more memory" ) parser.add_argument('--max_answer_len', type=int, default=8, help="Max answer span to select") parser.add_argument('-c', '--corpus', choices=[ "web-dev", "web-test", "web-verified-dev", "web-train", "open-dev", "open-train", "wiki-dev", "wiki-test" ], default="web-verified-dev") parser.add_argument("-s", "--source_dir", type=str, default=None, help="where to take input files") parser.add_argument("--n_span_per_q", type=int, default=1, help="where to take input files") args = parser.parse_args() dataset_name = args.source_dir.split('/')[-1] model_name = args.model.split('/')[-1] ElasticLogger().write_log('INFO', 'Start Evaluation', context_dict={ 'model': model_name, 'dataset': dataset_name }) model_dir = ModelDir(args.model) model = model_dir.get_model() if args.corpus.startswith('web'): dataset = TriviaQaWebDataset() if args.corpus == "web-dev": test_questions = dataset.get_dev() elif args.corpus == "web-test": test_questions = dataset.get_test() elif args.corpus == "web-verified-dev": test_questions = dataset.get_verified() elif args.corpus == "web-train": test_questions = dataset.get_train() else: raise AssertionError() elif args.corpus.startswith("wiki"): dataset = TriviaQaWikiDataset() if args.corpus == "wiki-dev": test_questions = dataset.get_dev() elif args.corpus == "wiki-test": test_questions = dataset.get_test() else: raise AssertionError() else: dataset = TriviaQaOpenDataset(args.source_dir) if args.corpus == "open-dev": # just loading the pkl that was saved in build_span_corpus test_questions = dataset.get_dev() elif args.corpus == "open-train": test_questions = dataset.get_train() else: raise AssertionError() ### ALON debuging #test_questions = test_questions[0:5] corpus = dataset.evidence splitter = MergeParagraphs(args.tokens) per_document = args.corpus.startswith( "web") # wiki and web are both multi-document #per_document = True filter_name = args.filter if filter_name is None: # Pick default depending on the kind of data we are using if per_document: filter_name = "tfidf" else: filter_name = "linear" print("Selecting %d paragraphs using method \"%s\" per %s" % (args.n_paragraphs, filter_name, ("question-document pair" if per_document else "question"))) if filter_name == "tfidf": para_filter = TopTfIdf(NltkPlusStopWords(punctuation=True), args.n_paragraphs) elif filter_name == "truncate": para_filter = FirstN(args.n_paragraphs) elif filter_name == "linear": para_filter = ShallowOpenWebRanker(args.n_paragraphs) else: raise ValueError() n_questions = args.n_sample docqa.config.SPANS_PER_QUESTION = args.n_span_per_q #n_questions = 1 if n_questions is not None: test_questions.sort(key=lambda x: x.question_id) np.random.RandomState(0).shuffle(test_questions) test_questions = test_questions[:n_questions] print("Building question/paragraph pairs...") # Loads the relevant questions/documents, selects the right paragraphs, and runs the model's preprocessor if per_document: prep = ExtractMultiParagraphs(splitter, para_filter, model.preprocessor, require_an_answer=False) else: prep = ExtractMultiParagraphsPerQuestion(splitter, para_filter, model.preprocessor, require_an_answer=False) prepped_data = preprocess_par(test_questions, corpus, prep, args.n_processes, 1000) data = [] for q in prepped_data.data: for i, p in enumerate(q.paragraphs): if q.answer_text is None: ans = None else: ans = TokenSpans(q.answer_text, p.answer_spans) data.append( DocumentParagraphQuestion(q.question_id, p.doc_id, (p.start, p.end), q.question, p.text, ans, i)) # Reverse so our first batch will be the largest (so OOMs happen early) questions = sorted(data, key=lambda x: (x.n_context_words, len(x.question)), reverse=True) print("Done, starting eval") if args.step is not None: if args.step == "latest": checkpoint = model_dir.get_latest_checkpoint() else: checkpoint = model_dir.get_checkpoint(int(args.step)) else: checkpoint = model_dir.get_best_weights() if checkpoint is not None: print("Using best weights") else: print("Using latest checkpoint") checkpoint = model_dir.get_latest_checkpoint() test_questions = ParagraphAndQuestionDataset( questions, FixedOrderBatcher(args.batch_size, True)) evaluation = trainer.test( model, [RecordParagraphSpanPrediction(args.max_answer_len, True)], {args.corpus: test_questions}, ResourceLoader(), checkpoint, not args.no_ema, args. async)[args.corpus] if not all(len(x) == len(data) for x in evaluation.per_sample.values()): raise RuntimeError() df = pd.DataFrame(evaluation.per_sample) if args.official_output is not None: print("Saving question result") fns = {} if per_document: # I didn't store the unormalized filenames exactly, so unfortunately we have to reload # the source data to get exact filename to output an official test script print("Loading proper filenames") if args.corpus == 'web-test': source = join(TRIVIA_QA, "qa", "web-test-without-answers.json") elif args.corpus == "web-dev": source = join(TRIVIA_QA, "qa", "web-dev.json") else: raise AssertionError() with open(join(source)) as f: data = json.load(f)["Data"] for point in data: for doc in point["EntityPages"]: filename = doc["Filename"] fn = join("wikipedia", filename[:filename.rfind(".")]) fn = normalize_wiki_filename(fn) fns[(point["QuestionId"], fn)] = filename answers = {} scores = {} for q_id, doc_id, start, end, txt, score in df[[ "question_id", "doc_id", "para_start", "para_end", "text_answer", "predicted_score" ]].itertuples(index=False): filename = dataset.evidence.file_id_map[doc_id] if per_document: if filename.startswith("web"): true_name = filename[4:] + ".txt" else: true_name = fns[(q_id, filename)] # Alon Patch for triviaqa test results true_name = true_name.replace('TriviaQA_Org/', '') key = q_id + "--" + true_name else: key = q_id prev_score = scores.get(key) if prev_score is None or prev_score < score: scores[key] = score answers[key] = txt with open(args.official_output, "w") as f: json.dump(answers, f) output_file = args.paragraph_output if output_file is not None: print("Saving paragraph result") df.to_csv(output_file, index=False) print("Computing scores") if per_document: group_by = ["question_id", "doc_id"] else: group_by = ["question_id"] # Print a table of scores as more paragraphs are used df.sort_values(group_by + ["rank"], inplace=True) df_scores = df.copy(deep=True) df_scores['predicted_score'] = df_scores['predicted_score'].apply( lambda x: pd.Series(x).max()) em = compute_ranked_scores(df_scores, "predicted_score", "text_em", group_by) f1 = compute_ranked_scores(df_scores, "predicted_score", "text_f1", group_by) table = [["N Paragraphs", "EM", "F1"]] table += list([str(i + 1), "%.4f" % e, "%.4f" % f] for i, (e, f) in enumerate(zip(em, f1))) table_df = pd.DataFrame(table[1:], columns=table[0]).drop(['N Paragraphs'], axis=1) ElasticLogger().write_log('INFO', 'Results', context_dict={'model': model_name, 'dataset': dataset_name, \ 'max_EM':table_df.max().ix['EM'], \ 'max_F1':table_df.max().ix['F1'], \ 'result_table': str(table_df)}) df_flat = [] for id, question in df.iterrows(): for text_answer, predicted_span, predicted_score in zip( question['text_answer'], question['predicted_span'], question['predicted_score']): new_question = dict(question.copy()) new_question.update({ 'text_answer': text_answer, 'predicted_span': predicted_span, 'predicted_score': predicted_score }) df_flat.append(new_question) results_df = pd.DataFrame(df_flat) #Alon: outputing the estimates for all the #results_df = results_df.groupby(['question_id', 'text_answer']).apply(lambda df: df.ix[df['predicted_score'].argmax()]).reset_index(drop=True) results_df.sort_values(by=['question_id', 'predicted_score'], ascending=False).set_index([ 'question_id', 'text_answer' ])[['question', 'predicted_score', 'text_em']].to_csv('results.csv') print_table(table)
def main(): parser = argparse.ArgumentParser( description='Train a model on TriviaQA unfiltered') parser.add_argument( 'mode', choices=["confidence", "merge", "shared-norm", "sigmoid", "paragraph"]) parser.add_argument("name", help="Where to store the model") parser.add_argument("-t", "--n_tokens", default=400, type=int, help="Paragraph size") parser.add_argument( '-n', '--n_processes', type=int, default=2, help="Number of processes (i.e., select which paragraphs to train on) " "the data with") args = parser.parse_args() mode = args.mode out = args.name + "-" + datetime.now().strftime("%m%d-%H%M%S") model = get_model(100, 140, mode, WithIndicators()) extract = ExtractMultiParagraphsPerQuestion(MergeParagraphs(args.n_tokens), ShallowOpenWebRanker(16), model.preprocessor, intern=True) eval = [ LossEvaluator(), MultiParagraphSpanEvaluator(8, "triviaqa", mode != "merge") ] oversample = [1] * 4 if mode == "paragraph": n_epochs = 120 test = RandomParagraphSetDatasetBuilder(120, "flatten", True, oversample) train = StratifyParagraphsBuilder(ClusteredBatcher( 60, ContextLenBucketedKey(3), True), oversample, only_answers=True) elif mode == "confidence" or mode == "sigmoid": if mode == "sigmoid": n_epochs = 640 else: n_epochs = 160 test = RandomParagraphSetDatasetBuilder(120, "flatten", True, oversample) train = StratifyParagraphsBuilder( ClusteredBatcher(60, ContextLenBucketedKey(3), True), oversample) else: n_epochs = 80 test = RandomParagraphSetDatasetBuilder( 120, "merge" if mode == "merge" else "group", True, oversample) train = StratifyParagraphSetsBuilder(30, mode == "merge", True, oversample) data = TriviaQaWebDataset() params = TrainParams(SerializableOptimizer("Adadelta", dict(learning_rate=1)), num_epochs=n_epochs, ema=0.999, max_checkpoints_to_keep=2, async_encoding=10, log_period=30, eval_period=1800, save_period=1800, eval_samples=dict(dev=None, train=6000)) data = PreprocessedData(data, extract, train, test, eval_on_verified=False) data.preprocess(args.n_processes, 1000) with open(__file__, "r") as f: notes = f.read() notes = "Mode: " + args.mode + "\n" + notes trainer.start_training(data, model, params, eval, model_dir.ModelDir(out), notes)
def main(): parser = argparse.ArgumentParser(description='') parser.add_argument('answers', help='answer file') parser.add_argument('question_source') args = parser.parse_args() print("Loading answers..") answer_df = pd.read_csv(args.answers) print("Loading questions..") if args.question_source == "open": corpus = TriviaQaOpenDataset() questions = {q.question_id: q for q in corpus.get_dev()} elif args.question_source == "web": corpus = TriviaQaWebDataset() questions = {} for q in corpus.get_dev(): for d in q.all_docs: questions[(q.question_id, d.doc_id)] = q elif args.question_source == "squad": show_squad_errors(args.answers) return else: raise ValueError() pre = WithIndicators() answer_df.sort_values(["question_id", "rank"], inplace=True) if args.question_source == "open": iter = answer_df.groupby(["question_id"]) else: iter = answer_df.groupby(["question_id", "doc_id"]) grouped = list(iter) np.random.shuffle(grouped) for key, group in grouped: print(list(questions.keys())[:10]) q = questions[key] cur_best_score = group.text_f1.iloc[0] cur_best_conf = group.predicted_score.iloc[0] cur_best_ix = group.index[0] for i in range(1, len(group)): ix = group.index[i] conf = group.predicted_score[ix] if conf > cur_best_conf: score = group.text_f1[ix] if score < cur_best_score: # We hurt our selves! print("Oh no!") print(" ".join(q.question)) print(q.answer.all_answers) print("Best score was %.4f (conf=%.4f), but not is %.4f (conf=%.4f)" % ( cur_best_score, cur_best_conf, score, conf )) d1 = [d for d in q.all_docs if d.doc_id == group.doc_id[cur_best_ix]][0] p1 = extract_paragraph(corpus.evidence.get_document(d1.doc_id), group.para_start[cur_best_ix], group.para_end[cur_best_ix]) s, e = group.para_start[cur_best_ix], group.para_end[cur_best_ix] answers = d1.answer_spans[np.logical_and(d1.answer_spans[:, 0] >= s, d1.answer_spans[:, 1] < s)] - s p1 = pre.encode_extracted_paragraph(q.question, ExtractedParagraphWithAnswers( p1, group.para_start[cur_best_ix], group.para_end[cur_best_ix], answers)) d2 = [d for d in q.all_docs if d.doc_id == group.doc_id[ix]][0] p2 = extract_paragraph(corpus.evidence.get_document(d2.doc_id), group.para_start[ix], group.para_end[ix]) s, e = group.para_start[ix], group.para_end[ix] answers = d2.answer_spans[np.logical_and(d2.answer_spans[:, 0] >= s, d2.answer_spans[:, 1] < s)] - s p2 = pre.encode_extracted_paragraph(q.question, ExtractedParagraphWithAnswers( p2, group.para_start[ix], group.para_end[ix], answers)) p1_s, p1_e = group.predicted_start[cur_best_ix], group.predicted_end[cur_best_ix] p2_s, p2_e = group.predicted_start[ix], group.predicted_end[ix] print(" ".join(display_para(p1.text, p1.answer_spans, q.question, p1_s, p1_e))) print() print(" ".join(display_para(p2.text, p2.answer_spans, q.question, p2_s, p2_e))) input() else: cur_best_score = score cur_best_ix = ix cur_best_conf = conf