def main(): data = TriviaQaWebDataset() stop = NltkPlusStopWords() splitter = MergeParagraphs(400) selector = TopTfIdf(stop, 4) print("Loading data..") train = data.get_train() print("Start") for q in train: for doc in q.all_docs: if len(doc.answer_spans) > 3: text = splitter.split_annotated( data.evidence.get_document(doc.doc_id), doc.answer_spans) text = selector.prune(q.question, text) for para in text: if len(para.answer_spans) > 3: print(q.question) text = flatten_iterable(para.text) for s, e in para.answer_spans: text[s] = "{{{" + text[s] text[e] = text[e] + "}}}" print(" ".join(text)) input()
def show_web_paragraphs(): splitter = MergeParagraphs(400) stop = NltkPlusStopWords(True) ranker = TopTfIdf(stop, 6) stop_words = stop.words corpus = TriviaQaWebDataset() train = corpus.get_train() points = flatten_iterable([(q, d) for d in q.all_docs] for q in train) np.random.shuffle(points) for q, d in points: q_words = {strip_accents_unicode(w.lower()) for w in q.question} q_words = {x for x in q_words if x not in stop_words} doc = corpus.evidence.get_document(d.doc_id) doc = splitter.split_annotated(doc, d.answer_spans) ranked = ranker.dists(q.question, doc) if len(ranked) < 2 or len(ranked[1][0].answer_spans) == 0: continue print(" ".join(q.question)) print(q.answer.all_answers) for i, (para, dist) in enumerate(ranked[0:2]): text = flatten_iterable(para.text) print("Start=%d, Rank=%d, Dist=%.4f" % (para.start, i, dist)) if len(para.answer_spans) == 0: continue for s, e in para.answer_spans: text[s] = bcolors.CYAN + text[s] text[e] = text[e] + bcolors.ENDC for i, w in enumerate(text): if strip_accents_unicode(w.lower()) in q_words: text[i] = bcolors.ERROR + text[i] + bcolors.ENDC print(" ".join(text)) input()
async def setup_qa(app, loop): # To play nice with iohttp's async ClientSession objects, we need to construct the QaSystem # inside the event loop. if args.debug == "dummy_qa": qa = DummyQa() else: qa = QaSystem( args.wiki_cache, MergeParagraphs(args.tokens), ShallowOpenWebRanker(args.n_paragraphs), args.voc, model, loader, bing_api_key, bing_version=args.bing_version, tagme_api_key=tagme_api_key, n_dl_threads=args.n_dl_threads, blacklist_trivia_sites=args.blacklist_trivia_sites, download_timeout=args.download_timeout, span_bound=span_bound, tagme_threshold=None if (tagme_api_key is None) else args.tagme_thresh, n_web_docs=args.n_web, ) app.qa = qa
def prepare_data(model, train_config, dataset_oversampling, n_processes): extract = ExtractMultiParagraphsPerQuestion( MergeParagraphs(train_config.n_tokens), ShallowOpenWebRanker(train_config.num_paragraphs), model.preprocessor, intern=True ) trivia_qa_test = RandomParagraphSetDatasetBuilder( train_config.test_batch_size, "merge" if train_config.trivia_qa_mode == "merge" else "group", True, train_config.oversample ) trivia_qa_train = StratifyParagraphSetsBuilder( train_config.train_batch_size, train_config.trivia_qa_mode == "merge", True, train_config.oversample ) datas = [] for name, sampling in dataset_oversampling.items(): for s in range(sampling): ds = TriviaQaSpanCorpus(name) ds.corpus_name = ds.corpus_name + '_{}'.format(s) datas.append(ds) data = MultiDataset(datas) data = PreprocessedData(data, extract, trivia_qa_train, trivia_qa_test, eval_on_verified=False) data.preprocess(n_processes, 1000) return data
def find_answer(documents, raw_question): raw_question = raw_question.lower() documents = [d.lower() for d in documents] global best_spans, conf documents = [re.split("\s*\n\s*", doc) for doc in documents] tokenizer = NltkAndPunctTokenizer() question = tokenizer.tokenize_paragraph_flat(raw_question) documents = [[tokenizer.tokenize_paragraph(p) for p in doc] for doc in documents] splitter = MergeParagraphs(400) documents = [splitter.split(doc) for doc in documents] if len(documents) == 1: selector = TopTfIdf(NltkPlusStopWords(True), n_to_select=5) context = selector.prune(question, documents[0]) else: selector = ShallowOpenWebRanker(n_to_select=10) context = selector.prune(question, flatten_iterable(documents)) context = [flatten_iterable(x.text) for x in context] data = [ ParagraphAndQuestion(x, question, None, "user-question%d" % i) for i, x in enumerate(context) ] encoded = model.encode(data, is_train=False) with sess.as_default(): spans, confid = sess.run([best_spans, conf], feed_dict=encoded) best_para = np.argmax(confid) ans = " ".join(context[best_para][spans[best_para][0]:spans[best_para][1] + 1]) confidence = confid[best_para] return ans, confidence
def show_open_paragraphs(start: int, end: int): splitter = MergeParagraphs(400) stop = NltkPlusStopWords(True) ranker = ShallowOpenWebRanker(6) stop_words = stop.words print("Loading train") corpus = TriviaQaOpenDataset() train = corpus.get_dev() np.random.shuffle(train) for q in train: q_words = {strip_accents_unicode(w.lower()) for w in q.question} q_words = {x for x in q_words if x not in stop_words} para = [] for d in q.all_docs: doc = corpus.evidence.get_document(d.doc_id) para += splitter.split_annotated(doc, d.answer_spans) ranked = ranker.prune(q.question, para) if len(ranked) < start: continue ranked = ranked[start:end] print(" ".join(q.question)) print(q.answer.all_answers) for i in range(start, end): para = ranked[i] text = flatten_iterable(para.text) print("Start=%d, Rank=%d" % (para.start, i)) if len(para.answer_spans) == 0: # print("No Answer!") continue for s, e in para.answer_spans: text[s] = bcolors.CYAN + text[s] text[e] = text[e] + bcolors.ENDC for i, w in enumerate(text): if strip_accents_unicode(w.lower()) in q_words: text[i] = bcolors.ERROR + text[i] + bcolors.ENDC print(" ".join(text)) input()
def check_preprocess(): data = TriviaQaWebDataset() merge = MergeParagraphs(400) questions = data.get_dev() pre = WithIndicators(False) remove_cross = WithIndicators(True) rng = np.random.RandomState(0) rng.shuffle(questions) for q in tqdm(questions[:1000]): doc = rng.choice(q.all_docs, 1)[0] text = data.evidence.get_document(doc.doc_id, n_tokens=800) paras = merge.split_annotated(text, doc.answer_spans) para = paras[np.random.randint(0, len(paras))] built = pre.encode_extracted_paragraph(q.question, para) expected_text = flatten_iterable(para.text) if expected_text != [ x for x in built.text if x not in pre.special_tokens() ]: raise ValueError() expected = [expected_text[s:e + 1] for s, e in para.answer_spans] expected = Counter([tuple(x) for x in expected]) actual = [tuple(built.text[s:e + 1]) for s, e in built.answer_spans] actual_cleaned = Counter( tuple(z for z in x if z not in pre.special_tokens()) for x in actual) if actual_cleaned != expected: raise ValueError() r_built = remove_cross.encode_extracted_paragraph(q.question, para) rc = Counter( tuple(r_built.text[s:e + 1]) for s, e in r_built.answer_spans) removed = Counter() for w in actual: if all(x not in pre.special_tokens() for x in w): removed[w] += 1 if rc != removed: raise ValueError()
def prepro(ds, fold, num_tokens_per_group, num_paragraphs, pad=True, n_samples=None): fold_funcs = { 'train': lambda: ds.get_train(), 'dev': lambda: ds.get_dev(), 'test': lambda: ds.get_test() } qs = fold_funcs[fold]() if n_samples is not None: qs = qs[:n_samples] evidence = ds.evidence prep = None extract = ExtractMultiParagraphsPerQuestion( MergeParagraphs(num_tokens_per_group), ShallowOpenWebRanker(num_paragraphs), prep, intern=True) answers = {} batches = {} for q in tqdm(qs, ncols=80, desc='preprocessing'): pre = extract.preprocess([q], evidence) if len(pre.data) == 0: continue assert len(pre.data) < 2 assert q.question_id not in answers assert q.question_id not in batches mpq = pre.data[0] pq_batch = [ ParagraphAndQuestion(p.get_context(), q.question, None, q.question_id, p.doc_id) for p in mpq.paragraphs # document paragraph question? ] if pad: for i in range(num_paragraphs - len(pq_batch)): pq_batch.append( ParagraphAndQuestion([], q.question, None, q.question_id, None)) answers[q.question_id] = mpq.answer_text batches[q.question_id] = pq_batch voc = {w for bs in batches.values() for b in bs for w in b.question} voc.update({w for bs in batches.values() for b in bs for w in b.context}) return answers, batches, voc
def show_stats(): splitter = MergeParagraphs(400) stop = NltkPlusStopWords(True) ranker = TopTfIdf(stop, 6) corpus = TriviaQaWebDataset() train = corpus.get_train() points = flatten_iterable([(q, d) for d in q.all_docs] for q in train) np.random.shuffle(points) counts = np.zeros(6) answers = np.zeros(6) n_answers = [] points = points[:1000] for q, d in tqdm(points): doc = corpus.evidence.get_document(d.doc_id) doc = splitter.split_annotated(doc, d.answer_spans) ranked = ranker.prune(q.question, doc) counts[:len(ranked)] += 1 for i, para in enumerate(ranked): if len(para.answer_spans) > 0: answers[i] += 1 n_answers.append( tuple(i for i, x in enumerate(ranked) if len(x.answer_spans) > 0)) print(answers / counts) c = Counter() other = 0 for tup in n_answers: if len(tup) <= 2: c[tup] += 1 else: other += 1 for p in sorted(c.keys()): print(p, c.get(p) / len(points)) print(other / len(points))
def contains_question_word(): data = TriviaQaWebDataset() stop = NltkPlusStopWords(punctuation=True).words doc_filter = ContainsQuestionWord(NltkPlusStopWords(punctuation=True)) splits = MergeParagraphs(400) # splits = Truncate(400) questions = data.get_dev() pairs = flatten_iterable([(q, doc) for doc in q.all_docs] for q in questions) pairs.sort(key=lambda x: (x[0].question_id, x[1].doc_id)) np.random.RandomState(0).shuffle(questions) has_token = 0 total = 0 used = Counter() for q, doc in tqdm(pairs[:1000]): text = data.evidence.get_document(doc.doc_id, splits.reads_first_n) q_tokens = set(x.lower() for x in q.question) q_tokens -= stop for para in splits.split_annotated(text, doc.answer_spans): # if para.start == 0: # continue if len(para.answer_spans) == 0: continue if any(x.lower() in q_tokens for x in flatten_iterable(para.text)): has_token += 1 for x in flatten_iterable(para.text): if x in q_tokens: used[x] += 1 # else: # print_questions(q.question, q.answer.all_answers, para.text, para.answer_spans) # input() total += 1 for k, v in used.most_common(200): print("%s: %d" % (k, v)) print(has_token / total)
def main(): data = TriviaQaOpenDataset() # data = TriviaQaWebDataset() print("Loading...") all_questions = data.get_dev() questions = [ q for q in all_questions if any( len(x.answer_spans) > 0 for x in q.all_docs) ] print( "%d/%d (%.4f) have an answer" % (len(questions), len(all_questions), len(questions) / len(all_questions))) np.random.shuffle(questions) pre = ExtractMultiParagraphsPerQuestion(MergeParagraphs(400), TopTfIdf(NltkPlusStopWords(), 20), require_an_answer=False) print("Done") out = preprocess_par(questions[:2000], data.evidence, pre, 2, 1000) n_counts = np.zeros(20) n_any = np.zeros(20) n_any_all = np.zeros(20) for q in out.data: for i, p in enumerate(q.paragraphs): n_counts[i] += 1 n_any[i] += len(p.answer_spans) > 0 for i, p in enumerate(q.paragraphs): if len(p.answer_spans) > 0: n_any_all[i:] += 1 break print(n_any_all / out.true_len) print(n_any / n_counts) print(n_counts)
def main(): parser = argparse.ArgumentParser(description='Evaluate a model on TriviaQA data') parser.add_argument('model', help='model directory') parser.add_argument('-p', '--paragraph_output', type=str, help="Save fine grained results for each paragraph in csv format") parser.add_argument('-o', '--official_output', type=str, help="Build an offical output file with the model's" " most confident span for each (question, doc) pair") parser.add_argument('--no_ema', action="store_true", help="Don't use EMA weights even if they exist") parser.add_argument('--n_processes', type=int, default=None, help="Number of processes to do the preprocessing (selecting paragraphs+loading context) with") parser.add_argument('-i', '--step', type=int, default=None, help="checkpoint to load, default to latest") parser.add_argument('-n', '--n_sample', type=int, default=None, help="Number of questions to evaluate on") parser.add_argument('-a', '--async', type=int, default=10) parser.add_argument('-t', '--tokens', type=int, default=400, help="Max tokens per a paragraph") parser.add_argument('-g', '--n_paragraphs', type=int, default=15, help="Number of paragraphs to run the model on") parser.add_argument('-f', '--filter', type=str, default=None, choices=["tfidf", "truncate", "linear"], help="How to select paragraphs") parser.add_argument('-b', '--batch_size', type=int, default=200, help="Batch size, larger sizes might be faster but wll take more memory") parser.add_argument('--max_answer_len', type=int, default=8, help="Max answer span to select") parser.add_argument('-c', '--corpus', choices=["web-dev", "web-test", "web-verified-dev", "web-train", "open-dev", "open-train"], default="web-verified-dev") args = parser.parse_args() model_dir = ModelDir(args.model) model = model_dir.get_model() if args.corpus.startswith('web'): dataset = TriviaQaWebDataset() corpus = dataset.evidence if args.corpus == "web-dev": test_questions = dataset.get_dev() elif args.corpus == "web-test": test_questions = dataset.get_test() elif args.corpus == "web-verified-dev": test_questions = dataset.get_verified() elif args.corpus == "web-train": test_questions = dataset.get_train() else: raise RuntimeError() else: dataset = TriviaQaOpenDataset() corpus = dataset.evidence if args.corpus == "open-dev": test_questions = dataset.get_dev() elif args.corpus == "open-train": test_questions = dataset.get_train() else: raise RuntimeError() splitter = MergeParagraphs(args.tokens) per_document = not args.corpus.startswith("open") filter_name = args.filter if filter_name is None: if args.corpus.startswith("open"): filter_name = "linear" else: filter_name = "tfidf" print("Selecting %d paragraphs using %s method per %s" % (args.n_paragraphs, filter_name, ("question-document pair" if per_document else "question"))) if filter_name == "tfidf": para_filter = TopTfIdf(NltkPlusStopWords(punctuation=True), args.n_paragraphs) elif filter_name == "truncate": para_filter = FirstN(args.n_paragraphs) elif filter_name == "linear": para_filter = ShallowOpenWebRanker(args.n_paragraphs) else: raise ValueError() n_questions = args.n_sample if n_questions is not None: test_questions.sort(key=lambda x:x.question_id) np.random.RandomState(0).shuffle(test_questions) test_questions = test_questions[:n_questions] print("Building question/paragraph pairs...") # Loads the relevant questions/documents, selects the right paragraphs, and runs the model's preprocessor if per_document: prep = ExtractMultiParagraphs(splitter, para_filter, model.preprocessor, require_an_answer=False) else: prep = ExtractMultiParagraphsPerQuestion(splitter, para_filter, model.preprocessor, require_an_answer=False) prepped_data = preprocess_par(test_questions, corpus, prep, args.n_processes, 1000) data = [] for q in prepped_data.data: for i, p in enumerate(q.paragraphs): if q.answer_text is None: ans = None else: ans = TokenSpans(q.answer_text, p.answer_spans) data.append(DocumentParagraphQuestion(q.question_id, p.doc_id, (p.start, p.end), q.question, p.text, ans, i)) # Reverse so our first batch will be the largest (so OOMs happen early) questions = sorted(data, key=lambda x: (x.n_context_words, len(x.question)), reverse=True) print("Done, starting eval") if args.step is not None: if args.step == "latest": checkpoint = model_dir.get_latest_checkpoint() else: checkpoint = model_dir.get_checkpoint(int(args.step)) else: checkpoint = model_dir.get_best_weights() if checkpoint is not None: print("Using best weights") else: print("Using latest checkpoint") checkpoint = model_dir.get_latest_checkpoint() test_questions = ParagraphAndQuestionDataset(questions, FixedOrderBatcher(args.batch_size, True)) evaluation = trainer.test(model, [RecordParagraphSpanPrediction(args.max_answer_len, True)], {args.corpus:test_questions}, ResourceLoader(), checkpoint, not args.no_ema, args.async)[args.corpus] if not all(len(x) == len(data) for x in evaluation.per_sample.values()): raise RuntimeError() df = pd.DataFrame(evaluation.per_sample) if args.official_output is not None: print("Saving question result") # I didn't store the unormalized filenames exactly, so unfortunately we have to reload # the source data to get exact filename to output an official test script fns = {} print("Loading proper filenames") if args.corpus == 'web-test': source = join(TRIVIA_QA, "qa", "web-test-without-answers.json") elif args.corpus == "web-dev": source = join(TRIVIA_QA, "qa", "web-dev.json") else: raise NotImplementedError() with open(join(source)) as f: data = json.load(f)["Data"] for point in data: for doc in point["EntityPages"]: filename = doc["Filename"] fn = join("wikipedia", filename[:filename.rfind(".")]) fn = normalize_wiki_filename(fn) fns[(point["QuestionId"], fn)] = filename answers = {} scores = {} for q_id, doc_id, start, end, txt, score in df[["question_id", "doc_id", "para_start", "para_end", "text_answer", "predicted_score"]].itertuples(index=False): filename = dataset.evidence.file_id_map[doc_id] if filename.startswith("web"): true_name = filename[4:] + ".txt" else: true_name = fns[(q_id, filename)] key = q_id + "--" + true_name prev_score = scores.get(key) if prev_score is None or prev_score < score: scores[key] = score answers[key] = txt with open(args.official_output, "w") as f: json.dump(answers, f) if per_document: group_by = ["question_id", "doc_id"] else: group_by = ["question_id"] # Print a table of scores as more paragraphs are used df.sort_values(group_by + ["rank"], inplace=True) f1 = compute_model_scores(df, "predicted_score", "text_f1", group_by) em = compute_model_scores(df, "predicted_score", "text_em", group_by) table = [["N Paragraphs", "EM", "F1"]] table += list([str(i+1), "%.4f" % e, "%.4f" % f] for i, (e, f) in enumerate(zip(em, f1))) print_table(table) output_file = args.paragraph_output if output_file is not None: print("Saving paragraph result") if output_file.endswith("json"): with open(output_file, "w") as f: json.dump(evaluation.per_sample, f) elif output_file.endswith("pkl"): with open(output_file, "wb") as f: pickle.dump(evaluation.per_sample, f) elif output_file.endswith("csv"): df.to_csv(output_file, index=False) else: raise ValueError("Unrecognized file format")
def main(): parser = argparse.ArgumentParser(description="Run an ELMo model on user input") # parser.add_argument("model", type=int, help="Model directory") parser.add_argument("question", help="Question to answer") parser.add_argument("documents", help="List of text documents to answer the question with", nargs='+') args = parser.parse_args() # Models path SQUAD_MODEL_DIRECTORY_PATH = 'docqa/models-cpu/squad' SQUAD_SHARED_NORM_MODEL_DIRECTORY_PATH = 'docqa/models-cpu/squad-shared-norm' TRIVIAQA_MODEL_DIRECTORY_PATH = 'docqa/models-cpu/triviaqa-unfiltered-shared-norm' TRIVIAQA_SHARED_NORM_MODEL_DIRECTORY_PATH = 'docqa/models-cpu/triviaqa-web-shared-norm' models_directory = [ SQUAD_MODEL_DIRECTORY_PATH, SQUAD_SHARED_NORM_MODEL_DIRECTORY_PATH, TRIVIAQA_MODEL_DIRECTORY_PATH, TRIVIAQA_SHARED_NORM_MODEL_DIRECTORY_PATH ] print("Preprocessing...") # Load the model # model_dir = ModelDir(args.model) model_dir = ModelDir(models_directory[0]) model = model_dir.get_model() if not isinstance(model, ParagraphQuestionModel): raise ValueError("This script is built to work for ParagraphQuestionModel models only") # Read the documents documents = [] for doc in args.documents: if not isfile(doc): raise ValueError(doc + " does not exist") with open(doc, "r") as f: documents.append(f.read()) print("Loaded %d documents" % len(documents)) # Split documents into lists of paragraphs documents = [re.split("\s*\n\s*", doc) for doc in documents] # Tokenize the input, the models expects data to be tokenized using `NltkAndPunctTokenizer` # Note the model expects case-sensitive input tokenizer = NltkAndPunctTokenizer() question = tokenizer.tokenize_paragraph_flat(args.question) # List of words # Now list of document->paragraph->sentence->word documents = [[tokenizer.tokenize_paragraph(p) for p in doc] for doc in documents] # Now group the document into paragraphs, this returns `ExtractedParagraph` objects # that additionally remember the start/end token of the paragraph within the source document splitter = MergeParagraphs(400) # splitter = PreserveParagraphs() # Uncomment to use the natural paragraph grouping documents = [splitter.split(doc) for doc in documents] # Now select the top paragraphs using a `ParagraphFilter` if len(documents) == 1: # Use TF-IDF to select top paragraphs from the document selector = TopTfIdf(NltkPlusStopWords(True), n_to_select=5) context = selector.prune(question, documents[0]) else: # Use a linear classifier to select top paragraphs among all the documents selector = ShallowOpenWebRanker(n_to_select=10) context = selector.prune(question, flatten_iterable(documents)) print("Select %d paragraph" % len(context)) if model.preprocessor is not None: # Models are allowed to define an additional pre-processing step # This will turn the `ExtractedParagraph` objects back into simple lists of tokens context = [model.preprocessor.encode_text(question, x) for x in context] else: # Otherwise just use flattened text context = [flatten_iterable(x.text) for x in context] print("Setting up model") # Tell the model the batch size (can be None) and vocab to expect, This will load the # needed word vectors and fix the batch size to use when building the graph / encoding the input voc = set(question) for txt in context: voc.update(txt) model.set_input_spec(ParagraphAndQuestionSpec(batch_size=len(context)), voc) # Now we build the actual tensorflow graph, `best_span` and `conf` are # tensors holding the predicted span (inclusive) and confidence scores for each # element in the input batch, confidence scores being the pre-softmax logit for the span print("Build tf graph") sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) # We need to use sess.as_default when working with the cuNND stuff, since we need an active # session to figure out the # of parameters needed for each layer. The cpu-compatible models don't need this. with sess.as_default(): # 8 means to limit the span to size 8 or less best_spans, conf = model.get_prediction().get_best_span(10) # Loads the saved weights model_dir.restore_checkpoint(sess) # Now the model is ready to run # The model takes input in the form of `ContextAndQuestion` objects, for example: data = [ParagraphAndQuestion(x, question, None, "user-question%d"%i) for i, x in enumerate(context)] print("Starting run") # The model is run in two steps, first it "encodes" a batch of paragraph/context pairs # into numpy arrays, then we use `sess` to run the actual model get the predictions encoded = model.encode(data, is_train=False) # batch of `ContextAndQuestion` -> feed_dict best_spans, conf = sess.run([best_spans, conf], feed_dict=encoded) # feed_dict -> predictions best_para = np.argmax(conf) # We get output for each paragraph, select the most-confident one to print print("Best Paragraph: " + str(best_para)) para_id = int(str(best_para)) # print("Best Paragraph: \n" + (" ".join((paras[para_id].text)[0]))) print("Best Paragraph: \n" + " ".join(context[para_id])) print("Best span: " + str(best_spans[best_para])) print("Answer text: " + " ".join(context[best_para][best_spans[best_para][0]:best_spans[best_para][1]+1])) print("Confidence: " + str(conf[best_para]))
def main(): parser = argparse.ArgumentParser() parser.add_argument( '--n_processes', type=int, default=1, help= "Number of processes to do the preprocessing (selecting paragraphs+loading context) with" ) parser.add_argument('-a', '--async', type=int, default=10) parser.add_argument('-t', '--tokens', type=int, default=400, help="Max tokens per a paragraph") parser.add_argument('-n', '--n_sample', type=int, default=None, help="Number of questions to evaluate on") parser.add_argument('-g', '--n_paragraphs', type=int, default=15, help="Number of paragraphs to run the model on") parser.add_argument('-f', '--filter', type=str, default=None, choices=["tfidf", "truncate", "linear"], help="How to select paragraphs") parser.add_argument( '-c', '--corpus', choices=[ "en_dev", "en_test", "fr_dev", "fr_test", "de_dev", "de_test", "ru_dev", "ru_test", "pt_dev", "pt_test", "zh_dev", "zh_test", "pl_dev", "pl_test", "uk_dev", "uk_test", "ta_dev", "ta_test", "fr_trans_en_dev", "fr_trans_en_test", "de_trans_en_dev", "de_trans_en_test", "ru_trans_en_dev", "ru_trans_en_test", "pt_trans_en_dev", "pt_trans_en_test", "zh_trans_en_dev", "zh_trans_en_test", "pl_trans_en_dev", "pl_trans_en_test", "uk_trans_en_dev", "uk_trans_en_test", "ta_trans_en_dev", "ta_trans_en_test" ], required=True) args = parser.parse_args() corpus_name = args.corpus[:args.corpus.rfind("_")] eval_set = args.corpus[args.corpus.rfind("_") + 1:] dataset = XQADataset(corpus_name) if eval_set == "dev": test_questions = dataset.get_dev() elif eval_set == "test": test_questions = dataset.get_test() else: raise AssertionError() corpus = dataset.evidence splitter = MergeParagraphs(args.tokens) per_document = args.corpus.startswith( "web") # wiki and web are both multi-document filter_name = args.filter if filter_name is None: # Pick default depending on the kind of data we are using if per_document: filter_name = "tfidf" else: filter_name = "linear" print("Selecting %d paragraphs using method \"%s\" per %s" % (args.n_paragraphs, filter_name, ("question-document pair" if per_document else "question"))) if filter_name == "tfidf": para_filter = TopTfIdf(NltkPlusStopWords(punctuation=True), args.n_paragraphs) elif filter_name == "truncate": para_filter = FirstN(args.n_paragraphs) elif filter_name == "linear": para_filter = ShallowOpenWebRanker(args.n_paragraphs) else: raise ValueError() n_questions = args.n_sample if n_questions is not None: test_questions.sort(key=lambda x: x.question_id) np.random.RandomState(0).shuffle(test_questions) test_questions = test_questions[:n_questions] preprocessor = WithIndicators() print("Building question/paragraph pairs...") # Loads the relevant questions/documents, selects the right paragraphs, and runs the model's preprocessor if per_document: prep = ExtractMultiParagraphs(splitter, para_filter, preprocessor, require_an_answer=False) else: prep = ExtractMultiParagraphsPerQuestion(splitter, para_filter, preprocessor, require_an_answer=False) prepped_data = preprocess_par(test_questions, corpus, prep, args.n_processes, 1000) data = [] for q in prepped_data.data: for i, p in enumerate(q.paragraphs): if q.answer_text is None: ans = None else: ans = TokenSpans(q.answer_text, p.answer_spans) data.append( DocumentParagraphQuestion(q.question_id, p.doc_id, (p.start, p.end), q.question, p.text, ans, i)) # Reverse so our first batch will be the largest (so OOMs happen early) questions = sorted(data, key=lambda x: (x.n_context_words, len(x.question)), reverse=True) # dump eval data for bert import pickle pickle.dump(questions, open("%s_%d.pkl" % (args.corpus, args.n_paragraphs), "wb"))
def main(Data: pd.DataFrame, nlp, model_dir, model): #parser = argparse.ArgumentParser(description="Run an ELMo model on user input") #parser.add_argument("model", help="Model directory") #parser.add_argument("question", help="Question to answer") #parser.add_argument("documents", help="List of text documents to answer the question with", nargs='+') #args = parser.parse_args() #print("Preprocessing...") # Load the model #model_dir = ModelDir(MODEL_DIR) #model = model_dir.get_model() print(model) if not isinstance(model, ParagraphQuestionModel): raise ValueError( "This script is built to work for ParagraphQuestionModel models only" ) #print(model) # Read the documents documents = [] documents.append(Data.at[0, 'Filetext']) """import pyodbc conn = pyodbc.connect("Driver={ODBC Driver 13 for SQL Server};" "Server=192.168.100.15;" "Database=PharmaAce;" "UID=sa;" "PWD=admin@123;" "Trusted_Connection=no;") cursor=conn.cursor() #(23211,28690,33214,25638,25837,26454,28693,26137,31428,32087) for doc in cursor.execute("select cast(filetext as varchar(max)) as filetext from kpl_tmp"): documents.append(doc[0]) #doc="D:\Document QnA\document-qa-master\Data\Drug_Delivery_Surveying_Global_Competitive_Landscape_BMI.txt" if not isfile(doc): raise ValueError(doc + " does not exist") with open(doc, "r") as f: documents.append(f.read()) """ #print("Loaded %d documents" % len(documents)) #temp=documents[0].split() # Split documents into lists of paragraphs #documents=[" ".join(temp[i:(i+400)]) for i in range(1,len(temp),400)] documents = [re.split("\s*\n\s*", doc) for doc in documents] # Tokenize the input, the models expects data to be tokenized using `NltkAndPunctTokenizer` # Note the model expects case-sensitive input tokenizer = NltkAndPunctTokenizer() question = tokenizer.tokenize_paragraph_flat( Data.at[0, 'Question']) # List of words # Now list of document->paragraph->sentence->word documents = [[tokenizer.tokenize_paragraph(p) for p in doc] for doc in documents] # Now group the document into paragraphs, this returns `ExtractedParagraph` objects # that additionally remember the start/end token of the paragraph within the source document splitter = MergeParagraphs(400) #splitter = PreserveParagraphs() # Uncomment to use the natural paragraph grouping documents = [splitter.split(doc) for doc in documents] #print(str(len(documents))+" kpl") #kpl # Now select the top paragraphs using a `ParagraphFilter` print(len(documents)) #kpl if len(documents) == 1: # Use TF-IDF to select top paragraphs from the document selector = TopTfIdf(NltkPlusStopWords(True), n_to_select=5) context = selector.prune(question, documents[0]) else: # Use a linear classifier to select top paragraphs among all the documents selector = ShallowOpenWebRanker(n_to_select=10) context = selector.prune(question, flatten_iterable(documents)) #print("Select %d paragraph" % len(context)) if model.preprocessor is not None: # Models are allowed to define an additional pre-processing step # This will turn the `ExtractedParagraph` objects back into simple lists of tokens context = [ model.preprocessor.encode_text(question, x) for x in context ] else: # Otherwise just use flattened text context = [flatten_iterable(x.text) for x in context] print("Setting up model") # Tell the model the batch size (can be None) and vocab to expect, This will load the # needed word vectors and fix the batch size to use when building the graph / encoding the input voc = set(question) for txt in context: voc.update(txt) model.set_input_spec(nlp, ParagraphAndQuestionSpec(batch_size=len(context)), voc) # Now we build the actual tensorflow graph, `best_span` and `conf` are # tensors holding the predicted span (inclusive) and confidence scores for each # element in the input batch, confidence scores being the pre-softmax logit for the span #print("Build tf graph") #kpl print("after set input spec") sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) # We need to use sess.as_default when working with the cuNND stuff, since we need an active # session to figure out the # of parameters needed for each layer. The cpu-compatible models don't need this. with sess.as_default(): # 8 means to limit the span to size 8 or less best_spans, conf = model.get_prediction().get_best_span(8) # Loads the saved weights model_dir.restore_checkpoint(sess) print("after loading weights") # Now the model is ready to run # The model takes input in the form of `ContextAndQuestion` objects, for example: data = [ ParagraphAndQuestion(x, question, None, "user-question%d" % i) for i, x in enumerate(context) ] #print("Starting run") # The model is run in two steps, first it "encodes" a batch of paragraph/context pairs # into numpy arrays, then we use `sess` to run the actual model get the predictions encoded = model.encode( data, is_train=True) # batch of `ContextAndQuestion` -> feed_dict best_spans, conf = sess.run([best_spans, conf], feed_dict=encoded) # feed_dict -> predictions best_para = np.argmax( conf ) # We get output for each paragraph, select the most-confident one to print #print("Best Paragraph: " + str(best_para)) #print("Best span: " + str(best_spans[best_para])) #print("Answer text: " + " ".join(context[best_para][best_spans[best_para][0]:best_spans[best_para][1]+1])) #print("Confidence: " + str(conf[best_para])) return " ".join( context[best_para][best_spans[best_para][0]:best_spans[best_para][1] + 1]) #if __name__ == "__main__": # main()
def main(): parser = argparse.ArgumentParser(description='Run the demo server') parser.add_argument('model', help='Models to use') parser.add_argument( '-v', '--voc', help='vocab to use, only words from this file will be used') parser.add_argument('-t', '--tokens', type=int, default=400, help='Number of tokens to use per paragraph') parser.add_argument('--vec_dir', help='Location to find word vectors') parser.add_argument('--n_paragraphs', type=int, default=12, help="Number of paragraphs to run the model on") parser.add_argument('--span_bound', type=int, default=8, help="Max span size to return as an answer") parser.add_argument( '--tagme_api_key', help="Key to use for TAGME (tagme.d4science.org/tagme)") parser.add_argument('--bing_api_key', help="Key to use for bing searches") parser.add_argument('--tagme_thresh', default=0.2, type=float) parser.add_argument('--no_wiki', action="store_true", help="Dont use TAGME") parser.add_argument('--n_web', type=int, default=10, help='Number of web docs to fetch') parser.add_argument('--blacklist_trivia_sites', action="store_true", help="Don't use trivia websites") parser.add_argument('-c', '--wiki_cache', help="Cache wiki articles in this directory") parser.add_argument('--n_dl_threads', type=int, default=5, help="Number of threads to download documents with") parser.add_argument('--request_timeout', type=int, default=60) parser.add_argument('--download_timeout', type=int, default=25) parser.add_argument('--workers', type=int, default=1, help="Number of server workers") parser.add_argument('--debug', default=None, choices=["random_model", "dummy_qa"]) args = parser.parse_args() span_bound = args.span_bound if args.tagme_api_key is not None: tagme_api_key = args.tagme_api_key else: tagme_api_key = environ.get("TAGME_API_KEY") if args.bing_api_key is not None: bing_api_key = args.bing_api_key else: bing_api_key = environ.get("BING_API_KEY") if bing_api_key is None and args.n_web > 0: raise ValueError("If n_web > 0 you must give a BING_API_KEY") if args.debug is None: model = ModelDir(args.model) else: model = RandomPredictor(5, WithIndicators()) if args.vec_dir is not None: loader = LoadFromPath(args.vec_dir) else: loader = ResourceLoader() if args.debug == "dummy_qa": qa = DummyQa() else: qa = QaSystem( args.wiki_cache, MergeParagraphs(args.tokens), ShallowOpenWebRanker(args.n_paragraphs), args.voc, model, loader, bing_api_key, tagme_api_key=tagme_api_key, n_dl_threads=args.n_dl_threads, blacklist_trivia_sites=args.blacklist_trivia_sites, download_timeout=args.download_timeout, span_bound=span_bound, tagme_threshold=None if args.no_wiki else args.tagme_thresh, n_web_docs=args.n_web) logging.propagate = False formatter = logging.Formatter("%(asctime)s: %(levelname)s: %(message)s") handler = logging.StreamHandler() handler.setFormatter(formatter) logging.root.addHandler(handler) logging.root.setLevel(logging.DEBUG) app = Sanic() app.config.REQUEST_TIMEOUT = args.request_timeout @app.route("/answer") async def answer(request): try: question = request.args["question"][0] if question == "": return response.json({'message': 'No question given'}, status=400) spans, paras = await qa.answer_question(question) answers = select_answers(paras, spans, span_bound, 10) return json([x.to_json() for x in answers]) except Exception as e: log.info("Error: " + str(e)) raise ServerError("Server Error", status_code=500) @app.route('/answer-from', methods=['POST']) async def answer_from(request): try: args = ujson.loads(request.body.decode("utf-8")) question = args.get("question") if question is None or question == "": return response.json({'message': 'No question given'}, status=400) doc = args["document"] if len(doc) > 500000: raise ServerError("Document too large", status_code=400) spans, paras = qa.answer_with_doc(question, doc) answers = select_answers(paras, spans, span_bound, 10) return json([x.to_json() for x in answers]) except Exception as e: log.info("Error: " + str(e)) raise ServerError("Server Error", status_code=500) app.static('/', './docqa//server/static/index.html') app.static('/about.html', './docqa//service/static/about.html') app.run(host="0.0.0.0", port=8000, workers=args.workers, debug=False)
def getAnswer(self): #parser = argparse.ArgumentParser(description="Run an ELMo model on user input") #parser.add_argument("model", help="Model directory") #parser.add_argument("question", help="Question to answer") #parser.add_argument("documents", help="List of text documents to answer the question with", nargs='+') #args = parser.parse_args() #print("Preprocessing...") # Load the model model_dir = ModelDir(MODEL_DIR) model = model_dir.get_model() if not isinstance(model, ParagraphQuestionModel): raise ValueError( "This script is built to work for ParagraphQuestionModel models only" ) conn = pyodbc.connect(DB_CONN) cursor = conn.cursor() #(23211,28690,33214,25638,25837,26454,28693,26137,31428,32087) query="select cast(filetext as varchar(max)) as filetext, name, type from dbo.UserworkspaceData where objectmasterid= "+\ str(self.ObjectMasterId)+\ " order by id asc" #query="select cast(filetext as varchar(max)) as filetext from kpl_tmp" documents = [] document = "" name = "" filetype = 0 for doc in cursor.execute(query): document = document + doc[0] name = doc[1] filetype = doc[2] #open("E:/kpl.txt","w+").write(document) documents.append(document) #documents.replace("\n\n","\n") #r.sub("",documents) #documents=" ".join(documents.split()) #open("E:\kpl_test.txt","w+").write(document) #doc="D:\Document QnA\document-qa-master\Data\Drug_Delivery_Surveying_Global_Competitive_Landscape_BMI.txt" # ============================================================================= # if not isfile(doc): # raise ValueError(doc + " does not exist") # with open(doc, "r") as f: # documents.append(f.read()) # ============================================================================= #print("Loaded %d documents" % len(documents)) #temp=documents[0].split() # Split documents into lists of paragraphs #documents=[" ".join(temp[i:(i+400)]) for i in range(1,len(temp),400)] documents = [re.split("\s*\n\s*", doc) for doc in documents] # Tokenize the input, the models expects data to be tokenized using `NltkAndPunctTokenizer` # Note the model expects case-sensitive input tokenizer = NltkAndPunctTokenizer() question = tokenizer.tokenize_paragraph_flat( self.Question) # List of words # Now list of document->paragraph->sentence->word documents = [[tokenizer.tokenize_paragraph(p) for p in doc] for doc in documents] # Now group the document into paragraphs, this returns `ExtractedParagraph` objects # that additionally remember the start/end token of the paragraph within the source document splitter = MergeParagraphs(400) #splitter = PreserveParagraphs() # Uncomment to use the natural paragraph grouping documents = [splitter.split(doc) for doc in documents] #print(str(len(documents))+" kpl") #kpl # Now select the top paragraphs using a `ParagraphFilter` if len(documents) == 1: # Use TF-IDF to select top paragraphs from the document selector = TopTfIdf(NltkPlusStopWords(True), n_to_select=5) context = selector.prune(question, documents[0]) else: # Use a linear classifier to select top paragraphs among all the documents selector = ShallowOpenWebRanker(n_to_select=10) context = selector.prune(question, flatten_iterable(documents)) #print("Select %d paragraph" % len(context)) if model.preprocessor is not None: # Models are allowed to define an additional pre-processing step # This will turn the `ExtractedParagraph` objects back into simple lists of tokens context = [ model.preprocessor.encode_text(question, x) for x in context ] else: # Otherwise just use flattened text context = [flatten_iterable(x.text) for x in context] #x=open("E:\context.txt","a+") #[x.write(" ".join(cont)) for cont in context] #x.write("\n.......................................................\n") #print("Setting up model") # Tell the model the batch size (can be None) and vocab to expect, This will load the # needed word vectors and fix the batch size to use when building the graph / encoding the input voc = set(question) for txt in context: voc.update(txt) model.set_input_spec(self.nlp, ParagraphAndQuestionSpec(batch_size=len(context)), voc) # Now we build the actual tensorflow graph, `best_span` and `conf` are # tensors holding the predicted span (inclusive) and confidence scores for each # element in the input batch, confidence scores being the pre-softmax logit for the span #print("Build tf graph") #kpl sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) # We need to use sess.as_default when working with the cuNND stuff, since we need an active # session to figure out the # of parameters needed for each layer. The cpu-compatible models don't need this. with sess.as_default(): # 8 means to limit the span to size 8 or less best_spans, conf = model.get_prediction().get_best_span(8) # Loads the saved weights model_dir.restore_checkpoint(sess) # Now the model is ready to run # The model takes input in the form of `ContextAndQuestion` objects, for example: data = [ ParagraphAndQuestion(x, question, None, "user-question%d" % i) for i, x in enumerate(context) ] #print("Starting run") # The model is run in two steps, first it "encodes" a batch of paragraph/context pairs # into numpy arrays, then we use `sess` to run the actual model get the predictions encoded = model.encode( data, is_train=True) # batch of `ContextAndQuestion` -> feed_dict best_spans, conf = sess.run( [best_spans, conf], feed_dict=encoded) # feed_dict -> predictions best_para = np.argmax( conf ) # We get output for each paragraph, select the most-confident one to print #print("Best Paragraph: " + str(best_para)) #print("Best span: " + str(best_spans[best_para])) #print("Answer text: " + " ".join(context[best_para][best_spans[best_para][0]:best_spans[best_para][1]+1])) #print("Confidence: " + str(conf[best_para])) Answer = " ".join(context[best_para] [best_spans[best_para][0]:best_spans[best_para][1] + 1]) print("Confidence: " + str(conf[best_para])) print("Best Paragraph: " + str(best_para)) print("Best span: " + str(best_spans[best_para])) print("Answer text: " + Answer) print(" ".join(context[best_para])) context[best_para][best_spans[best_para][ 0]] = r"<em>" + context[best_para][best_spans[best_para][0]] context[best_para][best_spans[best_para][1]] = context[best_para][ best_spans[best_para][1]] + r"</em>" start = 0 end = len(context[best_para]) positions = [ x for x, n in enumerate(context[best_para] [0:best_spans[best_para][0]]) if n == "." ] if len(positions) >= 2: start = positions[len(positions) - 2] + 1 positions = [ x for x, n in enumerate(context[best_para][best_spans[best_para][1] + 1:]) if n == "." ] if len(positions) > 1: end = best_spans[best_para][1] + 1 + positions[1] d = dict() if conf[best_para] > 10: d["answer"] = Answer else: d["answer"] = "" d["name"] = name d["filetype"] = filetype d["paragraph"] = re.sub(r' (?=\W)', '', " ".join(context[best_para][start:end])) d["ObjectMasterId"] = self.ObjectMasterId return d #if __name__ == "__main__": # main()
def main(): parser = argparse.ArgumentParser( description='Evaluate a model on TriviaQA data') parser.add_argument('model', help='model directory') parser.add_argument( '-p', '--paragraph_output', type=str, help="Save fine grained results for each paragraph in csv format") parser.add_argument('-o', '--official_output', type=str, help="Build an offical output file with the model's" " most confident span for each (question, doc) pair") parser.add_argument('--no_ema', action="store_true", help="Don't use EMA weights even if they exist") parser.add_argument( '--n_processes', type=int, default=None, help= "Number of processes to do the preprocessing (selecting paragraphs+loading context) with" ) parser.add_argument('-i', '--step', type=int, default=None, help="checkpoint to load, default to latest") parser.add_argument('-n', '--n_sample', type=int, default=None, help="Number of questions to evaluate on") parser.add_argument('-a', '--async', type=int, default=10) parser.add_argument('-t', '--tokens', type=int, default=400, help="Max tokens per a paragraph") parser.add_argument('-g', '--n_paragraphs', type=int, default=15, help="Number of paragraphs to run the model on") parser.add_argument('-f', '--filter', type=str, default=None, choices=["tfidf", "truncate", "linear"], help="How to select paragraphs") parser.add_argument( '-b', '--batch_size', type=int, default=200, help="Batch size, larger sizes might be faster but wll take more memory" ) parser.add_argument('--max_answer_len', type=int, default=8, help="Max answer span to select") parser.add_argument('-c', '--corpus', choices=[ "web-dev", "web-test", "web-verified-dev", "web-train", "open-dev", "open-train", "wiki-dev", "wiki-test" ], default="web-verified-dev") parser.add_argument("-s", "--source_dir", type=str, default=None, help="where to take input files") parser.add_argument("--n_span_per_q", type=int, default=1, help="where to take input files") args = parser.parse_args() dataset_name = args.source_dir.split('/')[-1] model_name = args.model.split('/')[-1] ElasticLogger().write_log('INFO', 'Start Evaluation', context_dict={ 'model': model_name, 'dataset': dataset_name }) model_dir = ModelDir(args.model) model = model_dir.get_model() if args.corpus.startswith('web'): dataset = TriviaQaWebDataset() if args.corpus == "web-dev": test_questions = dataset.get_dev() elif args.corpus == "web-test": test_questions = dataset.get_test() elif args.corpus == "web-verified-dev": test_questions = dataset.get_verified() elif args.corpus == "web-train": test_questions = dataset.get_train() else: raise AssertionError() elif args.corpus.startswith("wiki"): dataset = TriviaQaWikiDataset() if args.corpus == "wiki-dev": test_questions = dataset.get_dev() elif args.corpus == "wiki-test": test_questions = dataset.get_test() else: raise AssertionError() else: dataset = TriviaQaOpenDataset(args.source_dir) if args.corpus == "open-dev": # just loading the pkl that was saved in build_span_corpus test_questions = dataset.get_dev() elif args.corpus == "open-train": test_questions = dataset.get_train() else: raise AssertionError() ### ALON debuging #test_questions = test_questions[0:5] corpus = dataset.evidence splitter = MergeParagraphs(args.tokens) per_document = args.corpus.startswith( "web") # wiki and web are both multi-document #per_document = True filter_name = args.filter if filter_name is None: # Pick default depending on the kind of data we are using if per_document: filter_name = "tfidf" else: filter_name = "linear" print("Selecting %d paragraphs using method \"%s\" per %s" % (args.n_paragraphs, filter_name, ("question-document pair" if per_document else "question"))) if filter_name == "tfidf": para_filter = TopTfIdf(NltkPlusStopWords(punctuation=True), args.n_paragraphs) elif filter_name == "truncate": para_filter = FirstN(args.n_paragraphs) elif filter_name == "linear": para_filter = ShallowOpenWebRanker(args.n_paragraphs) else: raise ValueError() n_questions = args.n_sample docqa.config.SPANS_PER_QUESTION = args.n_span_per_q #n_questions = 1 if n_questions is not None: test_questions.sort(key=lambda x: x.question_id) np.random.RandomState(0).shuffle(test_questions) test_questions = test_questions[:n_questions] print("Building question/paragraph pairs...") # Loads the relevant questions/documents, selects the right paragraphs, and runs the model's preprocessor if per_document: prep = ExtractMultiParagraphs(splitter, para_filter, model.preprocessor, require_an_answer=False) else: prep = ExtractMultiParagraphsPerQuestion(splitter, para_filter, model.preprocessor, require_an_answer=False) prepped_data = preprocess_par(test_questions, corpus, prep, args.n_processes, 1000) data = [] for q in prepped_data.data: for i, p in enumerate(q.paragraphs): if q.answer_text is None: ans = None else: ans = TokenSpans(q.answer_text, p.answer_spans) data.append( DocumentParagraphQuestion(q.question_id, p.doc_id, (p.start, p.end), q.question, p.text, ans, i)) # Reverse so our first batch will be the largest (so OOMs happen early) questions = sorted(data, key=lambda x: (x.n_context_words, len(x.question)), reverse=True) print("Done, starting eval") if args.step is not None: if args.step == "latest": checkpoint = model_dir.get_latest_checkpoint() else: checkpoint = model_dir.get_checkpoint(int(args.step)) else: checkpoint = model_dir.get_best_weights() if checkpoint is not None: print("Using best weights") else: print("Using latest checkpoint") checkpoint = model_dir.get_latest_checkpoint() test_questions = ParagraphAndQuestionDataset( questions, FixedOrderBatcher(args.batch_size, True)) evaluation = trainer.test( model, [RecordParagraphSpanPrediction(args.max_answer_len, True)], {args.corpus: test_questions}, ResourceLoader(), checkpoint, not args.no_ema, args. async)[args.corpus] if not all(len(x) == len(data) for x in evaluation.per_sample.values()): raise RuntimeError() df = pd.DataFrame(evaluation.per_sample) if args.official_output is not None: print("Saving question result") fns = {} if per_document: # I didn't store the unormalized filenames exactly, so unfortunately we have to reload # the source data to get exact filename to output an official test script print("Loading proper filenames") if args.corpus == 'web-test': source = join(TRIVIA_QA, "qa", "web-test-without-answers.json") elif args.corpus == "web-dev": source = join(TRIVIA_QA, "qa", "web-dev.json") else: raise AssertionError() with open(join(source)) as f: data = json.load(f)["Data"] for point in data: for doc in point["EntityPages"]: filename = doc["Filename"] fn = join("wikipedia", filename[:filename.rfind(".")]) fn = normalize_wiki_filename(fn) fns[(point["QuestionId"], fn)] = filename answers = {} scores = {} for q_id, doc_id, start, end, txt, score in df[[ "question_id", "doc_id", "para_start", "para_end", "text_answer", "predicted_score" ]].itertuples(index=False): filename = dataset.evidence.file_id_map[doc_id] if per_document: if filename.startswith("web"): true_name = filename[4:] + ".txt" else: true_name = fns[(q_id, filename)] # Alon Patch for triviaqa test results true_name = true_name.replace('TriviaQA_Org/', '') key = q_id + "--" + true_name else: key = q_id prev_score = scores.get(key) if prev_score is None or prev_score < score: scores[key] = score answers[key] = txt with open(args.official_output, "w") as f: json.dump(answers, f) output_file = args.paragraph_output if output_file is not None: print("Saving paragraph result") df.to_csv(output_file, index=False) print("Computing scores") if per_document: group_by = ["question_id", "doc_id"] else: group_by = ["question_id"] # Print a table of scores as more paragraphs are used df.sort_values(group_by + ["rank"], inplace=True) df_scores = df.copy(deep=True) df_scores['predicted_score'] = df_scores['predicted_score'].apply( lambda x: pd.Series(x).max()) em = compute_ranked_scores(df_scores, "predicted_score", "text_em", group_by) f1 = compute_ranked_scores(df_scores, "predicted_score", "text_f1", group_by) table = [["N Paragraphs", "EM", "F1"]] table += list([str(i + 1), "%.4f" % e, "%.4f" % f] for i, (e, f) in enumerate(zip(em, f1))) table_df = pd.DataFrame(table[1:], columns=table[0]).drop(['N Paragraphs'], axis=1) ElasticLogger().write_log('INFO', 'Results', context_dict={'model': model_name, 'dataset': dataset_name, \ 'max_EM':table_df.max().ix['EM'], \ 'max_F1':table_df.max().ix['F1'], \ 'result_table': str(table_df)}) df_flat = [] for id, question in df.iterrows(): for text_answer, predicted_span, predicted_score in zip( question['text_answer'], question['predicted_span'], question['predicted_score']): new_question = dict(question.copy()) new_question.update({ 'text_answer': text_answer, 'predicted_span': predicted_span, 'predicted_score': predicted_score }) df_flat.append(new_question) results_df = pd.DataFrame(df_flat) #Alon: outputing the estimates for all the #results_df = results_df.groupby(['question_id', 'text_answer']).apply(lambda df: df.ix[df['predicted_score'].argmax()]).reset_index(drop=True) results_df.sort_values(by=['question_id', 'predicted_score'], ascending=False).set_index([ 'question_id', 'text_answer' ])[['question', 'predicted_score', 'text_em']].to_csv('results.csv') print_table(table)
def perform_evaluation(model_name: str, dataset_names: List[str], tokens_per_paragraph: int, filter_type: str, n_processes: int, n_paragraphs: int, batch_size: int, checkpoint: str, no_ema: bool, max_answer_len: int, official_output_path: str, paragraph_output_path: str, aggregated_output_path: str, elmo_char_cnn: bool, n_samples: Union[int, None], per_document: bool = False): """Perform an evaluation using cape's answer decoder A file will be created listing the answers per question ID for each dataset :param model_name: path to the model to evaluate :param dataset_names: list of strings of datasets to evaluate :param tokens_per_paragraph: how big to make paragraph chunks :param filter_type: how to select the paragraphs to read :param n_processes: how many processes to use when multiprocessing :param n_paragraphs: how many paragraphs to read per question :param batch_size: how many datapoints to evaluate at once :param checkpoint: string, checkpoint to load :param no_ema: if true, dont use EMA weights :param max_answer_len: the maximum allowable length of an answer in tokens :param official_output_path: path to write official output to :param paragraph_output_path: path to write paragraph output to :param aggregated_output_path: path to write aggregated output to :param elmo_char_cnn: if true, uses the elmo CNN to make token embeddings, less OOV but requires much more memory :param per_document: if false, return best scoring answer to a question, if true, the best scoring answer from each document is used instead. """ async = True corpus_name = 'all' print('Setting Up:') model_dir = ModelDir(model_name) model = model_dir.get_model() dataset = get_multidataset(dataset_names) splitter = MergeParagraphs(tokens_per_paragraph) para_filter = get_para_filter(filter_type, per_document, n_paragraphs) test_questions, n_questions = get_questions(per_document, dataset, splitter, para_filter, model.preprocessor, n_processes, batch_size) print("Starting eval") checkpoint = get_checkpoint(checkpoint, model_dir) evaluation = test(model, [RecordParagraphSpanPrediction(max_answer_len, True)], {corpus_name: test_questions}, ResourceLoader(), checkpoint, not no_ema, async, n_samples, elmo_char_cnn)[corpus_name] print('Exporting and Post-processing') if not all(len(x) == n_questions for x in evaluation.per_sample.values()): raise RuntimeError() df = pd.DataFrame(evaluation.per_sample) compute_and_dump_official_output(df, official_output_path, per_document=per_document) print("Saving paragraph result") df.to_csv(paragraph_output_path, index=False) print("Computing scores") agg_df = get_aggregated_df(df, per_document) agg_df.to_csv(aggregated_output_path, index=False)
def perform_evaluation( model_config: CapeDocQAMachineReaderConfig, dataset_name: str, tokens_per_paragraph: int, filter_type: str, n_processes: int, n_paragraphs: int, batch_size: int, max_answer_len: int, official_output_path: str, paragraph_output_path: str, aggregated_output_path: str, n_samples: Union[int, None], per_document: False, ): """Perform an evaluation using cape's answer decoder A file will be created listing the answers per question ID for each dataset :param model_name: path to the model to evaluate :param dataset_names: list of strings of datasets to evaluate :param tokens_per_paragraph: how big to make paragraph chunks :param filter_type: how to select the paragraphs to read :param n_processes: how many processes to use when multiprocessing :param n_paragraphs: how many paragraphs to read per question :param batch_size: how many datapoints to evaluate at once :param checkpoint: string, checkpoint to load :param no_ema: if true, dont use EMA weights :param max_answer_len: the maximum allowable length of an answer in tokens :param official_output_path: path to write official output to :param paragraph_output_path: path to write paragraph output to :param aggregated_output_path: path to write aggregated output to :param elmo_char_cnn: if true, uses the elmo CNN to make token embeddings, less OOV but requires much more memory :param per_document: if true, scores each document associated with a question seperately, if false, just the highest scoring answer from any document is used. """ print('Setting Up:') dataset = get_multidataset([dataset_name]) splitter = MergeParagraphs(tokens_per_paragraph) para_filter = get_para_filter(filter_type, per_document, n_paragraphs) sess, model, evaluator_runner = build_model_and_evaluator_runner( model_config, max_answer_len, n_paragraphs) test_questions, n_questions = get_questions(per_document, dataset, splitter, para_filter, model.preprocessor, n_processes, batch_size) print('Starting Eval') evaluation = evaluator_runner.run_evaluators(sess, test_questions, dataset_name, n_samples, {}) print('Exporting and Post-processing') if not all(len(x) == n_questions for x in evaluation.per_sample.values()): raise RuntimeError() df = pd.DataFrame(evaluation.per_sample) compute_and_dump_official_output(df, official_output_path) print("Saving paragraph result") df.to_csv(paragraph_output_path, index=False) print("Computing scores") agg_df = get_aggregated_df(df, per_document) agg_df.to_csv(aggregated_output_path, index=False) sess.close() del sess tf.reset_default_graph()
np.random.RandomState(0).shuffle(questions) has_token = 0 total = 0 used = Counter() for q, doc in tqdm(pairs[:1000]): text = data.evidence.get_document(doc.doc_id, splits.reads_first_n) q_tokens = set(x.lower() for x in q.question) q_tokens -= stop for para in splits.split_annotated(text, doc.answer_spans): # if para.start == 0: # continue if len(para.answer_spans) == 0: continue if any(x.lower() in q_tokens for x in flatten_iterable(para.text)): has_token += 1 for x in flatten_iterable(para.text): if x in q_tokens: used[x] += 1 # else: # print_questions(q.question, q.answer.all_answers, para.text, para.answer_spans) # input() total += 1 for k, v in used.most_common(200): print("%s: %d" % (k, v)) print(has_token / total) if __name__ == "__main__": paragraph_stats(TriviaQaWebDataset(), MergeParagraphs(400), 1000)
def main(): parser = argparse.ArgumentParser(description='Train a model on TriviaQA web') parser.add_argument('mode', choices=["paragraph-level", "confidence", "merge", "shared-norm", "sigmoid", "shared-norm-600"]) parser.add_argument("name", help="Where to store the model") parser.add_argument('-n', '--n_processes', type=int, default=2, help="Number of processes (i.e., select which paragraphs to train on) " "the data with") args = parser.parse_args() mode = args.mode out = args.name + "-" + datetime.now().strftime("%m%d-%H%M%S") model = get_model(100, 140, mode, WithIndicators()) stop = NltkPlusStopWords(True) if mode == "paragraph-level": extract = ExtractSingleParagraph(MergeParagraphs(400), TopTfIdf(stop, 1), model.preprocessor, intern=True) elif mode == "shared-norm-600": extract = ExtractMultiParagraphs(MergeParagraphs(600), TopTfIdf(stop, 4), model.preprocessor, intern=True) else: extract = ExtractMultiParagraphs(MergeParagraphs(400), TopTfIdf(stop, 4), model.preprocessor, intern=True) if mode == "paragraph-level": n_epochs = 16 train = ParagraphAndQuestionsBuilder(ClusteredBatcher(60, ContextLenBucketedKey(3), True)) test = ParagraphAndQuestionsBuilder(ClusteredBatcher(60, ContextLenKey(), False)) n_dev, n_train = 21000, 12000 eval = [LossEvaluator(), SpanEvaluator([4, 8], "triviaqa")] else: eval = [LossEvaluator(), MultiParagraphSpanEvaluator(8, "triviaqa", mode != "merge")] # we sample two paragraphs per a (question, doc) pair, so evaluate on fewer questions n_dev, n_train = 15000, 8000 if mode == "confidence" or mode == "sigmoid": if mode == "sigmoid": # Trains very slowly, do this at your own risk n_epochs = 71 else: n_epochs = 28 test = RandomParagraphSetDatasetBuilder(120, "flatten", True, 1) train = StratifyParagraphsBuilder(ClusteredBatcher(60, ContextLenBucketedKey(3), True), 0, 1) else: n_epochs = 14 test = RandomParagraphSetDatasetBuilder(120, "merge" if mode == "merge" else "group", True, 1) train = StratifyParagraphSetsBuilder(35, mode == "merge", True, 1) data = TriviaQaWebDataset() params = get_triviaqa_train_params(n_epochs, n_dev, n_train) data = PreprocessedData(data, extract, train, test, eval_on_verified=False) data.preprocess(args.n_processes, 1000) with open(__file__, "r") as f: notes = f.read() notes = "*" * 10 + "\nMode: " + args.mode + "\n" + "*"*10 + "\n" + notes trainer.start_training(data, model, params, eval, model_dir.ModelDir(out), notes)
def main(): parser = argparse.ArgumentParser( description='Train a model on TriviaQA unfiltered') parser.add_argument( 'mode', choices=["confidence", "merge", "shared-norm", "sigmoid", "paragraph"]) parser.add_argument("name", help="Where to store the model") parser.add_argument("-t", "--n_tokens", default=400, type=int, help="Paragraph size") parser.add_argument( '-n', '--n_processes', type=int, default=2, help="Number of processes (i.e., select which paragraphs to train on) " "the data with") parser.add_argument("-s", "--source_dir", type=str, default=None, help="where to take input files") parser.add_argument("--n_epochs", type=int, default=None, help="Max number of epoches to train on ") parser.add_argument("--char_th", type=int, default=None, help="char level embeddings") parser.add_argument("--hl_dim", type=int, default=None, help="hidden layer dim size") parser.add_argument("--regularization", type=int, default=None, help="hidden layer dim size") parser.add_argument("--LR", type=float, default=1.0, help="hidden layer dim size") parser.add_argument("--save_every", type=int, default=1800, help="save period") parser.add_argument("--init_from", type=str, default=None, help="model to init from") args = parser.parse_args() mode = args.mode #out = args.name + "-" + datetime.now().strftime("%m%d-%H%M%S") out = join('models', args.name) char_th = 100 hl_dim = 140 if args.char_th is not None: print(args.char_th) char_th = int(args.char_th) out += '--th' + str(char_th) if args.hl_dim is not None: print(args.hl_dim) hl_dim = int(args.hl_dim) out += '--hl' + str(hl_dim) if args.init_from is None: model = get_model(char_th, hl_dim, mode, WithIndicators()) else: md = model_dir.ModelDir(args.init_from) model = md.get_model() extract = ExtractMultiParagraphsPerQuestion(MergeParagraphs(args.n_tokens), ShallowOpenWebRanker(16), model.preprocessor, intern=True) eval = [ LossEvaluator(), MultiParagraphSpanEvaluator(8, "triviaqa", mode != "merge", per_doc=False) ] oversample = [1] * 4 if mode == "paragraph": n_epochs = 120 test = RandomParagraphSetDatasetBuilder(120, "flatten", True, oversample) train = StratifyParagraphsBuilder(ClusteredBatcher( 60, ContextLenBucketedKey(3), True), oversample, only_answers=True) elif mode == "confidence" or mode == "sigmoid": if mode == "sigmoid": n_epochs = 640 else: n_epochs = 160 test = RandomParagraphSetDatasetBuilder(120, "flatten", True, oversample) train = StratifyParagraphsBuilder( ClusteredBatcher(60, ContextLenBucketedKey(3), True), oversample) else: n_epochs = 80 test = RandomParagraphSetDatasetBuilder( 120, "merge" if mode == "merge" else "group", True, oversample) train = StratifyParagraphSetsBuilder(30, mode == "merge", True, oversample) if args.n_epochs is not None: n_epochs = args.n_epochs out += '--' + str(n_epochs) if args.LR != 1.0: out += '--' + str(args.LR) data = TriviaQaOpenDataset(args.source_dir) async_encoding = 10 #async_encoding = 0 params = TrainParams(SerializableOptimizer("Adadelta", dict(learning_rate=args.LR)), num_epochs=n_epochs, num_of_steps=250000, ema=0.999, max_checkpoints_to_keep=2, async_encoding=async_encoding, log_period=30, eval_period=1800, save_period=args.save_every, eval_samples=dict(dev=None, train=6000), regularization_weight=None) data = PreprocessedData(data, extract, train, test, eval_on_verified=False) data.preprocess(args.n_processes, 1000) with open(__file__, "r") as f: notes = f.read() notes = "Mode: " + args.mode + "\n" + notes if args.init_from is not None: init_from = model_dir.ModelDir(args.init_from).get_best_weights() if init_from is None: init_from = model_dir.ModelDir( args.init_from).get_latest_checkpoint() else: init_from = None trainer.start_training(data, model, params, eval, model_dir.ModelDir(out), notes, initialize_from=init_from)
def main(): parser = argparse.ArgumentParser() parser.add_argument("corpus", choices=["en", "en_trans_de", "en_trans_zh"]) parser.add_argument( 'mode', choices=["confidence", "merge", "shared-norm", "sigmoid", "paragraph"]) # Note I haven't tested modes other than `shared-norm` on this corpus, so # some things might need adjusting parser.add_argument("name", help="Where to store the model") parser.add_argument("-t", "--n_tokens", default=400, type=int, help="Paragraph size") parser.add_argument( '-n', '--n_processes', type=int, default=2, help="Number of processes (i.e., select which paragraphs to train on) " "the data with") args = parser.parse_args() mode = args.mode corpus = args.corpus out = args.name + "-" + datetime.now().strftime("%m%d-%H%M%S") model = get_model(100, 140, mode, WithIndicators()) extract = ExtractMultiParagraphsPerQuestion(MergeParagraphs(args.n_tokens), ShallowOpenWebRanker(16), model.preprocessor, intern=True) eval = [ LossEvaluator(), MultiParagraphSpanEvaluator(8, "triviaqa", mode != "merge", per_doc=False) ] oversample = [ 1 ] * 2 # Sample the top two answer-containing paragraphs twice if mode == "paragraph": n_epochs = 120 test = RandomParagraphSetDatasetBuilder(120, "flatten", True, oversample) train = StratifyParagraphsBuilder(ClusteredBatcher( 60, ContextLenBucketedKey(3), True), oversample, only_answers=True) elif mode == "confidence" or mode == "sigmoid": if mode == "sigmoid": n_epochs = 640 else: n_epochs = 160 test = RandomParagraphSetDatasetBuilder(120, "flatten", True, oversample) train = StratifyParagraphsBuilder( ClusteredBatcher(60, ContextLenBucketedKey(3), True), oversample) else: n_epochs = 80 test = RandomParagraphSetDatasetBuilder( 120, "merge" if mode == "merge" else "group", True, oversample) train = StratifyParagraphSetsBuilder(30, mode == "merge", True, oversample) data = XQADataset(corpus) params = TrainParams(SerializableOptimizer("Adadelta", dict(learning_rate=1)), num_epochs=n_epochs, ema=0.999, max_checkpoints_to_keep=2, async_encoding=10, log_period=30, eval_period=1800, save_period=1800, best_weights=("dev", "b8/question-text-f1"), eval_samples=dict(dev=None, train=6000)) data = PreprocessedData(data, extract, train, test, eval_on_verified=False) data.preprocess(args.n_processes, 1000) with open(__file__, "r") as f: notes = f.read() notes = "Mode: " + args.mode + "\n" + notes trainer.start_training(data, model, params, eval, model_dir.ModelDir(out), notes)
def predict(): json_data = {"success": False, "predictions": []} print("Preprocessing...") # Load the model model_dir = ModelDir( "/home/antriv/conversation_ai/Transfer_Learning/ALLENAI_DocumentQA/document-qa/pretrained_models/models/triviaqa-unfiltered-shared-norm" ) model = model_dir.get_model() if not isinstance(model, ParagraphQuestionModel): raise ValueError( "This script is built to work for ParagraphQuestionModel models only" ) # Load the question question = (flask.request.data).decode("utf-8") # Read the documents documents = [] doclist = ["/home/antriv/data/The-Future-Computed.txt"] for doc in doclist: if not isfile(doc): raise ValueError(doc + " does not exist") with open(doc, "r") as f: documents.append(f.read()) print("Loaded %d documents" % len(documents)) # Split documents into lists of paragraphs documents = [re.split("\s*\n\s*", doc) for doc in documents] # Tokenize the input, the models expects data to be tokenized using `NltkAndPunctTokenizer` # Note the model expects case-sensitive input tokenizer = NltkAndPunctTokenizer() question = tokenizer.tokenize_paragraph_flat(question) # List of words # Now list of document->paragraph->sentence->word documents = [[tokenizer.tokenize_paragraph(p) for p in doc] for doc in documents] # Now group the document into paragraphs, this returns `ExtractedParagraph` objects # that additionally remember the start/end token of the paragraph within the source document splitter = MergeParagraphs(400) #splitter = PreserveParagraphs() # Uncomment to use the natural paragraph grouping documents = [splitter.split(doc) for doc in documents] # Now select the top paragraphs using a `ParagraphFilter` if len(documents) == 1: # Use TF-IDF to select top paragraphs from the document selector = TopTfIdf(NltkPlusStopWords(True), n_to_select=1000) context = selector.prune(question, documents[0]) else: # Use a linear classifier to select top paragraphs among all the documents selector = ShallowOpenWebRanker(n_to_select=1000) context = selector.prune(question, flatten_iterable(documents)) print("Select %d paragraph" % len(context)) if model.preprocessor is not None: # Models are allowed to define an additional pre-processing step # This will turn the `ExtractedParagraph` objects back into simple lists of tokens context = [ model.preprocessor.encode_text(question, x) for x in context ] else: # Otherwise just use flattened text context = [flatten_iterable(x.text) for x in context] print("Setting up model") # Tell the model the batch size (can be None) and vocab to expect, This will load the # needed word vectors and fix the batch size to use when building the graph / encoding the input voc = set(question) for txt in context: voc.update(txt) model.set_input_spec(ParagraphAndQuestionSpec(batch_size=len(context)), voc) # Now we build the actual tensorflow graph, `best_span` and `conf` are # tensors holding the predicted span (inclusive) and confidence scores for each # element in the input batch, confidence scores being the pre-softmax logit for the span print("Build tf graph") sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) # We need to use sess.as_default when working with the cuNND stuff, since we need an active # session to figure out the # of parameters needed for each layer. The cpu-compatible models don't need this. with sess.as_default(): # 8 means to limit the span to size 8 or less best_spans, conf = model.get_prediction().get_best_span(8) # Loads the saved weights model_dir.restore_checkpoint(sess) # Now the model is ready to run # The model takes input in the form of `ContextAndQuestion` objects, for example: data = [ ParagraphAndQuestion(x, question, None, "user-question%d" % i) for i, x in enumerate(context) ] print("Starting run") # The model is run in two steps, first it "encodes" a batch of paragraph/context pairs # into numpy arrays, then we use `sess` to run the actual model get the predictions encoded = model.encode( data, is_train=False) # batch of `ContextAndQuestion` -> feed_dict best_spans, conf = sess.run([best_spans, conf], feed_dict=encoded) # feed_dict -> predictions best_para = np.argmax( conf ) # We get output for each paragraph, select the most-confident one to print print("Best Paragraph: " + str(best_para)) print("Best span: " + str(best_spans[best_para])) print("Answer text: " + " ".join(context[best_para] [best_spans[best_para][0]:best_spans[best_para][1] + 1])) print("Confidence: " + str(conf[best_para])) y_output = " ".join( context[best_para][best_spans[best_para][0]:best_spans[best_para][1] + 1]) print(y_output) json_data["predictions"].append(str(y_output)) #indicate that the request was a success json_data["success"] = True #return the data dictionary as a JSON response return flask.jsonify(json_data)
def main(): parser = argparse.ArgumentParser() parser.add_argument( "corpus", choices=["en", "fr", "de", "ru", "pt", "zh", "pl", "uk", "ta"]) parser.add_argument( 'mode', choices=["confidence", "merge", "shared-norm", "sigmoid", "paragraph"]) # Note I haven't tested modes other than `shared-norm` on this corpus, so # some things might need adjusting parser.add_argument("-t", "--n_tokens", default=400, type=int, help="Paragraph size") args = parser.parse_args() mode = args.mode corpus = args.corpus model = get_model(100, 140, mode, WithIndicators()) extract = ExtractMultiParagraphsPerQuestion(MergeParagraphs(args.n_tokens), ShallowOpenWebRanker(16), model.preprocessor, intern=True) oversample = [ 1 ] * 2 # Sample the top two answer-containing paragraphs twice if mode == "paragraph": n_epochs = 120 test = RandomParagraphSetDatasetBuilder(120, "flatten", True, oversample) train = StratifyParagraphsBuilder(ClusteredBatcher( 60, ContextLenBucketedKey(3), True), oversample, only_answers=True) elif mode == "confidence" or mode == "sigmoid": if mode == "sigmoid": n_epochs = 640 else: n_epochs = 160 test = RandomParagraphSetDatasetBuilder(120, "flatten", True, oversample) train = StratifyParagraphsBuilder( ClusteredBatcher(60, ContextLenBucketedKey(3), True), oversample) else: n_epochs = 80 test = RandomParagraphSetDatasetBuilder( 120, "merge" if mode == "merge" else "group", True, oversample) train = StratifyParagraphSetsBuilder(30, mode == "merge", True, oversample) data = XQADataset(corpus) data = PreprocessedData(data, extract, train, test, eval_on_verified=False) data.preprocess(1, 1000) # dump preprocessed dev data for bert data.cache_preprocess("dev_data_%s.pkl" % args.corpus)
parser = argparse.ArgumentParser() parser.add_argument('--input_file', required=True, help="input file, e.g. train_data.pkl") parser.add_argument('--output_train_file', required=True, help="output train file, e.g. train_output.json") parser.add_argument('--num_epoch', required=True, type=int, help="num_epoch, e.g. 10") args = parser.parse_args() mode = "shared-norm" model = get_model(100, 140, mode, WithIndicators()) extract = ExtractMultiParagraphsPerQuestion(MergeParagraphs(400), ShallowOpenWebRanker(16), model.preprocessor, intern=True) oversample = [1] * 2 train = StratifyParagraphSetsBuilder(30, mode == "merge", True, oversample) test = RandomParagraphSetDatasetBuilder( 120, "merge" if mode == "merge" else "group", True, oversample) data = PreprocessedData(None, extract, train, test, eval_on_verified=False) data.load_preprocess(args.input_file) outputs = [] training_data = data.get_train() for i in range(args.num_epoch):