def main(): data = TriviaQaWebDataset() stop = NltkPlusStopWords() splitter = MergeParagraphs(400) selector = TopTfIdf(stop, 4) print("Loading data..") train = data.get_train() print("Start") for q in train: for doc in q.all_docs: if len(doc.answer_spans) > 3: text = splitter.split_annotated( data.evidence.get_document(doc.doc_id), doc.answer_spans) text = selector.prune(q.question, text) for para in text: if len(para.answer_spans) > 3: print(q.question) text = flatten_iterable(para.text) for s, e in para.answer_spans: text[s] = "{{{" + text[s] text[e] = text[e] + "}}}" print(" ".join(text)) input()
def read_input_data(model): data = [] vocab = set() tokenizer = NltkAndPunctTokenizer() splitter = Truncate(400) # NOTE: we truncate past 400 tokens selector = TopTfIdf(NltkPlusStopWords(True), n_to_select=5) with open(OPTS.input_file) as f: for i, line in enumerate(f): try: document_raw, question_raw = line.strip().split('\t') except ValueError as e: print(line.strip()) print('Error at line %d' % i) raise e document = re.split("\s*\n\s*", document_raw) question = tokenizer.tokenize_paragraph_flat(question_raw) doc_toks = [tokenizer.tokenize_paragraph(p) for p in document] split_doc = splitter.split(doc_toks) context = selector.prune(question, split_doc) if model.preprocessor is not None: context = [model.preprocessor.encode_text(question, x) for x in context] else: context = [flatten_iterable(x.text) for x in context] vocab.update(question) for txt in context: vocab.update(txt) ex = [ParagraphAndQuestion(x, question, None, "user-question%d"%i) for i, x in enumerate(context)] data.append((document_raw, question_raw, context, ex)) return data, vocab
def show_web_paragraphs(): splitter = MergeParagraphs(400) stop = NltkPlusStopWords(True) ranker = TopTfIdf(stop, 6) stop_words = stop.words corpus = TriviaQaWebDataset() train = corpus.get_train() points = flatten_iterable([(q, d) for d in q.all_docs] for q in train) np.random.shuffle(points) for q, d in points: q_words = {strip_accents_unicode(w.lower()) for w in q.question} q_words = {x for x in q_words if x not in stop_words} doc = corpus.evidence.get_document(d.doc_id) doc = splitter.split_annotated(doc, d.answer_spans) ranked = ranker.dists(q.question, doc) if len(ranked) < 2 or len(ranked[1][0].answer_spans) == 0: continue print(" ".join(q.question)) print(q.answer.all_answers) for i, (para, dist) in enumerate(ranked[0:2]): text = flatten_iterable(para.text) print("Start=%d, Rank=%d, Dist=%.4f" % (para.start, i, dist)) if len(para.answer_spans) == 0: continue for s, e in para.answer_spans: text[s] = bcolors.CYAN + text[s] text[e] = text[e] + bcolors.ENDC for i, w in enumerate(text): if strip_accents_unicode(w.lower()) in q_words: text[i] = bcolors.ERROR + text[i] + bcolors.ENDC print(" ".join(text)) input()
def main(): corpus = SquadCorpus() if OPTS.normalize_before_ranking: normalizer = WordNormalizer() else: normalizer = None if OPTS.use_vec_dist: word_vecs = corpus.get_pruned_word_vecs('glove.840B.300d') prepro = SquadVectorTfIdfRanker(NltkPlusStopWords(True), OPTS.num_per_orig, True, word_vecs, word_normalizer=normalizer) else: prepro = SquadTfIdfRanker(NltkPlusStopWords(True), OPTS.num_per_orig, True, word_normalizer=normalizer) orig_data = corpus.get_train() if OPTS.split == 'train' else corpus.get_dev() orig_lens = [len(p.text[0]) for doc in orig_data for p in doc.paragraphs for q in p.questions] new_data = preprocess_par(orig_data, corpus.evidence, prepro, n_processes=1) new_lens = [len(p.text) for q in new_data for p in q.paragraphs] print('%d original, mean %.2f words' % (len(orig_lens), np.mean(orig_lens))) print('%d new, mean %.2f words'% (len(new_lens), np.mean(new_lens))) if OPTS.out_file: write_output(OPTS.split, new_data, OPTS.out_file)
def get_para_filter(filter_name, per_document, n_paragraphs): filter_name = ('tfidf' if per_document else 'linear') if filter_name is None else filter_name if filter_name == "tfidf": para_filter = TopTfIdf(NltkPlusStopWords(punctuation=True), n_paragraphs) elif filter_name == "truncate": para_filter = FirstN(n_paragraphs) elif filter_name == "linear": para_filter = ShallowOpenWebRanker(n_paragraphs) else: raise ValueError() return para_filter
def contains_question_word(): data = TriviaQaWebDataset() stop = NltkPlusStopWords(punctuation=True).words doc_filter = ContainsQuestionWord(NltkPlusStopWords(punctuation=True)) splits = MergeParagraphs(400) # splits = Truncate(400) questions = data.get_dev() pairs = flatten_iterable([(q, doc) for doc in q.all_docs] for q in questions) pairs.sort(key=lambda x: (x[0].question_id, x[1].doc_id)) np.random.RandomState(0).shuffle(questions) has_token = 0 total = 0 used = Counter() for q, doc in tqdm(pairs[:1000]): text = data.evidence.get_document(doc.doc_id, splits.reads_first_n) q_tokens = set(x.lower() for x in q.question) q_tokens -= stop for para in splits.split_annotated(text, doc.answer_spans): # if para.start == 0: # continue if len(para.answer_spans) == 0: continue if any(x.lower() in q_tokens for x in flatten_iterable(para.text)): has_token += 1 for x in flatten_iterable(para.text): if x in q_tokens: used[x] += 1 # else: # print_questions(q.question, q.answer.all_answers, para.text, para.answer_spans) # input() total += 1 for k, v in used.most_common(200): print("%s: %d" % (k, v)) print(has_token / total)
def find_answer(documents, raw_question): raw_question = raw_question.lower() documents = [d.lower() for d in documents] global best_spans, conf documents = [re.split("\s*\n\s*", doc) for doc in documents] tokenizer = NltkAndPunctTokenizer() question = tokenizer.tokenize_paragraph_flat(raw_question) documents = [[tokenizer.tokenize_paragraph(p) for p in doc] for doc in documents] splitter = MergeParagraphs(400) documents = [splitter.split(doc) for doc in documents] if len(documents) == 1: selector = TopTfIdf(NltkPlusStopWords(True), n_to_select=5) context = selector.prune(question, documents[0]) else: selector = ShallowOpenWebRanker(n_to_select=10) context = selector.prune(question, flatten_iterable(documents)) context = [flatten_iterable(x.text) for x in context] data = [ ParagraphAndQuestion(x, question, None, "user-question%d" % i) for i, x in enumerate(context) ] encoded = model.encode(data, is_train=False) with sess.as_default(): spans, confid = sess.run([best_spans, conf], feed_dict=encoded) best_para = np.argmax(confid) ans = " ".join(context[best_para][spans[best_para][0]:spans[best_para][1] + 1]) confidence = confid[best_para] return ans, confidence
def main(): corpus = SquadCorpus() prepro = SquadTfIdfRanker(NltkPlusStopWords(True), OPTS.num_per_orig, True) orig_data = corpus.get_train( ) if OPTS.split == 'train' else corpus.get_dev() orig_lens = [ len(p.text[0]) for doc in orig_data for p in doc.paragraphs for q in p.questions ] new_data = preprocess_par(orig_data, corpus.evidence, prepro, n_processes=1) new_lens = [len(p.text) for q in new_data for p in q.paragraphs] print('%d original, mean %.2f words' % (len(orig_lens), np.mean(orig_lens))) print('%d new, mean %.2f words' % (len(new_lens), np.mean(new_lens))) if OPTS.out_file: write_output(OPTS.split, new_data, OPTS.out_file)
def show_open_paragraphs(start: int, end: int): splitter = MergeParagraphs(400) stop = NltkPlusStopWords(True) ranker = ShallowOpenWebRanker(6) stop_words = stop.words print("Loading train") corpus = TriviaQaOpenDataset() train = corpus.get_dev() np.random.shuffle(train) for q in train: q_words = {strip_accents_unicode(w.lower()) for w in q.question} q_words = {x for x in q_words if x not in stop_words} para = [] for d in q.all_docs: doc = corpus.evidence.get_document(d.doc_id) para += splitter.split_annotated(doc, d.answer_spans) ranked = ranker.prune(q.question, para) if len(ranked) < start: continue ranked = ranked[start:end] print(" ".join(q.question)) print(q.answer.all_answers) for i in range(start, end): para = ranked[i] text = flatten_iterable(para.text) print("Start=%d, Rank=%d" % (para.start, i)) if len(para.answer_spans) == 0: # print("No Answer!") continue for s, e in para.answer_spans: text[s] = bcolors.CYAN + text[s] text[e] = text[e] + bcolors.ENDC for i, w in enumerate(text): if strip_accents_unicode(w.lower()) in q_words: text[i] = bcolors.ERROR + text[i] + bcolors.ENDC print(" ".join(text)) input()
def main(): data = TriviaQaOpenDataset() # data = TriviaQaWebDataset() print("Loading...") all_questions = data.get_dev() questions = [ q for q in all_questions if any( len(x.answer_spans) > 0 for x in q.all_docs) ] print( "%d/%d (%.4f) have an answer" % (len(questions), len(all_questions), len(questions) / len(all_questions))) np.random.shuffle(questions) pre = ExtractMultiParagraphsPerQuestion(MergeParagraphs(400), TopTfIdf(NltkPlusStopWords(), 20), require_an_answer=False) print("Done") out = preprocess_par(questions[:2000], data.evidence, pre, 2, 1000) n_counts = np.zeros(20) n_any = np.zeros(20) n_any_all = np.zeros(20) for q in out.data: for i, p in enumerate(q.paragraphs): n_counts[i] += 1 n_any[i] += len(p.answer_spans) > 0 for i, p in enumerate(q.paragraphs): if len(p.answer_spans) > 0: n_any_all[i:] += 1 break print(n_any_all / out.true_len) print(n_any / n_counts) print(n_counts)
def show_stats(): splitter = MergeParagraphs(400) stop = NltkPlusStopWords(True) ranker = TopTfIdf(stop, 6) corpus = TriviaQaWebDataset() train = corpus.get_train() points = flatten_iterable([(q, d) for d in q.all_docs] for q in train) np.random.shuffle(points) counts = np.zeros(6) answers = np.zeros(6) n_answers = [] points = points[:1000] for q, d in tqdm(points): doc = corpus.evidence.get_document(d.doc_id) doc = splitter.split_annotated(doc, d.answer_spans) ranked = ranker.prune(q.question, doc) counts[:len(ranked)] += 1 for i, para in enumerate(ranked): if len(para.answer_spans) > 0: answers[i] += 1 n_answers.append( tuple(i for i, x in enumerate(ranked) if len(x.answer_spans) > 0)) print(answers / counts) c = Counter() other = 0 for tup in n_answers: if len(tup) <= 2: c[tup] += 1 else: other += 1 for p in sorted(c.keys()): print(p, c.get(p) / len(points)) print(other / len(points))
def __init__(self, n_to_select): self.n_to_select = n_to_select self._stop = NltkPlusStopWords(True).words self._tfidf = TfidfVectorizer(strip_accents="unicode", stop_words=self._stop)
def main(): parser = argparse.ArgumentParser( description='Train a model on document-level SQuAD') parser.add_argument( 'mode', choices=["paragraph", "confidence", "shared-norm", "merge", "sigmoid"]) parser.add_argument("name", help="Output directory") args = parser.parse_args() mode = args.mode out = args.name + "-" + datetime.now().strftime("%m%d-%H%M%S") corpus = SquadCorpus() if mode == "merge": # Adds paragraph start tokens, since we will be concatenating paragraphs together pre = WithIndicators(True, para_tokens=False, doc_start_token=False) else: pre = None model = get_model(50, 100, args.mode, pre) if mode == "paragraph": # Run in the "standard" known-paragraph setting if model.preprocessor is not None: raise NotImplementedError() n_epochs = 26 train_batching = ClusteredBatcher(45, ContextLenBucketedKey(3), True, False) eval_batching = ClusteredBatcher(45, ContextLenKey(), False, False) data = DocumentQaTrainingData(corpus, None, train_batching, eval_batching) eval = [LossEvaluator(), SpanEvaluator(bound=[17], text_eval="squad")] else: eval_set_mode = { "confidence": "flatten", "sigmoid": "flatten", "shared-norm": "group", "merge": "merge" }[mode] eval_dataset = RandomParagraphSetDatasetBuilder( 100, eval_set_mode, True, 0) if mode == "confidence" or mode == "sigmoid": if mode == "sigmoid": # needs to be trained for a really long time for reasons unknown, even this might be too small n_epochs = 100 else: n_epochs = 50 # more epochs since we only "see" the label very other epoch-osh train_batching = ClusteredBatcher(45, ContextLenBucketedKey(3), True, False) data = PreprocessedData( SquadCorpus(), SquadTfIdfRanker(NltkPlusStopWords(True), 4, True, model.preprocessor), StratifyParagraphsBuilder(train_batching, 1), eval_dataset, eval_on_verified=False, ) else: n_epochs = 26 data = PreprocessedData( SquadCorpus(), SquadTfIdfRanker(NltkPlusStopWords(True), 4, True, model.preprocessor), StratifyParagraphSetsBuilder(25, args.mode == "merge", True, 1), eval_dataset, eval_on_verified=False, ) eval = [LossEvaluator(), MultiParagraphSpanEvaluator(17, "squad")] data.preprocess(1) with open(__file__, "r") as f: notes = f.read() notes = args.mode + "\n" + notes trainer.start_training(data, model, train_params(n_epochs), eval, model_dir.ModelDir(out), notes)
def main(): parser = argparse.ArgumentParser(description="Run an ELMo model on user input") # parser.add_argument("model", type=int, help="Model directory") parser.add_argument("question", help="Question to answer") parser.add_argument("documents", help="List of text documents to answer the question with", nargs='+') args = parser.parse_args() # Models path SQUAD_MODEL_DIRECTORY_PATH = 'docqa/models-cpu/squad' SQUAD_SHARED_NORM_MODEL_DIRECTORY_PATH = 'docqa/models-cpu/squad-shared-norm' TRIVIAQA_MODEL_DIRECTORY_PATH = 'docqa/models-cpu/triviaqa-unfiltered-shared-norm' TRIVIAQA_SHARED_NORM_MODEL_DIRECTORY_PATH = 'docqa/models-cpu/triviaqa-web-shared-norm' models_directory = [ SQUAD_MODEL_DIRECTORY_PATH, SQUAD_SHARED_NORM_MODEL_DIRECTORY_PATH, TRIVIAQA_MODEL_DIRECTORY_PATH, TRIVIAQA_SHARED_NORM_MODEL_DIRECTORY_PATH ] print("Preprocessing...") # Load the model # model_dir = ModelDir(args.model) model_dir = ModelDir(models_directory[0]) model = model_dir.get_model() if not isinstance(model, ParagraphQuestionModel): raise ValueError("This script is built to work for ParagraphQuestionModel models only") # Read the documents documents = [] for doc in args.documents: if not isfile(doc): raise ValueError(doc + " does not exist") with open(doc, "r") as f: documents.append(f.read()) print("Loaded %d documents" % len(documents)) # Split documents into lists of paragraphs documents = [re.split("\s*\n\s*", doc) for doc in documents] # Tokenize the input, the models expects data to be tokenized using `NltkAndPunctTokenizer` # Note the model expects case-sensitive input tokenizer = NltkAndPunctTokenizer() question = tokenizer.tokenize_paragraph_flat(args.question) # List of words # Now list of document->paragraph->sentence->word documents = [[tokenizer.tokenize_paragraph(p) for p in doc] for doc in documents] # Now group the document into paragraphs, this returns `ExtractedParagraph` objects # that additionally remember the start/end token of the paragraph within the source document splitter = MergeParagraphs(400) # splitter = PreserveParagraphs() # Uncomment to use the natural paragraph grouping documents = [splitter.split(doc) for doc in documents] # Now select the top paragraphs using a `ParagraphFilter` if len(documents) == 1: # Use TF-IDF to select top paragraphs from the document selector = TopTfIdf(NltkPlusStopWords(True), n_to_select=5) context = selector.prune(question, documents[0]) else: # Use a linear classifier to select top paragraphs among all the documents selector = ShallowOpenWebRanker(n_to_select=10) context = selector.prune(question, flatten_iterable(documents)) print("Select %d paragraph" % len(context)) if model.preprocessor is not None: # Models are allowed to define an additional pre-processing step # This will turn the `ExtractedParagraph` objects back into simple lists of tokens context = [model.preprocessor.encode_text(question, x) for x in context] else: # Otherwise just use flattened text context = [flatten_iterable(x.text) for x in context] print("Setting up model") # Tell the model the batch size (can be None) and vocab to expect, This will load the # needed word vectors and fix the batch size to use when building the graph / encoding the input voc = set(question) for txt in context: voc.update(txt) model.set_input_spec(ParagraphAndQuestionSpec(batch_size=len(context)), voc) # Now we build the actual tensorflow graph, `best_span` and `conf` are # tensors holding the predicted span (inclusive) and confidence scores for each # element in the input batch, confidence scores being the pre-softmax logit for the span print("Build tf graph") sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) # We need to use sess.as_default when working with the cuNND stuff, since we need an active # session to figure out the # of parameters needed for each layer. The cpu-compatible models don't need this. with sess.as_default(): # 8 means to limit the span to size 8 or less best_spans, conf = model.get_prediction().get_best_span(10) # Loads the saved weights model_dir.restore_checkpoint(sess) # Now the model is ready to run # The model takes input in the form of `ContextAndQuestion` objects, for example: data = [ParagraphAndQuestion(x, question, None, "user-question%d"%i) for i, x in enumerate(context)] print("Starting run") # The model is run in two steps, first it "encodes" a batch of paragraph/context pairs # into numpy arrays, then we use `sess` to run the actual model get the predictions encoded = model.encode(data, is_train=False) # batch of `ContextAndQuestion` -> feed_dict best_spans, conf = sess.run([best_spans, conf], feed_dict=encoded) # feed_dict -> predictions best_para = np.argmax(conf) # We get output for each paragraph, select the most-confident one to print print("Best Paragraph: " + str(best_para)) para_id = int(str(best_para)) # print("Best Paragraph: \n" + (" ".join((paras[para_id].text)[0]))) print("Best Paragraph: \n" + " ".join(context[para_id])) print("Best span: " + str(best_spans[best_para])) print("Answer text: " + " ".join(context[best_para][best_spans[best_para][0]:best_spans[best_para][1]+1])) print("Confidence: " + str(conf[best_para]))
def main(): parser = argparse.ArgumentParser() parser.add_argument( '--n_processes', type=int, default=1, help= "Number of processes to do the preprocessing (selecting paragraphs+loading context) with" ) parser.add_argument('-a', '--async', type=int, default=10) parser.add_argument('-t', '--tokens', type=int, default=400, help="Max tokens per a paragraph") parser.add_argument('-n', '--n_sample', type=int, default=None, help="Number of questions to evaluate on") parser.add_argument('-g', '--n_paragraphs', type=int, default=15, help="Number of paragraphs to run the model on") parser.add_argument('-f', '--filter', type=str, default=None, choices=["tfidf", "truncate", "linear"], help="How to select paragraphs") parser.add_argument( '-c', '--corpus', choices=[ "en_dev", "en_test", "fr_dev", "fr_test", "de_dev", "de_test", "ru_dev", "ru_test", "pt_dev", "pt_test", "zh_dev", "zh_test", "pl_dev", "pl_test", "uk_dev", "uk_test", "ta_dev", "ta_test", "fr_trans_en_dev", "fr_trans_en_test", "de_trans_en_dev", "de_trans_en_test", "ru_trans_en_dev", "ru_trans_en_test", "pt_trans_en_dev", "pt_trans_en_test", "zh_trans_en_dev", "zh_trans_en_test", "pl_trans_en_dev", "pl_trans_en_test", "uk_trans_en_dev", "uk_trans_en_test", "ta_trans_en_dev", "ta_trans_en_test" ], required=True) args = parser.parse_args() corpus_name = args.corpus[:args.corpus.rfind("_")] eval_set = args.corpus[args.corpus.rfind("_") + 1:] dataset = XQADataset(corpus_name) if eval_set == "dev": test_questions = dataset.get_dev() elif eval_set == "test": test_questions = dataset.get_test() else: raise AssertionError() corpus = dataset.evidence splitter = MergeParagraphs(args.tokens) per_document = args.corpus.startswith( "web") # wiki and web are both multi-document filter_name = args.filter if filter_name is None: # Pick default depending on the kind of data we are using if per_document: filter_name = "tfidf" else: filter_name = "linear" print("Selecting %d paragraphs using method \"%s\" per %s" % (args.n_paragraphs, filter_name, ("question-document pair" if per_document else "question"))) if filter_name == "tfidf": para_filter = TopTfIdf(NltkPlusStopWords(punctuation=True), args.n_paragraphs) elif filter_name == "truncate": para_filter = FirstN(args.n_paragraphs) elif filter_name == "linear": para_filter = ShallowOpenWebRanker(args.n_paragraphs) else: raise ValueError() n_questions = args.n_sample if n_questions is not None: test_questions.sort(key=lambda x: x.question_id) np.random.RandomState(0).shuffle(test_questions) test_questions = test_questions[:n_questions] preprocessor = WithIndicators() print("Building question/paragraph pairs...") # Loads the relevant questions/documents, selects the right paragraphs, and runs the model's preprocessor if per_document: prep = ExtractMultiParagraphs(splitter, para_filter, preprocessor, require_an_answer=False) else: prep = ExtractMultiParagraphsPerQuestion(splitter, para_filter, preprocessor, require_an_answer=False) prepped_data = preprocess_par(test_questions, corpus, prep, args.n_processes, 1000) data = [] for q in prepped_data.data: for i, p in enumerate(q.paragraphs): if q.answer_text is None: ans = None else: ans = TokenSpans(q.answer_text, p.answer_spans) data.append( DocumentParagraphQuestion(q.question_id, p.doc_id, (p.start, p.end), q.question, p.text, ans, i)) # Reverse so our first batch will be the largest (so OOMs happen early) questions = sorted(data, key=lambda x: (x.n_context_words, len(x.question)), reverse=True) # dump eval data for bert import pickle pickle.dump(questions, open("%s_%d.pkl" % (args.corpus, args.n_paragraphs), "wb"))
def main(): parser = argparse.ArgumentParser( description='Evaluate a model on document-level SQuAD') parser.add_argument('model', help='model to use') parser.add_argument( 'output', type=str, help="Store the per-paragraph results in csv format in this file") parser.add_argument('-n', '--n_sample', type=int, default=None, help="(for testing) sample documents") parser.add_argument( '-s', '--async', type=int, default=10, help="Encoding batch asynchronously, queueing up to this many") parser.add_argument('-a', '--answer_bound', type=int, default=17, help="Max answer span length") parser.add_argument('-p', '--n_paragraphs', type=int, default=None, help="Max number of paragraphs to use") parser.add_argument( '-b', '--batch_size', type=int, default=200, help="Batch size, larger sizes can be faster but uses more memory") parser.add_argument('-c', '--corpus', choices=["dev", "train", "doc-rd-dev"], default="dev") parser.add_argument('--no_ema', action="store_true", help="Don't use EMA weights even if they exist") args = parser.parse_args() model_dir = ModelDir(args.model) print("Loading data") questions = [] ranker = SquadTfIdfRanker(NltkPlusStopWords(True), args.n_paragraphs, force_answer=False) if args.corpus == "doc-rd-dev": docs = SquadCorpus().get_dev() if args.n_sample is not None: docs.sort(key=lambda x: x.doc_id) np.random.RandomState(0).shuffle(docs) docs = docs[:args.n_sample] print("Fetching document reader docs...") doc_rd_versions = get_doc_rd_doc(docs) print("Ranking and matching with questions...") for doc in tqdm(docs): doc_questions = flatten_iterable(x.questions for x in doc.paragraphs) paragraphs = doc_rd_versions[doc.title] ranks = ranker.rank([x.words for x in doc_questions], [x.text for x in paragraphs]) for i, question in enumerate(doc_questions): para_ranks = np.argsort(ranks[i]) for para_rank, para_num in enumerate( para_ranks[:args.n_paragraphs]): # Just use dummy answers spans for these pairs questions.append( RankedParagraphQuestion( question.words, TokenSpans(question.answer.answer_text, np.zeros((0, 2), dtype=np.int32)), question.question_id, paragraphs[para_num], para_rank, para_num)) rl = ResourceLoader() else: if args.corpus == "dev": docs = SquadCorpus().get_dev() else: docs = SquadCorpus().get_train() rl = SquadCorpus().get_resource_loader() if args.n_sample is not None: docs.sort(key=lambda x: x.doc_id) np.random.RandomState(0).shuffle(docs) docs = docs[:args.n_sample] for q in ranker.ranked_questions(docs): for i, p in enumerate(q.paragraphs): questions.append( RankedParagraphQuestion( q.question, TokenSpans(q.answer_text, p.answer_spans), q.question_id, ParagraphWithInverse([p.text], p.original_text, p.spans), i, p.paragraph_num)) print("Split %d docs into %d paragraphs" % (len(docs), len(questions))) questions = sorted(questions, key=lambda x: (x.n_context_words, len(x.question)), reverse=True) for q in questions: if len(q.answer.answer_spans.shape) != 2: raise ValueError() checkpoint = model_dir.get_best_weights() if checkpoint is not None: print("Using best weights") else: print("Using latest checkpoint") checkpoint = model_dir.get_latest_checkpoint() if checkpoint is None: raise ValueError("No checkpoints found") data = ParagraphAndQuestionDataset( questions, FixedOrderBatcher(args.batch_size, True)) model = model_dir.get_model() evaluation = trainer.test( model, [RecordParagraphSpanPrediction(args.answer_bound, True)], {args.corpus: data}, rl, checkpoint, not args.no_ema, args. async)[args.corpus] print("Saving result") output_file = args.output df = pd.DataFrame(evaluation.per_sample) df.sort_values(["question_id", "rank"], inplace=True, ascending=True) group_by = ["question_id"] f1 = compute_ranked_scores(df, "predicted_score", "text_f1", group_by) em = compute_ranked_scores(df, "predicted_score", "text_em", group_by) table = [["N Paragraphs", "EM", "F1"]] table += list([str(i + 1), "%.4f" % e, "%.4f" % f] for i, (e, f) in enumerate(zip(em, f1))) print_table(table) df.to_csv(output_file, index=False)
def paragraph_stats(corpus, splitter: DocumentSplitter, sample): stop = NltkPlusStopWords(punctuation=True).words data = corpus.get_dev() pairs = flatten_iterable([(q, doc) for doc in q.all_docs] for q in data) data = [ pairs[i] for i in np.random.choice( np.arange(0, len(pairs)), sample, replace=False) ] word_matches = Counter() n_para = [] n_answers = [] n_question_words = [] for q, doc in data: if len(doc.answer_spans) == 0: continue q_words = set(x.lower() for x in q.question) q_words -= stop # q_words = set(norm.normalize(w) for w in q_words) text = corpus.evidence.get_document(doc.doc_id) para = splitter.split_annotated(text, doc.answer_spans) n_para.append(len(para)) n_answers += [len(x.answer_spans) for x in para] for x in para: match_set = set() n_matches = 0 text = flatten_iterable(x.text) for word in text: word = word.lower() if word in q_words: n_matches += 1 match_set.add(word) if len(match_set) == 0 and len(x.answer_spans) > 0: print_paragraph(q, x) input() word_matches.update(match_set) n_question_words.append(n_matches) n_answers = np.array(n_answers) n_question_words = np.array(n_question_words) any_answers = n_answers > 0 any_question_word = n_question_words > 0 total_para = len(any_answers) total_q = len(n_para) no_quesiton_and_answer = any_answers[np.logical_not(any_question_word)] print("%d/%d (%.4f) pairs have an answer" % (total_q, len(data), total_q / len(data))) print("%d para in %d questions (av %.4f)" % (sum(n_para), total_q, sum(n_para) / total_q)) print("%d/%d (%.4f) paragraphs have answers" % (any_answers.sum(), total_para, any_answers.mean())) print("%d/%d (%.4f) paragraphs have question word" % (any_question_word.sum(), total_para, any_question_word.mean())) print("%d/%d (%.4f) no question words have answers" % (no_quesiton_and_answer.sum(), len(no_quesiton_and_answer), no_quesiton_and_answer.mean()))
def main(Data: pd.DataFrame, nlp, model_dir, model): #parser = argparse.ArgumentParser(description="Run an ELMo model on user input") #parser.add_argument("model", help="Model directory") #parser.add_argument("question", help="Question to answer") #parser.add_argument("documents", help="List of text documents to answer the question with", nargs='+') #args = parser.parse_args() #print("Preprocessing...") # Load the model #model_dir = ModelDir(MODEL_DIR) #model = model_dir.get_model() print(model) if not isinstance(model, ParagraphQuestionModel): raise ValueError( "This script is built to work for ParagraphQuestionModel models only" ) #print(model) # Read the documents documents = [] documents.append(Data.at[0, 'Filetext']) """import pyodbc conn = pyodbc.connect("Driver={ODBC Driver 13 for SQL Server};" "Server=192.168.100.15;" "Database=PharmaAce;" "UID=sa;" "PWD=admin@123;" "Trusted_Connection=no;") cursor=conn.cursor() #(23211,28690,33214,25638,25837,26454,28693,26137,31428,32087) for doc in cursor.execute("select cast(filetext as varchar(max)) as filetext from kpl_tmp"): documents.append(doc[0]) #doc="D:\Document QnA\document-qa-master\Data\Drug_Delivery_Surveying_Global_Competitive_Landscape_BMI.txt" if not isfile(doc): raise ValueError(doc + " does not exist") with open(doc, "r") as f: documents.append(f.read()) """ #print("Loaded %d documents" % len(documents)) #temp=documents[0].split() # Split documents into lists of paragraphs #documents=[" ".join(temp[i:(i+400)]) for i in range(1,len(temp),400)] documents = [re.split("\s*\n\s*", doc) for doc in documents] # Tokenize the input, the models expects data to be tokenized using `NltkAndPunctTokenizer` # Note the model expects case-sensitive input tokenizer = NltkAndPunctTokenizer() question = tokenizer.tokenize_paragraph_flat( Data.at[0, 'Question']) # List of words # Now list of document->paragraph->sentence->word documents = [[tokenizer.tokenize_paragraph(p) for p in doc] for doc in documents] # Now group the document into paragraphs, this returns `ExtractedParagraph` objects # that additionally remember the start/end token of the paragraph within the source document splitter = MergeParagraphs(400) #splitter = PreserveParagraphs() # Uncomment to use the natural paragraph grouping documents = [splitter.split(doc) for doc in documents] #print(str(len(documents))+" kpl") #kpl # Now select the top paragraphs using a `ParagraphFilter` print(len(documents)) #kpl if len(documents) == 1: # Use TF-IDF to select top paragraphs from the document selector = TopTfIdf(NltkPlusStopWords(True), n_to_select=5) context = selector.prune(question, documents[0]) else: # Use a linear classifier to select top paragraphs among all the documents selector = ShallowOpenWebRanker(n_to_select=10) context = selector.prune(question, flatten_iterable(documents)) #print("Select %d paragraph" % len(context)) if model.preprocessor is not None: # Models are allowed to define an additional pre-processing step # This will turn the `ExtractedParagraph` objects back into simple lists of tokens context = [ model.preprocessor.encode_text(question, x) for x in context ] else: # Otherwise just use flattened text context = [flatten_iterable(x.text) for x in context] print("Setting up model") # Tell the model the batch size (can be None) and vocab to expect, This will load the # needed word vectors and fix the batch size to use when building the graph / encoding the input voc = set(question) for txt in context: voc.update(txt) model.set_input_spec(nlp, ParagraphAndQuestionSpec(batch_size=len(context)), voc) # Now we build the actual tensorflow graph, `best_span` and `conf` are # tensors holding the predicted span (inclusive) and confidence scores for each # element in the input batch, confidence scores being the pre-softmax logit for the span #print("Build tf graph") #kpl print("after set input spec") sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) # We need to use sess.as_default when working with the cuNND stuff, since we need an active # session to figure out the # of parameters needed for each layer. The cpu-compatible models don't need this. with sess.as_default(): # 8 means to limit the span to size 8 or less best_spans, conf = model.get_prediction().get_best_span(8) # Loads the saved weights model_dir.restore_checkpoint(sess) print("after loading weights") # Now the model is ready to run # The model takes input in the form of `ContextAndQuestion` objects, for example: data = [ ParagraphAndQuestion(x, question, None, "user-question%d" % i) for i, x in enumerate(context) ] #print("Starting run") # The model is run in two steps, first it "encodes" a batch of paragraph/context pairs # into numpy arrays, then we use `sess` to run the actual model get the predictions encoded = model.encode( data, is_train=True) # batch of `ContextAndQuestion` -> feed_dict best_spans, conf = sess.run([best_spans, conf], feed_dict=encoded) # feed_dict -> predictions best_para = np.argmax( conf ) # We get output for each paragraph, select the most-confident one to print #print("Best Paragraph: " + str(best_para)) #print("Best span: " + str(best_spans[best_para])) #print("Answer text: " + " ".join(context[best_para][best_spans[best_para][0]:best_spans[best_para][1]+1])) #print("Confidence: " + str(conf[best_para])) return " ".join( context[best_para][best_spans[best_para][0]:best_spans[best_para][1] + 1]) #if __name__ == "__main__": # main()
def main(): print('Starting...') model_dir = ModelDir(OPTS.model) model = model_dir.get_model() tokenizer = NltkAndPunctTokenizer() if not isinstance(model, ParagraphQuestionModel): raise ValueError( "This script is built to work for ParagraphQuestionModel models only" ) if OPTS.reload_vocab: loader = ResourceLoader() else: loader = CachingResourceLoader() print('Loading word vectors...') model.set_input_spec(ParagraphAndQuestionSpec(batch_size=None), set([',']), word_vec_loader=loader, allow_update=True) print('Starting Tensorflow session...') sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) with sess.as_default(): prediction = model.get_prediction() # Take 0-th here because we know we only truncate to one paragraph start_logits_tf = prediction.start_logits[0] end_logits_tf = prediction.end_logits[0] none_logit_tf = prediction.none_logit[0] #best_spans_tf, conf_tf = prediction.get_best_span(MAX_SPAN_LENGTH) model_dir.restore_checkpoint(sess) splitter = Truncate(400) # NOTE: we truncate past 400 tokens selector = TopTfIdf(NltkPlusStopWords(True), n_to_select=5) app = bottle.Bottle() @app.route('/') def index(): return bottle.template('index') @app.route('/post_query', method='post') def post_query(): document_raw = bottle.request.forms.getunicode('document').strip() question_raw = bottle.request.forms.getunicode('question').strip() document = re.split("\s*\n\s*", document_raw) question = tokenizer.tokenize_paragraph_flat(question_raw) doc_toks = [tokenizer.tokenize_paragraph(p) for p in document] split_doc = splitter.split(doc_toks) context = selector.prune(question, split_doc) if model.preprocessor is not None: context = [ model.preprocessor.encode_text(question, x) for x in context ] else: context = [flatten_iterable(x.text) for x in context] vocab = set(question) for txt in context: vocab.update(txt) data = [ ParagraphAndQuestion(x, question, None, "user-question%d" % i) for i, x in enumerate(context) ] model.word_embed.update(loader, vocab) encoded = model.encode(data, is_train=False) start_logits, end_logits, none_logit = sess.run( [start_logits_tf, end_logits_tf, none_logit_tf], feed_dict=encoded) beam, p_na = logits_to_probs(document_raw, context[0], start_logits, end_logits, none_logit, beam_size=BEAM_SIZE) return bottle.template('results', document=document_raw, question=question_raw, beam=beam, p_na=p_na) cur_dir = os.path.abspath(os.path.dirname(__file__)) bottle.TEMPLATE_PATH.insert(0, os.path.join(cur_dir, 'views')) bottle.run(app, host=OPTS.hostname, port=OPTS.port, debug=OPTS.debug)
def getAnswer(self): #parser = argparse.ArgumentParser(description="Run an ELMo model on user input") #parser.add_argument("model", help="Model directory") #parser.add_argument("question", help="Question to answer") #parser.add_argument("documents", help="List of text documents to answer the question with", nargs='+') #args = parser.parse_args() #print("Preprocessing...") # Load the model model_dir = ModelDir(MODEL_DIR) model = model_dir.get_model() if not isinstance(model, ParagraphQuestionModel): raise ValueError( "This script is built to work for ParagraphQuestionModel models only" ) conn = pyodbc.connect(DB_CONN) cursor = conn.cursor() #(23211,28690,33214,25638,25837,26454,28693,26137,31428,32087) query="select cast(filetext as varchar(max)) as filetext, name, type from dbo.UserworkspaceData where objectmasterid= "+\ str(self.ObjectMasterId)+\ " order by id asc" #query="select cast(filetext as varchar(max)) as filetext from kpl_tmp" documents = [] document = "" name = "" filetype = 0 for doc in cursor.execute(query): document = document + doc[0] name = doc[1] filetype = doc[2] #open("E:/kpl.txt","w+").write(document) documents.append(document) #documents.replace("\n\n","\n") #r.sub("",documents) #documents=" ".join(documents.split()) #open("E:\kpl_test.txt","w+").write(document) #doc="D:\Document QnA\document-qa-master\Data\Drug_Delivery_Surveying_Global_Competitive_Landscape_BMI.txt" # ============================================================================= # if not isfile(doc): # raise ValueError(doc + " does not exist") # with open(doc, "r") as f: # documents.append(f.read()) # ============================================================================= #print("Loaded %d documents" % len(documents)) #temp=documents[0].split() # Split documents into lists of paragraphs #documents=[" ".join(temp[i:(i+400)]) for i in range(1,len(temp),400)] documents = [re.split("\s*\n\s*", doc) for doc in documents] # Tokenize the input, the models expects data to be tokenized using `NltkAndPunctTokenizer` # Note the model expects case-sensitive input tokenizer = NltkAndPunctTokenizer() question = tokenizer.tokenize_paragraph_flat( self.Question) # List of words # Now list of document->paragraph->sentence->word documents = [[tokenizer.tokenize_paragraph(p) for p in doc] for doc in documents] # Now group the document into paragraphs, this returns `ExtractedParagraph` objects # that additionally remember the start/end token of the paragraph within the source document splitter = MergeParagraphs(400) #splitter = PreserveParagraphs() # Uncomment to use the natural paragraph grouping documents = [splitter.split(doc) for doc in documents] #print(str(len(documents))+" kpl") #kpl # Now select the top paragraphs using a `ParagraphFilter` if len(documents) == 1: # Use TF-IDF to select top paragraphs from the document selector = TopTfIdf(NltkPlusStopWords(True), n_to_select=5) context = selector.prune(question, documents[0]) else: # Use a linear classifier to select top paragraphs among all the documents selector = ShallowOpenWebRanker(n_to_select=10) context = selector.prune(question, flatten_iterable(documents)) #print("Select %d paragraph" % len(context)) if model.preprocessor is not None: # Models are allowed to define an additional pre-processing step # This will turn the `ExtractedParagraph` objects back into simple lists of tokens context = [ model.preprocessor.encode_text(question, x) for x in context ] else: # Otherwise just use flattened text context = [flatten_iterable(x.text) for x in context] #x=open("E:\context.txt","a+") #[x.write(" ".join(cont)) for cont in context] #x.write("\n.......................................................\n") #print("Setting up model") # Tell the model the batch size (can be None) and vocab to expect, This will load the # needed word vectors and fix the batch size to use when building the graph / encoding the input voc = set(question) for txt in context: voc.update(txt) model.set_input_spec(self.nlp, ParagraphAndQuestionSpec(batch_size=len(context)), voc) # Now we build the actual tensorflow graph, `best_span` and `conf` are # tensors holding the predicted span (inclusive) and confidence scores for each # element in the input batch, confidence scores being the pre-softmax logit for the span #print("Build tf graph") #kpl sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) # We need to use sess.as_default when working with the cuNND stuff, since we need an active # session to figure out the # of parameters needed for each layer. The cpu-compatible models don't need this. with sess.as_default(): # 8 means to limit the span to size 8 or less best_spans, conf = model.get_prediction().get_best_span(8) # Loads the saved weights model_dir.restore_checkpoint(sess) # Now the model is ready to run # The model takes input in the form of `ContextAndQuestion` objects, for example: data = [ ParagraphAndQuestion(x, question, None, "user-question%d" % i) for i, x in enumerate(context) ] #print("Starting run") # The model is run in two steps, first it "encodes" a batch of paragraph/context pairs # into numpy arrays, then we use `sess` to run the actual model get the predictions encoded = model.encode( data, is_train=True) # batch of `ContextAndQuestion` -> feed_dict best_spans, conf = sess.run( [best_spans, conf], feed_dict=encoded) # feed_dict -> predictions best_para = np.argmax( conf ) # We get output for each paragraph, select the most-confident one to print #print("Best Paragraph: " + str(best_para)) #print("Best span: " + str(best_spans[best_para])) #print("Answer text: " + " ".join(context[best_para][best_spans[best_para][0]:best_spans[best_para][1]+1])) #print("Confidence: " + str(conf[best_para])) Answer = " ".join(context[best_para] [best_spans[best_para][0]:best_spans[best_para][1] + 1]) print("Confidence: " + str(conf[best_para])) print("Best Paragraph: " + str(best_para)) print("Best span: " + str(best_spans[best_para])) print("Answer text: " + Answer) print(" ".join(context[best_para])) context[best_para][best_spans[best_para][ 0]] = r"<em>" + context[best_para][best_spans[best_para][0]] context[best_para][best_spans[best_para][1]] = context[best_para][ best_spans[best_para][1]] + r"</em>" start = 0 end = len(context[best_para]) positions = [ x for x, n in enumerate(context[best_para] [0:best_spans[best_para][0]]) if n == "." ] if len(positions) >= 2: start = positions[len(positions) - 2] + 1 positions = [ x for x, n in enumerate(context[best_para][best_spans[best_para][1] + 1:]) if n == "." ] if len(positions) > 1: end = best_spans[best_para][1] + 1 + positions[1] d = dict() if conf[best_para] > 10: d["answer"] = Answer else: d["answer"] = "" d["name"] = name d["filetype"] = filetype d["paragraph"] = re.sub(r' (?=\W)', '', " ".join(context[best_para][start:end])) d["ObjectMasterId"] = self.ObjectMasterId return d #if __name__ == "__main__": # main()
def main(): parser = argparse.ArgumentParser(description='Evaluate a model on TriviaQA data') parser.add_argument('model', help='model directory') parser.add_argument('-p', '--paragraph_output', type=str, help="Save fine grained results for each paragraph in csv format") parser.add_argument('-o', '--official_output', type=str, help="Build an offical output file with the model's" " most confident span for each (question, doc) pair") parser.add_argument('--no_ema', action="store_true", help="Don't use EMA weights even if they exist") parser.add_argument('--n_processes', type=int, default=None, help="Number of processes to do the preprocessing (selecting paragraphs+loading context) with") parser.add_argument('-i', '--step', type=int, default=None, help="checkpoint to load, default to latest") parser.add_argument('-n', '--n_sample', type=int, default=None, help="Number of questions to evaluate on") parser.add_argument('-a', '--async', type=int, default=10) parser.add_argument('-t', '--tokens', type=int, default=400, help="Max tokens per a paragraph") parser.add_argument('-g', '--n_paragraphs', type=int, default=15, help="Number of paragraphs to run the model on") parser.add_argument('-f', '--filter', type=str, default=None, choices=["tfidf", "truncate", "linear"], help="How to select paragraphs") parser.add_argument('-b', '--batch_size', type=int, default=200, help="Batch size, larger sizes might be faster but wll take more memory") parser.add_argument('--max_answer_len', type=int, default=8, help="Max answer span to select") parser.add_argument('-c', '--corpus', choices=["web-dev", "web-test", "web-verified-dev", "web-train", "open-dev", "open-train"], default="web-verified-dev") args = parser.parse_args() model_dir = ModelDir(args.model) model = model_dir.get_model() if args.corpus.startswith('web'): dataset = TriviaQaWebDataset() corpus = dataset.evidence if args.corpus == "web-dev": test_questions = dataset.get_dev() elif args.corpus == "web-test": test_questions = dataset.get_test() elif args.corpus == "web-verified-dev": test_questions = dataset.get_verified() elif args.corpus == "web-train": test_questions = dataset.get_train() else: raise RuntimeError() else: dataset = TriviaQaOpenDataset() corpus = dataset.evidence if args.corpus == "open-dev": test_questions = dataset.get_dev() elif args.corpus == "open-train": test_questions = dataset.get_train() else: raise RuntimeError() splitter = MergeParagraphs(args.tokens) per_document = not args.corpus.startswith("open") filter_name = args.filter if filter_name is None: if args.corpus.startswith("open"): filter_name = "linear" else: filter_name = "tfidf" print("Selecting %d paragraphs using %s method per %s" % (args.n_paragraphs, filter_name, ("question-document pair" if per_document else "question"))) if filter_name == "tfidf": para_filter = TopTfIdf(NltkPlusStopWords(punctuation=True), args.n_paragraphs) elif filter_name == "truncate": para_filter = FirstN(args.n_paragraphs) elif filter_name == "linear": para_filter = ShallowOpenWebRanker(args.n_paragraphs) else: raise ValueError() n_questions = args.n_sample if n_questions is not None: test_questions.sort(key=lambda x:x.question_id) np.random.RandomState(0).shuffle(test_questions) test_questions = test_questions[:n_questions] print("Building question/paragraph pairs...") # Loads the relevant questions/documents, selects the right paragraphs, and runs the model's preprocessor if per_document: prep = ExtractMultiParagraphs(splitter, para_filter, model.preprocessor, require_an_answer=False) else: prep = ExtractMultiParagraphsPerQuestion(splitter, para_filter, model.preprocessor, require_an_answer=False) prepped_data = preprocess_par(test_questions, corpus, prep, args.n_processes, 1000) data = [] for q in prepped_data.data: for i, p in enumerate(q.paragraphs): if q.answer_text is None: ans = None else: ans = TokenSpans(q.answer_text, p.answer_spans) data.append(DocumentParagraphQuestion(q.question_id, p.doc_id, (p.start, p.end), q.question, p.text, ans, i)) # Reverse so our first batch will be the largest (so OOMs happen early) questions = sorted(data, key=lambda x: (x.n_context_words, len(x.question)), reverse=True) print("Done, starting eval") if args.step is not None: if args.step == "latest": checkpoint = model_dir.get_latest_checkpoint() else: checkpoint = model_dir.get_checkpoint(int(args.step)) else: checkpoint = model_dir.get_best_weights() if checkpoint is not None: print("Using best weights") else: print("Using latest checkpoint") checkpoint = model_dir.get_latest_checkpoint() test_questions = ParagraphAndQuestionDataset(questions, FixedOrderBatcher(args.batch_size, True)) evaluation = trainer.test(model, [RecordParagraphSpanPrediction(args.max_answer_len, True)], {args.corpus:test_questions}, ResourceLoader(), checkpoint, not args.no_ema, args.async)[args.corpus] if not all(len(x) == len(data) for x in evaluation.per_sample.values()): raise RuntimeError() df = pd.DataFrame(evaluation.per_sample) if args.official_output is not None: print("Saving question result") # I didn't store the unormalized filenames exactly, so unfortunately we have to reload # the source data to get exact filename to output an official test script fns = {} print("Loading proper filenames") if args.corpus == 'web-test': source = join(TRIVIA_QA, "qa", "web-test-without-answers.json") elif args.corpus == "web-dev": source = join(TRIVIA_QA, "qa", "web-dev.json") else: raise NotImplementedError() with open(join(source)) as f: data = json.load(f)["Data"] for point in data: for doc in point["EntityPages"]: filename = doc["Filename"] fn = join("wikipedia", filename[:filename.rfind(".")]) fn = normalize_wiki_filename(fn) fns[(point["QuestionId"], fn)] = filename answers = {} scores = {} for q_id, doc_id, start, end, txt, score in df[["question_id", "doc_id", "para_start", "para_end", "text_answer", "predicted_score"]].itertuples(index=False): filename = dataset.evidence.file_id_map[doc_id] if filename.startswith("web"): true_name = filename[4:] + ".txt" else: true_name = fns[(q_id, filename)] key = q_id + "--" + true_name prev_score = scores.get(key) if prev_score is None or prev_score < score: scores[key] = score answers[key] = txt with open(args.official_output, "w") as f: json.dump(answers, f) if per_document: group_by = ["question_id", "doc_id"] else: group_by = ["question_id"] # Print a table of scores as more paragraphs are used df.sort_values(group_by + ["rank"], inplace=True) f1 = compute_model_scores(df, "predicted_score", "text_f1", group_by) em = compute_model_scores(df, "predicted_score", "text_em", group_by) table = [["N Paragraphs", "EM", "F1"]] table += list([str(i+1), "%.4f" % e, "%.4f" % f] for i, (e, f) in enumerate(zip(em, f1))) print_table(table) output_file = args.paragraph_output if output_file is not None: print("Saving paragraph result") if output_file.endswith("json"): with open(output_file, "w") as f: json.dump(evaluation.per_sample, f) elif output_file.endswith("pkl"): with open(output_file, "wb") as f: pickle.dump(evaluation.per_sample, f) elif output_file.endswith("csv"): df.to_csv(output_file, index=False) else: raise ValueError("Unrecognized file format")
def main(): parser = argparse.ArgumentParser( description='Evaluate a model on TriviaQA data') parser.add_argument('model', help='model directory') parser.add_argument( '-p', '--paragraph_output', type=str, help="Save fine grained results for each paragraph in csv format") parser.add_argument('-o', '--official_output', type=str, help="Build an offical output file with the model's" " most confident span for each (question, doc) pair") parser.add_argument('--no_ema', action="store_true", help="Don't use EMA weights even if they exist") parser.add_argument( '--n_processes', type=int, default=None, help= "Number of processes to do the preprocessing (selecting paragraphs+loading context) with" ) parser.add_argument('-i', '--step', type=int, default=None, help="checkpoint to load, default to latest") parser.add_argument('-n', '--n_sample', type=int, default=None, help="Number of questions to evaluate on") parser.add_argument('-a', '--async', type=int, default=10) parser.add_argument('-t', '--tokens', type=int, default=400, help="Max tokens per a paragraph") parser.add_argument('-g', '--n_paragraphs', type=int, default=15, help="Number of paragraphs to run the model on") parser.add_argument('-f', '--filter', type=str, default=None, choices=["tfidf", "truncate", "linear"], help="How to select paragraphs") parser.add_argument( '-b', '--batch_size', type=int, default=200, help="Batch size, larger sizes might be faster but wll take more memory" ) parser.add_argument('--max_answer_len', type=int, default=8, help="Max answer span to select") parser.add_argument('-c', '--corpus', choices=[ "web-dev", "web-test", "web-verified-dev", "web-train", "open-dev", "open-train", "wiki-dev", "wiki-test" ], default="web-verified-dev") parser.add_argument("-s", "--source_dir", type=str, default=None, help="where to take input files") parser.add_argument("--n_span_per_q", type=int, default=1, help="where to take input files") args = parser.parse_args() dataset_name = args.source_dir.split('/')[-1] model_name = args.model.split('/')[-1] ElasticLogger().write_log('INFO', 'Start Evaluation', context_dict={ 'model': model_name, 'dataset': dataset_name }) model_dir = ModelDir(args.model) model = model_dir.get_model() if args.corpus.startswith('web'): dataset = TriviaQaWebDataset() if args.corpus == "web-dev": test_questions = dataset.get_dev() elif args.corpus == "web-test": test_questions = dataset.get_test() elif args.corpus == "web-verified-dev": test_questions = dataset.get_verified() elif args.corpus == "web-train": test_questions = dataset.get_train() else: raise AssertionError() elif args.corpus.startswith("wiki"): dataset = TriviaQaWikiDataset() if args.corpus == "wiki-dev": test_questions = dataset.get_dev() elif args.corpus == "wiki-test": test_questions = dataset.get_test() else: raise AssertionError() else: dataset = TriviaQaOpenDataset(args.source_dir) if args.corpus == "open-dev": # just loading the pkl that was saved in build_span_corpus test_questions = dataset.get_dev() elif args.corpus == "open-train": test_questions = dataset.get_train() else: raise AssertionError() ### ALON debuging #test_questions = test_questions[0:5] corpus = dataset.evidence splitter = MergeParagraphs(args.tokens) per_document = args.corpus.startswith( "web") # wiki and web are both multi-document #per_document = True filter_name = args.filter if filter_name is None: # Pick default depending on the kind of data we are using if per_document: filter_name = "tfidf" else: filter_name = "linear" print("Selecting %d paragraphs using method \"%s\" per %s" % (args.n_paragraphs, filter_name, ("question-document pair" if per_document else "question"))) if filter_name == "tfidf": para_filter = TopTfIdf(NltkPlusStopWords(punctuation=True), args.n_paragraphs) elif filter_name == "truncate": para_filter = FirstN(args.n_paragraphs) elif filter_name == "linear": para_filter = ShallowOpenWebRanker(args.n_paragraphs) else: raise ValueError() n_questions = args.n_sample docqa.config.SPANS_PER_QUESTION = args.n_span_per_q #n_questions = 1 if n_questions is not None: test_questions.sort(key=lambda x: x.question_id) np.random.RandomState(0).shuffle(test_questions) test_questions = test_questions[:n_questions] print("Building question/paragraph pairs...") # Loads the relevant questions/documents, selects the right paragraphs, and runs the model's preprocessor if per_document: prep = ExtractMultiParagraphs(splitter, para_filter, model.preprocessor, require_an_answer=False) else: prep = ExtractMultiParagraphsPerQuestion(splitter, para_filter, model.preprocessor, require_an_answer=False) prepped_data = preprocess_par(test_questions, corpus, prep, args.n_processes, 1000) data = [] for q in prepped_data.data: for i, p in enumerate(q.paragraphs): if q.answer_text is None: ans = None else: ans = TokenSpans(q.answer_text, p.answer_spans) data.append( DocumentParagraphQuestion(q.question_id, p.doc_id, (p.start, p.end), q.question, p.text, ans, i)) # Reverse so our first batch will be the largest (so OOMs happen early) questions = sorted(data, key=lambda x: (x.n_context_words, len(x.question)), reverse=True) print("Done, starting eval") if args.step is not None: if args.step == "latest": checkpoint = model_dir.get_latest_checkpoint() else: checkpoint = model_dir.get_checkpoint(int(args.step)) else: checkpoint = model_dir.get_best_weights() if checkpoint is not None: print("Using best weights") else: print("Using latest checkpoint") checkpoint = model_dir.get_latest_checkpoint() test_questions = ParagraphAndQuestionDataset( questions, FixedOrderBatcher(args.batch_size, True)) evaluation = trainer.test( model, [RecordParagraphSpanPrediction(args.max_answer_len, True)], {args.corpus: test_questions}, ResourceLoader(), checkpoint, not args.no_ema, args. async)[args.corpus] if not all(len(x) == len(data) for x in evaluation.per_sample.values()): raise RuntimeError() df = pd.DataFrame(evaluation.per_sample) if args.official_output is not None: print("Saving question result") fns = {} if per_document: # I didn't store the unormalized filenames exactly, so unfortunately we have to reload # the source data to get exact filename to output an official test script print("Loading proper filenames") if args.corpus == 'web-test': source = join(TRIVIA_QA, "qa", "web-test-without-answers.json") elif args.corpus == "web-dev": source = join(TRIVIA_QA, "qa", "web-dev.json") else: raise AssertionError() with open(join(source)) as f: data = json.load(f)["Data"] for point in data: for doc in point["EntityPages"]: filename = doc["Filename"] fn = join("wikipedia", filename[:filename.rfind(".")]) fn = normalize_wiki_filename(fn) fns[(point["QuestionId"], fn)] = filename answers = {} scores = {} for q_id, doc_id, start, end, txt, score in df[[ "question_id", "doc_id", "para_start", "para_end", "text_answer", "predicted_score" ]].itertuples(index=False): filename = dataset.evidence.file_id_map[doc_id] if per_document: if filename.startswith("web"): true_name = filename[4:] + ".txt" else: true_name = fns[(q_id, filename)] # Alon Patch for triviaqa test results true_name = true_name.replace('TriviaQA_Org/', '') key = q_id + "--" + true_name else: key = q_id prev_score = scores.get(key) if prev_score is None or prev_score < score: scores[key] = score answers[key] = txt with open(args.official_output, "w") as f: json.dump(answers, f) output_file = args.paragraph_output if output_file is not None: print("Saving paragraph result") df.to_csv(output_file, index=False) print("Computing scores") if per_document: group_by = ["question_id", "doc_id"] else: group_by = ["question_id"] # Print a table of scores as more paragraphs are used df.sort_values(group_by + ["rank"], inplace=True) df_scores = df.copy(deep=True) df_scores['predicted_score'] = df_scores['predicted_score'].apply( lambda x: pd.Series(x).max()) em = compute_ranked_scores(df_scores, "predicted_score", "text_em", group_by) f1 = compute_ranked_scores(df_scores, "predicted_score", "text_f1", group_by) table = [["N Paragraphs", "EM", "F1"]] table += list([str(i + 1), "%.4f" % e, "%.4f" % f] for i, (e, f) in enumerate(zip(em, f1))) table_df = pd.DataFrame(table[1:], columns=table[0]).drop(['N Paragraphs'], axis=1) ElasticLogger().write_log('INFO', 'Results', context_dict={'model': model_name, 'dataset': dataset_name, \ 'max_EM':table_df.max().ix['EM'], \ 'max_F1':table_df.max().ix['F1'], \ 'result_table': str(table_df)}) df_flat = [] for id, question in df.iterrows(): for text_answer, predicted_span, predicted_score in zip( question['text_answer'], question['predicted_span'], question['predicted_score']): new_question = dict(question.copy()) new_question.update({ 'text_answer': text_answer, 'predicted_span': predicted_span, 'predicted_score': predicted_score }) df_flat.append(new_question) results_df = pd.DataFrame(df_flat) #Alon: outputing the estimates for all the #results_df = results_df.groupby(['question_id', 'text_answer']).apply(lambda df: df.ix[df['predicted_score'].argmax()]).reset_index(drop=True) results_df.sort_values(by=['question_id', 'predicted_score'], ascending=False).set_index([ 'question_id', 'text_answer' ])[['question', 'predicted_score', 'text_em']].to_csv('results.csv') print_table(table)
def main(): parser = argparse.ArgumentParser(description='Train a model on TriviaQA web') parser.add_argument('mode', choices=["paragraph-level", "confidence", "merge", "shared-norm", "sigmoid", "shared-norm-600"]) parser.add_argument("name", help="Where to store the model") parser.add_argument('-n', '--n_processes', type=int, default=2, help="Number of processes (i.e., select which paragraphs to train on) " "the data with") args = parser.parse_args() mode = args.mode out = args.name + "-" + datetime.now().strftime("%m%d-%H%M%S") model = get_model(100, 140, mode, WithIndicators()) stop = NltkPlusStopWords(True) if mode == "paragraph-level": extract = ExtractSingleParagraph(MergeParagraphs(400), TopTfIdf(stop, 1), model.preprocessor, intern=True) elif mode == "shared-norm-600": extract = ExtractMultiParagraphs(MergeParagraphs(600), TopTfIdf(stop, 4), model.preprocessor, intern=True) else: extract = ExtractMultiParagraphs(MergeParagraphs(400), TopTfIdf(stop, 4), model.preprocessor, intern=True) if mode == "paragraph-level": n_epochs = 16 train = ParagraphAndQuestionsBuilder(ClusteredBatcher(60, ContextLenBucketedKey(3), True)) test = ParagraphAndQuestionsBuilder(ClusteredBatcher(60, ContextLenKey(), False)) n_dev, n_train = 21000, 12000 eval = [LossEvaluator(), SpanEvaluator([4, 8], "triviaqa")] else: eval = [LossEvaluator(), MultiParagraphSpanEvaluator(8, "triviaqa", mode != "merge")] # we sample two paragraphs per a (question, doc) pair, so evaluate on fewer questions n_dev, n_train = 15000, 8000 if mode == "confidence" or mode == "sigmoid": if mode == "sigmoid": # Trains very slowly, do this at your own risk n_epochs = 71 else: n_epochs = 28 test = RandomParagraphSetDatasetBuilder(120, "flatten", True, 1) train = StratifyParagraphsBuilder(ClusteredBatcher(60, ContextLenBucketedKey(3), True), 0, 1) else: n_epochs = 14 test = RandomParagraphSetDatasetBuilder(120, "merge" if mode == "merge" else "group", True, 1) train = StratifyParagraphSetsBuilder(35, mode == "merge", True, 1) data = TriviaQaWebDataset() params = get_triviaqa_train_params(n_epochs, n_dev, n_train) data = PreprocessedData(data, extract, train, test, eval_on_verified=False) data.preprocess(args.n_processes, 1000) with open(__file__, "r") as f: notes = f.read() notes = "*" * 10 + "\nMode: " + args.mode + "\n" + "*"*10 + "\n" + notes trainer.start_training(data, model, params, eval, model_dir.ModelDir(out), notes)
def predict(): json_data = {"success": False, "predictions": []} print("Preprocessing...") # Load the model model_dir = ModelDir( "/home/antriv/conversation_ai/Transfer_Learning/ALLENAI_DocumentQA/document-qa/pretrained_models/models/triviaqa-unfiltered-shared-norm" ) model = model_dir.get_model() if not isinstance(model, ParagraphQuestionModel): raise ValueError( "This script is built to work for ParagraphQuestionModel models only" ) # Load the question question = (flask.request.data).decode("utf-8") # Read the documents documents = [] doclist = ["/home/antriv/data/The-Future-Computed.txt"] for doc in doclist: if not isfile(doc): raise ValueError(doc + " does not exist") with open(doc, "r") as f: documents.append(f.read()) print("Loaded %d documents" % len(documents)) # Split documents into lists of paragraphs documents = [re.split("\s*\n\s*", doc) for doc in documents] # Tokenize the input, the models expects data to be tokenized using `NltkAndPunctTokenizer` # Note the model expects case-sensitive input tokenizer = NltkAndPunctTokenizer() question = tokenizer.tokenize_paragraph_flat(question) # List of words # Now list of document->paragraph->sentence->word documents = [[tokenizer.tokenize_paragraph(p) for p in doc] for doc in documents] # Now group the document into paragraphs, this returns `ExtractedParagraph` objects # that additionally remember the start/end token of the paragraph within the source document splitter = MergeParagraphs(400) #splitter = PreserveParagraphs() # Uncomment to use the natural paragraph grouping documents = [splitter.split(doc) for doc in documents] # Now select the top paragraphs using a `ParagraphFilter` if len(documents) == 1: # Use TF-IDF to select top paragraphs from the document selector = TopTfIdf(NltkPlusStopWords(True), n_to_select=1000) context = selector.prune(question, documents[0]) else: # Use a linear classifier to select top paragraphs among all the documents selector = ShallowOpenWebRanker(n_to_select=1000) context = selector.prune(question, flatten_iterable(documents)) print("Select %d paragraph" % len(context)) if model.preprocessor is not None: # Models are allowed to define an additional pre-processing step # This will turn the `ExtractedParagraph` objects back into simple lists of tokens context = [ model.preprocessor.encode_text(question, x) for x in context ] else: # Otherwise just use flattened text context = [flatten_iterable(x.text) for x in context] print("Setting up model") # Tell the model the batch size (can be None) and vocab to expect, This will load the # needed word vectors and fix the batch size to use when building the graph / encoding the input voc = set(question) for txt in context: voc.update(txt) model.set_input_spec(ParagraphAndQuestionSpec(batch_size=len(context)), voc) # Now we build the actual tensorflow graph, `best_span` and `conf` are # tensors holding the predicted span (inclusive) and confidence scores for each # element in the input batch, confidence scores being the pre-softmax logit for the span print("Build tf graph") sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) # We need to use sess.as_default when working with the cuNND stuff, since we need an active # session to figure out the # of parameters needed for each layer. The cpu-compatible models don't need this. with sess.as_default(): # 8 means to limit the span to size 8 or less best_spans, conf = model.get_prediction().get_best_span(8) # Loads the saved weights model_dir.restore_checkpoint(sess) # Now the model is ready to run # The model takes input in the form of `ContextAndQuestion` objects, for example: data = [ ParagraphAndQuestion(x, question, None, "user-question%d" % i) for i, x in enumerate(context) ] print("Starting run") # The model is run in two steps, first it "encodes" a batch of paragraph/context pairs # into numpy arrays, then we use `sess` to run the actual model get the predictions encoded = model.encode( data, is_train=False) # batch of `ContextAndQuestion` -> feed_dict best_spans, conf = sess.run([best_spans, conf], feed_dict=encoded) # feed_dict -> predictions best_para = np.argmax( conf ) # We get output for each paragraph, select the most-confident one to print print("Best Paragraph: " + str(best_para)) print("Best span: " + str(best_spans[best_para])) print("Answer text: " + " ".join(context[best_para] [best_spans[best_para][0]:best_spans[best_para][1] + 1])) print("Confidence: " + str(conf[best_para])) y_output = " ".join( context[best_para][best_spans[best_para][0]:best_spans[best_para][1] + 1]) print(y_output) json_data["predictions"].append(str(y_output)) #indicate that the request was a success json_data["success"] = True #return the data dictionary as a JSON response return flask.jsonify(json_data)
def main(): parser = argparse.ArgumentParser("Train our ELMo model on SQuAD") parser.add_argument("loss_mode", choices=['default', 'confidence']) parser.add_argument("output_dir") parser.add_argument("--dim", type=int, default=90) parser.add_argument("--l2", type=float, default=0) parser.add_argument("--mode", choices=["input", "output", "both", "none"], default="both") parser.add_argument("--top_layer_only", action="store_true") parser.add_argument("--no-tfidf", action='store_true', help="Don't add TF-IDF negative examples") args = parser.parse_args() out = args.output_dir + "-" + datetime.now().strftime("%m%d-%H%M%S") dim = args.dim recurrent_layer = CudnnGru(dim, w_init=TruncatedNormal(stddev=0.05)) if args.loss_mode == 'default': n_epochs = 24 answer_encoder = SingleSpanAnswerEncoder() predictor = BoundsPredictor( ChainBiMapper(first_layer=recurrent_layer, second_layer=recurrent_layer)) batcher = ClusteredBatcher(45, ContextLenKey(), False, False) data = DocumentQaTrainingData(SquadCorpus(), None, batcher, batcher) elif args.loss_mode == 'confidence': if args.no_tfidf: prepro = SquadDefault() n_epochs = 15 else: prepro = SquadTfIdfRanker(NltkPlusStopWords(True), 4, True) n_epochs = 50 answer_encoder = DenseMultiSpanAnswerEncoder() predictor = ConfidencePredictor(ChainBiMapper( first_layer=recurrent_layer, second_layer=recurrent_layer, ), AttentionEncoder(), FullyConnected(80, activation="tanh"), aggregate="sum") eval_dataset = RandomParagraphSetDatasetBuilder( 100, 'flatten', True, 0) train_batching = ClusteredBatcher(45, ContextLenBucketedKey(3), True, False) data = PreprocessedData(SquadCorpus(), prepro, StratifyParagraphsBuilder(train_batching, 1), eval_dataset, eval_on_verified=False) data.preprocess(1) params = trainer.TrainParams(trainer.SerializableOptimizer( "Adadelta", dict(learning_rate=1.0)), ema=0.999, max_checkpoints_to_keep=2, async_encoding=10, num_epochs=n_epochs, log_period=30, eval_period=1200, save_period=1200, best_weights=("dev", "b17/text-f1"), eval_samples=dict(dev=None, train=8000)) lm_reduce = MapperSeq( ElmoLayer(args.l2, layer_norm=False, top_layer_only=args.top_layer_only), DropoutLayer(0.5), ) model = AttentionWithElmo( encoder=DocumentAndQuestionEncoder(answer_encoder), lm_model=SquadContextConcatSkip(), append_before_atten=(args.mode == "both" or args.mode == "output"), append_embed=(args.mode == "both" or args.mode == "input"), max_batch_size=128, word_embed=FixedWordEmbedder(vec_name="glove.840B.300d", word_vec_init_scale=0, learn_unk=False, cpu=True), char_embed=CharWordEmbedder(LearnedCharEmbedder(word_size_th=14, char_th=49, char_dim=20, init_scale=0.05, force_cpu=True), MaxPool(Conv1d(100, 5, 0.8)), shared_parameters=True), embed_mapper=SequenceMapperSeq( VariationalDropoutLayer(0.8), recurrent_layer, VariationalDropoutLayer(0.8), ), lm_reduce=None, lm_reduce_shared=lm_reduce, per_sentence=False, memory_builder=NullBiMapper(), attention=BiAttention(TriLinear(bias=True), True), match_encoder=SequenceMapperSeq( FullyConnected(dim * 2, activation="relu"), ResidualLayer( SequenceMapperSeq( VariationalDropoutLayer(0.8), recurrent_layer, VariationalDropoutLayer(0.8), StaticAttentionSelf(TriLinear(bias=True), ConcatWithProduct()), FullyConnected(dim * 2, activation="relu"), )), VariationalDropoutLayer(0.8)), predictor=predictor) with open(__file__, "r") as f: notes = f.read() notes = str(sorted(args.__dict__.items(), key=lambda x: x[0])) + "\n" + notes trainer.start_training( data, model, params, [LossEvaluator(), SpanEvaluator(bound=[17], text_eval="squad")], ModelDir(out), notes)
return out if on_token + len(sent) < start: pass on_para += sent on_token += len(sent) if len(on_para) > 0: out.append(on_para) on_para = [] out.append(on_para) if len(flatten_iterable(out)) != end - start: raise ValueError(len(flatten_iterable(out)), end-start) return out stop = NltkPlusStopWords(True).words class bcolors: CORRECT = '\033[94m' ERROR = '\033[91m' CYAN = "\033[96m" ENDC = '\033[0m' def display_para(text: List[str], answers, question, p_start, p_end): words = {w.lower() for w in question if w.lower() not in stop} text = list(text) if answers is not None: for s,e in answers: text[s] = bcolors.CORRECT + text[s] text[e] = text[e] + bcolors.ENDC