def _init(self, loader: ResourceLoader, voc: Iterable[str]): # TODO we should not be building variables here if voc is not None: word_to_vec = loader.load_word_vec(self.vec_name, voc) else: word_to_vec = loader.load_word_vec(self.vec_name) voc = set(word_to_vec.keys()) self._word_to_ix = {} dim = next(iter(word_to_vec.values())).shape[0] null_embed = tf.zeros((1, dim), dtype=tf.float32) unk_embed = tf.get_variable(shape=(1, dim), name="unk_embed", dtype=np.float32, trainable=self.learn_unk, initializer=tf.random_uniform_initializer( -self.word_vec_init_scale, self.word_vec_init_scale)) ix = 2 matrix_list = [null_embed, unk_embed] if self._special_tokens is not None and len(self._special_tokens) > 0: print("Building embeddings for %d special_tokens" % (len(self._special_tokens))) tok_embed = tf.get_variable( shape=(len(self._special_tokens), dim), name="token_embed", dtype=np.float32, trainable=True, initializer=tf.random_uniform_initializer( -self.word_vec_init_scale, self.word_vec_init_scale)) matrix_list.append(tok_embed) for token in self._special_tokens: self._word_to_ix[token] = ix ix += 1 mat = [] for word in voc: if word in self._word_to_ix: continue # in case we already added due after seeing a capitalized version of `word` if word in word_to_vec: mat.append(word_to_vec[word]) self._word_to_ix[word] = ix ix += 1 else: lower = word.lower() # Full back to the lower-case version if lower in word_to_vec and lower not in self._word_to_ix: mat.append(word_to_vec[lower]) self._word_to_ix[lower] = ix ix += 1 print("Had pre-trained word embeddings for %d of %d words" % (len(mat), len(voc))) #code.interact(local=locals()) matrix_list.append(tf.constant(value=np.vstack(mat))) self._word_emb_mat = tf.concat(matrix_list, axis=0)
def test_model_pickle(output_dir): print("Testing...") save_dir = join(output_dir, "save") test_questions = get_test_questions() with open(join(output_dir, "model.pkl"), "rb") as f: model = pickle.load(f) sess = tf.Session() model.set_input_spec( ParagraphAndQuestionSpec(1, None, None, 14), {"the"}, ResourceLoader(lambda a, b: {"the": np.zeros(300, np.float32)})) pred = model.get_prediction() print("Rebuilding") all_vars = tf.global_variables() + tf.get_collection( tf.GraphKeys.SAVEABLE_OBJECTS) lm_var_names = {x.name for x in all_vars if x.name.startswith("bilm")} vars_to_restore = [x for x in all_vars if x.name not in lm_var_names] saver = tf.train.Saver(vars_to_restore) saver.restore(sess, tf.train.latest_checkpoint(save_dir)) sess.run( tf.variables_initializer( [x for x in all_vars if x.name in lm_var_names])) feed = model.encode([test_questions], False) cpu_out = sess.run([pred.start_logits, pred.end_logits], feed_dict=feed) return cpu_out
def set_input_spec(self, input_spec: ParagraphAndQuestionSpec, voc: Set[str], word_vec_loader: ResourceLoader=None): if word_vec_loader is None: word_vec_loader = ResourceLoader() if self.word_embed is not None: self.word_embed.init(word_vec_loader, voc) if self.char_embed is not None: self.char_embed.embeder.init(word_vec_loader, voc) self.encoder.init(input_spec, True, self.word_embed, None if self.char_embed is None else self.char_embed.embeder) self._is_train_placeholder = tf.placeholder(tf.bool, ()) return self.encoder.get_placeholders()
def build_model_and_evaluator_runner(model_config, max_answer_len, n_paragraphs): with open(model_config.model_pickle_file, 'rb') as f: model = pickle.load(f) model.lm_model.weight_file = model_config.lm_weights_file model.lm_model.lm_vocab_file = model_config.vocab_file model.lm_model.embed_weights_file = model_config.lm_token_weights_file model.lm_model.options_file = model_config.lm_options_file model.word_embed.vec_name = model_config.word_vector_file vocab_to_ignore = {'<S>', '</S>', '<UNK>', '!!!MAXTERMID'} vocab_to_init_with = { line.strip() for line in open(model_config.vocab_file, encoding="utf-8") if line.strip() not in vocab_to_ignore } #evaluator_runner = AysncEvaluatorRunner([RecordParagraphSpanPrediction(max_answer_len, True)], model, 10) sess = tf.Session() with sess.as_default(): model.set_input_spec(ParagraphAndQuestionSpec(None, None, None, 14), vocab_to_init_with, word_vec_loader=ResourceLoader( load_vec_fn=lambda x, y: load_word_vectors( x, y, is_path=True))) evaluator_runner = AysncEvaluatorRunner( [RecordParagraphSpanPrediction(max_answer_len, True)], model, 10) input_dict = { p: x for p, x in zip(model.get_placeholders(), evaluator_runner.dequeue_op) } pred = model.get_predictions_for(input_dict) evaluator_runner.set_input(pred) all_vars = tf.global_variables() + tf.get_collection( tf.GraphKeys.SAVEABLE_OBJECTS) lm_var_names = {x.name for x in all_vars if x.name.startswith("bilm")} vars_to_restore = [x for x in all_vars if x.name not in lm_var_names] saver = tf.train.Saver(vars_to_restore) saver.restore(sess, model_config.checkpoint_file) sess.run( tf.variables_initializer( [x for x in all_vars if x.name in lm_var_names])) return sess, model, evaluator_runner
def _build_model(self): vocab_to_init_with = { line.strip() for line in open(self.config.vocab_file, encoding="utf-8") if line.strip() not in vocab_to_ignore } self.model.word_embed.vec_name = self.config.word_vector_file with self.sess.as_default(): self.model.set_input_spec( ParagraphAndQuestionSpec(None, None, None, 14), vocab_to_init_with, word_vec_loader=ResourceLoader( load_vec_fn=lambda x, y: load_word_vectors( x, y, is_path=True))) pred = self.model.get_production_predictions_for( {x: x for x in self.model.get_placeholders()}) return pred.start_logits, pred.end_logits, self.model.context_rep
def set_input_spec(self, input_spec, voc, word_vec_loader=None): if word_vec_loader is None: word_vec_loader = ResourceLoader() if self.word_embed is not None: self.word_embed.init(word_vec_loader, voc) if self.char_embed is not None: self.char_embed.embeder.init(word_vec_loader, voc) batch_size = input_spec.batch_size self.batch_size = batch_size self.encoder.init( input_spec, True, self.word_embed, None if self.char_embed is None else self.char_embed.embeder) self._is_train_placeholder = tf.placeholder(tf.bool, ()) if self.token_lookup: self._batcher = TokenBatcher(self.lm_model.lm_vocab_file) self._question_char_ids_placeholder = tf.placeholder( tf.int32, (batch_size, None)) self._context_char_ids_placeholder = tf.placeholder( tf.int32, (batch_size, None)) self._max_word_size = input_spec.max_word_size self._context_sentence_ixs = None else: input_spec.max_word_size = 50 # TODO hack, harded coded from the lm model self._batcher = Batcher(self.lm_model.lm_vocab_file, 50) self._max_word_size = input_spec.max_word_size self._question_char_ids_placeholder = tf.placeholder( tf.int32, (batch_size, None, self._max_word_size)) if self.per_sentence: self._context_char_ids_placeholder = tf.placeholder( tf.int32, (None, None, self._max_word_size)) self._context_sentence_ixs = tf.placeholder( tf.int32, (batch_size, 3, None, 3)) else: self._context_char_ids_placeholder = tf.placeholder( tf.int32, (batch_size, None, self._max_word_size)) self._context_sentence_ixs = None return self.get_placeholders()
def __init__(self, wiki_cache: str, paragraph_splitter: DocumentSplitter, paragraph_selector: ParagraphFilter, vocab: Union[str, Set[str]], model: Union[ParagraphQuestionModel, ModelDir], loader: ResourceLoader = ResourceLoader(), bing_api_key=None, tagme_api_key=None, blacklist_trivia_sites: bool = False, n_dl_threads: int = 5, span_bound: int = 8, tagme_threshold: Optional[float] = 0.2, download_timeout: int = None, n_web_docs=10): self.log = logging.getLogger('qa_system') self.tagme_threshold = tagme_threshold self.n_web_docs = n_web_docs self.blacklist_trivia_sites = blacklist_trivia_sites self.tagme_api_key = tagme_api_key if bing_api_key is not None: self.searcher = AsyncWebSearcher(bing_api_key) self.text_extractor = AsyncBoilerpipeCliExtractor( n_dl_threads, download_timeout) else: self.text_extractor = None self.searcher = None self.wiki_corpus = WikiCorpus(wiki_cache, keep_inverse_mapping=True) self.paragraph_splitter = paragraph_splitter self.paragraph_selector = paragraph_selector self.model_dir = model voc = None if vocab is not None: if isinstance(vocab, str): voc = set() with open(vocab, "r") as f: for line in f: voc.add(line.strip()) else: voc = vocab self.log.info("Using preset vocab of size %d", len(voc)) self.log.info("Setting up model...") if isinstance(model, ModelDir): self.model = model.get_model() else: self.model = model self.model.set_input_spec(ParagraphAndQuestionSpec(None), voc, loader) self.sess = tf.Session() with self.sess.as_default(): pred = self.model.get_prediction() model.restore_checkpoint(self.sess) self.span_scores = pred.get_span_scores() self.span, self.score = pred.get_best_span(span_bound) self.tokenizer = NltkAndPunctTokenizer() self.sess.graph.finalize()
def get_resource_loader(self): return ResourceLoader(self.get_pruned_word_vecs)
def main(): parser = argparse.ArgumentParser(description='Evaluate a model on TriviaQA data') parser.add_argument('model', help='model directory') parser.add_argument('-p', '--paragraph_output', type=str, help="Save fine grained results for each paragraph in csv format") parser.add_argument('-o', '--official_output', type=str, help="Build an offical output file with the model's" " most confident span for each (question, doc) pair") parser.add_argument('--no_ema', action="store_true", help="Don't use EMA weights even if they exist") parser.add_argument('--n_processes', type=int, default=None, help="Number of processes to do the preprocessing (selecting paragraphs+loading context) with") parser.add_argument('-i', '--step', type=int, default=None, help="checkpoint to load, default to latest") parser.add_argument('-n', '--n_sample', type=int, default=None, help="Number of questions to evaluate on") parser.add_argument('-a', '--async', type=int, default=10) parser.add_argument('-t', '--tokens', type=int, default=400, help="Max tokens per a paragraph") parser.add_argument('-g', '--n_paragraphs', type=int, default=15, help="Number of paragraphs to run the model on") parser.add_argument('-f', '--filter', type=str, default=None, choices=["tfidf", "truncate", "linear"], help="How to select paragraphs") parser.add_argument('-b', '--batch_size', type=int, default=200, help="Batch size, larger sizes might be faster but wll take more memory") parser.add_argument('--max_answer_len', type=int, default=8, help="Max answer span to select") parser.add_argument('-c', '--corpus', choices=["web-dev", "web-test", "web-verified-dev", "web-train", "open-dev", "open-train"], default="web-verified-dev") args = parser.parse_args() model_dir = ModelDir(args.model) model = model_dir.get_model() if args.corpus.startswith('web'): dataset = TriviaQaWebDataset() corpus = dataset.evidence if args.corpus == "web-dev": test_questions = dataset.get_dev() elif args.corpus == "web-test": test_questions = dataset.get_test() elif args.corpus == "web-verified-dev": test_questions = dataset.get_verified() elif args.corpus == "web-train": test_questions = dataset.get_train() else: raise RuntimeError() else: dataset = TriviaQaOpenDataset() corpus = dataset.evidence if args.corpus == "open-dev": test_questions = dataset.get_dev() elif args.corpus == "open-train": test_questions = dataset.get_train() else: raise RuntimeError() splitter = MergeParagraphs(args.tokens) per_document = not args.corpus.startswith("open") filter_name = args.filter if filter_name is None: if args.corpus.startswith("open"): filter_name = "linear" else: filter_name = "tfidf" print("Selecting %d paragraphs using %s method per %s" % (args.n_paragraphs, filter_name, ("question-document pair" if per_document else "question"))) if filter_name == "tfidf": para_filter = TopTfIdf(NltkPlusStopWords(punctuation=True), args.n_paragraphs) elif filter_name == "truncate": para_filter = FirstN(args.n_paragraphs) elif filter_name == "linear": para_filter = ShallowOpenWebRanker(args.n_paragraphs) else: raise ValueError() n_questions = args.n_sample if n_questions is not None: test_questions.sort(key=lambda x:x.question_id) np.random.RandomState(0).shuffle(test_questions) test_questions = test_questions[:n_questions] print("Building question/paragraph pairs...") # Loads the relevant questions/documents, selects the right paragraphs, and runs the model's preprocessor if per_document: prep = ExtractMultiParagraphs(splitter, para_filter, model.preprocessor, require_an_answer=False) else: prep = ExtractMultiParagraphsPerQuestion(splitter, para_filter, model.preprocessor, require_an_answer=False) prepped_data = preprocess_par(test_questions, corpus, prep, args.n_processes, 1000) data = [] for q in prepped_data.data: for i, p in enumerate(q.paragraphs): if q.answer_text is None: ans = None else: ans = TokenSpans(q.answer_text, p.answer_spans) data.append(DocumentParagraphQuestion(q.question_id, p.doc_id, (p.start, p.end), q.question, p.text, ans, i)) # Reverse so our first batch will be the largest (so OOMs happen early) questions = sorted(data, key=lambda x: (x.n_context_words, len(x.question)), reverse=True) print("Done, starting eval") if args.step is not None: if args.step == "latest": checkpoint = model_dir.get_latest_checkpoint() else: checkpoint = model_dir.get_checkpoint(int(args.step)) else: checkpoint = model_dir.get_best_weights() if checkpoint is not None: print("Using best weights") else: print("Using latest checkpoint") checkpoint = model_dir.get_latest_checkpoint() test_questions = ParagraphAndQuestionDataset(questions, FixedOrderBatcher(args.batch_size, True)) evaluation = trainer.test(model, [RecordParagraphSpanPrediction(args.max_answer_len, True)], {args.corpus:test_questions}, ResourceLoader(), checkpoint, not args.no_ema, args.async)[args.corpus] if not all(len(x) == len(data) for x in evaluation.per_sample.values()): raise RuntimeError() df = pd.DataFrame(evaluation.per_sample) if args.official_output is not None: print("Saving question result") # I didn't store the unormalized filenames exactly, so unfortunately we have to reload # the source data to get exact filename to output an official test script fns = {} print("Loading proper filenames") if args.corpus == 'web-test': source = join(TRIVIA_QA, "qa", "web-test-without-answers.json") elif args.corpus == "web-dev": source = join(TRIVIA_QA, "qa", "web-dev.json") else: raise NotImplementedError() with open(join(source)) as f: data = json.load(f)["Data"] for point in data: for doc in point["EntityPages"]: filename = doc["Filename"] fn = join("wikipedia", filename[:filename.rfind(".")]) fn = normalize_wiki_filename(fn) fns[(point["QuestionId"], fn)] = filename answers = {} scores = {} for q_id, doc_id, start, end, txt, score in df[["question_id", "doc_id", "para_start", "para_end", "text_answer", "predicted_score"]].itertuples(index=False): filename = dataset.evidence.file_id_map[doc_id] if filename.startswith("web"): true_name = filename[4:] + ".txt" else: true_name = fns[(q_id, filename)] key = q_id + "--" + true_name prev_score = scores.get(key) if prev_score is None or prev_score < score: scores[key] = score answers[key] = txt with open(args.official_output, "w") as f: json.dump(answers, f) if per_document: group_by = ["question_id", "doc_id"] else: group_by = ["question_id"] # Print a table of scores as more paragraphs are used df.sort_values(group_by + ["rank"], inplace=True) f1 = compute_model_scores(df, "predicted_score", "text_f1", group_by) em = compute_model_scores(df, "predicted_score", "text_em", group_by) table = [["N Paragraphs", "EM", "F1"]] table += list([str(i+1), "%.4f" % e, "%.4f" % f] for i, (e, f) in enumerate(zip(em, f1))) print_table(table) output_file = args.paragraph_output if output_file is not None: print("Saving paragraph result") if output_file.endswith("json"): with open(output_file, "w") as f: json.dump(evaluation.per_sample, f) elif output_file.endswith("pkl"): with open(output_file, "wb") as f: pickle.dump(evaluation.per_sample, f) elif output_file.endswith("csv"): df.to_csv(output_file, index=False) else: raise ValueError("Unrecognized file format")
def main(): parser = argparse.ArgumentParser(description='Run the demo server') parser.add_argument('model', help='Models to use') parser.add_argument( '-v', '--voc', help='vocab to use, only words from this file will be used') parser.add_argument('-t', '--tokens', type=int, default=400, help='Number of tokens to use per paragraph') parser.add_argument('--vec_dir', help='Location to find word vectors') parser.add_argument('--n_paragraphs', type=int, default=12, help="Number of paragraphs to run the model on") parser.add_argument('--span_bound', type=int, default=8, help="Max span size to return as an answer") parser.add_argument( '--tagme_api_key', help="Key to use for TAGME (tagme.d4science.org/tagme)") parser.add_argument('--bing_api_key', help="Key to use for bing searches") parser.add_argument('--tagme_thresh', default=0.2, type=float) parser.add_argument('--no_wiki', action="store_true", help="Dont use TAGME") parser.add_argument('--n_web', type=int, default=10, help='Number of web docs to fetch') parser.add_argument('--blacklist_trivia_sites', action="store_true", help="Don't use trivia websites") parser.add_argument('-c', '--wiki_cache', help="Cache wiki articles in this directory") parser.add_argument('--n_dl_threads', type=int, default=5, help="Number of threads to download documents with") parser.add_argument('--request_timeout', type=int, default=60) parser.add_argument('--download_timeout', type=int, default=25) parser.add_argument('--workers', type=int, default=1, help="Number of server workers") parser.add_argument('--debug', default=None, choices=["random_model", "dummy_qa"]) args = parser.parse_args() span_bound = args.span_bound if args.tagme_api_key is not None: tagme_api_key = args.tagme_api_key else: tagme_api_key = environ.get("TAGME_API_KEY") if args.bing_api_key is not None: bing_api_key = args.bing_api_key else: bing_api_key = environ.get("BING_API_KEY") if bing_api_key is None and args.n_web > 0: raise ValueError("If n_web > 0 you must give a BING_API_KEY") if args.debug is None: model = ModelDir(args.model) else: model = RandomPredictor(5, WithIndicators()) if args.vec_dir is not None: loader = LoadFromPath(args.vec_dir) else: loader = ResourceLoader() if args.debug == "dummy_qa": qa = DummyQa() else: qa = QaSystem( args.wiki_cache, MergeParagraphs(args.tokens), ShallowOpenWebRanker(args.n_paragraphs), args.voc, model, loader, bing_api_key, tagme_api_key=tagme_api_key, n_dl_threads=args.n_dl_threads, blacklist_trivia_sites=args.blacklist_trivia_sites, download_timeout=args.download_timeout, span_bound=span_bound, tagme_threshold=None if args.no_wiki else args.tagme_thresh, n_web_docs=args.n_web) logging.propagate = False formatter = logging.Formatter("%(asctime)s: %(levelname)s: %(message)s") handler = logging.StreamHandler() handler.setFormatter(formatter) logging.root.addHandler(handler) logging.root.setLevel(logging.DEBUG) app = Sanic() app.config.REQUEST_TIMEOUT = args.request_timeout @app.route("/answer") async def answer(request): try: question = request.args["question"][0] if question == "": return response.json({'message': 'No question given'}, status=400) spans, paras = await qa.answer_question(question) answers = select_answers(paras, spans, span_bound, 10) return json([x.to_json() for x in answers]) except Exception as e: log.info("Error: " + str(e)) raise ServerError("Server Error", status_code=500) @app.route('/answer-from', methods=['POST']) async def answer_from(request): try: args = ujson.loads(request.body.decode("utf-8")) question = args.get("question") if question is None or question == "": return response.json({'message': 'No question given'}, status=400) doc = args["document"] if len(doc) > 500000: raise ServerError("Document too large", status_code=400) spans, paras = qa.answer_with_doc(question, doc) answers = select_answers(paras, spans, span_bound, 10) return json([x.to_json() for x in answers]) except Exception as e: log.info("Error: " + str(e)) raise ServerError("Server Error", status_code=500) app.static('/', './docqa//server/static/index.html') app.static('/about.html', './docqa//service/static/about.html') app.run(host="0.0.0.0", port=8000, workers=args.workers, debug=False)
def _init(self, loader: ResourceLoader, voc: Iterable[str]): # TODO we should not be building variables here if voc is not None: word_to_vec = loader.load_word_vec(self.vec_name, voc) else: word_to_vec = loader.load_word_vec(self.vec_name) voc = set(word_to_vec.keys()) self._word_to_ix = {} dim = next(iter(word_to_vec.values())).shape[0] if self.placeholder_flag: dim += 1 null_embed = tf.zeros((1, dim), dtype=tf.float32) ix = 1 matrix_list = [null_embed] if self._special_tokens is not None and len(self._special_tokens) > 0: print("Building embeddings for %d special_tokens" % (len(self._special_tokens))) tok_embed = tf.get_variable( shape=(len(self._special_tokens), dim), name="token_embed", dtype=np.float32, trainable=True, initializer=tf.random_uniform_initializer( -self.word_vec_init_scale, self.word_vec_init_scale)) matrix_list.append(tok_embed) for token in self._special_tokens: self._word_to_ix[token] = ix ix += 1 mat = [] for word in voc: if word in self._word_to_ix: continue # in case we already added due after seeing a capitalized version of `word` if word in word_to_vec: mat.append(word_to_vec[word]) self._word_to_ix[word] = ix ix += 1 else: lower = word.lower() # Full back to the lower-case version if lower in word_to_vec and lower not in self._word_to_ix: mat.append(word_to_vec[lower]) self._word_to_ix[lower] = ix ix += 1 print("Had pre-trained word embeddings for %d of %d words" % (len(mat), len(voc))) mat = np.vstack(mat) if self.placeholder_flag: mat = np.concatenate( [mat, np.zeros((len(mat), 1), dtype=np.float32)], axis=1) matrix_list.append(tf.constant(value=mat)) self._placeholder_start = ix if self.placeholder_flag: def init(shape, dtype=None, partition_info=None): out = tf.random_normal((self.n_placeholders, dim - 1), stddev=self.placeholder_stddev) return tf.concat([out, tf.ones((self.n_placeholders, 1))], axis=1) init_fn = init else: init_fn = tf.random_normal_initializer( stddev=self.placeholder_stddev) matrix_list.append( tf.get_variable("placeholders", (self.n_placeholders, mat.shape[1]), tf.float32, trainable=False, initializer=init_fn)) self._word_emb_mat = tf.concat(matrix_list, axis=0)
def convert_saved_graph(model_dir, output_dir): print("Load model") md = ModelDir(model_dir) model = md.get_model() # remove the lm models word embeddings - cpu model will use Char-CNN model.lm_model.embed_weights_file = None dim = model.embed_mapper.layers[1].n_units print("Setting up cudnn version") sess = tf.Session() with sess.as_default(): model.set_input_spec( ParagraphAndQuestionSpec(1, None, None, 14), {"the"}, ResourceLoader(lambda a, b: {"the": np.zeros(300, np.float32)})) print("Buiding graph") pred = model.get_prediction() test_questions = get_test_questions() print("Load vars:") all_vars = tf.global_variables() + tf.get_collection( tf.GraphKeys.SAVEABLE_OBJECTS) lm_var_names = {x.name for x in all_vars if x.name.startswith("bilm")} vars = [x for x in all_vars if x.name not in lm_var_names] md.restore_checkpoint(sess, vars) sess.run( tf.variables_initializer( [x for x in all_vars if x.name in lm_var_names])) feed = model.encode([test_questions], False) cuddn_out = sess.run([pred.start_logits, pred.end_logits], feed_dict=feed) print("Done, copying files...") if not exists(output_dir): mkdir(output_dir) for file in listdir(model_dir): if isfile(file) and file != "model.npy": copyfile(join(model_dir, file), join(output_dir, file)) print("Done, mapping tensors...") to_save, to_init = [], [] for x in tf.trainable_variables(): if x.name.endswith("/gru_parameters:0"): key = x.name[:-len("/gru_parameters:0")] indim, outdim = get_dims(x, dim) c = cudnn_rnn_ops.CudnnGRUSaveable(x, 1, outdim, indim, scope=key) for spec in c.specs: if spec.name.endswith("bias_cudnn 0") or \ spec.name.endswith("bias_cudnn 1"): print('Unsupported spec: ' + spec.name) continue if 'forward' in spec.name: new_name = spec.name.replace( 'forward/rnn/multi_rnn_cell/cell_0/', 'bidirectional_rnn/fw/') else: new_name = spec.name.replace( 'backward/rnn/multi_rnn_cell/cell_0/', 'bidirectional_rnn/bw/') v = tf.Variable(sess.run(spec.tensor), name=new_name) to_init.append(v) to_save.append(v) else: to_save.append(x) save_dir = join(output_dir, "save") if not exists(save_dir): mkdir(save_dir) # save: all_vars = tf.global_variables() + tf.get_collection( tf.GraphKeys.SAVEABLE_OBJECTS) vars_to_save = [x for x in all_vars if not x.name.startswith("bilm")] sess.run(tf.initialize_variables(to_init)) saver = tf.train.Saver(vars_to_save) saver.save( sess, join(save_dir, 'checkpoint'), global_step=123456789, write_meta_graph=False, ) sess.close() tf.reset_default_graph() return cuddn_out
def _init(self, loader: ResourceLoader, voc: Iterable[str], allow_update=False, do_update=False): # TODO we should not be building variables here if voc is not None: word_to_vec = loader.load_word_vec(self.vec_name, voc) else: word_to_vec = loader.load_word_vec(self.vec_name) voc = set(word_to_vec.keys()) self._word_to_ix = {} dim = next(iter(word_to_vec.values())).shape[0] null_embed = tf.zeros((1, dim), dtype=tf.float32) if not do_update: self.unk_embed = tf.get_variable( shape=(1, dim), name="unk_embed", dtype=np.float32, trainable=self.learn_unk, initializer=tf.random_uniform_initializer(-self.word_vec_init_scale, self.word_vec_init_scale)) ix = 2 matrix_list = [null_embed, self.unk_embed] if self._special_tokens is not None and len(self._special_tokens) > 0: print("Building embeddings for %d special_tokens" % (len(self._special_tokens))) tok_embed = tf.get_variable(shape=(len(self._special_tokens), dim), name="token_embed", dtype=np.float32, trainable=True, initializer=tf.random_uniform_initializer(-self.word_vec_init_scale, self.word_vec_init_scale)) matrix_list.append(tok_embed) for token in self._special_tokens: self._word_to_ix[token] = ix ix += 1 mat = [] for word in voc: if word in self._word_to_ix: continue # in case we already added due after seeing a capitalized version of `word` if word in word_to_vec: mat.append(word_to_vec[word]) self._word_to_ix[word] = ix ix += 1 else: lower = word.lower() # Full back to the lower-case version if lower in word_to_vec and lower not in self._word_to_ix: mat.append(word_to_vec[lower]) self._word_to_ix[lower] = ix ix += 1 print("Had pre-trained word embeddings for %d of %d words" % (len(mat), len(voc))) # Encoder will feed this as value of self.common_word_mat # Allows us to quickly change the vocabulary at test time self.common_word_mat_np = np.vstack(mat) if not do_update: # Set up the tf graph only once if allow_update: self.common_word_mat = tf.placeholder(tf.float32, shape=(None, dim), name='common_word_mat') matrix_list.append(self.common_word_mat) else: self.common_word_mat = None matrix_list.append(tf.constant(value=self.common_word_mat_np)) self._word_emb_mat = tf.concat(matrix_list, axis=0)
def main(): parser = argparse.ArgumentParser( description='Evaluate a model on TriviaQA data') parser.add_argument('model', help='model directory') parser.add_argument( '-p', '--paragraph_output', type=str, help="Save fine grained results for each paragraph in csv format") parser.add_argument('-o', '--official_output', type=str, help="Build an offical output file with the model's" " most confident span for each (question, doc) pair") parser.add_argument('--no_ema', action="store_true", help="Don't use EMA weights even if they exist") parser.add_argument( '--n_processes', type=int, default=None, help= "Number of processes to do the preprocessing (selecting paragraphs+loading context) with" ) parser.add_argument('-i', '--step', type=int, default=None, help="checkpoint to load, default to latest") parser.add_argument('-n', '--n_sample', type=int, default=None, help="Number of questions to evaluate on") parser.add_argument('-a', '--async', type=int, default=10) parser.add_argument('-t', '--tokens', type=int, default=400, help="Max tokens per a paragraph") parser.add_argument('-g', '--n_paragraphs', type=int, default=15, help="Number of paragraphs to run the model on") parser.add_argument('-f', '--filter', type=str, default=None, choices=["tfidf", "truncate", "linear"], help="How to select paragraphs") parser.add_argument( '-b', '--batch_size', type=int, default=200, help="Batch size, larger sizes might be faster but wll take more memory" ) parser.add_argument('--max_answer_len', type=int, default=8, help="Max answer span to select") parser.add_argument('-c', '--corpus', choices=[ "web-dev", "web-test", "web-verified-dev", "web-train", "open-dev", "open-train", "wiki-dev", "wiki-test" ], default="web-verified-dev") parser.add_argument("-s", "--source_dir", type=str, default=None, help="where to take input files") parser.add_argument("--n_span_per_q", type=int, default=1, help="where to take input files") args = parser.parse_args() dataset_name = args.source_dir.split('/')[-1] model_name = args.model.split('/')[-1] ElasticLogger().write_log('INFO', 'Start Evaluation', context_dict={ 'model': model_name, 'dataset': dataset_name }) model_dir = ModelDir(args.model) model = model_dir.get_model() if args.corpus.startswith('web'): dataset = TriviaQaWebDataset() if args.corpus == "web-dev": test_questions = dataset.get_dev() elif args.corpus == "web-test": test_questions = dataset.get_test() elif args.corpus == "web-verified-dev": test_questions = dataset.get_verified() elif args.corpus == "web-train": test_questions = dataset.get_train() else: raise AssertionError() elif args.corpus.startswith("wiki"): dataset = TriviaQaWikiDataset() if args.corpus == "wiki-dev": test_questions = dataset.get_dev() elif args.corpus == "wiki-test": test_questions = dataset.get_test() else: raise AssertionError() else: dataset = TriviaQaOpenDataset(args.source_dir) if args.corpus == "open-dev": # just loading the pkl that was saved in build_span_corpus test_questions = dataset.get_dev() elif args.corpus == "open-train": test_questions = dataset.get_train() else: raise AssertionError() ### ALON debuging #test_questions = test_questions[0:5] corpus = dataset.evidence splitter = MergeParagraphs(args.tokens) per_document = args.corpus.startswith( "web") # wiki and web are both multi-document #per_document = True filter_name = args.filter if filter_name is None: # Pick default depending on the kind of data we are using if per_document: filter_name = "tfidf" else: filter_name = "linear" print("Selecting %d paragraphs using method \"%s\" per %s" % (args.n_paragraphs, filter_name, ("question-document pair" if per_document else "question"))) if filter_name == "tfidf": para_filter = TopTfIdf(NltkPlusStopWords(punctuation=True), args.n_paragraphs) elif filter_name == "truncate": para_filter = FirstN(args.n_paragraphs) elif filter_name == "linear": para_filter = ShallowOpenWebRanker(args.n_paragraphs) else: raise ValueError() n_questions = args.n_sample docqa.config.SPANS_PER_QUESTION = args.n_span_per_q #n_questions = 1 if n_questions is not None: test_questions.sort(key=lambda x: x.question_id) np.random.RandomState(0).shuffle(test_questions) test_questions = test_questions[:n_questions] print("Building question/paragraph pairs...") # Loads the relevant questions/documents, selects the right paragraphs, and runs the model's preprocessor if per_document: prep = ExtractMultiParagraphs(splitter, para_filter, model.preprocessor, require_an_answer=False) else: prep = ExtractMultiParagraphsPerQuestion(splitter, para_filter, model.preprocessor, require_an_answer=False) prepped_data = preprocess_par(test_questions, corpus, prep, args.n_processes, 1000) data = [] for q in prepped_data.data: for i, p in enumerate(q.paragraphs): if q.answer_text is None: ans = None else: ans = TokenSpans(q.answer_text, p.answer_spans) data.append( DocumentParagraphQuestion(q.question_id, p.doc_id, (p.start, p.end), q.question, p.text, ans, i)) # Reverse so our first batch will be the largest (so OOMs happen early) questions = sorted(data, key=lambda x: (x.n_context_words, len(x.question)), reverse=True) print("Done, starting eval") if args.step is not None: if args.step == "latest": checkpoint = model_dir.get_latest_checkpoint() else: checkpoint = model_dir.get_checkpoint(int(args.step)) else: checkpoint = model_dir.get_best_weights() if checkpoint is not None: print("Using best weights") else: print("Using latest checkpoint") checkpoint = model_dir.get_latest_checkpoint() test_questions = ParagraphAndQuestionDataset( questions, FixedOrderBatcher(args.batch_size, True)) evaluation = trainer.test( model, [RecordParagraphSpanPrediction(args.max_answer_len, True)], {args.corpus: test_questions}, ResourceLoader(), checkpoint, not args.no_ema, args. async)[args.corpus] if not all(len(x) == len(data) for x in evaluation.per_sample.values()): raise RuntimeError() df = pd.DataFrame(evaluation.per_sample) if args.official_output is not None: print("Saving question result") fns = {} if per_document: # I didn't store the unormalized filenames exactly, so unfortunately we have to reload # the source data to get exact filename to output an official test script print("Loading proper filenames") if args.corpus == 'web-test': source = join(TRIVIA_QA, "qa", "web-test-without-answers.json") elif args.corpus == "web-dev": source = join(TRIVIA_QA, "qa", "web-dev.json") else: raise AssertionError() with open(join(source)) as f: data = json.load(f)["Data"] for point in data: for doc in point["EntityPages"]: filename = doc["Filename"] fn = join("wikipedia", filename[:filename.rfind(".")]) fn = normalize_wiki_filename(fn) fns[(point["QuestionId"], fn)] = filename answers = {} scores = {} for q_id, doc_id, start, end, txt, score in df[[ "question_id", "doc_id", "para_start", "para_end", "text_answer", "predicted_score" ]].itertuples(index=False): filename = dataset.evidence.file_id_map[doc_id] if per_document: if filename.startswith("web"): true_name = filename[4:] + ".txt" else: true_name = fns[(q_id, filename)] # Alon Patch for triviaqa test results true_name = true_name.replace('TriviaQA_Org/', '') key = q_id + "--" + true_name else: key = q_id prev_score = scores.get(key) if prev_score is None or prev_score < score: scores[key] = score answers[key] = txt with open(args.official_output, "w") as f: json.dump(answers, f) output_file = args.paragraph_output if output_file is not None: print("Saving paragraph result") df.to_csv(output_file, index=False) print("Computing scores") if per_document: group_by = ["question_id", "doc_id"] else: group_by = ["question_id"] # Print a table of scores as more paragraphs are used df.sort_values(group_by + ["rank"], inplace=True) df_scores = df.copy(deep=True) df_scores['predicted_score'] = df_scores['predicted_score'].apply( lambda x: pd.Series(x).max()) em = compute_ranked_scores(df_scores, "predicted_score", "text_em", group_by) f1 = compute_ranked_scores(df_scores, "predicted_score", "text_f1", group_by) table = [["N Paragraphs", "EM", "F1"]] table += list([str(i + 1), "%.4f" % e, "%.4f" % f] for i, (e, f) in enumerate(zip(em, f1))) table_df = pd.DataFrame(table[1:], columns=table[0]).drop(['N Paragraphs'], axis=1) ElasticLogger().write_log('INFO', 'Results', context_dict={'model': model_name, 'dataset': dataset_name, \ 'max_EM':table_df.max().ix['EM'], \ 'max_F1':table_df.max().ix['F1'], \ 'result_table': str(table_df)}) df_flat = [] for id, question in df.iterrows(): for text_answer, predicted_span, predicted_score in zip( question['text_answer'], question['predicted_span'], question['predicted_score']): new_question = dict(question.copy()) new_question.update({ 'text_answer': text_answer, 'predicted_span': predicted_span, 'predicted_score': predicted_score }) df_flat.append(new_question) results_df = pd.DataFrame(df_flat) #Alon: outputing the estimates for all the #results_df = results_df.groupby(['question_id', 'text_answer']).apply(lambda df: df.ix[df['predicted_score'].argmax()]).reset_index(drop=True) results_df.sort_values(by=['question_id', 'predicted_score'], ascending=False).set_index([ 'question_id', 'text_answer' ])[['question', 'predicted_score', 'text_em']].to_csv('results.csv') print_table(table)
def perform_evaluation(model_name: str, dataset_names: List[str], tokens_per_paragraph: int, filter_type: str, n_processes: int, n_paragraphs: int, batch_size: int, checkpoint: str, no_ema: bool, max_answer_len: int, official_output_path: str, paragraph_output_path: str, aggregated_output_path: str, elmo_char_cnn: bool, n_samples: Union[int, None], per_document: bool = False): """Perform an evaluation using cape's answer decoder A file will be created listing the answers per question ID for each dataset :param model_name: path to the model to evaluate :param dataset_names: list of strings of datasets to evaluate :param tokens_per_paragraph: how big to make paragraph chunks :param filter_type: how to select the paragraphs to read :param n_processes: how many processes to use when multiprocessing :param n_paragraphs: how many paragraphs to read per question :param batch_size: how many datapoints to evaluate at once :param checkpoint: string, checkpoint to load :param no_ema: if true, dont use EMA weights :param max_answer_len: the maximum allowable length of an answer in tokens :param official_output_path: path to write official output to :param paragraph_output_path: path to write paragraph output to :param aggregated_output_path: path to write aggregated output to :param elmo_char_cnn: if true, uses the elmo CNN to make token embeddings, less OOV but requires much more memory :param per_document: if false, return best scoring answer to a question, if true, the best scoring answer from each document is used instead. """ async = True corpus_name = 'all' print('Setting Up:') model_dir = ModelDir(model_name) model = model_dir.get_model() dataset = get_multidataset(dataset_names) splitter = MergeParagraphs(tokens_per_paragraph) para_filter = get_para_filter(filter_type, per_document, n_paragraphs) test_questions, n_questions = get_questions(per_document, dataset, splitter, para_filter, model.preprocessor, n_processes, batch_size) print("Starting eval") checkpoint = get_checkpoint(checkpoint, model_dir) evaluation = test(model, [RecordParagraphSpanPrediction(max_answer_len, True)], {corpus_name: test_questions}, ResourceLoader(), checkpoint, not no_ema, async, n_samples, elmo_char_cnn)[corpus_name] print('Exporting and Post-processing') if not all(len(x) == n_questions for x in evaluation.per_sample.values()): raise RuntimeError() df = pd.DataFrame(evaluation.per_sample) compute_and_dump_official_output(df, official_output_path, per_document=per_document) print("Saving paragraph result") df.to_csv(paragraph_output_path, index=False) print("Computing scores") agg_df = get_aggregated_df(df, per_document) agg_df.to_csv(aggregated_output_path, index=False)
def convert(model_dir, output_dir, best_weights=False): print("Load model") md = ModelDir(model_dir) model = md.get_model() dim = model.embed_mapper.layers[1].n_units global_step = tf.get_variable('global_step', shape=[], dtype='int32', initializer=tf.constant_initializer(0), trainable=False) print("Setting up cudnn version") #global_step = tf.get_variable('global_step', shape=[], dtype='int32', trainable=False) sess = tf.Session() sess.run(global_step.assign(0)) with sess.as_default(): model.set_input_spec( ParagraphAndQuestionSpec(1, None, None, 14), {"the"}, ResourceLoader(lambda a, b: {"the": np.zeros(300, np.float32)})) print("Buiding graph") pred = model.get_prediction() test_questions = ParagraphAndQuestion( ["Harry", "Potter", "was", "written", "by", "JK"], ["Who", "wrote", "Harry", "Potter", "?"], None, "test_questions") print("Load vars") md.restore_checkpoint(sess) print("Restore finished") feed = model.encode([test_questions], False) cuddn_out = sess.run([pred.start_logits, pred.end_logits], feed_dict=feed) print("Done, copying files...") if not exists(output_dir): mkdir(output_dir) for file in listdir(model_dir): if isfile(file) and file != "model.npy": copyfile(join(model_dir, file), join(output_dir, file)) print("Done, mapping tensors...") to_save = [] to_init = [] for x in tf.trainable_variables(): if x.name.endswith("/gru_parameters:0"): key = x.name[:-len("/gru_parameters:0")] fw_params = x if "map_embed" in x.name: c = cudnn_rnn_ops.CudnnGRU(1, dim, 400) elif "chained-out" in x.name: c = cudnn_rnn_ops.CudnnGRU(1, dim, dim * 4) else: c = cudnn_rnn_ops.CudnnGRU(1, dim, dim * 2) params_saveable = cudnn_rnn_ops.RNNParamsSaveable( c, c.params_to_canonical, c.canonical_to_params, [fw_params], key) for spec in params_saveable.specs: if spec.name.endswith("bias_cudnn 0") or \ spec.name.endswith("bias_cudnn 1"): # ??? What do these even do? continue name = spec.name.split("/") name.remove("cell_0") if "forward" in name: ix = name.index("forward") name.insert(ix + 2, "fw") else: ix = name.index("backward") name.insert(ix + 2, "bw") del name[ix] ix = name.index("multi_rnn_cell") name[ix] = "bidirectional_rnn" name = "/".join(name) v = tf.Variable(sess.run(spec.tensor), name=name) to_init.append(v) to_save.append(v) else: to_save.append(x) other = [ x for x in tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) if x not in tf.trainable_variables() ] print(other) sess.run(tf.initialize_variables(to_init)) saver = tf.train.Saver(to_save + other) save_dir = join(output_dir, "save") if not exists(save_dir): mkdir(save_dir) saver.save(sess, join(save_dir, "checkpoint"), sess.run(global_step)) sess.close() tf.reset_default_graph() print("Updating model...") model.embed_mapper.layers = [ model.embed_mapper.layers[0], BiRecurrentMapper(CompatGruCellSpec(dim)) ] model.match_encoder.layers = list(model.match_encoder.layers) other = model.match_encoder.layers[1].other other.layers = list(other.layers) other.layers[1] = BiRecurrentMapper(CompatGruCellSpec(dim)) pred = model.predictor.predictor pred.first_layer = BiRecurrentMapper(CompatGruCellSpec(dim)) pred.second_layer = BiRecurrentMapper(CompatGruCellSpec(dim)) with open(join(output_dir, "model.pkl"), "wb") as f: pickle.dump(model, f) print("Testing...") with open(join(output_dir, "model.pkl"), "rb") as f: model = pickle.load(f) sess = tf.Session() model.set_input_spec( ParagraphAndQuestionSpec(1, None, None, 14), {"the"}, ResourceLoader(lambda a, b: {"the": np.zeros(300, np.float32)})) pred = model.get_prediction() print("Rebuilding") saver = tf.train.Saver() saver.restore(sess, tf.train.latest_checkpoint(save_dir)) feed = model.encode([test_questions], False) cpu_out = sess.run([pred.start_logits, pred.end_logits], feed_dict=feed) print("These should be close:") print([np.allclose(a, b) for a, b in zip(cpu_out, cuddn_out)]) print(cpu_out) print(cuddn_out)
def main(): print('Starting...') model_dir = ModelDir(OPTS.model) model = model_dir.get_model() tokenizer = NltkAndPunctTokenizer() if not isinstance(model, ParagraphQuestionModel): raise ValueError( "This script is built to work for ParagraphQuestionModel models only" ) if OPTS.reload_vocab: loader = ResourceLoader() else: loader = CachingResourceLoader() print('Loading word vectors...') model.set_input_spec(ParagraphAndQuestionSpec(batch_size=None), set([',']), word_vec_loader=loader, allow_update=True) print('Starting Tensorflow session...') sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) with sess.as_default(): prediction = model.get_prediction() # Take 0-th here because we know we only truncate to one paragraph start_logits_tf = prediction.start_logits[0] end_logits_tf = prediction.end_logits[0] none_logit_tf = prediction.none_logit[0] #best_spans_tf, conf_tf = prediction.get_best_span(MAX_SPAN_LENGTH) model_dir.restore_checkpoint(sess) splitter = Truncate(400) # NOTE: we truncate past 400 tokens selector = TopTfIdf(NltkPlusStopWords(True), n_to_select=5) app = bottle.Bottle() @app.route('/') def index(): return bottle.template('index') @app.route('/post_query', method='post') def post_query(): document_raw = bottle.request.forms.getunicode('document').strip() question_raw = bottle.request.forms.getunicode('question').strip() document = re.split("\s*\n\s*", document_raw) question = tokenizer.tokenize_paragraph_flat(question_raw) doc_toks = [tokenizer.tokenize_paragraph(p) for p in document] split_doc = splitter.split(doc_toks) context = selector.prune(question, split_doc) if model.preprocessor is not None: context = [ model.preprocessor.encode_text(question, x) for x in context ] else: context = [flatten_iterable(x.text) for x in context] vocab = set(question) for txt in context: vocab.update(txt) data = [ ParagraphAndQuestion(x, question, None, "user-question%d" % i) for i, x in enumerate(context) ] model.word_embed.update(loader, vocab) encoded = model.encode(data, is_train=False) start_logits, end_logits, none_logit = sess.run( [start_logits_tf, end_logits_tf, none_logit_tf], feed_dict=encoded) beam, p_na = logits_to_probs(document_raw, context[0], start_logits, end_logits, none_logit, beam_size=BEAM_SIZE) return bottle.template('results', document=document_raw, question=question_raw, beam=beam, p_na=p_na) cur_dir = os.path.abspath(os.path.dirname(__file__)) bottle.TEMPLATE_PATH.insert(0, os.path.join(cur_dir, 'views')) bottle.run(app, host=OPTS.hostname, port=OPTS.port, debug=OPTS.debug)
def main(): parser = argparse.ArgumentParser(description='Run the demo server') parser.add_argument( 'model', default= "/home/antriv/conversation_ai/ALLENAI_DocumentQA/document-qa/models/triviaqa-unfiltered-shared-norm/best-weights", help='Models to use') parser.add_argument( '-v', '--voc', default= "/home/antriv/conversation_ai/ALLENAI_DocumentQA/document-qa/data/triviaqa/evidence/vocab.txt", help='vocab to use, only words from this file will be used') parser.add_argument('-t', '--tokens', type=int, default=400, help='Number of tokens to use per paragraph') parser.add_argument('--vec_dir', default="/home/antriv/data/glove", help='Location to find word vectors') parser.add_argument('--n_paragraphs', type=int, default=15, help="Number of paragraphs to run the model on") parser.add_argument('--paragraphs_to_return', type=int, default=10, help="Number of paragraphs return to the frontend") parser.add_argument('--span_bound', type=int, default=8, help="Max span size to return as an answer") parser.add_argument( '--tagme_api_key', default="1cdc0067-b2de-4774-afbe-38703b11a365-843339462", help="Key to use for TAGME (tagme.d4science.org/tagme)") parser.add_argument('--bing_api_key', default="413239df9faa4f1494a914e0c9cec78e", help="Key to use for bing searches") parser.add_argument( '--bing_version', choices=["v5.0", "v7.0"], default="v7.0", help='Version of Bing API to use (must be compatible with the API key)' ) parser.add_argument( '--tagme_thresh', default=0.2, type=float, help="TAGME threshold for when to use the identified docs") parser.add_argument('--n_web', type=int, default=10, help='Number of web docs to fetch') parser.add_argument('--blacklist_trivia_sites', action="store_true", help="Don't use trivia websites") parser.add_argument( '-c', '--wiki_cache', default= "/home/antriv/conversation_ai/ALLENAI_DocumentQA/document-qa/data/triviaqa/evidence/wikipedia", help="Cache wiki articles in this directory") parser.add_argument('--n_dl_threads', type=int, default=5, help="Number of threads to download documents with") parser.add_argument('--request_timeout', type=int, default=60) parser.add_argument('--download_timeout', type=int, default=25, help="how long to wait before timing out downloads") parser.add_argument('--workers', type=int, default=1, help="Number of server workers") parser.add_argument('--debug', default=None, choices=["random_model", "dummy_qa"]) args = parser.parse_args() span_bound = args.span_bound n_to_return = args.paragraphs_to_return if args.tagme_api_key is not None: tagme_api_key = args.tagme_api_key else: tagme_api_key = environ.get("TAGME_API_KEY") if args.bing_api_key is not None: bing_api_key = args.bing_api_key else: bing_api_key = environ.get("BING_API_KEY") if bing_api_key is None and args.n_web > 0: raise ValueError("If n_web > 0 you must give a BING_API_KEY") if args.debug is None: model = ModelDir(args.model) else: model = RandomPredictor(5, WithIndicators()) if args.vec_dir is not None: loader = LoadFromPath(args.vec_dir) else: loader = ResourceLoader() # Update Sanic's logging to register our class's loggers log_config = LOGGING formatter = "%(asctime)s: %(levelname)s: %(message)s" log_config["formatters"]['my_formatter'] = { 'format': formatter, 'datefmt': '%Y-%m-%d %H:%M:%S', } log_config['handlers']['stream_handler'] = { 'class': "logging.StreamHandler", 'formatter': 'my_formatter', 'stream': sys.stderr } log_config['handlers']['file_handler'] = { 'class': "logging.FileHandler", 'formatter': 'my_formatter', 'filename': 'logging.log' } # It looks like we have to go and name every logger our own code might # use in order to register it with Sanic log_config["loggers"]['qa_system'] = { 'level': 'INFO', 'handlers': ['stream_handler', 'file_handler'], } log_config["loggers"]['downloader'] = { 'level': 'INFO', 'handlers': ['stream_handler', 'file_handler'], } log_config["loggers"]['server'] = { 'level': 'INFO', 'handlers': ['stream_handler', 'file_handler'], } app = Sanic() app.config.REQUEST_TIMEOUT = args.request_timeout log = logging.getLogger('server') @app.listener('before_server_start') async def setup_qa(app, loop): # To play nice with iohttp's async ClientSession objects, we need to construct the QaSystem # inside the event loop. if args.debug == "dummy_qa": qa = DummyQa() else: qa = QaSystem( args.wiki_cache, MergeParagraphs(args.tokens), ShallowOpenWebRanker(args.n_paragraphs), args.voc, model, loader, bing_api_key, bing_version=args.bing_version, tagme_api_key=tagme_api_key, n_dl_threads=args.n_dl_threads, blacklist_trivia_sites=args.blacklist_trivia_sites, download_timeout=args.download_timeout, span_bound=span_bound, tagme_threshold=None if (tagme_api_key is None) else args.tagme_thresh, n_web_docs=args.n_web, ) app.qa = qa @app.listener('after_server_stop') async def setup_qa(app, loop): app.qa.close() @app.route("/answer") async def answer(request): try: question = request.args["question"][0] if question == "": return response.json({'message': 'No question given'}, status=400) spans, paras = await app.qa.answer_question(question) answers = select_answers(paras, spans, span_bound, 10) answers = answers[:n_to_return] best_span = max(answers[0].answers, key=lambda x: x.conf) log.info("Answered \"%s\" (with web search): \"%s\"", question, answers[0].original_text[best_span.start:best_span.end]) return json([x.to_json() for x in answers]) except Exception as e: log.info("Error: " + str(e)) raise ServerError(e, status_code=500) @app.route('/answer-from', methods=['POST']) async def answer_from(request): try: args = ujson.loads(request.body.decode("utf-8")) question = args.get("question") if question is None or question == "": return response.json({'message': 'No question given'}, status=400) doc = args["document"] if len(doc) > 500000: raise ServerError("Document too large", status_code=400) spans, paras = app.qa.answer_with_doc(question, doc) answers = select_answers(paras, spans, span_bound, 10) answers = answers[:n_to_return] best_span = max(answers[0].answers, key=lambda x: x.conf) log.info("Answered \"%s\" (with user doc): \"%s\"", question, answers[0].original_text[best_span.start:best_span.end]) return json([x.to_json() for x in answers]) except Exception as e: log.info("Error: " + str(e)) raise ServerError(e, status_code=500) app.static('/', './docqa//server/static/index.html') app.static('/about.html', './docqa/server/static/about.html') app.run(host="0.0.0.0", port=5000, workers=args.workers, debug=False, log_config=LOGGING)
def main(): parser = argparse.ArgumentParser( description='Evaluate a model on document-level SQuAD') parser.add_argument('model', help='model to use') parser.add_argument( 'output', type=str, help="Store the per-paragraph results in csv format in this file") parser.add_argument('-n', '--n_sample', type=int, default=None, help="(for testing) sample documents") parser.add_argument( '-s', '--async', type=int, default=10, help="Encoding batch asynchronously, queueing up to this many") parser.add_argument('-a', '--answer_bound', type=int, default=17, help="Max answer span length") parser.add_argument('-p', '--n_paragraphs', type=int, default=None, help="Max number of paragraphs to use") parser.add_argument( '-b', '--batch_size', type=int, default=200, help="Batch size, larger sizes can be faster but uses more memory") parser.add_argument('-c', '--corpus', choices=["dev", "train", "doc-rd-dev"], default="dev") parser.add_argument('--no_ema', action="store_true", help="Don't use EMA weights even if they exist") args = parser.parse_args() model_dir = ModelDir(args.model) print("Loading data") questions = [] ranker = SquadTfIdfRanker(NltkPlusStopWords(True), args.n_paragraphs, force_answer=False) if args.corpus == "doc-rd-dev": docs = SquadCorpus().get_dev() if args.n_sample is not None: docs.sort(key=lambda x: x.doc_id) np.random.RandomState(0).shuffle(docs) docs = docs[:args.n_sample] print("Fetching document reader docs...") doc_rd_versions = get_doc_rd_doc(docs) print("Ranking and matching with questions...") for doc in tqdm(docs): doc_questions = flatten_iterable(x.questions for x in doc.paragraphs) paragraphs = doc_rd_versions[doc.title] ranks = ranker.rank([x.words for x in doc_questions], [x.text for x in paragraphs]) for i, question in enumerate(doc_questions): para_ranks = np.argsort(ranks[i]) for para_rank, para_num in enumerate( para_ranks[:args.n_paragraphs]): # Just use dummy answers spans for these pairs questions.append( RankedParagraphQuestion( question.words, TokenSpans(question.answer.answer_text, np.zeros((0, 2), dtype=np.int32)), question.question_id, paragraphs[para_num], para_rank, para_num)) rl = ResourceLoader() else: if args.corpus == "dev": docs = SquadCorpus().get_dev() else: docs = SquadCorpus().get_train() rl = SquadCorpus().get_resource_loader() if args.n_sample is not None: docs.sort(key=lambda x: x.doc_id) np.random.RandomState(0).shuffle(docs) docs = docs[:args.n_sample] for q in ranker.ranked_questions(docs): for i, p in enumerate(q.paragraphs): questions.append( RankedParagraphQuestion( q.question, TokenSpans(q.answer_text, p.answer_spans), q.question_id, ParagraphWithInverse([p.text], p.original_text, p.spans), i, p.paragraph_num)) print("Split %d docs into %d paragraphs" % (len(docs), len(questions))) questions = sorted(questions, key=lambda x: (x.n_context_words, len(x.question)), reverse=True) for q in questions: if len(q.answer.answer_spans.shape) != 2: raise ValueError() checkpoint = model_dir.get_best_weights() if checkpoint is not None: print("Using best weights") else: print("Using latest checkpoint") checkpoint = model_dir.get_latest_checkpoint() if checkpoint is None: raise ValueError("No checkpoints found") data = ParagraphAndQuestionDataset( questions, FixedOrderBatcher(args.batch_size, True)) model = model_dir.get_model() evaluation = trainer.test( model, [RecordParagraphSpanPrediction(args.answer_bound, True)], {args.corpus: data}, rl, checkpoint, not args.no_ema, args. async)[args.corpus] print("Saving result") output_file = args.output df = pd.DataFrame(evaluation.per_sample) df.sort_values(["question_id", "rank"], inplace=True, ascending=True) group_by = ["question_id"] f1 = compute_ranked_scores(df, "predicted_score", "text_f1", group_by) em = compute_ranked_scores(df, "predicted_score", "text_em", group_by) table = [["N Paragraphs", "EM", "F1"]] table += list([str(i + 1), "%.4f" % e, "%.4f" % f] for i, (e, f) in enumerate(zip(em, f1))) print_table(table) df.to_csv(output_file, index=False)
def run(): parser = argparse.ArgumentParser() parser.add_argument("squad_path", help="path to squad dev data file") parser.add_argument("output_path", help="path where evaluation json file will be written") parser.add_argument("--model-path", default="model", help="path to model directory") parser.add_argument("--n", type=int, default=None) parser.add_argument("-b", "--batch_size", type=int, default=100) parser.add_argument("--ema", action="store_true") args = parser.parse_args() squad_path = args.squad_path output_path = args.output_path model_dir = ModelDir(args.model_path) nltk.data.path.append("nltk_data") print("Loading data") docs = parse_squad_data(squad_path, "", NltkAndPunctTokenizer(), False) pairs = split_docs(docs) dataset = ParagraphAndQuestionDataset( pairs, ClusteredBatcher(args.batch_size, ContextLenKey(), False, True)) print("Done, init model") model = model_dir.get_model() loader = ResourceLoader(lambda a, b: load_word_vector_file( join(VEC_DIR, "glove.840B.300d.txt"), b)) lm_model = model.lm_model basedir = join(LM_DIR, "squad-context-concat-skip") lm_model.lm_vocab_file = join(basedir, "squad_train_dev_all_unique_tokens.txt") lm_model.options_file = join( basedir, "options_squad_lm_2x4096_512_2048cnn_2xhighway_skip.json") lm_model.weight_file = join( basedir, "squad_context_concat_lm_2x4096_512_2048cnn_2xhighway_skip.hdf5") lm_model.embed_weights_file = None model.set_inputs([dataset], loader) print("Done, building graph") sess = tf.Session() with sess.as_default(): pred = model.get_prediction() best_span = pred.get_best_span(17)[0] all_vars = tf.global_variables() + tf.get_collection( tf.GraphKeys.SAVEABLE_OBJECTS) dont_restore_names = { x.name for x in all_vars if x.name.startswith("bilm") } print(sorted(dont_restore_names)) vars = [x for x in all_vars if x.name not in dont_restore_names] print("Done, loading weights") checkpoint = model_dir.get_best_weights() if checkpoint is None: print("Loading most recent checkpoint") checkpoint = model_dir.get_latest_checkpoint() else: print("Loading best weights") saver = tf.train.Saver(vars) saver.restore(sess, checkpoint) if args.ema: ema = tf.train.ExponentialMovingAverage(0) saver = tf.train.Saver( {ema.average_name(x): x for x in tf.trainable_variables()}) saver.restore(sess, checkpoint) sess.run( tf.variables_initializer( [x for x in all_vars if x.name in dont_restore_names])) print("Done, starting evaluation") out = {} for i, batch in enumerate(dataset.get_epoch()): if args.n is not None and i == args.n: break print("On batch: %d" % (i + 1)) enc = model.encode(batch, False) spans = sess.run(best_span, feed_dict=enc) for (s, e), point in zip(spans, batch): out[point.question_id] = point.get_original_text(s, e) sess.close() print("Done, saving") with open(output_path, "w") as f: json.dump(out, f) print("Mission accomplished!")
def get_resource_loader(self): return ResourceLoader()
def run(): parser = argparse.ArgumentParser() parser.add_argument("input_data") parser.add_argument("output_data") parser.add_argument("--plot_dir", type=str, default=None) parser.add_argument("--model_dir", type=str, default="/tmp/model/document-qa") parser.add_argument("--lm_dir", type=str, default="/home/castle/data/lm/squad-context-concat-skip") parser.add_argument("--glove_dir", type=str, default="/home/castle/data/glove") parser.add_argument("--n", type=int, default=None) parser.add_argument("-b", "--batch_size", type=int, default=30) parser.add_argument("--ema", action="store_true") args = parser.parse_args() input_data = args.input_data output_path = args.output_data model_dir = ModelDir(args.model_dir) nltk.data.path.append("nltk_data") print("Loading data") docs = parse_squad_data(input_data, "", NltkAndPunctTokenizer(), False) pairs = split_docs(docs) dataset = ParagraphAndQuestionDataset(pairs, ClusteredBatcher(args.batch_size, ContextLenKey(), False, True)) print("Done, init model") model = model_dir.get_model() # small hack, just load the vector file at its expected location rather then using the config location loader = ResourceLoader(lambda a, b: load_word_vector_file(join(args.glove_dir, "glove.840B.300d.txt"), b)) lm_model = model.lm_model basedir = args.lm_dir plotdir = args.plot_dir lm_model.lm_vocab_file = join(basedir, "squad_train_dev_all_unique_tokens.txt") lm_model.options_file = join(basedir, "options_squad_lm_2x4096_512_2048cnn_2xhighway_skip.json") lm_model.weight_file = join(basedir, "squad_context_concat_lm_2x4096_512_2048cnn_2xhighway_skip.hdf5") lm_model.embed_weights_file = None model.set_inputs([dataset], loader) print("Done, building graph") sess = tf.Session() with sess.as_default(): pred = model.get_prediction() best_span = pred.get_best_span(17)[0] if plotdir != None: start_logits_op, end_logits_op = pred.get_logits() all_vars = tf.global_variables() + tf.get_collection(tf.GraphKeys.SAVEABLE_OBJECTS) dont_restore_names = {x.name for x in all_vars if x.name.startswith("bilm")} print(sorted(dont_restore_names)) vars = [x for x in all_vars if x.name not in dont_restore_names] print("Done, loading weights") checkpoint = model_dir.get_best_weights() if checkpoint is None: print("Loading most recent checkpoint") checkpoint = model_dir.get_latest_checkpoint() else: print("Loading best weights") saver = tf.train.Saver(vars) saver.restore(sess, checkpoint) if args.ema: ema = tf.train.ExponentialMovingAverage(0) saver = tf.train.Saver({ema.average_name(x): x for x in tf.trainable_variables()}) saver.restore(sess, checkpoint) sess.run(tf.variables_initializer([x for x in all_vars if x.name in dont_restore_names])) print("Done, starting evaluation") out = {} for i, batch in enumerate(dataset.get_epoch()): if args.n is not None and i == args.n: break print("On batch size [%d], now in %d th batch" % (args.batch_size, i +1)) enc = model.encode(batch, False) if plotdir != None: spans, start_logits, end_logits = sess.run([best_span, start_logits_op, end_logits_op], feed_dict=enc) for bi, point in enumerate(batch): q = ' '.join(point.question) c = point.paragraph.get_context() gt = ' | '.join(point.answer.answer_text) s, e = spans[bi] pred = point.get_original_text(s, e) start_dist = start_logits[bi] end_dist = end_logits[bi] c_interval = np.arange(0.0, start_dist.shape[0], 1) c_label = c plt.figure(1) plt.subplot(211) plt.plot(c_interval, start_dist, color='r') plt.title("Q : " + q + " // A : " + gt, fontsize=9) plt.text(0, 0, r'Predict : %s [%d:%d]' % (pred, s, e), color='b') axes = plt.gca() axes.set_ylim([-20, 20]) plt.subplot(212) plt.plot(c_interval, end_dist, color='g') plt.xticks(c_interval, c_label, rotation=90, fontsize=5) axes = plt.gca() axes.set_ylim([-20, 20]) plt.show() break else: spans = sess.run(best_span, feed_dict=enc) for (s, e), point in zip(spans, batch): out[point.question_id] = point.get_original_text(s, e) sess.close() print("Done, saving") with open(output_path, "w") as f: json.dump(out, f) print("Mission accomplished!")