def read_input_data(model): data = [] vocab = set() tokenizer = NltkAndPunctTokenizer() splitter = Truncate(400) # NOTE: we truncate past 400 tokens selector = TopTfIdf(NltkPlusStopWords(True), n_to_select=5) with open(OPTS.input_file) as f: for i, line in enumerate(f): try: document_raw, question_raw = line.strip().split('\t') except ValueError as e: print(line.strip()) print('Error at line %d' % i) raise e document = re.split("\s*\n\s*", document_raw) question = tokenizer.tokenize_paragraph_flat(question_raw) doc_toks = [tokenizer.tokenize_paragraph(p) for p in document] split_doc = splitter.split(doc_toks) context = selector.prune(question, split_doc) if model.preprocessor is not None: context = [model.preprocessor.encode_text(question, x) for x in context] else: context = [flatten_iterable(x.text) for x in context] vocab.update(question) for txt in context: vocab.update(txt) ex = [ParagraphAndQuestion(x, question, None, "user-question%d"%i) for i, x in enumerate(context)] data.append((document_raw, question_raw, context, ex)) return data, vocab
class RandomMachineReaderModel(CapeMachineReaderModelInterface): def __init__(self, _): self.tokenizer = NltkAndPunctTokenizer() def tokenize(self, text): tokens = self.tokenizer.tokenize_paragraph_flat(text) spans = self.tokenizer.convert_to_spans(text, [tokens])[0] return tokens, spans def get_document_embedding(self, text): np.random.seed( int(hashlib.sha1(text.encode()).hexdigest(), 16) % 10**8) document_tokens, _ = self.tokenize(text) return np.random.random((len(document_tokens), 240)) def get_logits(self, question, document_embedding): question_tokens, _ = self.tokenize(question) n_words = document_embedding.shape[0] qseed = int(hashlib.sha1(question.encode()).hexdigest(), 16) % 10**8 dseed = int(np.sum(document_embedding) * 10**6) % 10**8 np.random.seed(dseed + qseed) start_logits = np.random.random(n_words) off = np.random.randint(1, 5) end_logits = np.concatenate( [np.zeros(off) + np.min(start_logits), start_logits[off:]]) return start_logits[:n_words], end_logits[:n_words]
def get_doc_rd_doc( docs: List[Document]) -> Dict[str, List[ParagraphWithInverse]]: tokenizer = NltkAndPunctTokenizer() conn = sqlite3.connect(DOCUMENT_READER_DB) c = conn.cursor() titles = [clean_title(doc.title) for doc in docs] for i, t in enumerate(titles): # Had to manually resolve this (due to changes in Wikipedia?) if t == "Sky (United Kingdom)": titles[i] = "Sky UK" title_to_doc_id = {t: doc.title for t, doc in zip(titles, docs)} c.execute("CREATE TEMPORARY TABLE squad_docs(id)") c.executemany("INSERT INTO squad_docs VALUES (?)", [(x, ) for x in titles]) c.execute("SELECT id, text FROM documents WHERE id IN squad_docs") documents = {} out = c.fetchall() conn.close() for title, text in out: paragraphs = [] for para in text.split("\n"): para = para.strip() if len(para) > 0: paragraphs.append(tokenizer.tokenize_with_inverse(para)) documents[title_to_doc_id[title]] = paragraphs return documents
def __init__(self, machine_reader_config): self.tokenizer = NltkAndPunctTokenizer() self.config = machine_reader_config self.model = self._load_model() self.sess = tf.Session() self.start_logits, self.end_logits, self.context_rep = self._build_model( ) self._initialize()
def test_split_inv(self): paras = [ "One fish two fish. Red fish blue fish", "Just one sentence", "How will an overhead score? The satisfactory juice returns against an inviting protein. " "How can a rat expand? The subway fishes throughout a struggle. The guaranteed herd pictures an " "episode into the accustomed damned. The garbage reigns beside the component!", ] tok = NltkAndPunctTokenizer() tokenized = [tok.tokenize_with_inverse(x) for x in paras] inv_split = RandomSplitter().split_inverse(tokenized) for para in inv_split: self.assertTrue(flatten_iterable(para.text) == [para.original_text[s:e] for s,e in para.spans])
def build_web_corpus(n_processes, sets_to_build, source_dir, target_dir): sets_to_build_dict = {} if 'verified' in sets_to_build: sets_to_build_dict['verified'] = join(source_dir, "verified-web-dev.json") if 'dev' in sets_to_build: sets_to_build_dict['dev'] = join(source_dir, "web-dev.json") if 'train' in sets_to_build: sets_to_build_dict['train'] = join(source_dir, "web-train.json") if 'test' in sets_to_build: sets_to_build_dict['test'] = join(source_dir, "web-test-without-answers.json") #dict( # verified=join(TRIVIA_QA, "qa", "verified-web-dev.json"), # dev=join(TRIVIA_QA, "qa", "web-dev.json"), # train=join(TRIVIA_QA, "qa", "web-train.json"), # test=join(TRIVIA_QA, "qa", "web-test-without-answers.json") #) build_dataset("web", NltkAndPunctTokenizer(), sets_to_build_dict, FastNormalizedAnswerDetector(), n_processes, out_dir=target_dir)
def main(): parser = argparse.ArgumentParser("Preprocess SQuAD data") #basedir = join(expanduser("~"), "data", "squad") basedir = join(expanduser("~"), "azayats", "data", "squad") parser.add_argument("--train_file", default=join(basedir, "train-v1.1.json")) parser.add_argument("--dev_file", default=join(basedir, "dev-v1.1.json")) if not exists(config.CORPUS_DIR): mkdir(config.CORPUS_DIR) target_dir = join(config.CORPUS_DIR, SquadCorpus.NAME) if exists(target_dir) and len(listdir(target_dir)) > 0: raise ValueError("Files already exist in " + target_dir) args = parser.parse_args() tokenzier = NltkAndPunctTokenizer() print("Parsing train...") train = list(parse_squad_data(args.train_file, "train", tokenzier)) print("Parsing dev...") dev = list(parse_squad_data(args.dev_file, "dev", tokenzier)) print("Saving...") SquadCorpus.make_corpus(train, dev) print("Done")
def main(): #Namespace(directory= 'C:/Users/boidiyv/document-qa-master',dump=False, fake=False, verbose=False) parser = argparse.ArgumentParser("Preprocess SQuAD data") #parser = argparse.ArgumentParser() parser.add_argument('--document-qa/docqa/squad', type=Path) parser.add_argument("--train_file", default=config.SQUAD_TRAIN) parser.add_argument("--dev_file", default=config.SQUAD_DEV) #parser.add_argument("--document-qa-master",type=lambda p: Path(p).absolute(),default=Path(__file__).absolute().parent / "document-qa-master",help="Path to the data directory" ) if not exists(config.CORPUS_DIR): mkdir(config.CORPUS_DIR) target_dir = join(config.CORPUS_DIR, SquadCorpus.NAME) if exists(target_dir) and len(listdir(target_dir)) > 0: raise ValueError("Files already exist in " + target_dir) args = parser.parse_args('') tokenzier = NltkAndPunctTokenizer() print("Parsing train...") train = list(parse_squad_data(args.train_file, "train", tokenzier)) print(train) print("Parsing dev...") dev = list(parse_squad_data(args.dev_file, "dev", tokenzier)) print("Saving...") SquadCorpus.make_corpus(train, dev) print("Done")
def main(): parser = argparse.ArgumentParser("Preprocess SQuAD data") parser.add_argument("--train_file", default=config.SQUAD_TRAIN) parser.add_argument("--dev_file", default=config.SQUAD_DEV) parser.add_argument("--weighted-questions", action='store_true') if not exists(config.CORPUS_DIR): mkdir(config.CORPUS_DIR) target_dir = join(config.CORPUS_DIR, SquadCorpus.NAME) if exists(target_dir) and len(listdir(target_dir)) > 0: raise ValueError("Files already exist in " + target_dir) args = parser.parse_known_args()[0] tokenizer = NltkAndPunctTokenizer() print("Parsing train...") train = list( parse_squad_data(args.train_file, "train", tokenizer, weighted_samples=args.weighted_questions)) print("Parsing dev...") dev = list(parse_squad_data(args.dev_file, "dev", tokenizer)) print("Saving...") SquadCorpus.make_corpus(train, dev) print("Done")
def main(): parse = argparse.ArgumentParser("Pre-tokenize the XQA evidence corpus") parse.add_argument("--corpus", choices=[ "en", "fr", "de", "ru", "pt", "zh", "pl", "uk", "ta" "en_trans_de", "en_trans_zh", "fr_trans_en", "de_trans_en", "ru_trans_en", "pt_trans_en", "zh_trans_en", "pl_trans_en", "uk_trans_en", "ta_trans_en" ], required=True) # This is slow, using more processes is recommended parse.add_argument("-n", "--n_processes", type=int, default=1, help="Number of processes to use") parse.add_argument("--wiki_only", action="store_true") args = parse.parse_args() output_dir = join(config.CORPUS_DIR, args.corpus, "evidence") source = join(config.CORPUS_NAME_TO_PATH[args.corpus], "evidence") if args.corpus == "en_trans_zh" or args.corpus == "zh": tokenizer = ChineseTokenizer() else: tokenizer = NltkAndPunctTokenizer() build_tokenized_corpus(source, tokenizer, output_dir, n_processes=args.n_processes, wiki_only=args.wiki_only)
def build_web_corpus(n_processes): build_dataset( "web", NltkAndPunctTokenizer(), dict(verified=join(TRIVIA_QA, "qa", "verified-web-dev.json"), dev=join(TRIVIA_QA, "qa", "web-dev.json"), train=join(TRIVIA_QA, "qa", "web-train.json"), test=join(TRIVIA_QA, "qa", "web-test-without-answers.json")), FastNormalizedAnswerDetector(), n_processes)
def build_wiki_corpus(n_processes): build_dataset( "wiki", NltkAndPunctTokenizer(), dict( verified=join(TRIVIA_QA, "qa", "verified-wikipedia-dev.json"), dev=join(TRIVIA_QA, "qa", "wikipedia-dev.json"), train=join(TRIVIA_QA, "qa", "wikipedia-train.json"), ), FastNormalizedAnswerDetector(), n_processes)
def find_answer(documents, raw_question): raw_question = raw_question.lower() documents = [d.lower() for d in documents] global best_spans, conf documents = [re.split("\s*\n\s*", doc) for doc in documents] tokenizer = NltkAndPunctTokenizer() question = tokenizer.tokenize_paragraph_flat(raw_question) documents = [[tokenizer.tokenize_paragraph(p) for p in doc] for doc in documents] splitter = MergeParagraphs(400) documents = [splitter.split(doc) for doc in documents] if len(documents) == 1: selector = TopTfIdf(NltkPlusStopWords(True), n_to_select=5) context = selector.prune(question, documents[0]) else: selector = ShallowOpenWebRanker(n_to_select=10) context = selector.prune(question, flatten_iterable(documents)) context = [flatten_iterable(x.text) for x in context] data = [ ParagraphAndQuestion(x, question, None, "user-question%d" % i) for i, x in enumerate(context) ] encoded = model.encode(data, is_train=False) with sess.as_default(): spans, confid = sess.run([best_spans, conf], feed_dict=encoded) best_para = np.argmax(confid) ans = " ".join(context[best_para][spans[best_para][0]:spans[best_para][1] + 1]) confidence = confid[best_para] return ans, confidence
def main(): from build_span_corpus import TriviaQaWebDataset from docqa.data_processing.text_utils import NltkAndPunctTokenizer dataset = TriviaQaWebDataset() qs = dataset.get_train() qs = np.random.RandomState(0).choice(qs, 1000, replace=False) evaluate_question_detector(qs, dataset.evidence, NltkAndPunctTokenizer().tokenize_paragraph_flat, FastNormalizedAnswerDetector())
def get_random_answer(self): time.sleep(1) para = NltkAndPunctTokenizer().tokenize_with_inverse(ipso) para1 = WebParagraph(para.text, ipso, para.spans, 0, 0, 0, "source1", "fake_url1") para2 = WebParagraph(para.text, ipso, np.array(para.spans), 0, 0, 0, "source2", "fake_url2") span_scores = np.random.normal(size=(2, len(para.spans), len(para.spans))) * 5 return span_scores, [para1, para2]
def build_sample_corpus(n_processes): build_dataset("web-sample", NltkAndPunctTokenizer(), dict( dev=join(TRIVIA_QA, "qa", "web-dev.json"), train=join(TRIVIA_QA, "qa", "web-train.json"), ), FastNormalizedAnswerDetector(), n_processes, sample=1000)
def read_input_data(model): data = [] vocab = set() tokenizer = NltkAndPunctTokenizer() with open(OPTS.input_file) as f: json_data = json.load(f) for doc in json_data['data']: for paragraph in doc['paragraphs']: context = tokenizer.tokenize_with_inverse(paragraph['context']) if model.preprocessor is not None: context = model.preprocessor.encode_text(question, context) context = context.get_context() vocab.update(context) for qa in paragraph['qas']: question = tokenizer.tokenize_sentence(qa['question']) vocab.update(question) ex = [ParagraphAndQuestion(context, question, None, qa['id'])] data.append((paragraph['context'], context, ex)) return data, sorted(list(vocab))
def __init__(self, corpus_name): self.corpus_name = corpus_name self.dir = join(CORPUS_DIR, self.corpus_name) self.tokenizer = NltkAndPunctTokenizer() self.detector = FastNormalizedAnswerDetector() self._train, self._raw_train = list(), None self._dev, self._raw_dev = list(), None self.missed_answer = 0
def main(): parse = argparse.ArgumentParser("Pre-tokenize the TriviaQA evidence corpus") parse.add_argument("-o", "--output_dir", type=str, default=join(config.CORPUS_DIR, "triviaqa", "evidence")) parse.add_argument("-s", "--source", type=str, default=join(config.TRIVIA_QA, "evidence")) # This is slow, using more processes is recommended parse.add_argument("-n", "--n_processes", type=int, default=8, help="Number of processes to use") parse.add_argument("--wiki_only", action="store_true") args = parse.parse_args() build_tokenized_corpus(args.source, NltkAndPunctTokenizer(), args.output_dir, n_processes=args.n_processes, wiki_only=args.wiki_only)
def build_unfiltered_corpus(n_processes): build_dataset("web-open", NltkAndPunctTokenizer(), dict(dev=join(TRIVIA_QA_UNFILTERED, "unfiltered-web-dev.json"), train=join(TRIVIA_QA_UNFILTERED, "unfiltered-web-train.json"), test=join(TRIVIA_QA_UNFILTERED, "unfiltered-web-test-without-answers.json")), answer_detector=FastNormalizedAnswerDetector(), n_process=n_processes)
def triviaqa_prepro(wiki_only, n_processes): print('Tokenizing {} corpus:'.format('wiki' if wiki_only else 'wiki and web')) build_tokenized_corpus( TRIVIA_QA_EVIDENCE, NltkAndPunctTokenizer(), PREPRO_EVIDENCE_DIR, n_processes=n_processes, wiki_only=wiki_only, ) print('Preparing wiki corpus:') build_wiki_corpus(n_processes) if not wiki_only: print('Preparing web corpus:') build_web_corpus(n_processes)
def __init__(self, cache_dir=None, follow_redirects: bool = True, keep_inverse_mapping: bool = False, extract_lists: bool = False, tokenizer=NltkAndPunctTokenizer()): self.tokenizer = tokenizer self.extract_lists = extract_lists self.follow_redirects = follow_redirects self.cache_dir = cache_dir self.keep_inverse_mapping = keep_inverse_mapping if cache_dir is not None and not exists(self.cache_dir): mkdir(self.cache_dir)
def build_unfiltered_corpus(n_processes, sets_to_build, source_dir, target_dir): sets_to_build_dict = {} if 'dev' in sets_to_build: sets_to_build_dict['dev'] = join(source_dir, "unfiltered-web-dev.json") if 'train' in sets_to_build: sets_to_build_dict['train'] = join(source_dir, "unfiltered-web-train.json") if 'test' in sets_to_build: sets_to_build_dict['test'] = join( source_dir, "unfiltered-web-test-without-answers.json") build_dataset("web-open", NltkAndPunctTokenizer(), sets_to_build_dict, answer_detector=FastNormalizedAnswerDetector(), n_process=n_processes, out_dir=target_dir)
def __init__(self, cache_dir=None, follow_redirects: bool=True, keep_inverse_mapping: bool=False, extract_lists: bool=False, tokenizer=NltkAndPunctTokenizer()): """ :param cache_dir: Optional, directory to cache the documents we download :param follow_redirects: Follow wiki re-directs :param keep_inverse_mapping: Keep track of the inverse mapping of tokens so the text can be "untokenized" accurately :param extract_lists: Include lists in the extracted articles :param tokenizer: Tokenizer to use to tokenize the documents """ self.tokenizer = tokenizer self.extract_lists = extract_lists self.follow_redirects = follow_redirects self.cache_dir = cache_dir self.keep_inverse_mapping = keep_inverse_mapping if cache_dir is not None and not exists(self.cache_dir): mkdir(self.cache_dir)
def build_xqa_corpus(corpus_name, n_processes): if corpus_name.startswith("en"): files_dict = dict(train=join(CORPUS_NAME_TO_PATH[corpus_name], "qa", "train.json"), dev=join(CORPUS_NAME_TO_PATH[corpus_name], "qa", "dev.json"), test=join(CORPUS_NAME_TO_PATH[corpus_name], "qa", "test.json")) else: files_dict = dict(dev=join(CORPUS_NAME_TO_PATH[corpus_name], "qa", "dev.json"), test=join(CORPUS_NAME_TO_PATH[corpus_name], "qa", "test.json")) if corpus_name == "en_trans_zh" or corpus_name == "zh": tokenizer = ChineseTokenizer() else: tokenizer = NltkAndPunctTokenizer() build_dataset(corpus_name, tokenizer, files_dict, FastNormalizedAnswerDetector(), n_processes)
def prepro_squad_fold(name, fold, squad_file_paths): tokenizer = NltkAndPunctTokenizer() dataset_evidence_dir = join(PREPRO_EVIDENCE_DIR, name) if not exists(dataset_evidence_dir): makedirs(dataset_evidence_dir) voc = set() squad_docs = [ d for squad_file_path in squad_file_paths for d in parse_squad_data(squad_file_path, fold, tokenizer) ] questions = [] file_map = {} for document in tqdm(squad_docs, desc=fold, ncols=80): for paragraph in document.paragraphs: for question in paragraph.questions: doc_id = question.question_id doc_savename = get_doc_savename(dataset_evidence_dir, doc_id) trivia_q = squad_q2triviaqa_q(question) with open(doc_savename + '.txt', 'w', encoding='utf8') as f: f.write(dump_paragraph(paragraph)) words = {w for sent in paragraph.text for w in sent} voc.update(words) file_map[doc_id] = doc_savename questions.append(trivia_q) questions_savename = get_questions_savename(name, fold) with open(questions_savename, "wb") as f: pickle.dump(questions, f) return voc, file_map
def getAnswer(self): #parser = argparse.ArgumentParser(description="Run an ELMo model on user input") #parser.add_argument("model", help="Model directory") #parser.add_argument("question", help="Question to answer") #parser.add_argument("documents", help="List of text documents to answer the question with", nargs='+') #args = parser.parse_args() #print("Preprocessing...") # Load the model model_dir = ModelDir(MODEL_DIR) model = model_dir.get_model() if not isinstance(model, ParagraphQuestionModel): raise ValueError( "This script is built to work for ParagraphQuestionModel models only" ) conn = pyodbc.connect(DB_CONN) cursor = conn.cursor() #(23211,28690,33214,25638,25837,26454,28693,26137,31428,32087) query="select cast(filetext as varchar(max)) as filetext, name, type from dbo.UserworkspaceData where objectmasterid= "+\ str(self.ObjectMasterId)+\ " order by id asc" #query="select cast(filetext as varchar(max)) as filetext from kpl_tmp" documents = [] document = "" name = "" filetype = 0 for doc in cursor.execute(query): document = document + doc[0] name = doc[1] filetype = doc[2] #open("E:/kpl.txt","w+").write(document) documents.append(document) #documents.replace("\n\n","\n") #r.sub("",documents) #documents=" ".join(documents.split()) #open("E:\kpl_test.txt","w+").write(document) #doc="D:\Document QnA\document-qa-master\Data\Drug_Delivery_Surveying_Global_Competitive_Landscape_BMI.txt" # ============================================================================= # if not isfile(doc): # raise ValueError(doc + " does not exist") # with open(doc, "r") as f: # documents.append(f.read()) # ============================================================================= #print("Loaded %d documents" % len(documents)) #temp=documents[0].split() # Split documents into lists of paragraphs #documents=[" ".join(temp[i:(i+400)]) for i in range(1,len(temp),400)] documents = [re.split("\s*\n\s*", doc) for doc in documents] # Tokenize the input, the models expects data to be tokenized using `NltkAndPunctTokenizer` # Note the model expects case-sensitive input tokenizer = NltkAndPunctTokenizer() question = tokenizer.tokenize_paragraph_flat( self.Question) # List of words # Now list of document->paragraph->sentence->word documents = [[tokenizer.tokenize_paragraph(p) for p in doc] for doc in documents] # Now group the document into paragraphs, this returns `ExtractedParagraph` objects # that additionally remember the start/end token of the paragraph within the source document splitter = MergeParagraphs(400) #splitter = PreserveParagraphs() # Uncomment to use the natural paragraph grouping documents = [splitter.split(doc) for doc in documents] #print(str(len(documents))+" kpl") #kpl # Now select the top paragraphs using a `ParagraphFilter` if len(documents) == 1: # Use TF-IDF to select top paragraphs from the document selector = TopTfIdf(NltkPlusStopWords(True), n_to_select=5) context = selector.prune(question, documents[0]) else: # Use a linear classifier to select top paragraphs among all the documents selector = ShallowOpenWebRanker(n_to_select=10) context = selector.prune(question, flatten_iterable(documents)) #print("Select %d paragraph" % len(context)) if model.preprocessor is not None: # Models are allowed to define an additional pre-processing step # This will turn the `ExtractedParagraph` objects back into simple lists of tokens context = [ model.preprocessor.encode_text(question, x) for x in context ] else: # Otherwise just use flattened text context = [flatten_iterable(x.text) for x in context] #x=open("E:\context.txt","a+") #[x.write(" ".join(cont)) for cont in context] #x.write("\n.......................................................\n") #print("Setting up model") # Tell the model the batch size (can be None) and vocab to expect, This will load the # needed word vectors and fix the batch size to use when building the graph / encoding the input voc = set(question) for txt in context: voc.update(txt) model.set_input_spec(self.nlp, ParagraphAndQuestionSpec(batch_size=len(context)), voc) # Now we build the actual tensorflow graph, `best_span` and `conf` are # tensors holding the predicted span (inclusive) and confidence scores for each # element in the input batch, confidence scores being the pre-softmax logit for the span #print("Build tf graph") #kpl sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) # We need to use sess.as_default when working with the cuNND stuff, since we need an active # session to figure out the # of parameters needed for each layer. The cpu-compatible models don't need this. with sess.as_default(): # 8 means to limit the span to size 8 or less best_spans, conf = model.get_prediction().get_best_span(8) # Loads the saved weights model_dir.restore_checkpoint(sess) # Now the model is ready to run # The model takes input in the form of `ContextAndQuestion` objects, for example: data = [ ParagraphAndQuestion(x, question, None, "user-question%d" % i) for i, x in enumerate(context) ] #print("Starting run") # The model is run in two steps, first it "encodes" a batch of paragraph/context pairs # into numpy arrays, then we use `sess` to run the actual model get the predictions encoded = model.encode( data, is_train=True) # batch of `ContextAndQuestion` -> feed_dict best_spans, conf = sess.run( [best_spans, conf], feed_dict=encoded) # feed_dict -> predictions best_para = np.argmax( conf ) # We get output for each paragraph, select the most-confident one to print #print("Best Paragraph: " + str(best_para)) #print("Best span: " + str(best_spans[best_para])) #print("Answer text: " + " ".join(context[best_para][best_spans[best_para][0]:best_spans[best_para][1]+1])) #print("Confidence: " + str(conf[best_para])) Answer = " ".join(context[best_para] [best_spans[best_para][0]:best_spans[best_para][1] + 1]) print("Confidence: " + str(conf[best_para])) print("Best Paragraph: " + str(best_para)) print("Best span: " + str(best_spans[best_para])) print("Answer text: " + Answer) print(" ".join(context[best_para])) context[best_para][best_spans[best_para][ 0]] = r"<em>" + context[best_para][best_spans[best_para][0]] context[best_para][best_spans[best_para][1]] = context[best_para][ best_spans[best_para][1]] + r"</em>" start = 0 end = len(context[best_para]) positions = [ x for x, n in enumerate(context[best_para] [0:best_spans[best_para][0]]) if n == "." ] if len(positions) >= 2: start = positions[len(positions) - 2] + 1 positions = [ x for x, n in enumerate(context[best_para][best_spans[best_para][1] + 1:]) if n == "." ] if len(positions) > 1: end = best_spans[best_para][1] + 1 + positions[1] d = dict() if conf[best_para] > 10: d["answer"] = Answer else: d["answer"] = "" d["name"] = name d["filetype"] = filetype d["paragraph"] = re.sub(r' (?=\W)', '', " ".join(context[best_para][start:end])) d["ObjectMasterId"] = self.ObjectMasterId return d #if __name__ == "__main__": # main()
def run(): parser = argparse.ArgumentParser() parser.add_argument("squad_path", help="path to squad dev data file") parser.add_argument("output_path", help="path where evaluation json file will be written") parser.add_argument("--model-path", default="model", help="path to model directory") parser.add_argument("--n", type=int, default=None) parser.add_argument("-b", "--batch_size", type=int, default=100) parser.add_argument("--ema", action="store_true") args = parser.parse_args() squad_path = args.squad_path output_path = args.output_path model_dir = ModelDir(args.model_path) nltk.data.path.append("nltk_data") print("Loading data") docs = parse_squad_data(squad_path, "", NltkAndPunctTokenizer(), False) pairs = split_docs(docs) dataset = ParagraphAndQuestionDataset( pairs, ClusteredBatcher(args.batch_size, ContextLenKey(), False, True)) print("Done, init model") model = model_dir.get_model() loader = ResourceLoader(lambda a, b: load_word_vector_file( join(VEC_DIR, "glove.840B.300d.txt"), b)) lm_model = model.lm_model basedir = join(LM_DIR, "squad-context-concat-skip") lm_model.lm_vocab_file = join(basedir, "squad_train_dev_all_unique_tokens.txt") lm_model.options_file = join( basedir, "options_squad_lm_2x4096_512_2048cnn_2xhighway_skip.json") lm_model.weight_file = join( basedir, "squad_context_concat_lm_2x4096_512_2048cnn_2xhighway_skip.hdf5") lm_model.embed_weights_file = None model.set_inputs([dataset], loader) print("Done, building graph") sess = tf.Session() with sess.as_default(): pred = model.get_prediction() best_span = pred.get_best_span(17)[0] all_vars = tf.global_variables() + tf.get_collection( tf.GraphKeys.SAVEABLE_OBJECTS) dont_restore_names = { x.name for x in all_vars if x.name.startswith("bilm") } print(sorted(dont_restore_names)) vars = [x for x in all_vars if x.name not in dont_restore_names] print("Done, loading weights") checkpoint = model_dir.get_best_weights() if checkpoint is None: print("Loading most recent checkpoint") checkpoint = model_dir.get_latest_checkpoint() else: print("Loading best weights") saver = tf.train.Saver(vars) saver.restore(sess, checkpoint) if args.ema: ema = tf.train.ExponentialMovingAverage(0) saver = tf.train.Saver( {ema.average_name(x): x for x in tf.trainable_variables()}) saver.restore(sess, checkpoint) sess.run( tf.variables_initializer( [x for x in all_vars if x.name in dont_restore_names])) print("Done, starting evaluation") out = {} for i, batch in enumerate(dataset.get_epoch()): if args.n is not None and i == args.n: break print("On batch: %d" % (i + 1)) enc = model.encode(batch, False) spans = sess.run(best_span, feed_dict=enc) for (s, e), point in zip(spans, batch): out[point.question_id] = point.get_original_text(s, e) sess.close() print("Done, saving") with open(output_path, "w") as f: json.dump(out, f) print("Mission accomplished!")
class CapeDocQAMachineReaderModel(CapeMachineReaderModelInterface): def __init__(self, machine_reader_config): self.tokenizer = NltkAndPunctTokenizer() self.config = machine_reader_config self.model = self._load_model() self.sess = tf.Session() self.start_logits, self.end_logits, self.context_rep = self._build_model( ) self._initialize() def _load_model(self): with open(self.config.model_pickle_file, 'rb') as f: model = pickle.load(f) model.lm_model.weight_file = self.config.lm_weights_file model.lm_model.lm_vocab_file = self.config.vocab_file model.lm_model.embed_weights_file = self.config.lm_token_weights_file model.lm_model.options_file = self.config.lm_options_file return model def _build_model(self): vocab_to_init_with = { line.strip() for line in open(self.config.vocab_file, encoding="utf-8") if line.strip() not in vocab_to_ignore } self.model.word_embed.vec_name = self.config.word_vector_file with self.sess.as_default(): self.model.set_input_spec( ParagraphAndQuestionSpec(None, None, None, 14), vocab_to_init_with, word_vec_loader=ResourceLoader( load_vec_fn=lambda x, y: load_word_vectors( x, y, is_path=True))) pred = self.model.get_production_predictions_for( {x: x for x in self.model.get_placeholders()}) return pred.start_logits, pred.end_logits, self.model.context_rep def _initialize(self): all_vars = tf.global_variables() + tf.get_collection( tf.GraphKeys.SAVEABLE_OBJECTS) lm_var_names = {x.name for x in all_vars if x.name.startswith("bilm")} vars_to_restore = [x for x in all_vars if x.name not in lm_var_names] saver = tf.train.Saver(vars_to_restore) saver.restore(self.sess, self.config.checkpoint_file) self.sess.run( tf.variables_initializer( [x for x in all_vars if x.name in lm_var_names])) def tokenize(self, text): tokens = self.tokenizer.tokenize_paragraph_flat(text) spans = self.tokenizer.convert_to_spans(text, [tokens])[0] return tokens, spans def get_document_embedding(self, text): document_tokens, _ = self.tokenize(text) test_question = ParagraphAndQuestion(document_tokens, ['dummy', 'question'], None, "cape_question", 'cape_document') feed = self.model.encode([test_question], False, cached_doc=None) return self.sess.run(self.model.context_rep, feed_dict=feed)[0] def get_logits(self, question, document_embedding): question_tokens, _ = self.tokenize(question) n_words = document_embedding.shape[0] dummy_document = ['dummy'] * n_words test_question = ParagraphAndQuestion(dummy_document, question_tokens, None, "cape_question", 'cape_document') feed = self.model.encode( [test_question], False, cached_doc=document_embedding[np.newaxis, :, :]) start_logits, end_logits = self.sess.run( [self.start_logits, self.end_logits], feed_dict=feed) return start_logits[0][:n_words], end_logits[0][:n_words]
def main(): parser = argparse.ArgumentParser( description="Run an ELMo model on user input") parser.add_argument("model", help="Model directory") parser.add_argument("question", help="Question to answer") parser.add_argument("context", help="Context to answer the question with") args = parser.parse_args() # Tokenize the input, the models expected data to be tokenized using `NltkAndPunctTokenizer` # Note the model expects case-sensitive input tokenizer = NltkAndPunctTokenizer() question = tokenizer.tokenize_paragraph_flat(args.question) context = tokenizer.tokenize_paragraph_flat(args.context) print("Loading model") model_dir = ModelDir(args.model) model = model_dir.get_model() if not isinstance(model, ElmoQaModel): raise ValueError( "This script is build to work for ElmoQaModel models only") # Important! This tells the language model not to use the pre-computed word vectors, # which are only applicable for the SQuAD dev/train sets. # Instead the language model will use its character-level CNN to compute # the word vectors dynamically. model.lm_model.embed_weights_file = None # Tell the model the batch size and vocab to expect, This will load the needed # word vectors and fix the batch size when building the graph / encoding the input print("Setting up model") voc = set(question) voc.update(context) model.set_input_spec(ParagraphAndQuestionSpec(batch_size=1), voc) # Now we build the actual tensorflow graph, `best_span` and `conf` are # tensors holding the predicted span (inclusive) and confidence scores for each # element in the input batch print("Build tf graph") sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) with sess.as_default(): # 17 means to limit the span to size 17 or less best_spans, conf = model.get_prediction().get_best_span(17) # Now restore the weights, this is a bit fiddly since we need to avoid restoring the # bilm weights, and instead load them from the pre-computed data all_vars = tf.global_variables() + tf.get_collection( tf.GraphKeys.SAVEABLE_OBJECTS) lm_var_names = {x.name for x in all_vars if x.name.startswith("bilm")} vars = [x for x in all_vars if x.name not in lm_var_names] model_dir.restore_checkpoint(sess, vars) # Run the initializer of the lm weights, which will load them from the lm directory sess.run( tf.variables_initializer( [x for x in all_vars if x.name in lm_var_names])) # Now the model is ready to run # The model takes input in the form of `ContextAndQuestion` objects, for example: data = [ParagraphAndQuestion(context, question, None, "user-question1")] print("Starting run") # The model is run in two steps, first it "encodes" the paragraph/context pairs # into numpy arrays, then to use `sess` to run the actual model get the predictions encoded = model.encode( data, is_train=False) # batch of `ContextAndQuestion` -> feed_dict best_spans, conf = sess.run([best_spans, conf], feed_dict=encoded) # feed_dict -> predictions print("Best span: " + str(best_spans[0])) print("Answer text: " + " ".join(context[best_spans[0][0]:best_spans[0][1] + 1])) print("Confidence: " + str(conf[0]))