def read_input_data(model): data = [] vocab = set() tokenizer = NltkAndPunctTokenizer() splitter = Truncate(400) # NOTE: we truncate past 400 tokens selector = TopTfIdf(NltkPlusStopWords(True), n_to_select=5) with open(OPTS.input_file) as f: for i, line in enumerate(f): try: document_raw, question_raw = line.strip().split('\t') except ValueError as e: print(line.strip()) print('Error at line %d' % i) raise e document = re.split("\s*\n\s*", document_raw) question = tokenizer.tokenize_paragraph_flat(question_raw) doc_toks = [tokenizer.tokenize_paragraph(p) for p in document] split_doc = splitter.split(doc_toks) context = selector.prune(question, split_doc) if model.preprocessor is not None: context = [model.preprocessor.encode_text(question, x) for x in context] else: context = [flatten_iterable(x.text) for x in context] vocab.update(question) for txt in context: vocab.update(txt) ex = [ParagraphAndQuestion(x, question, None, "user-question%d"%i) for i, x in enumerate(context)] data.append((document_raw, question_raw, context, ex)) return data, vocab
class RandomMachineReaderModel(CapeMachineReaderModelInterface): def __init__(self, _): self.tokenizer = NltkAndPunctTokenizer() def tokenize(self, text): tokens = self.tokenizer.tokenize_paragraph_flat(text) spans = self.tokenizer.convert_to_spans(text, [tokens])[0] return tokens, spans def get_document_embedding(self, text): np.random.seed( int(hashlib.sha1(text.encode()).hexdigest(), 16) % 10**8) document_tokens, _ = self.tokenize(text) return np.random.random((len(document_tokens), 240)) def get_logits(self, question, document_embedding): question_tokens, _ = self.tokenize(question) n_words = document_embedding.shape[0] qseed = int(hashlib.sha1(question.encode()).hexdigest(), 16) % 10**8 dseed = int(np.sum(document_embedding) * 10**6) % 10**8 np.random.seed(dseed + qseed) start_logits = np.random.random(n_words) off = np.random.randint(1, 5) end_logits = np.concatenate( [np.zeros(off) + np.min(start_logits), start_logits[off:]]) return start_logits[:n_words], end_logits[:n_words]
def find_answer(documents, raw_question): raw_question = raw_question.lower() documents = [d.lower() for d in documents] global best_spans, conf documents = [re.split("\s*\n\s*", doc) for doc in documents] tokenizer = NltkAndPunctTokenizer() question = tokenizer.tokenize_paragraph_flat(raw_question) documents = [[tokenizer.tokenize_paragraph(p) for p in doc] for doc in documents] splitter = MergeParagraphs(400) documents = [splitter.split(doc) for doc in documents] if len(documents) == 1: selector = TopTfIdf(NltkPlusStopWords(True), n_to_select=5) context = selector.prune(question, documents[0]) else: selector = ShallowOpenWebRanker(n_to_select=10) context = selector.prune(question, flatten_iterable(documents)) context = [flatten_iterable(x.text) for x in context] data = [ ParagraphAndQuestion(x, question, None, "user-question%d" % i) for i, x in enumerate(context) ] encoded = model.encode(data, is_train=False) with sess.as_default(): spans, confid = sess.run([best_spans, conf], feed_dict=encoded) best_para = np.argmax(confid) ans = " ".join(context[best_para][spans[best_para][0]:spans[best_para][1] + 1]) confidence = confid[best_para] return ans, confidence
def getAnswer(self): #parser = argparse.ArgumentParser(description="Run an ELMo model on user input") #parser.add_argument("model", help="Model directory") #parser.add_argument("question", help="Question to answer") #parser.add_argument("documents", help="List of text documents to answer the question with", nargs='+') #args = parser.parse_args() #print("Preprocessing...") # Load the model model_dir = ModelDir(MODEL_DIR) model = model_dir.get_model() if not isinstance(model, ParagraphQuestionModel): raise ValueError( "This script is built to work for ParagraphQuestionModel models only" ) conn = pyodbc.connect(DB_CONN) cursor = conn.cursor() #(23211,28690,33214,25638,25837,26454,28693,26137,31428,32087) query="select cast(filetext as varchar(max)) as filetext, name, type from dbo.UserworkspaceData where objectmasterid= "+\ str(self.ObjectMasterId)+\ " order by id asc" #query="select cast(filetext as varchar(max)) as filetext from kpl_tmp" documents = [] document = "" name = "" filetype = 0 for doc in cursor.execute(query): document = document + doc[0] name = doc[1] filetype = doc[2] #open("E:/kpl.txt","w+").write(document) documents.append(document) #documents.replace("\n\n","\n") #r.sub("",documents) #documents=" ".join(documents.split()) #open("E:\kpl_test.txt","w+").write(document) #doc="D:\Document QnA\document-qa-master\Data\Drug_Delivery_Surveying_Global_Competitive_Landscape_BMI.txt" # ============================================================================= # if not isfile(doc): # raise ValueError(doc + " does not exist") # with open(doc, "r") as f: # documents.append(f.read()) # ============================================================================= #print("Loaded %d documents" % len(documents)) #temp=documents[0].split() # Split documents into lists of paragraphs #documents=[" ".join(temp[i:(i+400)]) for i in range(1,len(temp),400)] documents = [re.split("\s*\n\s*", doc) for doc in documents] # Tokenize the input, the models expects data to be tokenized using `NltkAndPunctTokenizer` # Note the model expects case-sensitive input tokenizer = NltkAndPunctTokenizer() question = tokenizer.tokenize_paragraph_flat( self.Question) # List of words # Now list of document->paragraph->sentence->word documents = [[tokenizer.tokenize_paragraph(p) for p in doc] for doc in documents] # Now group the document into paragraphs, this returns `ExtractedParagraph` objects # that additionally remember the start/end token of the paragraph within the source document splitter = MergeParagraphs(400) #splitter = PreserveParagraphs() # Uncomment to use the natural paragraph grouping documents = [splitter.split(doc) for doc in documents] #print(str(len(documents))+" kpl") #kpl # Now select the top paragraphs using a `ParagraphFilter` if len(documents) == 1: # Use TF-IDF to select top paragraphs from the document selector = TopTfIdf(NltkPlusStopWords(True), n_to_select=5) context = selector.prune(question, documents[0]) else: # Use a linear classifier to select top paragraphs among all the documents selector = ShallowOpenWebRanker(n_to_select=10) context = selector.prune(question, flatten_iterable(documents)) #print("Select %d paragraph" % len(context)) if model.preprocessor is not None: # Models are allowed to define an additional pre-processing step # This will turn the `ExtractedParagraph` objects back into simple lists of tokens context = [ model.preprocessor.encode_text(question, x) for x in context ] else: # Otherwise just use flattened text context = [flatten_iterable(x.text) for x in context] #x=open("E:\context.txt","a+") #[x.write(" ".join(cont)) for cont in context] #x.write("\n.......................................................\n") #print("Setting up model") # Tell the model the batch size (can be None) and vocab to expect, This will load the # needed word vectors and fix the batch size to use when building the graph / encoding the input voc = set(question) for txt in context: voc.update(txt) model.set_input_spec(self.nlp, ParagraphAndQuestionSpec(batch_size=len(context)), voc) # Now we build the actual tensorflow graph, `best_span` and `conf` are # tensors holding the predicted span (inclusive) and confidence scores for each # element in the input batch, confidence scores being the pre-softmax logit for the span #print("Build tf graph") #kpl sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) # We need to use sess.as_default when working with the cuNND stuff, since we need an active # session to figure out the # of parameters needed for each layer. The cpu-compatible models don't need this. with sess.as_default(): # 8 means to limit the span to size 8 or less best_spans, conf = model.get_prediction().get_best_span(8) # Loads the saved weights model_dir.restore_checkpoint(sess) # Now the model is ready to run # The model takes input in the form of `ContextAndQuestion` objects, for example: data = [ ParagraphAndQuestion(x, question, None, "user-question%d" % i) for i, x in enumerate(context) ] #print("Starting run") # The model is run in two steps, first it "encodes" a batch of paragraph/context pairs # into numpy arrays, then we use `sess` to run the actual model get the predictions encoded = model.encode( data, is_train=True) # batch of `ContextAndQuestion` -> feed_dict best_spans, conf = sess.run( [best_spans, conf], feed_dict=encoded) # feed_dict -> predictions best_para = np.argmax( conf ) # We get output for each paragraph, select the most-confident one to print #print("Best Paragraph: " + str(best_para)) #print("Best span: " + str(best_spans[best_para])) #print("Answer text: " + " ".join(context[best_para][best_spans[best_para][0]:best_spans[best_para][1]+1])) #print("Confidence: " + str(conf[best_para])) Answer = " ".join(context[best_para] [best_spans[best_para][0]:best_spans[best_para][1] + 1]) print("Confidence: " + str(conf[best_para])) print("Best Paragraph: " + str(best_para)) print("Best span: " + str(best_spans[best_para])) print("Answer text: " + Answer) print(" ".join(context[best_para])) context[best_para][best_spans[best_para][ 0]] = r"<em>" + context[best_para][best_spans[best_para][0]] context[best_para][best_spans[best_para][1]] = context[best_para][ best_spans[best_para][1]] + r"</em>" start = 0 end = len(context[best_para]) positions = [ x for x, n in enumerate(context[best_para] [0:best_spans[best_para][0]]) if n == "." ] if len(positions) >= 2: start = positions[len(positions) - 2] + 1 positions = [ x for x, n in enumerate(context[best_para][best_spans[best_para][1] + 1:]) if n == "." ] if len(positions) > 1: end = best_spans[best_para][1] + 1 + positions[1] d = dict() if conf[best_para] > 10: d["answer"] = Answer else: d["answer"] = "" d["name"] = name d["filetype"] = filetype d["paragraph"] = re.sub(r' (?=\W)', '', " ".join(context[best_para][start:end])) d["ObjectMasterId"] = self.ObjectMasterId return d #if __name__ == "__main__": # main()
def main(): parser = argparse.ArgumentParser( description="Run an ELMo model on user input") parser.add_argument("model", help="Model directory") parser.add_argument("question", help="Question to answer") parser.add_argument("context", help="Context to answer the question with") args = parser.parse_args() # Tokenize the input, the models expected data to be tokenized using `NltkAndPunctTokenizer` # Note the model expects case-sensitive input tokenizer = NltkAndPunctTokenizer() question = tokenizer.tokenize_paragraph_flat(args.question) context = tokenizer.tokenize_paragraph_flat(args.context) print("Loading model") model_dir = ModelDir(args.model) model = model_dir.get_model() if not isinstance(model, ElmoQaModel): raise ValueError( "This script is build to work for ElmoQaModel models only") # Important! This tells the language model not to use the pre-computed word vectors, # which are only applicable for the SQuAD dev/train sets. # Instead the language model will use its character-level CNN to compute # the word vectors dynamically. model.lm_model.embed_weights_file = None # Tell the model the batch size and vocab to expect, This will load the needed # word vectors and fix the batch size when building the graph / encoding the input print("Setting up model") voc = set(question) voc.update(context) model.set_input_spec(ParagraphAndQuestionSpec(batch_size=1), voc) # Now we build the actual tensorflow graph, `best_span` and `conf` are # tensors holding the predicted span (inclusive) and confidence scores for each # element in the input batch print("Build tf graph") sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) with sess.as_default(): # 17 means to limit the span to size 17 or less best_spans, conf = model.get_prediction().get_best_span(17) # Now restore the weights, this is a bit fiddly since we need to avoid restoring the # bilm weights, and instead load them from the pre-computed data all_vars = tf.global_variables() + tf.get_collection( tf.GraphKeys.SAVEABLE_OBJECTS) lm_var_names = {x.name for x in all_vars if x.name.startswith("bilm")} vars = [x for x in all_vars if x.name not in lm_var_names] model_dir.restore_checkpoint(sess, vars) # Run the initializer of the lm weights, which will load them from the lm directory sess.run( tf.variables_initializer( [x for x in all_vars if x.name in lm_var_names])) # Now the model is ready to run # The model takes input in the form of `ContextAndQuestion` objects, for example: data = [ParagraphAndQuestion(context, question, None, "user-question1")] print("Starting run") # The model is run in two steps, first it "encodes" the paragraph/context pairs # into numpy arrays, then to use `sess` to run the actual model get the predictions encoded = model.encode( data, is_train=False) # batch of `ContextAndQuestion` -> feed_dict best_spans, conf = sess.run([best_spans, conf], feed_dict=encoded) # feed_dict -> predictions print("Best span: " + str(best_spans[0])) print("Answer text: " + " ".join(context[best_spans[0][0]:best_spans[0][1] + 1])) print("Confidence: " + str(conf[0]))
class CapeDocQAMachineReaderModel(CapeMachineReaderModelInterface): def __init__(self, machine_reader_config): self.tokenizer = NltkAndPunctTokenizer() self.config = machine_reader_config self.model = self._load_model() self.sess = tf.Session() self.start_logits, self.end_logits, self.context_rep = self._build_model( ) self._initialize() def _load_model(self): with open(self.config.model_pickle_file, 'rb') as f: model = pickle.load(f) model.lm_model.weight_file = self.config.lm_weights_file model.lm_model.lm_vocab_file = self.config.vocab_file model.lm_model.embed_weights_file = self.config.lm_token_weights_file model.lm_model.options_file = self.config.lm_options_file return model def _build_model(self): vocab_to_init_with = { line.strip() for line in open(self.config.vocab_file, encoding="utf-8") if line.strip() not in vocab_to_ignore } self.model.word_embed.vec_name = self.config.word_vector_file with self.sess.as_default(): self.model.set_input_spec( ParagraphAndQuestionSpec(None, None, None, 14), vocab_to_init_with, word_vec_loader=ResourceLoader( load_vec_fn=lambda x, y: load_word_vectors( x, y, is_path=True))) pred = self.model.get_production_predictions_for( {x: x for x in self.model.get_placeholders()}) return pred.start_logits, pred.end_logits, self.model.context_rep def _initialize(self): all_vars = tf.global_variables() + tf.get_collection( tf.GraphKeys.SAVEABLE_OBJECTS) lm_var_names = {x.name for x in all_vars if x.name.startswith("bilm")} vars_to_restore = [x for x in all_vars if x.name not in lm_var_names] saver = tf.train.Saver(vars_to_restore) saver.restore(self.sess, self.config.checkpoint_file) self.sess.run( tf.variables_initializer( [x for x in all_vars if x.name in lm_var_names])) def tokenize(self, text): tokens = self.tokenizer.tokenize_paragraph_flat(text) spans = self.tokenizer.convert_to_spans(text, [tokens])[0] return tokens, spans def get_document_embedding(self, text): document_tokens, _ = self.tokenize(text) test_question = ParagraphAndQuestion(document_tokens, ['dummy', 'question'], None, "cape_question", 'cape_document') feed = self.model.encode([test_question], False, cached_doc=None) return self.sess.run(self.model.context_rep, feed_dict=feed)[0] def get_logits(self, question, document_embedding): question_tokens, _ = self.tokenize(question) n_words = document_embedding.shape[0] dummy_document = ['dummy'] * n_words test_question = ParagraphAndQuestion(dummy_document, question_tokens, None, "cape_question", 'cape_document') feed = self.model.encode( [test_question], False, cached_doc=document_embedding[np.newaxis, :, :]) start_logits, end_logits = self.sess.run( [self.start_logits, self.end_logits], feed_dict=feed) return start_logits[0][:n_words], end_logits[0][:n_words]
def main(): parser = argparse.ArgumentParser(description="Run an ELMo model on user input") # parser.add_argument("model", type=int, help="Model directory") parser.add_argument("question", help="Question to answer") parser.add_argument("documents", help="List of text documents to answer the question with", nargs='+') args = parser.parse_args() # Models path SQUAD_MODEL_DIRECTORY_PATH = 'docqa/models-cpu/squad' SQUAD_SHARED_NORM_MODEL_DIRECTORY_PATH = 'docqa/models-cpu/squad-shared-norm' TRIVIAQA_MODEL_DIRECTORY_PATH = 'docqa/models-cpu/triviaqa-unfiltered-shared-norm' TRIVIAQA_SHARED_NORM_MODEL_DIRECTORY_PATH = 'docqa/models-cpu/triviaqa-web-shared-norm' models_directory = [ SQUAD_MODEL_DIRECTORY_PATH, SQUAD_SHARED_NORM_MODEL_DIRECTORY_PATH, TRIVIAQA_MODEL_DIRECTORY_PATH, TRIVIAQA_SHARED_NORM_MODEL_DIRECTORY_PATH ] print("Preprocessing...") # Load the model # model_dir = ModelDir(args.model) model_dir = ModelDir(models_directory[0]) model = model_dir.get_model() if not isinstance(model, ParagraphQuestionModel): raise ValueError("This script is built to work for ParagraphQuestionModel models only") # Read the documents documents = [] for doc in args.documents: if not isfile(doc): raise ValueError(doc + " does not exist") with open(doc, "r") as f: documents.append(f.read()) print("Loaded %d documents" % len(documents)) # Split documents into lists of paragraphs documents = [re.split("\s*\n\s*", doc) for doc in documents] # Tokenize the input, the models expects data to be tokenized using `NltkAndPunctTokenizer` # Note the model expects case-sensitive input tokenizer = NltkAndPunctTokenizer() question = tokenizer.tokenize_paragraph_flat(args.question) # List of words # Now list of document->paragraph->sentence->word documents = [[tokenizer.tokenize_paragraph(p) for p in doc] for doc in documents] # Now group the document into paragraphs, this returns `ExtractedParagraph` objects # that additionally remember the start/end token of the paragraph within the source document splitter = MergeParagraphs(400) # splitter = PreserveParagraphs() # Uncomment to use the natural paragraph grouping documents = [splitter.split(doc) for doc in documents] # Now select the top paragraphs using a `ParagraphFilter` if len(documents) == 1: # Use TF-IDF to select top paragraphs from the document selector = TopTfIdf(NltkPlusStopWords(True), n_to_select=5) context = selector.prune(question, documents[0]) else: # Use a linear classifier to select top paragraphs among all the documents selector = ShallowOpenWebRanker(n_to_select=10) context = selector.prune(question, flatten_iterable(documents)) print("Select %d paragraph" % len(context)) if model.preprocessor is not None: # Models are allowed to define an additional pre-processing step # This will turn the `ExtractedParagraph` objects back into simple lists of tokens context = [model.preprocessor.encode_text(question, x) for x in context] else: # Otherwise just use flattened text context = [flatten_iterable(x.text) for x in context] print("Setting up model") # Tell the model the batch size (can be None) and vocab to expect, This will load the # needed word vectors and fix the batch size to use when building the graph / encoding the input voc = set(question) for txt in context: voc.update(txt) model.set_input_spec(ParagraphAndQuestionSpec(batch_size=len(context)), voc) # Now we build the actual tensorflow graph, `best_span` and `conf` are # tensors holding the predicted span (inclusive) and confidence scores for each # element in the input batch, confidence scores being the pre-softmax logit for the span print("Build tf graph") sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) # We need to use sess.as_default when working with the cuNND stuff, since we need an active # session to figure out the # of parameters needed for each layer. The cpu-compatible models don't need this. with sess.as_default(): # 8 means to limit the span to size 8 or less best_spans, conf = model.get_prediction().get_best_span(10) # Loads the saved weights model_dir.restore_checkpoint(sess) # Now the model is ready to run # The model takes input in the form of `ContextAndQuestion` objects, for example: data = [ParagraphAndQuestion(x, question, None, "user-question%d"%i) for i, x in enumerate(context)] print("Starting run") # The model is run in two steps, first it "encodes" a batch of paragraph/context pairs # into numpy arrays, then we use `sess` to run the actual model get the predictions encoded = model.encode(data, is_train=False) # batch of `ContextAndQuestion` -> feed_dict best_spans, conf = sess.run([best_spans, conf], feed_dict=encoded) # feed_dict -> predictions best_para = np.argmax(conf) # We get output for each paragraph, select the most-confident one to print print("Best Paragraph: " + str(best_para)) para_id = int(str(best_para)) # print("Best Paragraph: \n" + (" ".join((paras[para_id].text)[0]))) print("Best Paragraph: \n" + " ".join(context[para_id])) print("Best span: " + str(best_spans[best_para])) print("Answer text: " + " ".join(context[best_para][best_spans[best_para][0]:best_spans[best_para][1]+1])) print("Confidence: " + str(conf[best_para]))
class QaSystem(object): """ End-to-end QA system, uses web-requests to get relevant documents and a model to scores candidate answer spans. """ _split_regex = re.compile( "\s*\n\s*") # split includes whitespace to avoid empty paragraphs def __init__(self, wiki_cache: str, paragraph_splitter: DocumentSplitter, paragraph_selector: ParagraphFilter, vocab: Union[str, Set[str]], model: Union[ParagraphQuestionModel, ModelDir], loader: ResourceLoader = ResourceLoader(), bing_api_key=None, tagme_api_key=None, blacklist_trivia_sites: bool = False, n_dl_threads: int = 5, span_bound: int = 8, tagme_threshold: Optional[float] = 0.2, download_timeout: int = None, n_web_docs=10): self.log = logging.getLogger('qa_system') self.tagme_threshold = tagme_threshold self.n_web_docs = n_web_docs self.blacklist_trivia_sites = blacklist_trivia_sites self.tagme_api_key = tagme_api_key if bing_api_key is not None: self.searcher = AsyncWebSearcher(bing_api_key) self.text_extractor = AsyncBoilerpipeCliExtractor( n_dl_threads, download_timeout) else: self.text_extractor = None self.searcher = None self.wiki_corpus = WikiCorpus(wiki_cache, keep_inverse_mapping=True) self.paragraph_splitter = paragraph_splitter self.paragraph_selector = paragraph_selector self.model_dir = model voc = None if vocab is not None: if isinstance(vocab, str): voc = set() with open(vocab, "r") as f: for line in f: voc.add(line.strip()) else: voc = vocab self.log.info("Using preset vocab of size %d", len(voc)) self.log.info("Setting up model...") if isinstance(model, ModelDir): self.model = model.get_model() else: self.model = model self.model.set_input_spec(ParagraphAndQuestionSpec(None), voc, loader) self.sess = tf.Session() with self.sess.as_default(): pred = self.model.get_prediction() model.restore_checkpoint(self.sess) self.span_scores = pred.get_span_scores() self.span, self.score = pred.get_best_span(span_bound) self.tokenizer = NltkAndPunctTokenizer() self.sess.graph.finalize() async def answer_question( self, question: str) -> Tuple[np.ndarray, List[WebParagraph]]: """ Answer a question using web search """ context = await self.get_question_context(question) question = self.tokenizer.tokenize_paragraph_flat(question) t0 = time.perf_counter() out = self._get_span_scores(question, context) self.log.info("Computing answer spans took %.5f seconds" % (time.perf_counter() - t0)) return out def answer_with_doc(self, question: str, doc: str) -> Tuple[np.ndarray, List[WebParagraph]]: """ Answer a question using the given text as a document """ self.log.info("Answering question \"%s\" with a given document" % question) # Tokenize question = self.tokenizer.tokenize_paragraph_flat(question) context = [ self.tokenizer.tokenize_with_inverse(x, False) for x in self._split_regex.split(doc) ] # Split into super-paragraphs context = self._split_document(context, "User", None) # Select top paragraphs context = self.paragraph_selector.prune(question, context) if len(context) == 0: raise ValueError("Unable to process documents") # Select the top answer span t0 = time.perf_counter() span_scores = self._get_span_scores(question, context) self.log.info("Computing answer spans took %.5f seconds" % (time.perf_counter() - t0)) return span_scores def _get_span_scores(self, question: List[str], paragraphs: List[ParagraphWithInverse]): """ Answer a question using the given paragraphs, returns both the span scores and the pre-processed paragraphs the span are valid for """ if self.model.preprocessor is not None: prepped = [] for para in paragraphs: if hasattr(para, "spans"): spans = para.spans else: spans = None text, _, inv = self.model.preprocessor.encode_paragraph( [], para.text, para.start == 0, np.zeros((0, 2), dtype=np.int32), spans) prepped.append( WebParagraph([text], para.original_text, inv, para.paragraph_num, para.start, para.end, para.source_name, para.source_url)) paragraphs = prepped qa_pairs = [ ParagraphAndQuestion(c.get_context(), question, None, "") for c in paragraphs ] encoded = self.model.encode(qa_pairs, False) return self.sess.run(self.span_scores, encoded), paragraphs def _split_document(self, para: List[ParagraphWithInverse], source_name: str, source_url: Optional[str]): tokenized_paragraphs = [] on_token = 0 for i, para in enumerate(self.paragraph_splitter.split_inverse(para)): n_tokens = para.n_tokens tokenized_paragraphs.append( WebParagraph(para.text, para.original_text, para.spans, i + 1, on_token, on_token + n_tokens, source_name, source_url)) on_token += n_tokens return tokenized_paragraphs async def _tagme(self, question): payload = { "text": question, "long_text": 3, "lang": "en", "gcube-token": self.tagme_api_key } async with ClientSession() as sess: async with sess.get(url=TAGME_API, params=payload) as resp: data = await resp.json() return [ ann_json for ann_json in data["annotations"] if "title" in ann_json ] async def get_question_context(self, question: str) -> List[WebParagraph]: """ Find a set of paragraphs from the web that are relevant to the given question """ tokenized_paragraphs = [] if self.tagme_threshold is not None: self.log.info("Query tagme for %s", question) tags = await self._tagme(question) t0 = time.perf_counter() found = set() for tag in tags: if tag["rho"] >= self.tagme_threshold: title = tag["title"] if title in found: continue found.add(title) doc = await self.wiki_corpus.get_wiki_article(title) tokenized_paragraphs += self._split_document( doc.paragraphs, "Wikipedia: " + doc.title, doc.url) if len(tokenized_paragraphs) > 0: self.log.info("Getting wiki docs took %.5f seconds" % (time.perf_counter() - t0)) if self.n_web_docs > 0: t0 = time.perf_counter() self.log.info("Running bing search for %s", question) search_results = await self.searcher.run_search( question, self.n_web_docs) t1 = time.perf_counter() self.log.info("Completed bing search, took %.5f seconds" % (t1 - t0)) t0 = t1 url_to_result = {x["url"]: x for x in search_results} self.log.info("Extracting text for %d results", len(search_results)) text_docs = await self.text_extractor.get_text( [x["url"] for x in search_results]) for doc in text_docs: if len(doc.text) == 0: continue search_r = url_to_result[doc.url] if self.blacklist_trivia_sites: lower = search_r["displayUrl"].lower() if 'quiz' in lower or 'trivia' in lower or 'answer' in lower: # heuristic to ignore trivia sites, recommend by Mandar self.log.debug("Skipping trivia site: " + lower) continue paras_text = self._split_regex.split(doc.text.strip()) paras_tokenized = [ self.tokenizer.tokenize_with_inverse(x) for x in paras_text ] tokenized_paragraphs += self._split_document( paras_tokenized, search_r["displayUrl"], doc.url) self.log.info("Completed extracting text, took %.5f seconds." % (time.perf_counter() - t0)) self.log.info("Have %d paragraphs", len(tokenized_paragraphs)) if len(tokenized_paragraphs) == 0: return [] question = self.tokenizer.tokenize_sentence(question) return self.paragraph_selector.prune(question, tokenized_paragraphs)
def predict(): json_data = {"success": False, "predictions": []} print("Preprocessing...") # Load the model model_dir = ModelDir( "/home/antriv/conversation_ai/Transfer_Learning/ALLENAI_DocumentQA/document-qa/pretrained_models/models/triviaqa-unfiltered-shared-norm" ) model = model_dir.get_model() if not isinstance(model, ParagraphQuestionModel): raise ValueError( "This script is built to work for ParagraphQuestionModel models only" ) # Load the question question = (flask.request.data).decode("utf-8") # Read the documents documents = [] doclist = ["/home/antriv/data/The-Future-Computed.txt"] for doc in doclist: if not isfile(doc): raise ValueError(doc + " does not exist") with open(doc, "r") as f: documents.append(f.read()) print("Loaded %d documents" % len(documents)) # Split documents into lists of paragraphs documents = [re.split("\s*\n\s*", doc) for doc in documents] # Tokenize the input, the models expects data to be tokenized using `NltkAndPunctTokenizer` # Note the model expects case-sensitive input tokenizer = NltkAndPunctTokenizer() question = tokenizer.tokenize_paragraph_flat(question) # List of words # Now list of document->paragraph->sentence->word documents = [[tokenizer.tokenize_paragraph(p) for p in doc] for doc in documents] # Now group the document into paragraphs, this returns `ExtractedParagraph` objects # that additionally remember the start/end token of the paragraph within the source document splitter = MergeParagraphs(400) #splitter = PreserveParagraphs() # Uncomment to use the natural paragraph grouping documents = [splitter.split(doc) for doc in documents] # Now select the top paragraphs using a `ParagraphFilter` if len(documents) == 1: # Use TF-IDF to select top paragraphs from the document selector = TopTfIdf(NltkPlusStopWords(True), n_to_select=1000) context = selector.prune(question, documents[0]) else: # Use a linear classifier to select top paragraphs among all the documents selector = ShallowOpenWebRanker(n_to_select=1000) context = selector.prune(question, flatten_iterable(documents)) print("Select %d paragraph" % len(context)) if model.preprocessor is not None: # Models are allowed to define an additional pre-processing step # This will turn the `ExtractedParagraph` objects back into simple lists of tokens context = [ model.preprocessor.encode_text(question, x) for x in context ] else: # Otherwise just use flattened text context = [flatten_iterable(x.text) for x in context] print("Setting up model") # Tell the model the batch size (can be None) and vocab to expect, This will load the # needed word vectors and fix the batch size to use when building the graph / encoding the input voc = set(question) for txt in context: voc.update(txt) model.set_input_spec(ParagraphAndQuestionSpec(batch_size=len(context)), voc) # Now we build the actual tensorflow graph, `best_span` and `conf` are # tensors holding the predicted span (inclusive) and confidence scores for each # element in the input batch, confidence scores being the pre-softmax logit for the span print("Build tf graph") sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) # We need to use sess.as_default when working with the cuNND stuff, since we need an active # session to figure out the # of parameters needed for each layer. The cpu-compatible models don't need this. with sess.as_default(): # 8 means to limit the span to size 8 or less best_spans, conf = model.get_prediction().get_best_span(8) # Loads the saved weights model_dir.restore_checkpoint(sess) # Now the model is ready to run # The model takes input in the form of `ContextAndQuestion` objects, for example: data = [ ParagraphAndQuestion(x, question, None, "user-question%d" % i) for i, x in enumerate(context) ] print("Starting run") # The model is run in two steps, first it "encodes" a batch of paragraph/context pairs # into numpy arrays, then we use `sess` to run the actual model get the predictions encoded = model.encode( data, is_train=False) # batch of `ContextAndQuestion` -> feed_dict best_spans, conf = sess.run([best_spans, conf], feed_dict=encoded) # feed_dict -> predictions best_para = np.argmax( conf ) # We get output for each paragraph, select the most-confident one to print print("Best Paragraph: " + str(best_para)) print("Best span: " + str(best_spans[best_para])) print("Answer text: " + " ".join(context[best_para] [best_spans[best_para][0]:best_spans[best_para][1] + 1])) print("Confidence: " + str(conf[best_para])) y_output = " ".join( context[best_para][best_spans[best_para][0]:best_spans[best_para][1] + 1]) print(y_output) json_data["predictions"].append(str(y_output)) #indicate that the request was a success json_data["success"] = True #return the data dictionary as a JSON response return flask.jsonify(json_data)
def main(Data: pd.DataFrame, nlp, model_dir, model): #parser = argparse.ArgumentParser(description="Run an ELMo model on user input") #parser.add_argument("model", help="Model directory") #parser.add_argument("question", help="Question to answer") #parser.add_argument("documents", help="List of text documents to answer the question with", nargs='+') #args = parser.parse_args() #print("Preprocessing...") # Load the model #model_dir = ModelDir(MODEL_DIR) #model = model_dir.get_model() print(model) if not isinstance(model, ParagraphQuestionModel): raise ValueError( "This script is built to work for ParagraphQuestionModel models only" ) #print(model) # Read the documents documents = [] documents.append(Data.at[0, 'Filetext']) """import pyodbc conn = pyodbc.connect("Driver={ODBC Driver 13 for SQL Server};" "Server=192.168.100.15;" "Database=PharmaAce;" "UID=sa;" "PWD=admin@123;" "Trusted_Connection=no;") cursor=conn.cursor() #(23211,28690,33214,25638,25837,26454,28693,26137,31428,32087) for doc in cursor.execute("select cast(filetext as varchar(max)) as filetext from kpl_tmp"): documents.append(doc[0]) #doc="D:\Document QnA\document-qa-master\Data\Drug_Delivery_Surveying_Global_Competitive_Landscape_BMI.txt" if not isfile(doc): raise ValueError(doc + " does not exist") with open(doc, "r") as f: documents.append(f.read()) """ #print("Loaded %d documents" % len(documents)) #temp=documents[0].split() # Split documents into lists of paragraphs #documents=[" ".join(temp[i:(i+400)]) for i in range(1,len(temp),400)] documents = [re.split("\s*\n\s*", doc) for doc in documents] # Tokenize the input, the models expects data to be tokenized using `NltkAndPunctTokenizer` # Note the model expects case-sensitive input tokenizer = NltkAndPunctTokenizer() question = tokenizer.tokenize_paragraph_flat( Data.at[0, 'Question']) # List of words # Now list of document->paragraph->sentence->word documents = [[tokenizer.tokenize_paragraph(p) for p in doc] for doc in documents] # Now group the document into paragraphs, this returns `ExtractedParagraph` objects # that additionally remember the start/end token of the paragraph within the source document splitter = MergeParagraphs(400) #splitter = PreserveParagraphs() # Uncomment to use the natural paragraph grouping documents = [splitter.split(doc) for doc in documents] #print(str(len(documents))+" kpl") #kpl # Now select the top paragraphs using a `ParagraphFilter` print(len(documents)) #kpl if len(documents) == 1: # Use TF-IDF to select top paragraphs from the document selector = TopTfIdf(NltkPlusStopWords(True), n_to_select=5) context = selector.prune(question, documents[0]) else: # Use a linear classifier to select top paragraphs among all the documents selector = ShallowOpenWebRanker(n_to_select=10) context = selector.prune(question, flatten_iterable(documents)) #print("Select %d paragraph" % len(context)) if model.preprocessor is not None: # Models are allowed to define an additional pre-processing step # This will turn the `ExtractedParagraph` objects back into simple lists of tokens context = [ model.preprocessor.encode_text(question, x) for x in context ] else: # Otherwise just use flattened text context = [flatten_iterable(x.text) for x in context] print("Setting up model") # Tell the model the batch size (can be None) and vocab to expect, This will load the # needed word vectors and fix the batch size to use when building the graph / encoding the input voc = set(question) for txt in context: voc.update(txt) model.set_input_spec(nlp, ParagraphAndQuestionSpec(batch_size=len(context)), voc) # Now we build the actual tensorflow graph, `best_span` and `conf` are # tensors holding the predicted span (inclusive) and confidence scores for each # element in the input batch, confidence scores being the pre-softmax logit for the span #print("Build tf graph") #kpl print("after set input spec") sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) # We need to use sess.as_default when working with the cuNND stuff, since we need an active # session to figure out the # of parameters needed for each layer. The cpu-compatible models don't need this. with sess.as_default(): # 8 means to limit the span to size 8 or less best_spans, conf = model.get_prediction().get_best_span(8) # Loads the saved weights model_dir.restore_checkpoint(sess) print("after loading weights") # Now the model is ready to run # The model takes input in the form of `ContextAndQuestion` objects, for example: data = [ ParagraphAndQuestion(x, question, None, "user-question%d" % i) for i, x in enumerate(context) ] #print("Starting run") # The model is run in two steps, first it "encodes" a batch of paragraph/context pairs # into numpy arrays, then we use `sess` to run the actual model get the predictions encoded = model.encode( data, is_train=True) # batch of `ContextAndQuestion` -> feed_dict best_spans, conf = sess.run([best_spans, conf], feed_dict=encoded) # feed_dict -> predictions best_para = np.argmax( conf ) # We get output for each paragraph, select the most-confident one to print #print("Best Paragraph: " + str(best_para)) #print("Best span: " + str(best_spans[best_para])) #print("Answer text: " + " ".join(context[best_para][best_spans[best_para][0]:best_spans[best_para][1]+1])) #print("Confidence: " + str(conf[best_para])) return " ".join( context[best_para][best_spans[best_para][0]:best_spans[best_para][1] + 1]) #if __name__ == "__main__": # main()