def main(): print('Starting...') model_dir = ModelDir(OPTS.model) model = model_dir.get_model() if not isinstance(model, ParagraphQuestionModel): raise ValueError("This script is built to work for ParagraphQuestionModel models only") input_data, vocab = read_input_data(model) print('Loading word vectors...') model.set_input_spec(ParagraphAndQuestionSpec(batch_size=None), vocab) print('Starting Tensorflow session...') sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) with sess.as_default(): prediction = model.get_prediction() # Take 0-th here because we know we only truncate to one paragraph start_logits_tf = prediction.start_logits[0] end_logits_tf = prediction.end_logits[0] none_logit_tf = prediction.none_logit[0] context_rep_tf = model.context_rep[0] m1_tf = model.predictor.m1[0] m2_tf = model.predictor.m2[0] model_dir.restore_checkpoint(sess) with open(OPTS.output_file, 'w') as f: for doc_raw, q_raw, context, ex in tqdm(input_data): encoded = model.encode(ex, is_train=False) start_logits, end_logits, none_logit, context_rep, m1, m2 = sess.run( [start_logits_tf, end_logits_tf, none_logit_tf, context_rep_tf, m1_tf, m2_tf], feed_dict=encoded) beam, p_na = logits_to_probs( doc_raw, context[0], start_logits, end_logits, none_logit, beam_size=OPTS.beam_size) inputs = [context_rep, m1, m2] vec = np.concatenate([np.amax(x, axis=0) for x in inputs] + [np.amin(x, axis=0) for x in inputs] + [np.mean(x, axis=0) for x in inputs]) #span_logits = np.add.outer(start_logits, end_logits) #all_logits = np.concatenate((np.array([none_logit]), span_logits.flatten())) #log_partition = scipy.special.logsumexp(all_logits) #vec = np.concatenate([ # np.amax(context_rep, axis=0), # np.amin(context_rep, axis=0), # np.mean(context_rep, axis=0), # [np.amax(start_logits), scipy.special.logsumexp(start_logits), # np.amax(end_logits), scipy.special.logsumexp(end_logits), # none_logit, log_partition] #]) out_obj = {'paragraph': doc_raw, 'question': q_raw, 'beam': beam, 'p_na': p_na} if not OPTS.no_vec: out_obj['vec'] = vec.tolist() print(json.dumps(out_obj), file=f)
def getAnswer(self): #parser = argparse.ArgumentParser(description="Run an ELMo model on user input") #parser.add_argument("model", help="Model directory") #parser.add_argument("question", help="Question to answer") #parser.add_argument("documents", help="List of text documents to answer the question with", nargs='+') #args = parser.parse_args() #print("Preprocessing...") # Load the model model_dir = ModelDir(MODEL_DIR) model = model_dir.get_model() if not isinstance(model, ParagraphQuestionModel): raise ValueError( "This script is built to work for ParagraphQuestionModel models only" ) conn = pyodbc.connect(DB_CONN) cursor = conn.cursor() #(23211,28690,33214,25638,25837,26454,28693,26137,31428,32087) query="select cast(filetext as varchar(max)) as filetext, name, type from dbo.UserworkspaceData where objectmasterid= "+\ str(self.ObjectMasterId)+\ " order by id asc" #query="select cast(filetext as varchar(max)) as filetext from kpl_tmp" documents = [] document = "" name = "" filetype = 0 for doc in cursor.execute(query): document = document + doc[0] name = doc[1] filetype = doc[2] #open("E:/kpl.txt","w+").write(document) documents.append(document) #documents.replace("\n\n","\n") #r.sub("",documents) #documents=" ".join(documents.split()) #open("E:\kpl_test.txt","w+").write(document) #doc="D:\Document QnA\document-qa-master\Data\Drug_Delivery_Surveying_Global_Competitive_Landscape_BMI.txt" # ============================================================================= # if not isfile(doc): # raise ValueError(doc + " does not exist") # with open(doc, "r") as f: # documents.append(f.read()) # ============================================================================= #print("Loaded %d documents" % len(documents)) #temp=documents[0].split() # Split documents into lists of paragraphs #documents=[" ".join(temp[i:(i+400)]) for i in range(1,len(temp),400)] documents = [re.split("\s*\n\s*", doc) for doc in documents] # Tokenize the input, the models expects data to be tokenized using `NltkAndPunctTokenizer` # Note the model expects case-sensitive input tokenizer = NltkAndPunctTokenizer() question = tokenizer.tokenize_paragraph_flat( self.Question) # List of words # Now list of document->paragraph->sentence->word documents = [[tokenizer.tokenize_paragraph(p) for p in doc] for doc in documents] # Now group the document into paragraphs, this returns `ExtractedParagraph` objects # that additionally remember the start/end token of the paragraph within the source document splitter = MergeParagraphs(400) #splitter = PreserveParagraphs() # Uncomment to use the natural paragraph grouping documents = [splitter.split(doc) for doc in documents] #print(str(len(documents))+" kpl") #kpl # Now select the top paragraphs using a `ParagraphFilter` if len(documents) == 1: # Use TF-IDF to select top paragraphs from the document selector = TopTfIdf(NltkPlusStopWords(True), n_to_select=5) context = selector.prune(question, documents[0]) else: # Use a linear classifier to select top paragraphs among all the documents selector = ShallowOpenWebRanker(n_to_select=10) context = selector.prune(question, flatten_iterable(documents)) #print("Select %d paragraph" % len(context)) if model.preprocessor is not None: # Models are allowed to define an additional pre-processing step # This will turn the `ExtractedParagraph` objects back into simple lists of tokens context = [ model.preprocessor.encode_text(question, x) for x in context ] else: # Otherwise just use flattened text context = [flatten_iterable(x.text) for x in context] #x=open("E:\context.txt","a+") #[x.write(" ".join(cont)) for cont in context] #x.write("\n.......................................................\n") #print("Setting up model") # Tell the model the batch size (can be None) and vocab to expect, This will load the # needed word vectors and fix the batch size to use when building the graph / encoding the input voc = set(question) for txt in context: voc.update(txt) model.set_input_spec(self.nlp, ParagraphAndQuestionSpec(batch_size=len(context)), voc) # Now we build the actual tensorflow graph, `best_span` and `conf` are # tensors holding the predicted span (inclusive) and confidence scores for each # element in the input batch, confidence scores being the pre-softmax logit for the span #print("Build tf graph") #kpl sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) # We need to use sess.as_default when working with the cuNND stuff, since we need an active # session to figure out the # of parameters needed for each layer. The cpu-compatible models don't need this. with sess.as_default(): # 8 means to limit the span to size 8 or less best_spans, conf = model.get_prediction().get_best_span(8) # Loads the saved weights model_dir.restore_checkpoint(sess) # Now the model is ready to run # The model takes input in the form of `ContextAndQuestion` objects, for example: data = [ ParagraphAndQuestion(x, question, None, "user-question%d" % i) for i, x in enumerate(context) ] #print("Starting run") # The model is run in two steps, first it "encodes" a batch of paragraph/context pairs # into numpy arrays, then we use `sess` to run the actual model get the predictions encoded = model.encode( data, is_train=True) # batch of `ContextAndQuestion` -> feed_dict best_spans, conf = sess.run( [best_spans, conf], feed_dict=encoded) # feed_dict -> predictions best_para = np.argmax( conf ) # We get output for each paragraph, select the most-confident one to print #print("Best Paragraph: " + str(best_para)) #print("Best span: " + str(best_spans[best_para])) #print("Answer text: " + " ".join(context[best_para][best_spans[best_para][0]:best_spans[best_para][1]+1])) #print("Confidence: " + str(conf[best_para])) Answer = " ".join(context[best_para] [best_spans[best_para][0]:best_spans[best_para][1] + 1]) print("Confidence: " + str(conf[best_para])) print("Best Paragraph: " + str(best_para)) print("Best span: " + str(best_spans[best_para])) print("Answer text: " + Answer) print(" ".join(context[best_para])) context[best_para][best_spans[best_para][ 0]] = r"<em>" + context[best_para][best_spans[best_para][0]] context[best_para][best_spans[best_para][1]] = context[best_para][ best_spans[best_para][1]] + r"</em>" start = 0 end = len(context[best_para]) positions = [ x for x, n in enumerate(context[best_para] [0:best_spans[best_para][0]]) if n == "." ] if len(positions) >= 2: start = positions[len(positions) - 2] + 1 positions = [ x for x, n in enumerate(context[best_para][best_spans[best_para][1] + 1:]) if n == "." ] if len(positions) > 1: end = best_spans[best_para][1] + 1 + positions[1] d = dict() if conf[best_para] > 10: d["answer"] = Answer else: d["answer"] = "" d["name"] = name d["filetype"] = filetype d["paragraph"] = re.sub(r' (?=\W)', '', " ".join(context[best_para][start:end])) d["ObjectMasterId"] = self.ObjectMasterId return d #if __name__ == "__main__": # main()
def main(): parser = argparse.ArgumentParser( description="Run an ELMo model on user input") parser.add_argument("model", help="Model directory") parser.add_argument("question", help="Question to answer") parser.add_argument("context", help="Context to answer the question with") args = parser.parse_args() # Tokenize the input, the models expected data to be tokenized using `NltkAndPunctTokenizer` # Note the model expects case-sensitive input tokenizer = NltkAndPunctTokenizer() question = tokenizer.tokenize_paragraph_flat(args.question) context = tokenizer.tokenize_paragraph_flat(args.context) print("Loading model") model_dir = ModelDir(args.model) model = model_dir.get_model() if not isinstance(model, ElmoQaModel): raise ValueError( "This script is build to work for ElmoQaModel models only") # Important! This tells the language model not to use the pre-computed word vectors, # which are only applicable for the SQuAD dev/train sets. # Instead the language model will use its character-level CNN to compute # the word vectors dynamically. model.lm_model.embed_weights_file = None # Tell the model the batch size and vocab to expect, This will load the needed # word vectors and fix the batch size when building the graph / encoding the input print("Setting up model") voc = set(question) voc.update(context) model.set_input_spec(ParagraphAndQuestionSpec(batch_size=1), voc) # Now we build the actual tensorflow graph, `best_span` and `conf` are # tensors holding the predicted span (inclusive) and confidence scores for each # element in the input batch print("Build tf graph") sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) with sess.as_default(): # 17 means to limit the span to size 17 or less best_spans, conf = model.get_prediction().get_best_span(17) # Now restore the weights, this is a bit fiddly since we need to avoid restoring the # bilm weights, and instead load them from the pre-computed data all_vars = tf.global_variables() + tf.get_collection( tf.GraphKeys.SAVEABLE_OBJECTS) lm_var_names = {x.name for x in all_vars if x.name.startswith("bilm")} vars = [x for x in all_vars if x.name not in lm_var_names] model_dir.restore_checkpoint(sess, vars) # Run the initializer of the lm weights, which will load them from the lm directory sess.run( tf.variables_initializer( [x for x in all_vars if x.name in lm_var_names])) # Now the model is ready to run # The model takes input in the form of `ContextAndQuestion` objects, for example: data = [ParagraphAndQuestion(context, question, None, "user-question1")] print("Starting run") # The model is run in two steps, first it "encodes" the paragraph/context pairs # into numpy arrays, then to use `sess` to run the actual model get the predictions encoded = model.encode( data, is_train=False) # batch of `ContextAndQuestion` -> feed_dict best_spans, conf = sess.run([best_spans, conf], feed_dict=encoded) # feed_dict -> predictions print("Best span: " + str(best_spans[0])) print("Answer text: " + " ".join(context[best_spans[0][0]:best_spans[0][1] + 1])) print("Confidence: " + str(conf[0]))
def convert_saved_graph(model_dir, output_dir): print("Load model") md = ModelDir(model_dir) model = md.get_model() # remove the lm models word embeddings - cpu model will use Char-CNN model.lm_model.embed_weights_file = None dim = model.embed_mapper.layers[1].n_units print("Setting up cudnn version") sess = tf.Session() with sess.as_default(): model.set_input_spec( ParagraphAndQuestionSpec(1, None, None, 14), {"the"}, ResourceLoader(lambda a, b: {"the": np.zeros(300, np.float32)})) print("Buiding graph") pred = model.get_prediction() test_questions = get_test_questions() print("Load vars:") all_vars = tf.global_variables() + tf.get_collection( tf.GraphKeys.SAVEABLE_OBJECTS) lm_var_names = {x.name for x in all_vars if x.name.startswith("bilm")} vars = [x for x in all_vars if x.name not in lm_var_names] md.restore_checkpoint(sess, vars) sess.run( tf.variables_initializer( [x for x in all_vars if x.name in lm_var_names])) feed = model.encode([test_questions], False) cuddn_out = sess.run([pred.start_logits, pred.end_logits], feed_dict=feed) print("Done, copying files...") if not exists(output_dir): mkdir(output_dir) for file in listdir(model_dir): if isfile(file) and file != "model.npy": copyfile(join(model_dir, file), join(output_dir, file)) print("Done, mapping tensors...") to_save, to_init = [], [] for x in tf.trainable_variables(): if x.name.endswith("/gru_parameters:0"): key = x.name[:-len("/gru_parameters:0")] indim, outdim = get_dims(x, dim) c = cudnn_rnn_ops.CudnnGRUSaveable(x, 1, outdim, indim, scope=key) for spec in c.specs: if spec.name.endswith("bias_cudnn 0") or \ spec.name.endswith("bias_cudnn 1"): print('Unsupported spec: ' + spec.name) continue if 'forward' in spec.name: new_name = spec.name.replace( 'forward/rnn/multi_rnn_cell/cell_0/', 'bidirectional_rnn/fw/') else: new_name = spec.name.replace( 'backward/rnn/multi_rnn_cell/cell_0/', 'bidirectional_rnn/bw/') v = tf.Variable(sess.run(spec.tensor), name=new_name) to_init.append(v) to_save.append(v) else: to_save.append(x) save_dir = join(output_dir, "save") if not exists(save_dir): mkdir(save_dir) # save: all_vars = tf.global_variables() + tf.get_collection( tf.GraphKeys.SAVEABLE_OBJECTS) vars_to_save = [x for x in all_vars if not x.name.startswith("bilm")] sess.run(tf.initialize_variables(to_init)) saver = tf.train.Saver(vars_to_save) saver.save( sess, join(save_dir, 'checkpoint'), global_step=123456789, write_meta_graph=False, ) sess.close() tf.reset_default_graph() return cuddn_out
def main(): parser = argparse.ArgumentParser(description="Run an ELMo model on user input") # parser.add_argument("model", type=int, help="Model directory") parser.add_argument("question", help="Question to answer") parser.add_argument("documents", help="List of text documents to answer the question with", nargs='+') args = parser.parse_args() # Models path SQUAD_MODEL_DIRECTORY_PATH = 'docqa/models-cpu/squad' SQUAD_SHARED_NORM_MODEL_DIRECTORY_PATH = 'docqa/models-cpu/squad-shared-norm' TRIVIAQA_MODEL_DIRECTORY_PATH = 'docqa/models-cpu/triviaqa-unfiltered-shared-norm' TRIVIAQA_SHARED_NORM_MODEL_DIRECTORY_PATH = 'docqa/models-cpu/triviaqa-web-shared-norm' models_directory = [ SQUAD_MODEL_DIRECTORY_PATH, SQUAD_SHARED_NORM_MODEL_DIRECTORY_PATH, TRIVIAQA_MODEL_DIRECTORY_PATH, TRIVIAQA_SHARED_NORM_MODEL_DIRECTORY_PATH ] print("Preprocessing...") # Load the model # model_dir = ModelDir(args.model) model_dir = ModelDir(models_directory[0]) model = model_dir.get_model() if not isinstance(model, ParagraphQuestionModel): raise ValueError("This script is built to work for ParagraphQuestionModel models only") # Read the documents documents = [] for doc in args.documents: if not isfile(doc): raise ValueError(doc + " does not exist") with open(doc, "r") as f: documents.append(f.read()) print("Loaded %d documents" % len(documents)) # Split documents into lists of paragraphs documents = [re.split("\s*\n\s*", doc) for doc in documents] # Tokenize the input, the models expects data to be tokenized using `NltkAndPunctTokenizer` # Note the model expects case-sensitive input tokenizer = NltkAndPunctTokenizer() question = tokenizer.tokenize_paragraph_flat(args.question) # List of words # Now list of document->paragraph->sentence->word documents = [[tokenizer.tokenize_paragraph(p) for p in doc] for doc in documents] # Now group the document into paragraphs, this returns `ExtractedParagraph` objects # that additionally remember the start/end token of the paragraph within the source document splitter = MergeParagraphs(400) # splitter = PreserveParagraphs() # Uncomment to use the natural paragraph grouping documents = [splitter.split(doc) for doc in documents] # Now select the top paragraphs using a `ParagraphFilter` if len(documents) == 1: # Use TF-IDF to select top paragraphs from the document selector = TopTfIdf(NltkPlusStopWords(True), n_to_select=5) context = selector.prune(question, documents[0]) else: # Use a linear classifier to select top paragraphs among all the documents selector = ShallowOpenWebRanker(n_to_select=10) context = selector.prune(question, flatten_iterable(documents)) print("Select %d paragraph" % len(context)) if model.preprocessor is not None: # Models are allowed to define an additional pre-processing step # This will turn the `ExtractedParagraph` objects back into simple lists of tokens context = [model.preprocessor.encode_text(question, x) for x in context] else: # Otherwise just use flattened text context = [flatten_iterable(x.text) for x in context] print("Setting up model") # Tell the model the batch size (can be None) and vocab to expect, This will load the # needed word vectors and fix the batch size to use when building the graph / encoding the input voc = set(question) for txt in context: voc.update(txt) model.set_input_spec(ParagraphAndQuestionSpec(batch_size=len(context)), voc) # Now we build the actual tensorflow graph, `best_span` and `conf` are # tensors holding the predicted span (inclusive) and confidence scores for each # element in the input batch, confidence scores being the pre-softmax logit for the span print("Build tf graph") sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) # We need to use sess.as_default when working with the cuNND stuff, since we need an active # session to figure out the # of parameters needed for each layer. The cpu-compatible models don't need this. with sess.as_default(): # 8 means to limit the span to size 8 or less best_spans, conf = model.get_prediction().get_best_span(10) # Loads the saved weights model_dir.restore_checkpoint(sess) # Now the model is ready to run # The model takes input in the form of `ContextAndQuestion` objects, for example: data = [ParagraphAndQuestion(x, question, None, "user-question%d"%i) for i, x in enumerate(context)] print("Starting run") # The model is run in two steps, first it "encodes" a batch of paragraph/context pairs # into numpy arrays, then we use `sess` to run the actual model get the predictions encoded = model.encode(data, is_train=False) # batch of `ContextAndQuestion` -> feed_dict best_spans, conf = sess.run([best_spans, conf], feed_dict=encoded) # feed_dict -> predictions best_para = np.argmax(conf) # We get output for each paragraph, select the most-confident one to print print("Best Paragraph: " + str(best_para)) para_id = int(str(best_para)) # print("Best Paragraph: \n" + (" ".join((paras[para_id].text)[0]))) print("Best Paragraph: \n" + " ".join(context[para_id])) print("Best span: " + str(best_spans[best_para])) print("Answer text: " + " ".join(context[best_para][best_spans[best_para][0]:best_spans[best_para][1]+1])) print("Confidence: " + str(conf[best_para]))
def predict(): json_data = {"success": False, "predictions": []} print("Preprocessing...") # Load the model model_dir = ModelDir( "/home/antriv/conversation_ai/Transfer_Learning/ALLENAI_DocumentQA/document-qa/pretrained_models/models/triviaqa-unfiltered-shared-norm" ) model = model_dir.get_model() if not isinstance(model, ParagraphQuestionModel): raise ValueError( "This script is built to work for ParagraphQuestionModel models only" ) # Load the question question = (flask.request.data).decode("utf-8") # Read the documents documents = [] doclist = ["/home/antriv/data/The-Future-Computed.txt"] for doc in doclist: if not isfile(doc): raise ValueError(doc + " does not exist") with open(doc, "r") as f: documents.append(f.read()) print("Loaded %d documents" % len(documents)) # Split documents into lists of paragraphs documents = [re.split("\s*\n\s*", doc) for doc in documents] # Tokenize the input, the models expects data to be tokenized using `NltkAndPunctTokenizer` # Note the model expects case-sensitive input tokenizer = NltkAndPunctTokenizer() question = tokenizer.tokenize_paragraph_flat(question) # List of words # Now list of document->paragraph->sentence->word documents = [[tokenizer.tokenize_paragraph(p) for p in doc] for doc in documents] # Now group the document into paragraphs, this returns `ExtractedParagraph` objects # that additionally remember the start/end token of the paragraph within the source document splitter = MergeParagraphs(400) #splitter = PreserveParagraphs() # Uncomment to use the natural paragraph grouping documents = [splitter.split(doc) for doc in documents] # Now select the top paragraphs using a `ParagraphFilter` if len(documents) == 1: # Use TF-IDF to select top paragraphs from the document selector = TopTfIdf(NltkPlusStopWords(True), n_to_select=1000) context = selector.prune(question, documents[0]) else: # Use a linear classifier to select top paragraphs among all the documents selector = ShallowOpenWebRanker(n_to_select=1000) context = selector.prune(question, flatten_iterable(documents)) print("Select %d paragraph" % len(context)) if model.preprocessor is not None: # Models are allowed to define an additional pre-processing step # This will turn the `ExtractedParagraph` objects back into simple lists of tokens context = [ model.preprocessor.encode_text(question, x) for x in context ] else: # Otherwise just use flattened text context = [flatten_iterable(x.text) for x in context] print("Setting up model") # Tell the model the batch size (can be None) and vocab to expect, This will load the # needed word vectors and fix the batch size to use when building the graph / encoding the input voc = set(question) for txt in context: voc.update(txt) model.set_input_spec(ParagraphAndQuestionSpec(batch_size=len(context)), voc) # Now we build the actual tensorflow graph, `best_span` and `conf` are # tensors holding the predicted span (inclusive) and confidence scores for each # element in the input batch, confidence scores being the pre-softmax logit for the span print("Build tf graph") sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) # We need to use sess.as_default when working with the cuNND stuff, since we need an active # session to figure out the # of parameters needed for each layer. The cpu-compatible models don't need this. with sess.as_default(): # 8 means to limit the span to size 8 or less best_spans, conf = model.get_prediction().get_best_span(8) # Loads the saved weights model_dir.restore_checkpoint(sess) # Now the model is ready to run # The model takes input in the form of `ContextAndQuestion` objects, for example: data = [ ParagraphAndQuestion(x, question, None, "user-question%d" % i) for i, x in enumerate(context) ] print("Starting run") # The model is run in two steps, first it "encodes" a batch of paragraph/context pairs # into numpy arrays, then we use `sess` to run the actual model get the predictions encoded = model.encode( data, is_train=False) # batch of `ContextAndQuestion` -> feed_dict best_spans, conf = sess.run([best_spans, conf], feed_dict=encoded) # feed_dict -> predictions best_para = np.argmax( conf ) # We get output for each paragraph, select the most-confident one to print print("Best Paragraph: " + str(best_para)) print("Best span: " + str(best_spans[best_para])) print("Answer text: " + " ".join(context[best_para] [best_spans[best_para][0]:best_spans[best_para][1] + 1])) print("Confidence: " + str(conf[best_para])) y_output = " ".join( context[best_para][best_spans[best_para][0]:best_spans[best_para][1] + 1]) print(y_output) json_data["predictions"].append(str(y_output)) #indicate that the request was a success json_data["success"] = True #return the data dictionary as a JSON response return flask.jsonify(json_data)
def convert(model_dir, output_dir, best_weights=False): print("Load model") md = ModelDir(model_dir) model = md.get_model() dim = model.embed_mapper.layers[1].n_units global_step = tf.get_variable('global_step', shape=[], dtype='int32', initializer=tf.constant_initializer(0), trainable=False) print("Setting up cudnn version") #global_step = tf.get_variable('global_step', shape=[], dtype='int32', trainable=False) sess = tf.Session() sess.run(global_step.assign(0)) with sess.as_default(): model.set_input_spec( ParagraphAndQuestionSpec(1, None, None, 14), {"the"}, ResourceLoader(lambda a, b: {"the": np.zeros(300, np.float32)})) print("Buiding graph") pred = model.get_prediction() test_questions = ParagraphAndQuestion( ["Harry", "Potter", "was", "written", "by", "JK"], ["Who", "wrote", "Harry", "Potter", "?"], None, "test_questions") print("Load vars") md.restore_checkpoint(sess) print("Restore finished") feed = model.encode([test_questions], False) cuddn_out = sess.run([pred.start_logits, pred.end_logits], feed_dict=feed) print("Done, copying files...") if not exists(output_dir): mkdir(output_dir) for file in listdir(model_dir): if isfile(file) and file != "model.npy": copyfile(join(model_dir, file), join(output_dir, file)) print("Done, mapping tensors...") to_save = [] to_init = [] for x in tf.trainable_variables(): if x.name.endswith("/gru_parameters:0"): key = x.name[:-len("/gru_parameters:0")] fw_params = x if "map_embed" in x.name: c = cudnn_rnn_ops.CudnnGRU(1, dim, 400) elif "chained-out" in x.name: c = cudnn_rnn_ops.CudnnGRU(1, dim, dim * 4) else: c = cudnn_rnn_ops.CudnnGRU(1, dim, dim * 2) params_saveable = cudnn_rnn_ops.RNNParamsSaveable( c, c.params_to_canonical, c.canonical_to_params, [fw_params], key) for spec in params_saveable.specs: if spec.name.endswith("bias_cudnn 0") or \ spec.name.endswith("bias_cudnn 1"): # ??? What do these even do? continue name = spec.name.split("/") name.remove("cell_0") if "forward" in name: ix = name.index("forward") name.insert(ix + 2, "fw") else: ix = name.index("backward") name.insert(ix + 2, "bw") del name[ix] ix = name.index("multi_rnn_cell") name[ix] = "bidirectional_rnn" name = "/".join(name) v = tf.Variable(sess.run(spec.tensor), name=name) to_init.append(v) to_save.append(v) else: to_save.append(x) other = [ x for x in tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) if x not in tf.trainable_variables() ] print(other) sess.run(tf.initialize_variables(to_init)) saver = tf.train.Saver(to_save + other) save_dir = join(output_dir, "save") if not exists(save_dir): mkdir(save_dir) saver.save(sess, join(save_dir, "checkpoint"), sess.run(global_step)) sess.close() tf.reset_default_graph() print("Updating model...") model.embed_mapper.layers = [ model.embed_mapper.layers[0], BiRecurrentMapper(CompatGruCellSpec(dim)) ] model.match_encoder.layers = list(model.match_encoder.layers) other = model.match_encoder.layers[1].other other.layers = list(other.layers) other.layers[1] = BiRecurrentMapper(CompatGruCellSpec(dim)) pred = model.predictor.predictor pred.first_layer = BiRecurrentMapper(CompatGruCellSpec(dim)) pred.second_layer = BiRecurrentMapper(CompatGruCellSpec(dim)) with open(join(output_dir, "model.pkl"), "wb") as f: pickle.dump(model, f) print("Testing...") with open(join(output_dir, "model.pkl"), "rb") as f: model = pickle.load(f) sess = tf.Session() model.set_input_spec( ParagraphAndQuestionSpec(1, None, None, 14), {"the"}, ResourceLoader(lambda a, b: {"the": np.zeros(300, np.float32)})) pred = model.get_prediction() print("Rebuilding") saver = tf.train.Saver() saver.restore(sess, tf.train.latest_checkpoint(save_dir)) feed = model.encode([test_questions], False) cpu_out = sess.run([pred.start_logits, pred.end_logits], feed_dict=feed) print("These should be close:") print([np.allclose(a, b) for a, b in zip(cpu_out, cuddn_out)]) print(cpu_out) print(cuddn_out)
def main(): print('Starting...') model_dir = ModelDir(OPTS.model) model = model_dir.get_model() tokenizer = NltkAndPunctTokenizer() if not isinstance(model, ParagraphQuestionModel): raise ValueError( "This script is built to work for ParagraphQuestionModel models only" ) if OPTS.reload_vocab: loader = ResourceLoader() else: loader = CachingResourceLoader() print('Loading word vectors...') model.set_input_spec(ParagraphAndQuestionSpec(batch_size=None), set([',']), word_vec_loader=loader, allow_update=True) print('Starting Tensorflow session...') sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) with sess.as_default(): prediction = model.get_prediction() # Take 0-th here because we know we only truncate to one paragraph start_logits_tf = prediction.start_logits[0] end_logits_tf = prediction.end_logits[0] none_logit_tf = prediction.none_logit[0] #best_spans_tf, conf_tf = prediction.get_best_span(MAX_SPAN_LENGTH) model_dir.restore_checkpoint(sess) splitter = Truncate(400) # NOTE: we truncate past 400 tokens selector = TopTfIdf(NltkPlusStopWords(True), n_to_select=5) app = bottle.Bottle() @app.route('/') def index(): return bottle.template('index') @app.route('/post_query', method='post') def post_query(): document_raw = bottle.request.forms.getunicode('document').strip() question_raw = bottle.request.forms.getunicode('question').strip() document = re.split("\s*\n\s*", document_raw) question = tokenizer.tokenize_paragraph_flat(question_raw) doc_toks = [tokenizer.tokenize_paragraph(p) for p in document] split_doc = splitter.split(doc_toks) context = selector.prune(question, split_doc) if model.preprocessor is not None: context = [ model.preprocessor.encode_text(question, x) for x in context ] else: context = [flatten_iterable(x.text) for x in context] vocab = set(question) for txt in context: vocab.update(txt) data = [ ParagraphAndQuestion(x, question, None, "user-question%d" % i) for i, x in enumerate(context) ] model.word_embed.update(loader, vocab) encoded = model.encode(data, is_train=False) start_logits, end_logits, none_logit = sess.run( [start_logits_tf, end_logits_tf, none_logit_tf], feed_dict=encoded) beam, p_na = logits_to_probs(document_raw, context[0], start_logits, end_logits, none_logit, beam_size=BEAM_SIZE) return bottle.template('results', document=document_raw, question=question_raw, beam=beam, p_na=p_na) cur_dir = os.path.abspath(os.path.dirname(__file__)) bottle.TEMPLATE_PATH.insert(0, os.path.join(cur_dir, 'views')) bottle.run(app, host=OPTS.hostname, port=OPTS.port, debug=OPTS.debug)
def main(): parser = argparse.ArgumentParser( description="Run an ELMo model on user input") parser.add_argument("model", help="Model directory") parser.add_argument("ja_filepath", help="File path to japanese questions") parser.add_argument("result_file", help="File path to predicted result json") args = parser.parse_args() print(args) print("Preprocessing...") paragraphs, questions = read_squad_style_database(args.ja_filepath) # Load the model model_dir = ModelDir(args.model) model = model_dir.get_model() if not isinstance(model, ParagraphQuestionModel): raise ValueError( "This script is built to work for ParagraphQuestionModel models only" ) paragraphs, questions = read_squad_style_database(args.ja_filepath) predictions = {} predictions["conf"] = {} for qa in questions: print(qa["id"]) title = qa["title"] para_idx = qa["para_idx"] context = paragraphs[title][para_idx] question = qa["question"] print(context) print(question) if model.preprocessor is not None: context = [ model.preprocessor.encode_text(question, x) for x in context ] print("Setting up model") voc = set(question) for txt in context: voc.update(txt) model.set_input_spec(ParagraphAndQuestionSpec(batch_size=len(context)), voc) print("Build tf graph") sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) with sess.as_default(): best_spans, conf = model.get_prediction().get_best_span(8) # Loads the saved weights model_dir.restore_checkpoint(sess) data = [ ParagraphAndQuestion(x, question, None, "user-question%d" % i) for i, x in enumerate(context) ] print("Starting run") encoded = model.encode( data, is_train=False) # batch of `ContextAndQuestion` -> feed_dict best_spans, conf = sess.run( [best_spans, conf], feed_dict=encoded) # feed_dict -> predictions print(best_spans) predictions[qa["id"]] = best_spans predictions["conf"][qa["id"]] = conf print(predictions) result_f = open(args.result_file, "w") json.dump(predictions, result_f) exit() official_evaluator = OfficialEvaluator(args.ja_filepath, args.result_file) evaluation = official_evaluator.evaluate() print(evaluation)
word = line.strip() if len(word) > 0: vocab.add(word) print('done') print('Loading Model...', end='') # init tf session and weights model.set_input_spec(ParagraphAndQuestionSpec(batch_size=None), vocab) sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) with sess.as_default(): best_spans, conf = model.get_prediction().get_best_span(16) model_dir.restore_checkpoint(sess) print('done') # use docs: list<str> to find ans: str to question: str def find_answer(documents, raw_question): raw_question = raw_question.lower() documents = [d.lower() for d in documents] global best_spans, conf documents = [re.split("\s*\n\s*", doc) for doc in documents] tokenizer = NltkAndPunctTokenizer()
def main(): print('Starting...') model_dir = ModelDir(OPTS.model) model = model_dir.get_model() if OPTS.elmo: # Fix absolute path names from other codalab runs lm = model.lm_model if lm.lm_vocab_file.startswith('/0x'): lm.lm_vocab_file = os.sep.join(lm.lm_vocab_file.split(os.sep)[2:]) if lm.options_file.startswith('/0x'): lm.options_file = os.sep.join(lm.options_file.split(os.sep)[2:]) if lm.weight_file.startswith('/0x'): lm.weight_file = os.sep.join(lm.weight_file.split(os.sep)[2:]) if lm.weight_file.startswith('/0x'): lm.embed_weights_file = os.sep.join(lm.embed_weights_file.split(os.sep)[2:]) lm.embed_weights_file = None #if not isinstance(model, ParagraphQuestionModel): # raise ValueError("This script is built to work for ParagraphQuestionModel models only") input_data, vocab = read_input_data(model) print('Loading word vectors...') model.set_input_spec(ParagraphAndQuestionSpec(batch_size=None), vocab) print('Starting Tensorflow session...') config = tf.ConfigProto(allow_soft_placement=True) config.gpu_options.allow_growth = True sess = tf.Session(config=config) with sess.as_default(): prediction = model.get_prediction() # Take 0-th here because we know we only truncate to one paragraph start_logits_tf = prediction.start_logits[0] end_logits_tf = prediction.end_logits[0] none_logit_tf = prediction.none_logit[0] if OPTS.elmo: # See elmo/run_on_user_text.py all_vars = tf.global_variables() + tf.get_collection(tf.GraphKeys.SAVEABLE_OBJECTS) lm_var_names = {x.name for x in all_vars if x.name.startswith("bilm")} vars = [x for x in all_vars if x.name not in lm_var_names] model_dir.restore_checkpoint(sess, vars) sess.run(tf.variables_initializer([x for x in all_vars if x.name in lm_var_names])) else: model_dir.restore_checkpoint(sess) pred_obj = {} na_prob_obj = {} pred_always_ans_obj = {} analysis_obj = {} for context_raw, context_toks, ex in tqdm(input_data): encoded = model.encode(ex, is_train=False) start_logits, end_logits, none_logit = sess.run( [start_logits_tf, end_logits_tf, none_logit_tf], feed_dict=encoded) # beam, p_na = logits_to_probs( # context_raw, context_toks, start_logits, end_logits, none_logit, # beam_size=DEFAULT_BEAM_SIZE) beam, p_na = logits_to_probs( context_raw, context_toks, start_logits, end_logits, none_logit, beam_size=10) # print(beam[0][0]) ans = beam[0][0] # start, end = beam[0][2],beam[0][3] non_empty_ans = [x[0] for x in beam if x[0]][0] qid = ex[0].question_id pred_obj[qid] = ans na_prob_obj[qid] = p_na pred_always_ans_obj[qid] = non_empty_ans analysis_obj[qid] = [{'answer': b[0], 'span':[b[2], b[3]], 'prob':b[1]} for b in beam] # print(analysis_obj[qid]) with open(OPTS.output_file, 'w') as f: json.dump(pred_obj, f) if OPTS.na_prob_file: with open(OPTS.na_prob_file, 'w') as f: json.dump(na_prob_obj, f) if OPTS.always_answer_file: with open(OPTS.always_answer_file, 'w') as f: json.dump(pred_always_ans_obj, f) if OPTS.analysis_file: with open(OPTS.analysis_file, 'w') as f: json.dump(analysis_obj, f, indent=2)