Example #1
0
def main():
  print('Starting...')
  model_dir = ModelDir(OPTS.model)
  model = model_dir.get_model()
  if not isinstance(model, ParagraphQuestionModel):
    raise ValueError("This script is built to work for ParagraphQuestionModel models only")
  input_data, vocab = read_input_data(model)

  print('Loading word vectors...')
  model.set_input_spec(ParagraphAndQuestionSpec(batch_size=None), vocab)

  print('Starting Tensorflow session...')
  sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
  with sess.as_default():
    prediction = model.get_prediction()
    # Take 0-th here because we know we only truncate to one paragraph
    start_logits_tf = prediction.start_logits[0]
    end_logits_tf = prediction.end_logits[0]
    none_logit_tf = prediction.none_logit[0]
    context_rep_tf = model.context_rep[0]
    m1_tf = model.predictor.m1[0]
    m2_tf = model.predictor.m2[0]
  model_dir.restore_checkpoint(sess)

  with open(OPTS.output_file, 'w') as f:
    for doc_raw, q_raw, context, ex in tqdm(input_data):
      encoded = model.encode(ex, is_train=False)
      start_logits, end_logits, none_logit, context_rep, m1, m2 = sess.run(
          [start_logits_tf, end_logits_tf, none_logit_tf, context_rep_tf,
           m1_tf, m2_tf],
          feed_dict=encoded)
      beam, p_na = logits_to_probs(
          doc_raw, context[0], start_logits, end_logits, none_logit,
          beam_size=OPTS.beam_size)
      inputs = [context_rep, m1, m2]
      vec = np.concatenate([np.amax(x, axis=0) for x in inputs] +
                           [np.amin(x, axis=0) for x in inputs] +
                           [np.mean(x, axis=0) for x in inputs])
      #span_logits = np.add.outer(start_logits, end_logits)
      #all_logits = np.concatenate((np.array([none_logit]), span_logits.flatten()))
      #log_partition = scipy.special.logsumexp(all_logits)
      #vec = np.concatenate([
      #    np.amax(context_rep, axis=0),
      #    np.amin(context_rep, axis=0),
      #    np.mean(context_rep, axis=0),
      #    [np.amax(start_logits), scipy.special.logsumexp(start_logits),
      #     np.amax(end_logits), scipy.special.logsumexp(end_logits),
      #     none_logit, log_partition] 
      #])
      out_obj = {'paragraph': doc_raw, 'question': q_raw,
                 'beam': beam, 'p_na': p_na}
      if not OPTS.no_vec:
        out_obj['vec'] = vec.tolist()
      print(json.dumps(out_obj), file=f)
    def getAnswer(self):
        #parser = argparse.ArgumentParser(description="Run an ELMo model on user input")
        #parser.add_argument("model", help="Model directory")
        #parser.add_argument("question", help="Question to answer")
        #parser.add_argument("documents", help="List of text documents to answer the question with", nargs='+')
        #args = parser.parse_args()

        #print("Preprocessing...")

        # Load the model
        model_dir = ModelDir(MODEL_DIR)
        model = model_dir.get_model()
        if not isinstance(model, ParagraphQuestionModel):
            raise ValueError(
                "This script is built to work for ParagraphQuestionModel models only"
            )

        conn = pyodbc.connect(DB_CONN)

        cursor = conn.cursor()
        #(23211,28690,33214,25638,25837,26454,28693,26137,31428,32087)
        query="select cast(filetext as varchar(max)) as filetext, name, type from dbo.UserworkspaceData where objectmasterid= "+\
               str(self.ObjectMasterId)+\
               " order by id asc"
        #query="select cast(filetext as varchar(max)) as filetext from kpl_tmp"
        documents = []
        document = ""
        name = ""
        filetype = 0
        for doc in cursor.execute(query):
            document = document + doc[0]
            name = doc[1]
            filetype = doc[2]
        #open("E:/kpl.txt","w+").write(document)
        documents.append(document)
        #documents.replace("\n\n","\n")
        #r.sub("",documents)
        #documents=" ".join(documents.split())
        #open("E:\kpl_test.txt","w+").write(document)
        #doc="D:\Document QnA\document-qa-master\Data\Drug_Delivery_Surveying_Global_Competitive_Landscape_BMI.txt"
        # =============================================================================
        #     if not isfile(doc):
        #         raise ValueError(doc + " does not exist")
        #     with open(doc, "r") as f:
        #         documents.append(f.read())
        # =============================================================================

        #print("Loaded %d documents" % len(documents))
        #temp=documents[0].split()
        # Split documents into lists of paragraphs
        #documents=[" ".join(temp[i:(i+400)]) for i in range(1,len(temp),400)]
        documents = [re.split("\s*\n\s*", doc) for doc in documents]
        # Tokenize the input, the models expects data to be tokenized using `NltkAndPunctTokenizer`
        # Note the model expects case-sensitive input
        tokenizer = NltkAndPunctTokenizer()
        question = tokenizer.tokenize_paragraph_flat(
            self.Question)  # List of words

        # Now list of document->paragraph->sentence->word
        documents = [[tokenizer.tokenize_paragraph(p) for p in doc]
                     for doc in documents]

        # Now group the document into paragraphs, this returns `ExtractedParagraph` objects
        # that additionally remember the start/end token of the paragraph within the source document
        splitter = MergeParagraphs(400)
        #splitter = PreserveParagraphs() # Uncomment to use the natural paragraph grouping
        documents = [splitter.split(doc) for doc in documents]
        #print(str(len(documents))+" kpl") #kpl
        # Now select the top paragraphs using a `ParagraphFilter`
        if len(documents) == 1:
            # Use TF-IDF to select top paragraphs from the document
            selector = TopTfIdf(NltkPlusStopWords(True), n_to_select=5)
            context = selector.prune(question, documents[0])
        else:
            # Use a linear classifier to select top paragraphs among all the documents
            selector = ShallowOpenWebRanker(n_to_select=10)
            context = selector.prune(question, flatten_iterable(documents))

    #print("Select %d paragraph" % len(context))

        if model.preprocessor is not None:
            # Models are allowed to define an additional pre-processing step
            # This will turn the `ExtractedParagraph` objects back into simple lists of tokens
            context = [
                model.preprocessor.encode_text(question, x) for x in context
            ]
        else:
            # Otherwise just use flattened text
            context = [flatten_iterable(x.text) for x in context]
        #x=open("E:\context.txt","a+")
        #[x.write(" ".join(cont)) for cont in context]
        #x.write("\n.......................................................\n")

        #print("Setting up model")
        # Tell the model the batch size (can be None) and vocab to expect, This will load the
        # needed word vectors and fix the batch size to use when building the graph / encoding the input
        voc = set(question)
        for txt in context:
            voc.update(txt)

        model.set_input_spec(self.nlp,
                             ParagraphAndQuestionSpec(batch_size=len(context)),
                             voc)
        # Now we build the actual tensorflow graph, `best_span` and `conf` are
        # tensors holding the predicted span (inclusive) and confidence scores for each
        # element in the input batch, confidence scores being the pre-softmax logit for the span
        #print("Build tf graph") #kpl
        sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
        # We need to use sess.as_default when working with the cuNND stuff, since we need an active
        # session to figure out the # of parameters needed for each layer. The cpu-compatible models don't need this.
        with sess.as_default():
            # 8 means to limit the span to size 8 or less
            best_spans, conf = model.get_prediction().get_best_span(8)

    # Loads the saved weights
        model_dir.restore_checkpoint(sess)

        # Now the model is ready to run
        # The model takes input in the form of `ContextAndQuestion` objects, for example:
        data = [
            ParagraphAndQuestion(x, question, None, "user-question%d" % i)
            for i, x in enumerate(context)
        ]

        #print("Starting run")
        # The model is run in two steps, first it "encodes" a batch of paragraph/context pairs
        # into numpy arrays, then we use `sess` to run the actual model get the predictions
        encoded = model.encode(
            data, is_train=True)  # batch of `ContextAndQuestion` -> feed_dict
        best_spans, conf = sess.run(
            [best_spans, conf], feed_dict=encoded)  # feed_dict -> predictions

        best_para = np.argmax(
            conf
        )  # We get output for each paragraph, select the most-confident one to print

        #print("Best Paragraph: " + str(best_para))
        #print("Best span: " + str(best_spans[best_para]))
        #print("Answer text: " + " ".join(context[best_para][best_spans[best_para][0]:best_spans[best_para][1]+1]))
        #print("Confidence: " + str(conf[best_para]))
        Answer = " ".join(context[best_para]
                          [best_spans[best_para][0]:best_spans[best_para][1] +
                           1])

        print("Confidence: " + str(conf[best_para]))
        print("Best Paragraph: " + str(best_para))
        print("Best span: " + str(best_spans[best_para]))
        print("Answer text: " + Answer)
        print(" ".join(context[best_para]))
        context[best_para][best_spans[best_para][
            0]] = r"<em>" + context[best_para][best_spans[best_para][0]]
        context[best_para][best_spans[best_para][1]] = context[best_para][
            best_spans[best_para][1]] + r"</em>"

        start = 0
        end = len(context[best_para])

        positions = [
            x for x, n in enumerate(context[best_para]
                                    [0:best_spans[best_para][0]]) if n == "."
        ]
        if len(positions) >= 2: start = positions[len(positions) - 2] + 1
        positions = [
            x
            for x, n in enumerate(context[best_para][best_spans[best_para][1] +
                                                     1:]) if n == "."
        ]
        if len(positions) > 1:
            end = best_spans[best_para][1] + 1 + positions[1]

        d = dict()
        if conf[best_para] > 10:
            d["answer"] = Answer
        else:
            d["answer"] = ""
        d["name"] = name
        d["filetype"] = filetype
        d["paragraph"] = re.sub(r' (?=\W)', '',
                                " ".join(context[best_para][start:end]))
        d["ObjectMasterId"] = self.ObjectMasterId

        return d


#if __name__ == "__main__":
#    main()
def main():
    parser = argparse.ArgumentParser(
        description="Run an ELMo model on user input")
    parser.add_argument("model", help="Model directory")
    parser.add_argument("question", help="Question to answer")
    parser.add_argument("context", help="Context to answer the question with")
    args = parser.parse_args()

    # Tokenize the input, the models expected data to be tokenized using `NltkAndPunctTokenizer`
    # Note the model expects case-sensitive input
    tokenizer = NltkAndPunctTokenizer()
    question = tokenizer.tokenize_paragraph_flat(args.question)
    context = tokenizer.tokenize_paragraph_flat(args.context)

    print("Loading model")
    model_dir = ModelDir(args.model)
    model = model_dir.get_model()
    if not isinstance(model, ElmoQaModel):
        raise ValueError(
            "This script is build to work for ElmoQaModel models only")

    # Important! This tells the language model not to use the pre-computed word vectors,
    # which are only applicable for the SQuAD dev/train sets.
    # Instead the language model will use its character-level CNN to compute
    # the word vectors dynamically.
    model.lm_model.embed_weights_file = None

    # Tell the model the batch size and vocab to expect, This will load the needed
    # word vectors and fix the batch size when building the graph / encoding the input
    print("Setting up model")
    voc = set(question)
    voc.update(context)
    model.set_input_spec(ParagraphAndQuestionSpec(batch_size=1), voc)

    # Now we build the actual tensorflow graph, `best_span` and `conf` are
    # tensors holding the predicted span (inclusive) and confidence scores for each
    # element in the input batch
    print("Build tf graph")
    sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
    with sess.as_default():
        # 17 means to limit the span to size 17 or less
        best_spans, conf = model.get_prediction().get_best_span(17)

    # Now restore the weights, this is a bit fiddly since we need to avoid restoring the
    # bilm weights, and instead load them from the pre-computed data
    all_vars = tf.global_variables() + tf.get_collection(
        tf.GraphKeys.SAVEABLE_OBJECTS)
    lm_var_names = {x.name for x in all_vars if x.name.startswith("bilm")}
    vars = [x for x in all_vars if x.name not in lm_var_names]
    model_dir.restore_checkpoint(sess, vars)

    # Run the initializer of the lm weights, which will load them from the lm directory
    sess.run(
        tf.variables_initializer(
            [x for x in all_vars if x.name in lm_var_names]))

    # Now the model is ready to run
    # The model takes input in the form of `ContextAndQuestion` objects, for example:
    data = [ParagraphAndQuestion(context, question, None, "user-question1")]

    print("Starting run")
    # The model is run in two steps, first it "encodes" the paragraph/context pairs
    # into numpy arrays, then to use `sess` to run the actual model get the predictions
    encoded = model.encode(
        data, is_train=False)  # batch of `ContextAndQuestion` -> feed_dict
    best_spans, conf = sess.run([best_spans, conf],
                                feed_dict=encoded)  # feed_dict -> predictions
    print("Best span: " + str(best_spans[0]))
    print("Answer text: " +
          " ".join(context[best_spans[0][0]:best_spans[0][1] + 1]))
    print("Confidence: " + str(conf[0]))
Example #4
0
def convert_saved_graph(model_dir, output_dir):
    print("Load model")
    md = ModelDir(model_dir)
    model = md.get_model()

    # remove the lm models word embeddings - cpu model will use Char-CNN
    model.lm_model.embed_weights_file = None
    dim = model.embed_mapper.layers[1].n_units

    print("Setting up cudnn version")
    sess = tf.Session()
    with sess.as_default():
        model.set_input_spec(
            ParagraphAndQuestionSpec(1, None, None, 14), {"the"},
            ResourceLoader(lambda a, b: {"the": np.zeros(300, np.float32)}))
        print("Buiding graph")
        pred = model.get_prediction()

    test_questions = get_test_questions()

    print("Load vars:")
    all_vars = tf.global_variables() + tf.get_collection(
        tf.GraphKeys.SAVEABLE_OBJECTS)
    lm_var_names = {x.name for x in all_vars if x.name.startswith("bilm")}
    vars = [x for x in all_vars if x.name not in lm_var_names]
    md.restore_checkpoint(sess, vars)
    sess.run(
        tf.variables_initializer(
            [x for x in all_vars if x.name in lm_var_names]))

    feed = model.encode([test_questions], False)
    cuddn_out = sess.run([pred.start_logits, pred.end_logits], feed_dict=feed)

    print("Done, copying files...")
    if not exists(output_dir):
        mkdir(output_dir)
    for file in listdir(model_dir):
        if isfile(file) and file != "model.npy":
            copyfile(join(model_dir, file), join(output_dir, file))

    print("Done, mapping tensors...")
    to_save, to_init = [], []
    for x in tf.trainable_variables():
        if x.name.endswith("/gru_parameters:0"):
            key = x.name[:-len("/gru_parameters:0")]
            indim, outdim = get_dims(x, dim)
            c = cudnn_rnn_ops.CudnnGRUSaveable(x, 1, outdim, indim, scope=key)
            for spec in c.specs:
                if spec.name.endswith("bias_cudnn 0") or \
                        spec.name.endswith("bias_cudnn 1"):
                    print('Unsupported spec: ' + spec.name)
                    continue
                if 'forward' in spec.name:
                    new_name = spec.name.replace(
                        'forward/rnn/multi_rnn_cell/cell_0/',
                        'bidirectional_rnn/fw/')
                else:
                    new_name = spec.name.replace(
                        'backward/rnn/multi_rnn_cell/cell_0/',
                        'bidirectional_rnn/bw/')
                v = tf.Variable(sess.run(spec.tensor), name=new_name)
                to_init.append(v)
                to_save.append(v)
        else:
            to_save.append(x)

    save_dir = join(output_dir, "save")
    if not exists(save_dir):
        mkdir(save_dir)

    # save:
    all_vars = tf.global_variables() + tf.get_collection(
        tf.GraphKeys.SAVEABLE_OBJECTS)
    vars_to_save = [x for x in all_vars if not x.name.startswith("bilm")]
    sess.run(tf.initialize_variables(to_init))
    saver = tf.train.Saver(vars_to_save)
    saver.save(
        sess,
        join(save_dir, 'checkpoint'),
        global_step=123456789,
        write_meta_graph=False,
    )

    sess.close()
    tf.reset_default_graph()
    return cuddn_out
def main():
    parser = argparse.ArgumentParser(description="Run an ELMo model on user input")
    # parser.add_argument("model", type=int, help="Model directory")
    parser.add_argument("question", help="Question to answer")
    parser.add_argument("documents", help="List of text documents to answer the question with", nargs='+')
    args = parser.parse_args()

    # Models path
    SQUAD_MODEL_DIRECTORY_PATH = 'docqa/models-cpu/squad'
    SQUAD_SHARED_NORM_MODEL_DIRECTORY_PATH = 'docqa/models-cpu/squad-shared-norm'
    TRIVIAQA_MODEL_DIRECTORY_PATH = 'docqa/models-cpu/triviaqa-unfiltered-shared-norm'
    TRIVIAQA_SHARED_NORM_MODEL_DIRECTORY_PATH = 'docqa/models-cpu/triviaqa-web-shared-norm'
    
    models_directory = [
        SQUAD_MODEL_DIRECTORY_PATH,
        SQUAD_SHARED_NORM_MODEL_DIRECTORY_PATH,
        TRIVIAQA_MODEL_DIRECTORY_PATH,
        TRIVIAQA_SHARED_NORM_MODEL_DIRECTORY_PATH
    ]

    print("Preprocessing...")

    # Load the model
    # model_dir = ModelDir(args.model)
    model_dir = ModelDir(models_directory[0])
    model = model_dir.get_model()
    if not isinstance(model, ParagraphQuestionModel):
        raise ValueError("This script is built to work for ParagraphQuestionModel models only")

    # Read the documents
    documents = []
    for doc in args.documents:
        if not isfile(doc):
            raise ValueError(doc + " does not exist")
        with open(doc, "r") as f:
            documents.append(f.read())
    print("Loaded %d documents" % len(documents))

    # Split documents into lists of paragraphs
    documents = [re.split("\s*\n\s*", doc) for doc in documents]

    # Tokenize the input, the models expects data to be tokenized using `NltkAndPunctTokenizer`
    # Note the model expects case-sensitive input
    tokenizer = NltkAndPunctTokenizer()
    question = tokenizer.tokenize_paragraph_flat(args.question)  # List of words
    # Now list of document->paragraph->sentence->word
    documents = [[tokenizer.tokenize_paragraph(p) for p in doc] for doc in documents]

    # Now group the document into paragraphs, this returns `ExtractedParagraph` objects
    # that additionally remember the start/end token of the paragraph within the source document
    splitter = MergeParagraphs(400)
    # splitter = PreserveParagraphs() # Uncomment to use the natural paragraph grouping
    documents = [splitter.split(doc) for doc in documents]

    # Now select the top paragraphs using a `ParagraphFilter`
    if len(documents) == 1:
        # Use TF-IDF to select top paragraphs from the document
        selector = TopTfIdf(NltkPlusStopWords(True), n_to_select=5)
        context = selector.prune(question, documents[0])
    else:
        # Use a linear classifier to select top paragraphs among all the documents
        selector = ShallowOpenWebRanker(n_to_select=10)
        context = selector.prune(question, flatten_iterable(documents))

    print("Select %d paragraph" % len(context))

    if model.preprocessor is not None:
        # Models are allowed to define an additional pre-processing step
        # This will turn the `ExtractedParagraph` objects back into simple lists of tokens
        context = [model.preprocessor.encode_text(question, x) for x in context]
    else:
        # Otherwise just use flattened text
        context = [flatten_iterable(x.text) for x in context]
        
    print("Setting up model")
    
    # Tell the model the batch size (can be None) and vocab to expect, This will load the
    # needed word vectors and fix the batch size to use when building the graph / encoding the input
    voc = set(question)
    for txt in context:
        voc.update(txt)
    model.set_input_spec(ParagraphAndQuestionSpec(batch_size=len(context)), voc)

    # Now we build the actual tensorflow graph, `best_span` and `conf` are
    # tensors holding the predicted span (inclusive) and confidence scores for each
    # element in the input batch, confidence scores being the pre-softmax logit for the span
    print("Build tf graph")
    sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
    # We need to use sess.as_default when working with the cuNND stuff, since we need an active
    # session to figure out the # of parameters needed for each layer. The cpu-compatible models don't need this.
    with sess.as_default():
        # 8 means to limit the span to size 8 or less
        best_spans, conf = model.get_prediction().get_best_span(10)

    # Loads the saved weights
    model_dir.restore_checkpoint(sess)

    # Now the model is ready to run
    # The model takes input in the form of `ContextAndQuestion` objects, for example:
    data = [ParagraphAndQuestion(x, question, None, "user-question%d"%i)
            for i, x in enumerate(context)]

    print("Starting run")
    # The model is run in two steps, first it "encodes" a batch of paragraph/context pairs
    # into numpy arrays, then we use `sess` to run the actual model get the predictions
    encoded = model.encode(data, is_train=False)  # batch of `ContextAndQuestion` -> feed_dict
    best_spans, conf = sess.run([best_spans, conf], feed_dict=encoded)  # feed_dict -> predictions

    best_para = np.argmax(conf)  # We get output for each paragraph, select the most-confident one to print
    print("Best Paragraph: " + str(best_para))
    para_id = int(str(best_para))
    # print("Best Paragraph: \n" + (" ".join((paras[para_id].text)[0])))
    print("Best Paragraph: \n" + " ".join(context[para_id]))
    print("Best span: " + str(best_spans[best_para]))
    print("Answer text: " + " ".join(context[best_para][best_spans[best_para][0]:best_spans[best_para][1]+1]))
    print("Confidence: " + str(conf[best_para]))
Example #6
0
def predict():
    json_data = {"success": False, "predictions": []}
    print("Preprocessing...")

    # Load the model
    model_dir = ModelDir(
        "/home/antriv/conversation_ai/Transfer_Learning/ALLENAI_DocumentQA/document-qa/pretrained_models/models/triviaqa-unfiltered-shared-norm"
    )
    model = model_dir.get_model()
    if not isinstance(model, ParagraphQuestionModel):
        raise ValueError(
            "This script is built to work for ParagraphQuestionModel models only"
        )

    # Load the question
    question = (flask.request.data).decode("utf-8")

    # Read the documents
    documents = []
    doclist = ["/home/antriv/data/The-Future-Computed.txt"]
    for doc in doclist:
        if not isfile(doc):
            raise ValueError(doc + " does not exist")
        with open(doc, "r") as f:
            documents.append(f.read())
    print("Loaded %d documents" % len(documents))

    # Split documents into lists of paragraphs
    documents = [re.split("\s*\n\s*", doc) for doc in documents]

    # Tokenize the input, the models expects data to be tokenized using `NltkAndPunctTokenizer`
    # Note the model expects case-sensitive input
    tokenizer = NltkAndPunctTokenizer()
    question = tokenizer.tokenize_paragraph_flat(question)  # List of words
    # Now list of document->paragraph->sentence->word
    documents = [[tokenizer.tokenize_paragraph(p) for p in doc]
                 for doc in documents]

    # Now group the document into paragraphs, this returns `ExtractedParagraph` objects
    # that additionally remember the start/end token of the paragraph within the source document
    splitter = MergeParagraphs(400)
    #splitter = PreserveParagraphs() # Uncomment to use the natural paragraph grouping
    documents = [splitter.split(doc) for doc in documents]

    # Now select the top paragraphs using a `ParagraphFilter`
    if len(documents) == 1:
        # Use TF-IDF to select top paragraphs from the document
        selector = TopTfIdf(NltkPlusStopWords(True), n_to_select=1000)
        context = selector.prune(question, documents[0])
    else:
        # Use a linear classifier to select top paragraphs among all the documents
        selector = ShallowOpenWebRanker(n_to_select=1000)
        context = selector.prune(question, flatten_iterable(documents))

    print("Select %d paragraph" % len(context))

    if model.preprocessor is not None:
        # Models are allowed to define an additional pre-processing step
        # This will turn the `ExtractedParagraph` objects back into simple lists of tokens
        context = [
            model.preprocessor.encode_text(question, x) for x in context
        ]
    else:
        # Otherwise just use flattened text
        context = [flatten_iterable(x.text) for x in context]

    print("Setting up model")
    # Tell the model the batch size (can be None) and vocab to expect, This will load the
    # needed word vectors and fix the batch size to use when building the graph / encoding the input
    voc = set(question)
    for txt in context:
        voc.update(txt)
    model.set_input_spec(ParagraphAndQuestionSpec(batch_size=len(context)),
                         voc)

    # Now we build the actual tensorflow graph, `best_span` and `conf` are
    # tensors holding the predicted span (inclusive) and confidence scores for each
    # element in the input batch, confidence scores being the pre-softmax logit for the span
    print("Build tf graph")
    sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
    # We need to use sess.as_default when working with the cuNND stuff, since we need an active
    # session to figure out the # of parameters needed for each layer. The cpu-compatible models don't need this.
    with sess.as_default():
        # 8 means to limit the span to size 8 or less
        best_spans, conf = model.get_prediction().get_best_span(8)

    # Loads the saved weights
    model_dir.restore_checkpoint(sess)

    # Now the model is ready to run
    # The model takes input in the form of `ContextAndQuestion` objects, for example:
    data = [
        ParagraphAndQuestion(x, question, None, "user-question%d" % i)
        for i, x in enumerate(context)
    ]

    print("Starting run")
    # The model is run in two steps, first it "encodes" a batch of paragraph/context pairs
    # into numpy arrays, then we use `sess` to run the actual model get the predictions
    encoded = model.encode(
        data, is_train=False)  # batch of `ContextAndQuestion` -> feed_dict
    best_spans, conf = sess.run([best_spans, conf],
                                feed_dict=encoded)  # feed_dict -> predictions

    best_para = np.argmax(
        conf
    )  # We get output for each paragraph, select the most-confident one to print
    print("Best Paragraph: " + str(best_para))
    print("Best span: " + str(best_spans[best_para]))
    print("Answer text: " +
          " ".join(context[best_para]
                   [best_spans[best_para][0]:best_spans[best_para][1] + 1]))
    print("Confidence: " + str(conf[best_para]))
    y_output = " ".join(
        context[best_para][best_spans[best_para][0]:best_spans[best_para][1] +
                           1])
    print(y_output)
    json_data["predictions"].append(str(y_output))

    #indicate that the request was a success
    json_data["success"] = True
    #return the data dictionary as a JSON response
    return flask.jsonify(json_data)
Example #7
0
def convert(model_dir, output_dir, best_weights=False):
    print("Load model")
    md = ModelDir(model_dir)
    model = md.get_model()
    dim = model.embed_mapper.layers[1].n_units
    global_step = tf.get_variable('global_step',
                                  shape=[],
                                  dtype='int32',
                                  initializer=tf.constant_initializer(0),
                                  trainable=False)

    print("Setting up cudnn version")
    #global_step = tf.get_variable('global_step', shape=[], dtype='int32', trainable=False)
    sess = tf.Session()
    sess.run(global_step.assign(0))
    with sess.as_default():
        model.set_input_spec(
            ParagraphAndQuestionSpec(1, None, None, 14), {"the"},
            ResourceLoader(lambda a, b: {"the": np.zeros(300, np.float32)}))

        print("Buiding graph")
        pred = model.get_prediction()

    test_questions = ParagraphAndQuestion(
        ["Harry", "Potter", "was", "written", "by", "JK"],
        ["Who", "wrote", "Harry", "Potter", "?"], None, "test_questions")

    print("Load vars")
    md.restore_checkpoint(sess)
    print("Restore finished")

    feed = model.encode([test_questions], False)
    cuddn_out = sess.run([pred.start_logits, pred.end_logits], feed_dict=feed)

    print("Done, copying files...")
    if not exists(output_dir):
        mkdir(output_dir)
    for file in listdir(model_dir):
        if isfile(file) and file != "model.npy":
            copyfile(join(model_dir, file), join(output_dir, file))

    print("Done, mapping tensors...")
    to_save = []
    to_init = []
    for x in tf.trainable_variables():
        if x.name.endswith("/gru_parameters:0"):
            key = x.name[:-len("/gru_parameters:0")]
            fw_params = x
            if "map_embed" in x.name:
                c = cudnn_rnn_ops.CudnnGRU(1, dim, 400)
            elif "chained-out" in x.name:
                c = cudnn_rnn_ops.CudnnGRU(1, dim, dim * 4)
            else:
                c = cudnn_rnn_ops.CudnnGRU(1, dim, dim * 2)
            params_saveable = cudnn_rnn_ops.RNNParamsSaveable(
                c, c.params_to_canonical, c.canonical_to_params, [fw_params],
                key)

            for spec in params_saveable.specs:
                if spec.name.endswith("bias_cudnn 0") or \
                        spec.name.endswith("bias_cudnn 1"):
                    # ??? What do these even do?
                    continue
                name = spec.name.split("/")
                name.remove("cell_0")
                if "forward" in name:
                    ix = name.index("forward")
                    name.insert(ix + 2, "fw")
                else:
                    ix = name.index("backward")
                    name.insert(ix + 2, "bw")
                del name[ix]

                ix = name.index("multi_rnn_cell")
                name[ix] = "bidirectional_rnn"
                name = "/".join(name)
                v = tf.Variable(sess.run(spec.tensor), name=name)
                to_init.append(v)
                to_save.append(v)

        else:
            to_save.append(x)

    other = [
        x for x in tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
        if x not in tf.trainable_variables()
    ]
    print(other)
    sess.run(tf.initialize_variables(to_init))
    saver = tf.train.Saver(to_save + other)
    save_dir = join(output_dir, "save")
    if not exists(save_dir):
        mkdir(save_dir)

    saver.save(sess, join(save_dir, "checkpoint"), sess.run(global_step))

    sess.close()
    tf.reset_default_graph()

    print("Updating model...")
    model.embed_mapper.layers = [
        model.embed_mapper.layers[0],
        BiRecurrentMapper(CompatGruCellSpec(dim))
    ]
    model.match_encoder.layers = list(model.match_encoder.layers)
    other = model.match_encoder.layers[1].other
    other.layers = list(other.layers)
    other.layers[1] = BiRecurrentMapper(CompatGruCellSpec(dim))

    pred = model.predictor.predictor
    pred.first_layer = BiRecurrentMapper(CompatGruCellSpec(dim))
    pred.second_layer = BiRecurrentMapper(CompatGruCellSpec(dim))

    with open(join(output_dir, "model.pkl"), "wb") as f:
        pickle.dump(model, f)

    print("Testing...")
    with open(join(output_dir, "model.pkl"), "rb") as f:
        model = pickle.load(f)

    sess = tf.Session()

    model.set_input_spec(
        ParagraphAndQuestionSpec(1, None, None, 14), {"the"},
        ResourceLoader(lambda a, b: {"the": np.zeros(300, np.float32)}))
    pred = model.get_prediction()

    print("Rebuilding")
    saver = tf.train.Saver()
    saver.restore(sess, tf.train.latest_checkpoint(save_dir))

    feed = model.encode([test_questions], False)
    cpu_out = sess.run([pred.start_logits, pred.end_logits], feed_dict=feed)

    print("These should be close:")
    print([np.allclose(a, b) for a, b in zip(cpu_out, cuddn_out)])
    print(cpu_out)
    print(cuddn_out)
Example #8
0
def main():
    print('Starting...')
    model_dir = ModelDir(OPTS.model)
    model = model_dir.get_model()
    tokenizer = NltkAndPunctTokenizer()
    if not isinstance(model, ParagraphQuestionModel):
        raise ValueError(
            "This script is built to work for ParagraphQuestionModel models only"
        )
    if OPTS.reload_vocab:
        loader = ResourceLoader()
    else:
        loader = CachingResourceLoader()
    print('Loading word vectors...')
    model.set_input_spec(ParagraphAndQuestionSpec(batch_size=None),
                         set([',']),
                         word_vec_loader=loader,
                         allow_update=True)
    print('Starting Tensorflow session...')
    sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
    with sess.as_default():
        prediction = model.get_prediction()
        # Take 0-th here because we know we only truncate to one paragraph
        start_logits_tf = prediction.start_logits[0]
        end_logits_tf = prediction.end_logits[0]
        none_logit_tf = prediction.none_logit[0]
        #best_spans_tf, conf_tf = prediction.get_best_span(MAX_SPAN_LENGTH)
    model_dir.restore_checkpoint(sess)
    splitter = Truncate(400)  # NOTE: we truncate past 400 tokens
    selector = TopTfIdf(NltkPlusStopWords(True), n_to_select=5)
    app = bottle.Bottle()

    @app.route('/')
    def index():
        return bottle.template('index')

    @app.route('/post_query', method='post')
    def post_query():
        document_raw = bottle.request.forms.getunicode('document').strip()
        question_raw = bottle.request.forms.getunicode('question').strip()
        document = re.split("\s*\n\s*", document_raw)
        question = tokenizer.tokenize_paragraph_flat(question_raw)
        doc_toks = [tokenizer.tokenize_paragraph(p) for p in document]
        split_doc = splitter.split(doc_toks)
        context = selector.prune(question, split_doc)
        if model.preprocessor is not None:
            context = [
                model.preprocessor.encode_text(question, x) for x in context
            ]
        else:
            context = [flatten_iterable(x.text) for x in context]
        vocab = set(question)
        for txt in context:
            vocab.update(txt)
        data = [
            ParagraphAndQuestion(x, question, None, "user-question%d" % i)
            for i, x in enumerate(context)
        ]
        model.word_embed.update(loader, vocab)
        encoded = model.encode(data, is_train=False)
        start_logits, end_logits, none_logit = sess.run(
            [start_logits_tf, end_logits_tf, none_logit_tf], feed_dict=encoded)
        beam, p_na = logits_to_probs(document_raw,
                                     context[0],
                                     start_logits,
                                     end_logits,
                                     none_logit,
                                     beam_size=BEAM_SIZE)
        return bottle.template('results',
                               document=document_raw,
                               question=question_raw,
                               beam=beam,
                               p_na=p_na)

    cur_dir = os.path.abspath(os.path.dirname(__file__))
    bottle.TEMPLATE_PATH.insert(0, os.path.join(cur_dir, 'views'))
    bottle.run(app, host=OPTS.hostname, port=OPTS.port, debug=OPTS.debug)
def main():
    parser = argparse.ArgumentParser(
        description="Run an ELMo model on user input")
    parser.add_argument("model", help="Model directory")
    parser.add_argument("ja_filepath", help="File path to japanese questions")
    parser.add_argument("result_file",
                        help="File path to predicted result json")
    args = parser.parse_args()
    print(args)

    print("Preprocessing...")

    paragraphs, questions = read_squad_style_database(args.ja_filepath)
    # Load the model
    model_dir = ModelDir(args.model)
    model = model_dir.get_model()
    if not isinstance(model, ParagraphQuestionModel):
        raise ValueError(
            "This script is built to work for ParagraphQuestionModel models only"
        )

    paragraphs, questions = read_squad_style_database(args.ja_filepath)
    predictions = {}
    predictions["conf"] = {}
    for qa in questions:
        print(qa["id"])

        title = qa["title"]
        para_idx = qa["para_idx"]

        context = paragraphs[title][para_idx]
        question = qa["question"]

        print(context)
        print(question)

        if model.preprocessor is not None:
            context = [
                model.preprocessor.encode_text(question, x) for x in context
            ]

        print("Setting up model")

        voc = set(question)
        for txt in context:
            voc.update(txt)
        model.set_input_spec(ParagraphAndQuestionSpec(batch_size=len(context)),
                             voc)

        print("Build tf graph")
        sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
        with sess.as_default():
            best_spans, conf = model.get_prediction().get_best_span(8)

        # Loads the saved weights
        model_dir.restore_checkpoint(sess)

        data = [
            ParagraphAndQuestion(x, question, None, "user-question%d" % i)
            for i, x in enumerate(context)
        ]

        print("Starting run")

        encoded = model.encode(
            data, is_train=False)  # batch of `ContextAndQuestion` -> feed_dict
        best_spans, conf = sess.run(
            [best_spans, conf], feed_dict=encoded)  # feed_dict -> predictions
        print(best_spans)
        predictions[qa["id"]] = best_spans
        predictions["conf"][qa["id"]] = conf
        print(predictions)

    result_f = open(args.result_file, "w")
    json.dump(predictions, result_f)
    exit()
    official_evaluator = OfficialEvaluator(args.ja_filepath, args.result_file)
    evaluation = official_evaluator.evaluate()
    print(evaluation)
        word = line.strip()
        if len(word) > 0:
            vocab.add(word)

print('done')

print('Loading Model...', end='')

# init tf session and weights
model.set_input_spec(ParagraphAndQuestionSpec(batch_size=None), vocab)

sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))

with sess.as_default():
    best_spans, conf = model.get_prediction().get_best_span(16)
    model_dir.restore_checkpoint(sess)

print('done')


# use docs: list<str> to find ans: str to question: str
def find_answer(documents, raw_question):

    raw_question = raw_question.lower()
    documents = [d.lower() for d in documents]

    global best_spans, conf

    documents = [re.split("\s*\n\s*", doc) for doc in documents]
    tokenizer = NltkAndPunctTokenizer()
Example #11
0
def main():
  print('Starting...')
  model_dir = ModelDir(OPTS.model)
  model = model_dir.get_model()
  if OPTS.elmo:
    # Fix absolute path names from other codalab runs
    lm = model.lm_model
    if lm.lm_vocab_file.startswith('/0x'):
      lm.lm_vocab_file = os.sep.join(lm.lm_vocab_file.split(os.sep)[2:])
    if lm.options_file.startswith('/0x'):
      lm.options_file = os.sep.join(lm.options_file.split(os.sep)[2:])
    if lm.weight_file.startswith('/0x'):
      lm.weight_file = os.sep.join(lm.weight_file.split(os.sep)[2:])
    if lm.weight_file.startswith('/0x'):
      lm.embed_weights_file = os.sep.join(lm.embed_weights_file.split(os.sep)[2:])
    lm.embed_weights_file = None

  #if not isinstance(model, ParagraphQuestionModel):
  #  raise ValueError("This script is built to work for ParagraphQuestionModel models only")
  input_data, vocab = read_input_data(model)

  print('Loading word vectors...')
  model.set_input_spec(ParagraphAndQuestionSpec(batch_size=None), vocab)

  print('Starting Tensorflow session...')
  config = tf.ConfigProto(allow_soft_placement=True)
  config.gpu_options.allow_growth = True
  sess = tf.Session(config=config)
  with sess.as_default():
    prediction = model.get_prediction()
    # Take 0-th here because we know we only truncate to one paragraph
    start_logits_tf = prediction.start_logits[0]
    end_logits_tf = prediction.end_logits[0]
    none_logit_tf = prediction.none_logit[0]
  if OPTS.elmo:
    # See elmo/run_on_user_text.py
    all_vars = tf.global_variables() + tf.get_collection(tf.GraphKeys.SAVEABLE_OBJECTS)
    lm_var_names = {x.name for x in all_vars if x.name.startswith("bilm")}
    vars = [x for x in all_vars if x.name not in lm_var_names]
    model_dir.restore_checkpoint(sess, vars)
    sess.run(tf.variables_initializer([x for x in all_vars if x.name in lm_var_names]))
  else:
    model_dir.restore_checkpoint(sess)

  pred_obj = {}
  na_prob_obj = {}
  pred_always_ans_obj = {}
  analysis_obj = {}

  for context_raw, context_toks, ex in tqdm(input_data):
    encoded = model.encode(ex, is_train=False)
    start_logits, end_logits, none_logit = sess.run(
        [start_logits_tf, end_logits_tf, none_logit_tf],
        feed_dict=encoded)
    # beam, p_na = logits_to_probs(
    #     context_raw, context_toks, start_logits, end_logits, none_logit,
    #     beam_size=DEFAULT_BEAM_SIZE)
    beam, p_na = logits_to_probs(
        context_raw, context_toks, start_logits, end_logits, none_logit,
        beam_size=10)

    # print(beam[0][0])

    ans = beam[0][0]
    # start, end = beam[0][2],beam[0][3]
    non_empty_ans = [x[0] for x in beam if x[0]][0]
    qid = ex[0].question_id

    pred_obj[qid] = ans
    na_prob_obj[qid] = p_na
    pred_always_ans_obj[qid] = non_empty_ans
    analysis_obj[qid] = [{'answer': b[0], 'span':[b[2], b[3]], 'prob':b[1]} for b in beam] 
    # print(analysis_obj[qid])

  with open(OPTS.output_file, 'w') as f:
    json.dump(pred_obj, f)
  if OPTS.na_prob_file:
    with open(OPTS.na_prob_file, 'w') as f:
      json.dump(na_prob_obj, f)
  if OPTS.always_answer_file:
    with open(OPTS.always_answer_file, 'w') as f:
      json.dump(pred_always_ans_obj, f)
  if OPTS.analysis_file:
    with open(OPTS.analysis_file, 'w') as f:
      json.dump(analysis_obj, f, indent=2)