Python NltkAndPunctTokenizer.tokenize_paragraph_flatの例

プログラミング言語: Python

名前空間/パッケージ名: docqa.data_processing.text_utils

メソッド/関数: tokenize_paragraph_flat

hotexamples.comのコード掲載数: 10

Python NltkAndPunctTokenizer.tokenize_paragraph_flat - 10件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのdocqa.data_processing.text_utils.NltkAndPunctTokenizer.tokenize_paragraph_flatの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

よく使われるメソッド

表示非表示

NltkAndPunctTokenizer(30)

tokenize_paragraph_flat(10)

tokenize_paragraph(6)

tokenize_with_inverse(4)

convert_to_spans(2)

tokenize_sentence(2)

コード例 #1

ファイルを表示

def read_input_data(model):
  data = []
  vocab = set()
  tokenizer = NltkAndPunctTokenizer()
  splitter = Truncate(400)  # NOTE: we truncate past 400 tokens
  selector = TopTfIdf(NltkPlusStopWords(True), n_to_select=5)
  with open(OPTS.input_file) as f:
    for i, line in enumerate(f):
      try:
        document_raw, question_raw = line.strip().split('\t')
      except ValueError as e:
        print(line.strip())
        print('Error at line %d' % i)
        raise e
      document = re.split("\s*\n\s*", document_raw)
      question = tokenizer.tokenize_paragraph_flat(question_raw)
      doc_toks = [tokenizer.tokenize_paragraph(p) for p in document]
      split_doc = splitter.split(doc_toks)
      context = selector.prune(question, split_doc)
      if model.preprocessor is not None:
        context = [model.preprocessor.encode_text(question, x) for x in context]
      else:
        context = [flatten_iterable(x.text) for x in context]
      vocab.update(question)
      for txt in context:
        vocab.update(txt)
      ex = [ParagraphAndQuestion(x, question, None, "user-question%d"%i)
            for i, x in enumerate(context)]
      data.append((document_raw, question_raw, context, ex))
  return data, vocab

コード例 #2

ファイルを表示

ファイル: test_machine_reader.py プロジェクト: trunghlt/cape-document-qa

class RandomMachineReaderModel(CapeMachineReaderModelInterface):
    def __init__(self, _):
        self.tokenizer = NltkAndPunctTokenizer()

    def tokenize(self, text):
        tokens = self.tokenizer.tokenize_paragraph_flat(text)
        spans = self.tokenizer.convert_to_spans(text, [tokens])[0]
        return tokens, spans

    def get_document_embedding(self, text):
        np.random.seed(
            int(hashlib.sha1(text.encode()).hexdigest(), 16) % 10**8)
        document_tokens, _ = self.tokenize(text)
        return np.random.random((len(document_tokens), 240))

    def get_logits(self, question, document_embedding):
        question_tokens, _ = self.tokenize(question)
        n_words = document_embedding.shape[0]
        qseed = int(hashlib.sha1(question.encode()).hexdigest(), 16) % 10**8
        dseed = int(np.sum(document_embedding) * 10**6) % 10**8
        np.random.seed(dseed + qseed)
        start_logits = np.random.random(n_words)
        off = np.random.randint(1, 5)
        end_logits = np.concatenate(
            [np.zeros(off) + np.min(start_logits), start_logits[off:]])
        return start_logits[:n_words], end_logits[:n_words]

コード例 #3

ファイルを表示

ファイル: qa_util.py プロジェクト: TxConvergentAdmin/bt-f18-nlp--utext

def find_answer(documents, raw_question):

    raw_question = raw_question.lower()
    documents = [d.lower() for d in documents]

    global best_spans, conf

    documents = [re.split("\s*\n\s*", doc) for doc in documents]
    tokenizer = NltkAndPunctTokenizer()

    question = tokenizer.tokenize_paragraph_flat(raw_question)

    documents = [[tokenizer.tokenize_paragraph(p) for p in doc]
                 for doc in documents]

    splitter = MergeParagraphs(400)

    documents = [splitter.split(doc) for doc in documents]

    if len(documents) == 1:
        selector = TopTfIdf(NltkPlusStopWords(True), n_to_select=5)
        context = selector.prune(question, documents[0])
    else:
        selector = ShallowOpenWebRanker(n_to_select=10)
        context = selector.prune(question, flatten_iterable(documents))

    context = [flatten_iterable(x.text) for x in context]

    data = [
        ParagraphAndQuestion(x, question, None, "user-question%d" % i)
        for i, x in enumerate(context)
    ]

    encoded = model.encode(data, is_train=False)

    with sess.as_default():
        spans, confid = sess.run([best_spans, conf], feed_dict=encoded)

    best_para = np.argmax(confid)
    ans = " ".join(context[best_para][spans[best_para][0]:spans[best_para][1] +
                                      1])
    confidence = confid[best_para]

    return ans, confidence

コード例 #4

ファイルを表示

ファイル: DocumentQA.py プロジェクト: kapil1201/Machine-Learning-and-Python-Work

    def getAnswer(self):
        #parser = argparse.ArgumentParser(description="Run an ELMo model on user input")
        #parser.add_argument("model", help="Model directory")
        #parser.add_argument("question", help="Question to answer")
        #parser.add_argument("documents", help="List of text documents to answer the question with", nargs='+')
        #args = parser.parse_args()

        #print("Preprocessing...")

        # Load the model
        model_dir = ModelDir(MODEL_DIR)
        model = model_dir.get_model()
        if not isinstance(model, ParagraphQuestionModel):
            raise ValueError(
                "This script is built to work for ParagraphQuestionModel models only"
            )

        conn = pyodbc.connect(DB_CONN)

        cursor = conn.cursor()
        #(23211,28690,33214,25638,25837,26454,28693,26137,31428,32087)
        query="select cast(filetext as varchar(max)) as filetext, name, type from dbo.UserworkspaceData where objectmasterid= "+\
               str(self.ObjectMasterId)+\
               " order by id asc"
        #query="select cast(filetext as varchar(max)) as filetext from kpl_tmp"
        documents = []
        document = ""
        name = ""
        filetype = 0
        for doc in cursor.execute(query):
            document = document + doc[0]
            name = doc[1]
            filetype = doc[2]
        #open("E:/kpl.txt","w+").write(document)
        documents.append(document)
        #documents.replace("\n\n","\n")
        #r.sub("",documents)
        #documents=" ".join(documents.split())
        #open("E:\kpl_test.txt","w+").write(document)
        #doc="D:\Document QnA\document-qa-master\Data\Drug_Delivery_Surveying_Global_Competitive_Landscape_BMI.txt"
        # =============================================================================
        #     if not isfile(doc):
        #         raise ValueError(doc + " does not exist")
        #     with open(doc, "r") as f:
        #         documents.append(f.read())
        # =============================================================================

        #print("Loaded %d documents" % len(documents))
        #temp=documents[0].split()
        # Split documents into lists of paragraphs
        #documents=[" ".join(temp[i:(i+400)]) for i in range(1,len(temp),400)]
        documents = [re.split("\s*\n\s*", doc) for doc in documents]
        # Tokenize the input, the models expects data to be tokenized using `NltkAndPunctTokenizer`
        # Note the model expects case-sensitive input
        tokenizer = NltkAndPunctTokenizer()
        question = tokenizer.tokenize_paragraph_flat(
            self.Question)  # List of words

        # Now list of document->paragraph->sentence->word
        documents = [[tokenizer.tokenize_paragraph(p) for p in doc]
                     for doc in documents]

        # Now group the document into paragraphs, this returns `ExtractedParagraph` objects
        # that additionally remember the start/end token of the paragraph within the source document
        splitter = MergeParagraphs(400)
        #splitter = PreserveParagraphs() # Uncomment to use the natural paragraph grouping
        documents = [splitter.split(doc) for doc in documents]
        #print(str(len(documents))+" kpl") #kpl
        # Now select the top paragraphs using a `ParagraphFilter`
        if len(documents) == 1:
            # Use TF-IDF to select top paragraphs from the document
            selector = TopTfIdf(NltkPlusStopWords(True), n_to_select=5)
            context = selector.prune(question, documents[0])
        else:
            # Use a linear classifier to select top paragraphs among all the documents
            selector = ShallowOpenWebRanker(n_to_select=10)
            context = selector.prune(question, flatten_iterable(documents))

    #print("Select %d paragraph" % len(context))

        if model.preprocessor is not None:
            # Models are allowed to define an additional pre-processing step
            # This will turn the `ExtractedParagraph` objects back into simple lists of tokens
            context = [
                model.preprocessor.encode_text(question, x) for x in context
            ]
        else:
            # Otherwise just use flattened text
            context = [flatten_iterable(x.text) for x in context]
        #x=open("E:\context.txt","a+")
        #[x.write(" ".join(cont)) for cont in context]
        #x.write("\n.......................................................\n")

        #print("Setting up model")
        # Tell the model the batch size (can be None) and vocab to expect, This will load the
        # needed word vectors and fix the batch size to use when building the graph / encoding the input
        voc = set(question)
        for txt in context:
            voc.update(txt)

        model.set_input_spec(self.nlp,
                             ParagraphAndQuestionSpec(batch_size=len(context)),
                             voc)
        # Now we build the actual tensorflow graph, `best_span` and `conf` are
        # tensors holding the predicted span (inclusive) and confidence scores for each
        # element in the input batch, confidence scores being the pre-softmax logit for the span
        #print("Build tf graph") #kpl
        sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
        # We need to use sess.as_default when working with the cuNND stuff, since we need an active
        # session to figure out the # of parameters needed for each layer. The cpu-compatible models don't need this.
        with sess.as_default():
            # 8 means to limit the span to size 8 or less
            best_spans, conf = model.get_prediction().get_best_span(8)

    # Loads the saved weights
        model_dir.restore_checkpoint(sess)

        # Now the model is ready to run
        # The model takes input in the form of `ContextAndQuestion` objects, for example:
        data = [
            ParagraphAndQuestion(x, question, None, "user-question%d" % i)
            for i, x in enumerate(context)
        ]

        #print("Starting run")
        # The model is run in two steps, first it "encodes" a batch of paragraph/context pairs
        # into numpy arrays, then we use `sess` to run the actual model get the predictions
        encoded = model.encode(
            data, is_train=True)  # batch of `ContextAndQuestion` -> feed_dict
        best_spans, conf = sess.run(
            [best_spans, conf], feed_dict=encoded)  # feed_dict -> predictions

        best_para = np.argmax(
            conf
        )  # We get output for each paragraph, select the most-confident one to print

        #print("Best Paragraph: " + str(best_para))
        #print("Best span: " + str(best_spans[best_para]))
        #print("Answer text: " + " ".join(context[best_para][best_spans[best_para][0]:best_spans[best_para][1]+1]))
        #print("Confidence: " + str(conf[best_para]))
        Answer = " ".join(context[best_para]
                          [best_spans[best_para][0]:best_spans[best_para][1] +
                           1])

        print("Confidence: " + str(conf[best_para]))
        print("Best Paragraph: " + str(best_para))
        print("Best span: " + str(best_spans[best_para]))
        print("Answer text: " + Answer)
        print(" ".join(context[best_para]))
        context[best_para][best_spans[best_para][
            0]] = r"<em>" + context[best_para][best_spans[best_para][0]]
        context[best_para][best_spans[best_para][1]] = context[best_para][
            best_spans[best_para][1]] + r"</em>"

        start = 0
        end = len(context[best_para])

        positions = [
            x for x, n in enumerate(context[best_para]
                                    [0:best_spans[best_para][0]]) if n == "."
        ]
        if len(positions) >= 2: start = positions[len(positions) - 2] + 1
        positions = [
            x
            for x, n in enumerate(context[best_para][best_spans[best_para][1] +
                                                     1:]) if n == "."
        ]
        if len(positions) > 1:
            end = best_spans[best_para][1] + 1 + positions[1]

        d = dict()
        if conf[best_para] > 10:
            d["answer"] = Answer
        else:
            d["answer"] = ""
        d["name"] = name
        d["filetype"] = filetype
        d["paragraph"] = re.sub(r' (?=\W)', '',
                                " ".join(context[best_para][start:end]))
        d["ObjectMasterId"] = self.ObjectMasterId

        return d


#if __name__ == "__main__":
#    main()

コード例 #5

ファイルを表示

ファイル: run_on_user_text.py プロジェクト: kapil1201/Machine-Learning-and-Python-Work

def main():
    parser = argparse.ArgumentParser(
        description="Run an ELMo model on user input")
    parser.add_argument("model", help="Model directory")
    parser.add_argument("question", help="Question to answer")
    parser.add_argument("context", help="Context to answer the question with")
    args = parser.parse_args()

    # Tokenize the input, the models expected data to be tokenized using `NltkAndPunctTokenizer`
    # Note the model expects case-sensitive input
    tokenizer = NltkAndPunctTokenizer()
    question = tokenizer.tokenize_paragraph_flat(args.question)
    context = tokenizer.tokenize_paragraph_flat(args.context)

    print("Loading model")
    model_dir = ModelDir(args.model)
    model = model_dir.get_model()
    if not isinstance(model, ElmoQaModel):
        raise ValueError(
            "This script is build to work for ElmoQaModel models only")

    # Important! This tells the language model not to use the pre-computed word vectors,
    # which are only applicable for the SQuAD dev/train sets.
    # Instead the language model will use its character-level CNN to compute
    # the word vectors dynamically.
    model.lm_model.embed_weights_file = None

    # Tell the model the batch size and vocab to expect, This will load the needed
    # word vectors and fix the batch size when building the graph / encoding the input
    print("Setting up model")
    voc = set(question)
    voc.update(context)
    model.set_input_spec(ParagraphAndQuestionSpec(batch_size=1), voc)

    # Now we build the actual tensorflow graph, `best_span` and `conf` are
    # tensors holding the predicted span (inclusive) and confidence scores for each
    # element in the input batch
    print("Build tf graph")
    sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
    with sess.as_default():
        # 17 means to limit the span to size 17 or less
        best_spans, conf = model.get_prediction().get_best_span(17)

    # Now restore the weights, this is a bit fiddly since we need to avoid restoring the
    # bilm weights, and instead load them from the pre-computed data
    all_vars = tf.global_variables() + tf.get_collection(
        tf.GraphKeys.SAVEABLE_OBJECTS)
    lm_var_names = {x.name for x in all_vars if x.name.startswith("bilm")}
    vars = [x for x in all_vars if x.name not in lm_var_names]
    model_dir.restore_checkpoint(sess, vars)

    # Run the initializer of the lm weights, which will load them from the lm directory
    sess.run(
        tf.variables_initializer(
            [x for x in all_vars if x.name in lm_var_names]))

    # Now the model is ready to run
    # The model takes input in the form of `ContextAndQuestion` objects, for example:
    data = [ParagraphAndQuestion(context, question, None, "user-question1")]

    print("Starting run")
    # The model is run in two steps, first it "encodes" the paragraph/context pairs
    # into numpy arrays, then to use `sess` to run the actual model get the predictions
    encoded = model.encode(
        data, is_train=False)  # batch of `ContextAndQuestion` -> feed_dict
    best_spans, conf = sess.run([best_spans, conf],
                                feed_dict=encoded)  # feed_dict -> predictions
    print("Best span: " + str(best_spans[0]))
    print("Answer text: " +
          " ".join(context[best_spans[0][0]:best_spans[0][1] + 1]))
    print("Confidence: " + str(conf[0]))

コード例 #6

ファイルを表示

ファイル: cape_docqa_machine_reader.py プロジェクト: trunghlt/cape-document-qa

class CapeDocQAMachineReaderModel(CapeMachineReaderModelInterface):
    def __init__(self, machine_reader_config):
        self.tokenizer = NltkAndPunctTokenizer()
        self.config = machine_reader_config
        self.model = self._load_model()
        self.sess = tf.Session()
        self.start_logits, self.end_logits, self.context_rep = self._build_model(
        )
        self._initialize()

    def _load_model(self):
        with open(self.config.model_pickle_file, 'rb') as f:
            model = pickle.load(f)

        model.lm_model.weight_file = self.config.lm_weights_file
        model.lm_model.lm_vocab_file = self.config.vocab_file
        model.lm_model.embed_weights_file = self.config.lm_token_weights_file
        model.lm_model.options_file = self.config.lm_options_file
        return model

    def _build_model(self):
        vocab_to_init_with = {
            line.strip()
            for line in open(self.config.vocab_file, encoding="utf-8")
            if line.strip() not in vocab_to_ignore
        }
        self.model.word_embed.vec_name = self.config.word_vector_file
        with self.sess.as_default():
            self.model.set_input_spec(
                ParagraphAndQuestionSpec(None, None, None, 14),
                vocab_to_init_with,
                word_vec_loader=ResourceLoader(
                    load_vec_fn=lambda x, y: load_word_vectors(
                        x, y, is_path=True)))
            pred = self.model.get_production_predictions_for(
                {x: x
                 for x in self.model.get_placeholders()})
        return pred.start_logits, pred.end_logits, self.model.context_rep

    def _initialize(self):
        all_vars = tf.global_variables() + tf.get_collection(
            tf.GraphKeys.SAVEABLE_OBJECTS)
        lm_var_names = {x.name for x in all_vars if x.name.startswith("bilm")}
        vars_to_restore = [x for x in all_vars if x.name not in lm_var_names]
        saver = tf.train.Saver(vars_to_restore)
        saver.restore(self.sess, self.config.checkpoint_file)
        self.sess.run(
            tf.variables_initializer(
                [x for x in all_vars if x.name in lm_var_names]))

    def tokenize(self, text):
        tokens = self.tokenizer.tokenize_paragraph_flat(text)
        spans = self.tokenizer.convert_to_spans(text, [tokens])[0]
        return tokens, spans

    def get_document_embedding(self, text):
        document_tokens, _ = self.tokenize(text)
        test_question = ParagraphAndQuestion(document_tokens,
                                             ['dummy', 'question'], None,
                                             "cape_question", 'cape_document')
        feed = self.model.encode([test_question], False, cached_doc=None)
        return self.sess.run(self.model.context_rep, feed_dict=feed)[0]

    def get_logits(self, question, document_embedding):
        question_tokens, _ = self.tokenize(question)
        n_words = document_embedding.shape[0]
        dummy_document = ['dummy'] * n_words
        test_question = ParagraphAndQuestion(dummy_document, question_tokens,
                                             None, "cape_question",
                                             'cape_document')
        feed = self.model.encode(
            [test_question],
            False,
            cached_doc=document_embedding[np.newaxis, :, :])
        start_logits, end_logits = self.sess.run(
            [self.start_logits, self.end_logits], feed_dict=feed)
        return start_logits[0][:n_words], end_logits[0][:n_words]

コード例 #7

ファイルを表示

ファイル: run_on_user_documents.py プロジェクト: vinaymundada27/document-qa

def main():
    parser = argparse.ArgumentParser(description="Run an ELMo model on user input")
    # parser.add_argument("model", type=int, help="Model directory")
    parser.add_argument("question", help="Question to answer")
    parser.add_argument("documents", help="List of text documents to answer the question with", nargs='+')
    args = parser.parse_args()

    # Models path
    SQUAD_MODEL_DIRECTORY_PATH = 'docqa/models-cpu/squad'
    SQUAD_SHARED_NORM_MODEL_DIRECTORY_PATH = 'docqa/models-cpu/squad-shared-norm'
    TRIVIAQA_MODEL_DIRECTORY_PATH = 'docqa/models-cpu/triviaqa-unfiltered-shared-norm'
    TRIVIAQA_SHARED_NORM_MODEL_DIRECTORY_PATH = 'docqa/models-cpu/triviaqa-web-shared-norm'
    
    models_directory = [
        SQUAD_MODEL_DIRECTORY_PATH,
        SQUAD_SHARED_NORM_MODEL_DIRECTORY_PATH,
        TRIVIAQA_MODEL_DIRECTORY_PATH,
        TRIVIAQA_SHARED_NORM_MODEL_DIRECTORY_PATH
    ]

    print("Preprocessing...")

    # Load the model
    # model_dir = ModelDir(args.model)
    model_dir = ModelDir(models_directory[0])
    model = model_dir.get_model()
    if not isinstance(model, ParagraphQuestionModel):
        raise ValueError("This script is built to work for ParagraphQuestionModel models only")

    # Read the documents
    documents = []
    for doc in args.documents:
        if not isfile(doc):
            raise ValueError(doc + " does not exist")
        with open(doc, "r") as f:
            documents.append(f.read())
    print("Loaded %d documents" % len(documents))

    # Split documents into lists of paragraphs
    documents = [re.split("\s*\n\s*", doc) for doc in documents]

    # Tokenize the input, the models expects data to be tokenized using `NltkAndPunctTokenizer`
    # Note the model expects case-sensitive input
    tokenizer = NltkAndPunctTokenizer()
    question = tokenizer.tokenize_paragraph_flat(args.question)  # List of words
    # Now list of document->paragraph->sentence->word
    documents = [[tokenizer.tokenize_paragraph(p) for p in doc] for doc in documents]

    # Now group the document into paragraphs, this returns `ExtractedParagraph` objects
    # that additionally remember the start/end token of the paragraph within the source document
    splitter = MergeParagraphs(400)
    # splitter = PreserveParagraphs() # Uncomment to use the natural paragraph grouping
    documents = [splitter.split(doc) for doc in documents]

    # Now select the top paragraphs using a `ParagraphFilter`
    if len(documents) == 1:
        # Use TF-IDF to select top paragraphs from the document
        selector = TopTfIdf(NltkPlusStopWords(True), n_to_select=5)
        context = selector.prune(question, documents[0])
    else:
        # Use a linear classifier to select top paragraphs among all the documents
        selector = ShallowOpenWebRanker(n_to_select=10)
        context = selector.prune(question, flatten_iterable(documents))

    print("Select %d paragraph" % len(context))

    if model.preprocessor is not None:
        # Models are allowed to define an additional pre-processing step
        # This will turn the `ExtractedParagraph` objects back into simple lists of tokens
        context = [model.preprocessor.encode_text(question, x) for x in context]
    else:
        # Otherwise just use flattened text
        context = [flatten_iterable(x.text) for x in context]
        
    print("Setting up model")
    
    # Tell the model the batch size (can be None) and vocab to expect, This will load the
    # needed word vectors and fix the batch size to use when building the graph / encoding the input
    voc = set(question)
    for txt in context:
        voc.update(txt)
    model.set_input_spec(ParagraphAndQuestionSpec(batch_size=len(context)), voc)

    # Now we build the actual tensorflow graph, `best_span` and `conf` are
    # tensors holding the predicted span (inclusive) and confidence scores for each
    # element in the input batch, confidence scores being the pre-softmax logit for the span
    print("Build tf graph")
    sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
    # We need to use sess.as_default when working with the cuNND stuff, since we need an active
    # session to figure out the # of parameters needed for each layer. The cpu-compatible models don't need this.
    with sess.as_default():
        # 8 means to limit the span to size 8 or less
        best_spans, conf = model.get_prediction().get_best_span(10)

    # Loads the saved weights
    model_dir.restore_checkpoint(sess)

    # Now the model is ready to run
    # The model takes input in the form of `ContextAndQuestion` objects, for example:
    data = [ParagraphAndQuestion(x, question, None, "user-question%d"%i)
            for i, x in enumerate(context)]

    print("Starting run")
    # The model is run in two steps, first it "encodes" a batch of paragraph/context pairs
    # into numpy arrays, then we use `sess` to run the actual model get the predictions
    encoded = model.encode(data, is_train=False)  # batch of `ContextAndQuestion` -> feed_dict
    best_spans, conf = sess.run([best_spans, conf], feed_dict=encoded)  # feed_dict -> predictions

    best_para = np.argmax(conf)  # We get output for each paragraph, select the most-confident one to print
    print("Best Paragraph: " + str(best_para))
    para_id = int(str(best_para))
    # print("Best Paragraph: \n" + (" ".join((paras[para_id].text)[0])))
    print("Best Paragraph: \n" + " ".join(context[para_id]))
    print("Best span: " + str(best_spans[best_para]))
    print("Answer text: " + " ".join(context[best_para][best_spans[best_para][0]:best_spans[best_para][1]+1]))
    print("Confidence: " + str(conf[best_para]))

コード例 #8

ファイルを表示

class QaSystem(object):
    """
    End-to-end QA system, uses web-requests to get relevant documents and a model
    to scores candidate answer spans.
    """

    _split_regex = re.compile(
        "\s*\n\s*")  # split includes whitespace to avoid empty paragraphs

    def __init__(self,
                 wiki_cache: str,
                 paragraph_splitter: DocumentSplitter,
                 paragraph_selector: ParagraphFilter,
                 vocab: Union[str, Set[str]],
                 model: Union[ParagraphQuestionModel, ModelDir],
                 loader: ResourceLoader = ResourceLoader(),
                 bing_api_key=None,
                 tagme_api_key=None,
                 blacklist_trivia_sites: bool = False,
                 n_dl_threads: int = 5,
                 span_bound: int = 8,
                 tagme_threshold: Optional[float] = 0.2,
                 download_timeout: int = None,
                 n_web_docs=10):
        self.log = logging.getLogger('qa_system')
        self.tagme_threshold = tagme_threshold
        self.n_web_docs = n_web_docs
        self.blacklist_trivia_sites = blacklist_trivia_sites
        self.tagme_api_key = tagme_api_key

        if bing_api_key is not None:
            self.searcher = AsyncWebSearcher(bing_api_key)
            self.text_extractor = AsyncBoilerpipeCliExtractor(
                n_dl_threads, download_timeout)
        else:
            self.text_extractor = None
            self.searcher = None
        self.wiki_corpus = WikiCorpus(wiki_cache, keep_inverse_mapping=True)
        self.paragraph_splitter = paragraph_splitter
        self.paragraph_selector = paragraph_selector
        self.model_dir = model

        voc = None
        if vocab is not None:
            if isinstance(vocab, str):
                voc = set()
                with open(vocab, "r") as f:
                    for line in f:
                        voc.add(line.strip())
            else:
                voc = vocab
            self.log.info("Using preset vocab of size %d", len(voc))

        self.log.info("Setting up model...")
        if isinstance(model, ModelDir):
            self.model = model.get_model()
        else:
            self.model = model

        self.model.set_input_spec(ParagraphAndQuestionSpec(None), voc, loader)

        self.sess = tf.Session()
        with self.sess.as_default():
            pred = self.model.get_prediction()

        model.restore_checkpoint(self.sess)

        self.span_scores = pred.get_span_scores()
        self.span, self.score = pred.get_best_span(span_bound)
        self.tokenizer = NltkAndPunctTokenizer()
        self.sess.graph.finalize()

    async def answer_question(
            self, question: str) -> Tuple[np.ndarray, List[WebParagraph]]:
        """ Answer a question using web search """

        context = await self.get_question_context(question)
        question = self.tokenizer.tokenize_paragraph_flat(question)
        t0 = time.perf_counter()
        out = self._get_span_scores(question, context)
        self.log.info("Computing answer spans took %.5f seconds" %
                      (time.perf_counter() - t0))
        return out

    def answer_with_doc(self, question: str,
                        doc: str) -> Tuple[np.ndarray, List[WebParagraph]]:
        """ Answer a question using the given text as a document """

        self.log.info("Answering question \"%s\" with a given document" %
                      question)
        # Tokenize
        question = self.tokenizer.tokenize_paragraph_flat(question)
        context = [
            self.tokenizer.tokenize_with_inverse(x, False)
            for x in self._split_regex.split(doc)
        ]

        # Split into super-paragraphs
        context = self._split_document(context, "User", None)

        # Select top paragraphs
        context = self.paragraph_selector.prune(question, context)
        if len(context) == 0:
            raise ValueError("Unable to process documents")

        # Select the top answer span
        t0 = time.perf_counter()
        span_scores = self._get_span_scores(question, context)
        self.log.info("Computing answer spans took %.5f seconds" %
                      (time.perf_counter() - t0))
        return span_scores

    def _get_span_scores(self, question: List[str],
                         paragraphs: List[ParagraphWithInverse]):
        """
        Answer a question using the given paragraphs, returns both the span scores and the
        pre-processed paragraphs the span are valid for
        """
        if self.model.preprocessor is not None:
            prepped = []
            for para in paragraphs:
                if hasattr(para, "spans"):
                    spans = para.spans
                else:
                    spans = None

                text, _, inv = self.model.preprocessor.encode_paragraph(
                    [], para.text, para.start == 0,
                    np.zeros((0, 2), dtype=np.int32), spans)
                prepped.append(
                    WebParagraph([text], para.original_text, inv,
                                 para.paragraph_num, para.start, para.end,
                                 para.source_name, para.source_url))
            paragraphs = prepped

        qa_pairs = [
            ParagraphAndQuestion(c.get_context(), question, None, "")
            for c in paragraphs
        ]
        encoded = self.model.encode(qa_pairs, False)
        return self.sess.run(self.span_scores, encoded), paragraphs

    def _split_document(self, para: List[ParagraphWithInverse],
                        source_name: str, source_url: Optional[str]):
        tokenized_paragraphs = []
        on_token = 0
        for i, para in enumerate(self.paragraph_splitter.split_inverse(para)):
            n_tokens = para.n_tokens
            tokenized_paragraphs.append(
                WebParagraph(para.text, para.original_text, para.spans, i + 1,
                             on_token, on_token + n_tokens, source_name,
                             source_url))
            on_token += n_tokens
        return tokenized_paragraphs

    async def _tagme(self, question):
        payload = {
            "text": question,
            "long_text": 3,
            "lang": "en",
            "gcube-token": self.tagme_api_key
        }
        async with ClientSession() as sess:
            async with sess.get(url=TAGME_API, params=payload) as resp:
                data = await resp.json()

        return [
            ann_json for ann_json in data["annotations"] if "title" in ann_json
        ]

    async def get_question_context(self, question: str) -> List[WebParagraph]:
        """
        Find a set of paragraphs from the web that are relevant to the given question
        """
        tokenized_paragraphs = []
        if self.tagme_threshold is not None:
            self.log.info("Query tagme for %s", question)
            tags = await self._tagme(question)
            t0 = time.perf_counter()
            found = set()
            for tag in tags:
                if tag["rho"] >= self.tagme_threshold:
                    title = tag["title"]
                    if title in found:
                        continue
                    found.add(title)
                    doc = await self.wiki_corpus.get_wiki_article(title)
                    tokenized_paragraphs += self._split_document(
                        doc.paragraphs, "Wikipedia: " + doc.title, doc.url)
            if len(tokenized_paragraphs) > 0:
                self.log.info("Getting wiki docs took %.5f seconds" %
                              (time.perf_counter() - t0))

        if self.n_web_docs > 0:
            t0 = time.perf_counter()
            self.log.info("Running bing search for %s", question)
            search_results = await self.searcher.run_search(
                question, self.n_web_docs)
            t1 = time.perf_counter()
            self.log.info("Completed bing search, took %.5f seconds" %
                          (t1 - t0))
            t0 = t1
            url_to_result = {x["url"]: x for x in search_results}
            self.log.info("Extracting text for %d results",
                          len(search_results))
            text_docs = await self.text_extractor.get_text(
                [x["url"] for x in search_results])

            for doc in text_docs:
                if len(doc.text) == 0:
                    continue
                search_r = url_to_result[doc.url]
                if self.blacklist_trivia_sites:
                    lower = search_r["displayUrl"].lower()
                    if 'quiz' in lower or 'trivia' in lower or 'answer' in lower:
                        # heuristic to ignore trivia sites, recommend by Mandar
                        self.log.debug("Skipping trivia site: " + lower)
                        continue

                paras_text = self._split_regex.split(doc.text.strip())

                paras_tokenized = [
                    self.tokenizer.tokenize_with_inverse(x) for x in paras_text
                ]

                tokenized_paragraphs += self._split_document(
                    paras_tokenized, search_r["displayUrl"], doc.url)

            self.log.info("Completed extracting text, took %.5f seconds." %
                          (time.perf_counter() - t0))

        self.log.info("Have %d paragraphs", len(tokenized_paragraphs))
        if len(tokenized_paragraphs) == 0:
            return []
        question = self.tokenizer.tokenize_sentence(question)
        return self.paragraph_selector.prune(question, tokenized_paragraphs)

コード例 #9

ファイルを表示

def predict():
    json_data = {"success": False, "predictions": []}
    print("Preprocessing...")

    # Load the model
    model_dir = ModelDir(
        "/home/antriv/conversation_ai/Transfer_Learning/ALLENAI_DocumentQA/document-qa/pretrained_models/models/triviaqa-unfiltered-shared-norm"
    )
    model = model_dir.get_model()
    if not isinstance(model, ParagraphQuestionModel):
        raise ValueError(
            "This script is built to work for ParagraphQuestionModel models only"
        )

    # Load the question
    question = (flask.request.data).decode("utf-8")

    # Read the documents
    documents = []
    doclist = ["/home/antriv/data/The-Future-Computed.txt"]
    for doc in doclist:
        if not isfile(doc):
            raise ValueError(doc + " does not exist")
        with open(doc, "r") as f:
            documents.append(f.read())
    print("Loaded %d documents" % len(documents))

    # Split documents into lists of paragraphs
    documents = [re.split("\s*\n\s*", doc) for doc in documents]

    # Tokenize the input, the models expects data to be tokenized using `NltkAndPunctTokenizer`
    # Note the model expects case-sensitive input
    tokenizer = NltkAndPunctTokenizer()
    question = tokenizer.tokenize_paragraph_flat(question)  # List of words
    # Now list of document->paragraph->sentence->word
    documents = [[tokenizer.tokenize_paragraph(p) for p in doc]
                 for doc in documents]

    # Now group the document into paragraphs, this returns `ExtractedParagraph` objects
    # that additionally remember the start/end token of the paragraph within the source document
    splitter = MergeParagraphs(400)
    #splitter = PreserveParagraphs() # Uncomment to use the natural paragraph grouping
    documents = [splitter.split(doc) for doc in documents]

    # Now select the top paragraphs using a `ParagraphFilter`
    if len(documents) == 1:
        # Use TF-IDF to select top paragraphs from the document
        selector = TopTfIdf(NltkPlusStopWords(True), n_to_select=1000)
        context = selector.prune(question, documents[0])
    else:
        # Use a linear classifier to select top paragraphs among all the documents
        selector = ShallowOpenWebRanker(n_to_select=1000)
        context = selector.prune(question, flatten_iterable(documents))

    print("Select %d paragraph" % len(context))

    if model.preprocessor is not None:
        # Models are allowed to define an additional pre-processing step
        # This will turn the `ExtractedParagraph` objects back into simple lists of tokens
        context = [
            model.preprocessor.encode_text(question, x) for x in context
        ]
    else:
        # Otherwise just use flattened text
        context = [flatten_iterable(x.text) for x in context]

    print("Setting up model")
    # Tell the model the batch size (can be None) and vocab to expect, This will load the
    # needed word vectors and fix the batch size to use when building the graph / encoding the input
    voc = set(question)
    for txt in context:
        voc.update(txt)
    model.set_input_spec(ParagraphAndQuestionSpec(batch_size=len(context)),
                         voc)

    # Now we build the actual tensorflow graph, `best_span` and `conf` are
    # tensors holding the predicted span (inclusive) and confidence scores for each
    # element in the input batch, confidence scores being the pre-softmax logit for the span
    print("Build tf graph")
    sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
    # We need to use sess.as_default when working with the cuNND stuff, since we need an active
    # session to figure out the # of parameters needed for each layer. The cpu-compatible models don't need this.
    with sess.as_default():
        # 8 means to limit the span to size 8 or less
        best_spans, conf = model.get_prediction().get_best_span(8)

    # Loads the saved weights
    model_dir.restore_checkpoint(sess)

    # Now the model is ready to run
    # The model takes input in the form of `ContextAndQuestion` objects, for example:
    data = [
        ParagraphAndQuestion(x, question, None, "user-question%d" % i)
        for i, x in enumerate(context)
    ]

    print("Starting run")
    # The model is run in two steps, first it "encodes" a batch of paragraph/context pairs
    # into numpy arrays, then we use `sess` to run the actual model get the predictions
    encoded = model.encode(
        data, is_train=False)  # batch of `ContextAndQuestion` -> feed_dict
    best_spans, conf = sess.run([best_spans, conf],
                                feed_dict=encoded)  # feed_dict -> predictions

    best_para = np.argmax(
        conf
    )  # We get output for each paragraph, select the most-confident one to print
    print("Best Paragraph: " + str(best_para))
    print("Best span: " + str(best_spans[best_para]))
    print("Answer text: " +
          " ".join(context[best_para]
                   [best_spans[best_para][0]:best_spans[best_para][1] + 1]))
    print("Confidence: " + str(conf[best_para]))
    y_output = " ".join(
        context[best_para][best_spans[best_para][0]:best_spans[best_para][1] +
                           1])
    print(y_output)
    json_data["predictions"].append(str(y_output))

    #indicate that the request was a success
    json_data["success"] = True
    #return the data dictionary as a JSON response
    return flask.jsonify(json_data)

コード例 #10

ファイルを表示

def main(Data: pd.DataFrame, nlp, model_dir, model):
    #parser = argparse.ArgumentParser(description="Run an ELMo model on user input")
    #parser.add_argument("model", help="Model directory")
    #parser.add_argument("question", help="Question to answer")
    #parser.add_argument("documents", help="List of text documents to answer the question with", nargs='+')
    #args = parser.parse_args()

    #print("Preprocessing...")

    # Load the model
    #model_dir = ModelDir(MODEL_DIR)
    #model = model_dir.get_model()
    print(model)
    if not isinstance(model, ParagraphQuestionModel):
        raise ValueError(
            "This script is built to work for ParagraphQuestionModel models only"
        )
    #print(model)
    # Read the documents
    documents = []
    documents.append(Data.at[0, 'Filetext'])
    """import pyodbc

    conn = pyodbc.connect("Driver={ODBC Driver 13 for SQL Server};"               
               "Server=192.168.100.15;"
               "Database=PharmaAce;"
               "UID=sa;"
               "PWD=admin@123;"
               "Trusted_Connection=no;")

    cursor=conn.cursor()
#(23211,28690,33214,25638,25837,26454,28693,26137,31428,32087)

    for doc in cursor.execute("select cast(filetext as varchar(max)) as filetext from kpl_tmp"):
        documents.append(doc[0])
        #doc="D:\Document QnA\document-qa-master\Data\Drug_Delivery_Surveying_Global_Competitive_Landscape_BMI.txt"   
    if not isfile(doc):
        raise ValueError(doc + " does not exist")
    with open(doc, "r") as f:
        documents.append(f.read())
    """
    #print("Loaded %d documents" % len(documents))
    #temp=documents[0].split()
    # Split documents into lists of paragraphs
    #documents=[" ".join(temp[i:(i+400)]) for i in range(1,len(temp),400)]
    documents = [re.split("\s*\n\s*", doc) for doc in documents]
    # Tokenize the input, the models expects data to be tokenized using `NltkAndPunctTokenizer`
    # Note the model expects case-sensitive input
    tokenizer = NltkAndPunctTokenizer()
    question = tokenizer.tokenize_paragraph_flat(
        Data.at[0, 'Question'])  # List of words
    # Now list of document->paragraph->sentence->word
    documents = [[tokenizer.tokenize_paragraph(p) for p in doc]
                 for doc in documents]

    # Now group the document into paragraphs, this returns `ExtractedParagraph` objects
    # that additionally remember the start/end token of the paragraph within the source document
    splitter = MergeParagraphs(400)
    #splitter = PreserveParagraphs() # Uncomment to use the natural paragraph grouping
    documents = [splitter.split(doc) for doc in documents]
    #print(str(len(documents))+" kpl") #kpl
    # Now select the top paragraphs using a `ParagraphFilter`
    print(len(documents))  #kpl
    if len(documents) == 1:
        # Use TF-IDF to select top paragraphs from the document
        selector = TopTfIdf(NltkPlusStopWords(True), n_to_select=5)
        context = selector.prune(question, documents[0])
    else:
        # Use a linear classifier to select top paragraphs among all the documents
        selector = ShallowOpenWebRanker(n_to_select=10)
        context = selector.prune(question, flatten_iterable(documents))

    #print("Select %d paragraph" % len(context))

    if model.preprocessor is not None:
        # Models are allowed to define an additional pre-processing step
        # This will turn the `ExtractedParagraph` objects back into simple lists of tokens
        context = [
            model.preprocessor.encode_text(question, x) for x in context
        ]
    else:
        # Otherwise just use flattened text
        context = [flatten_iterable(x.text) for x in context]

    print("Setting up model")
    # Tell the model the batch size (can be None) and vocab to expect, This will load the
    # needed word vectors and fix the batch size to use when building the graph / encoding the input
    voc = set(question)
    for txt in context:
        voc.update(txt)
    model.set_input_spec(nlp,
                         ParagraphAndQuestionSpec(batch_size=len(context)),
                         voc)
    # Now we build the actual tensorflow graph, `best_span` and `conf` are
    # tensors holding the predicted span (inclusive) and confidence scores for each
    # element in the input batch, confidence scores being the pre-softmax logit for the span
    #print("Build tf graph") #kpl
    print("after set input spec")
    sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
    # We need to use sess.as_default when working with the cuNND stuff, since we need an active
    # session to figure out the # of parameters needed for each layer. The cpu-compatible models don't need this.
    with sess.as_default():
        # 8 means to limit the span to size 8 or less
        best_spans, conf = model.get_prediction().get_best_span(8)

    # Loads the saved weights
    model_dir.restore_checkpoint(sess)
    print("after loading weights")
    # Now the model is ready to run
    # The model takes input in the form of `ContextAndQuestion` objects, for example:
    data = [
        ParagraphAndQuestion(x, question, None, "user-question%d" % i)
        for i, x in enumerate(context)
    ]

    #print("Starting run")
    # The model is run in two steps, first it "encodes" a batch of paragraph/context pairs
    # into numpy arrays, then we use `sess` to run the actual model get the predictions
    encoded = model.encode(
        data, is_train=True)  # batch of `ContextAndQuestion` -> feed_dict
    best_spans, conf = sess.run([best_spans, conf],
                                feed_dict=encoded)  # feed_dict -> predictions

    best_para = np.argmax(
        conf
    )  # We get output for each paragraph, select the most-confident one to print

    #print("Best Paragraph: " + str(best_para))
    #print("Best span: " + str(best_spans[best_para]))
    #print("Answer text: " + " ".join(context[best_para][best_spans[best_para][0]:best_spans[best_para][1]+1]))
    #print("Confidence: " + str(conf[best_para]))

    return " ".join(
        context[best_para][best_spans[best_para][0]:best_spans[best_para][1] +
                           1])


#if __name__ == "__main__":
#    main()