Python NltkAndPunctTokenizer Exemples, docqa.data_processing.text_utils.NltkAndPunctTokenizer Python Exemples

Exemple #1

0

Afficher le fichier

def read_input_data(model):
  data = []
  vocab = set()
  tokenizer = NltkAndPunctTokenizer()
  splitter = Truncate(400)  # NOTE: we truncate past 400 tokens
  selector = TopTfIdf(NltkPlusStopWords(True), n_to_select=5)
  with open(OPTS.input_file) as f:
    for i, line in enumerate(f):
      try:
        document_raw, question_raw = line.strip().split('\t')
      except ValueError as e:
        print(line.strip())
        print('Error at line %d' % i)
        raise e
      document = re.split("\s*\n\s*", document_raw)
      question = tokenizer.tokenize_paragraph_flat(question_raw)
      doc_toks = [tokenizer.tokenize_paragraph(p) for p in document]
      split_doc = splitter.split(doc_toks)
      context = selector.prune(question, split_doc)
      if model.preprocessor is not None:
        context = [model.preprocessor.encode_text(question, x) for x in context]
      else:
        context = [flatten_iterable(x.text) for x in context]
      vocab.update(question)
      for txt in context:
        vocab.update(txt)
      ex = [ParagraphAndQuestion(x, question, None, "user-question%d"%i)
            for i, x in enumerate(context)]
      data.append((document_raw, question_raw, context, ex))
  return data, vocab

Exemple #2

0

Afficher le fichier

Fichier : test_machine_reader.py Projet : trunghlt/cape-document-qa

class RandomMachineReaderModel(CapeMachineReaderModelInterface):
    def __init__(self, _):
        self.tokenizer = NltkAndPunctTokenizer()

    def tokenize(self, text):
        tokens = self.tokenizer.tokenize_paragraph_flat(text)
        spans = self.tokenizer.convert_to_spans(text, [tokens])[0]
        return tokens, spans

    def get_document_embedding(self, text):
        np.random.seed(
            int(hashlib.sha1(text.encode()).hexdigest(), 16) % 10**8)
        document_tokens, _ = self.tokenize(text)
        return np.random.random((len(document_tokens), 240))

    def get_logits(self, question, document_embedding):
        question_tokens, _ = self.tokenize(question)
        n_words = document_embedding.shape[0]
        qseed = int(hashlib.sha1(question.encode()).hexdigest(), 16) % 10**8
        dseed = int(np.sum(document_embedding) * 10**6) % 10**8
        np.random.seed(dseed + qseed)
        start_logits = np.random.random(n_words)
        off = np.random.randint(1, 5)
        end_logits = np.concatenate(
            [np.zeros(off) + np.min(start_logits), start_logits[off:]])
        return start_logits[:n_words], end_logits[:n_words]

Exemple #3

0

Afficher le fichier

Fichier : document_rd_corpus.py Projet : kapil1201/Machine-Learning-and-Python-Work

def get_doc_rd_doc(
        docs: List[Document]) -> Dict[str, List[ParagraphWithInverse]]:
    tokenizer = NltkAndPunctTokenizer()
    conn = sqlite3.connect(DOCUMENT_READER_DB)
    c = conn.cursor()
    titles = [clean_title(doc.title) for doc in docs]
    for i, t in enumerate(titles):
        # Had to manually resolve this (due to changes in Wikipedia?)
        if t == "Sky (United Kingdom)":
            titles[i] = "Sky UK"

    title_to_doc_id = {t: doc.title for t, doc in zip(titles, docs)}

    c.execute("CREATE TEMPORARY TABLE squad_docs(id)")
    c.executemany("INSERT INTO squad_docs VALUES (?)", [(x, ) for x in titles])

    c.execute("SELECT id, text FROM documents WHERE id IN squad_docs")

    documents = {}
    out = c.fetchall()
    conn.close()
    for title, text in out:
        paragraphs = []
        for para in text.split("\n"):
            para = para.strip()
            if len(para) > 0:
                paragraphs.append(tokenizer.tokenize_with_inverse(para))
        documents[title_to_doc_id[title]] = paragraphs

    return documents

Exemple #4

0

Afficher le fichier

Fichier : cape_docqa_machine_reader.py Projet : trunghlt/cape-document-qa

 def __init__(self, machine_reader_config):
     self.tokenizer = NltkAndPunctTokenizer()
     self.config = machine_reader_config
     self.model = self._load_model()
     self.sess = tf.Session()
     self.start_logits, self.end_logits, self.context_rep = self._build_model(
     )
     self._initialize()

Exemple #5

0

Afficher le fichier

Fichier : test_splitter.py Projet : kapil1201/Machine-Learning-and-Python-Work

 def test_split_inv(self):
     paras = [
         "One fish two fish. Red fish blue fish",
         "Just one sentence",
         "How will an overhead score? The satisfactory juice returns against an inviting protein. "
         "How can a rat expand? The subway fishes throughout a struggle. The guaranteed herd pictures an "
         "episode into the accustomed damned. The garbage reigns beside the component!",
     ]
     tok = NltkAndPunctTokenizer()
     tokenized = [tok.tokenize_with_inverse(x) for x in paras]
     inv_split = RandomSplitter().split_inverse(tokenized)
     for para in inv_split:
         self.assertTrue(flatten_iterable(para.text) == [para.original_text[s:e] for s,e in para.spans])

Exemple #6

0

Afficher le fichier

def build_web_corpus(n_processes, sets_to_build, source_dir, target_dir):
    sets_to_build_dict = {}
    if 'verified' in sets_to_build:
        sets_to_build_dict['verified'] = join(source_dir,
                                              "verified-web-dev.json")
    if 'dev' in sets_to_build:
        sets_to_build_dict['dev'] = join(source_dir, "web-dev.json")
    if 'train' in sets_to_build:
        sets_to_build_dict['train'] = join(source_dir, "web-train.json")
    if 'test' in sets_to_build:
        sets_to_build_dict['test'] = join(source_dir,
                                          "web-test-without-answers.json")

    #dict(
    #    verified=join(TRIVIA_QA, "qa", "verified-web-dev.json"),
    #    dev=join(TRIVIA_QA, "qa", "web-dev.json"),
    #    train=join(TRIVIA_QA, "qa", "web-train.json"),
    #    test=join(TRIVIA_QA, "qa", "web-test-without-answers.json")
    #)

    build_dataset("web",
                  NltkAndPunctTokenizer(),
                  sets_to_build_dict,
                  FastNormalizedAnswerDetector(),
                  n_processes,
                  out_dir=target_dir)

Exemple #7

0

Afficher le fichier

def main():
    parser = argparse.ArgumentParser("Preprocess SQuAD data")
    #basedir = join(expanduser("~"), "data", "squad")
    basedir = join(expanduser("~"), "azayats", "data", "squad")
    parser.add_argument("--train_file",
                        default=join(basedir, "train-v1.1.json"))
    parser.add_argument("--dev_file", default=join(basedir, "dev-v1.1.json"))

    if not exists(config.CORPUS_DIR):
        mkdir(config.CORPUS_DIR)

    target_dir = join(config.CORPUS_DIR, SquadCorpus.NAME)
    if exists(target_dir) and len(listdir(target_dir)) > 0:
        raise ValueError("Files already exist in " + target_dir)

    args = parser.parse_args()
    tokenzier = NltkAndPunctTokenizer()

    print("Parsing train...")
    train = list(parse_squad_data(args.train_file, "train", tokenzier))

    print("Parsing dev...")
    dev = list(parse_squad_data(args.dev_file, "dev", tokenzier))

    print("Saving...")
    SquadCorpus.make_corpus(train, dev)
    print("Done")

Exemple #8

0

Afficher le fichier

Fichier : build_squad_dataset.py Projet : boidi/tr_QA

def main():
    #Namespace(directory= 'C:/Users/boidiyv/document-qa-master',dump=False, fake=False, verbose=False)
    parser = argparse.ArgumentParser("Preprocess SQuAD data")
    #parser = argparse.ArgumentParser()

    parser.add_argument('--document-qa/docqa/squad', type=Path)
    parser.add_argument("--train_file", default=config.SQUAD_TRAIN)
    parser.add_argument("--dev_file", default=config.SQUAD_DEV)
    #parser.add_argument("--document-qa-master",type=lambda p: Path(p).absolute(),default=Path(__file__).absolute().parent / "document-qa-master",help="Path to the data directory" )
    
    
    if not exists(config.CORPUS_DIR):
        mkdir(config.CORPUS_DIR)

    target_dir = join(config.CORPUS_DIR, SquadCorpus.NAME)
    if exists(target_dir) and len(listdir(target_dir)) > 0:
        raise ValueError("Files already exist in " + target_dir)

    args = parser.parse_args('')
    tokenzier = NltkAndPunctTokenizer()

    print("Parsing train...")
    train = list(parse_squad_data(args.train_file, "train", tokenzier))
    print(train)

    print("Parsing dev...")
    dev = list(parse_squad_data(args.dev_file, "dev", tokenzier))

    print("Saving...")
    SquadCorpus.make_corpus(train, dev)
    print("Done")

Exemple #9

0

Afficher le fichier

def main():
    parser = argparse.ArgumentParser("Preprocess SQuAD data")
    parser.add_argument("--train_file", default=config.SQUAD_TRAIN)
    parser.add_argument("--dev_file", default=config.SQUAD_DEV)
    parser.add_argument("--weighted-questions", action='store_true')

    if not exists(config.CORPUS_DIR):
        mkdir(config.CORPUS_DIR)

    target_dir = join(config.CORPUS_DIR, SquadCorpus.NAME)
    if exists(target_dir) and len(listdir(target_dir)) > 0:
        raise ValueError("Files already exist in " + target_dir)

    args = parser.parse_known_args()[0]
    tokenizer = NltkAndPunctTokenizer()

    print("Parsing train...")
    train = list(
        parse_squad_data(args.train_file,
                         "train",
                         tokenizer,
                         weighted_samples=args.weighted_questions))

    print("Parsing dev...")
    dev = list(parse_squad_data(args.dev_file, "dev", tokenizer))

    print("Saving...")
    SquadCorpus.make_corpus(train, dev)
    print("Done")

Exemple #10

0

Afficher le fichier

Fichier : evidence_corpus.py Projet : zengyy8/XQA

def main():
    parse = argparse.ArgumentParser("Pre-tokenize the XQA evidence corpus")
    parse.add_argument("--corpus",
                       choices=[
                           "en", "fr", "de", "ru", "pt", "zh", "pl", "uk", "ta"
                           "en_trans_de", "en_trans_zh", "fr_trans_en",
                           "de_trans_en", "ru_trans_en", "pt_trans_en",
                           "zh_trans_en", "pl_trans_en", "uk_trans_en",
                           "ta_trans_en"
                       ],
                       required=True)
    # This is slow, using more processes is recommended
    parse.add_argument("-n",
                       "--n_processes",
                       type=int,
                       default=1,
                       help="Number of processes to use")
    parse.add_argument("--wiki_only", action="store_true")
    args = parse.parse_args()

    output_dir = join(config.CORPUS_DIR, args.corpus, "evidence")
    source = join(config.CORPUS_NAME_TO_PATH[args.corpus], "evidence")
    if args.corpus == "en_trans_zh" or args.corpus == "zh":
        tokenizer = ChineseTokenizer()
    else:
        tokenizer = NltkAndPunctTokenizer()
    build_tokenized_corpus(source,
                           tokenizer,
                           output_dir,
                           n_processes=args.n_processes,
                           wiki_only=args.wiki_only)

Exemple #11

0

Afficher le fichier

def build_web_corpus(n_processes):
    build_dataset(
        "web", NltkAndPunctTokenizer(),
        dict(verified=join(TRIVIA_QA, "qa", "verified-web-dev.json"),
             dev=join(TRIVIA_QA, "qa", "web-dev.json"),
             train=join(TRIVIA_QA, "qa", "web-train.json"),
             test=join(TRIVIA_QA, "qa", "web-test-without-answers.json")),
        FastNormalizedAnswerDetector(), n_processes)

Exemple #12

0

Afficher le fichier

def build_wiki_corpus(n_processes):
    build_dataset(
        "wiki", NltkAndPunctTokenizer(),
        dict(
            verified=join(TRIVIA_QA, "qa", "verified-wikipedia-dev.json"),
            dev=join(TRIVIA_QA, "qa", "wikipedia-dev.json"),
            train=join(TRIVIA_QA, "qa", "wikipedia-train.json"),
        ), FastNormalizedAnswerDetector(), n_processes)

Exemple #13

0

Afficher le fichier

Fichier : qa_util.py Projet : TxConvergentAdmin/bt-f18-nlp--utext

def find_answer(documents, raw_question):

    raw_question = raw_question.lower()
    documents = [d.lower() for d in documents]

    global best_spans, conf

    documents = [re.split("\s*\n\s*", doc) for doc in documents]
    tokenizer = NltkAndPunctTokenizer()

    question = tokenizer.tokenize_paragraph_flat(raw_question)

    documents = [[tokenizer.tokenize_paragraph(p) for p in doc]
                 for doc in documents]

    splitter = MergeParagraphs(400)

    documents = [splitter.split(doc) for doc in documents]

    if len(documents) == 1:
        selector = TopTfIdf(NltkPlusStopWords(True), n_to_select=5)
        context = selector.prune(question, documents[0])
    else:
        selector = ShallowOpenWebRanker(n_to_select=10)
        context = selector.prune(question, flatten_iterable(documents))

    context = [flatten_iterable(x.text) for x in context]

    data = [
        ParagraphAndQuestion(x, question, None, "user-question%d" % i)
        for i, x in enumerate(context)
    ]

    encoded = model.encode(data, is_train=False)

    with sess.as_default():
        spans, confid = sess.run([best_spans, conf], feed_dict=encoded)

    best_para = np.argmax(confid)
    ans = " ".join(context[best_para][spans[best_para][0]:spans[best_para][1] +
                                      1])
    confidence = confid[best_para]

    return ans, confidence

Exemple #14

0

Afficher le fichier

def main():
    from build_span_corpus import TriviaQaWebDataset
    from docqa.data_processing.text_utils import NltkAndPunctTokenizer

    dataset = TriviaQaWebDataset()
    qs = dataset.get_train()
    qs = np.random.RandomState(0).choice(qs, 1000, replace=False)
    evaluate_question_detector(qs, dataset.evidence,
                               NltkAndPunctTokenizer().tokenize_paragraph_flat,
                               FastNormalizedAnswerDetector())

Exemple #15

0

Afficher le fichier

 def get_random_answer(self):
     time.sleep(1)
     para = NltkAndPunctTokenizer().tokenize_with_inverse(ipso)
     para1 = WebParagraph(para.text, ipso, para.spans, 0, 0, 0, "source1",
                          "fake_url1")
     para2 = WebParagraph(para.text, ipso, np.array(para.spans), 0, 0, 0,
                          "source2", "fake_url2")
     span_scores = np.random.normal(size=(2, len(para.spans),
                                          len(para.spans))) * 5
     return span_scores, [para1, para2]

Exemple #16

0

Afficher le fichier

def build_sample_corpus(n_processes):
    build_dataset("web-sample",
                  NltkAndPunctTokenizer(),
                  dict(
                      dev=join(TRIVIA_QA, "qa", "web-dev.json"),
                      train=join(TRIVIA_QA, "qa", "web-train.json"),
                  ),
                  FastNormalizedAnswerDetector(),
                  n_processes,
                  sample=1000)

Exemple #17

0

Afficher le fichier

Fichier : run_json.py Projet : tiga1231/docqa-analysis

def read_input_data(model):
  data = []
  vocab = set()
  tokenizer = NltkAndPunctTokenizer()
  with open(OPTS.input_file) as f:
    json_data = json.load(f)
  for doc in json_data['data']:
    for paragraph in doc['paragraphs']:
      context = tokenizer.tokenize_with_inverse(paragraph['context'])
      if model.preprocessor is not None:
        context = model.preprocessor.encode_text(question, context)
      context = context.get_context()
      vocab.update(context)
      for qa in paragraph['qas']:
        question = tokenizer.tokenize_sentence(qa['question'])
        vocab.update(question)
        ex = [ParagraphAndQuestion(context, question, None, qa['id'])]
        data.append((paragraph['context'], context, ex))
  return data, sorted(list(vocab))

Exemple #18

0

Afficher le fichier

Fichier : build_span_corpus.py Projet : yumere/document-qa

    def __init__(self, corpus_name):
        self.corpus_name = corpus_name
        self.dir = join(CORPUS_DIR, self.corpus_name)
        self.tokenizer = NltkAndPunctTokenizer()
        self.detector = FastNormalizedAnswerDetector()

        self._train, self._raw_train = list(), None
        self._dev, self._raw_dev = list(), None

        self.missed_answer = 0

Exemple #19

0

Afficher le fichier

Fichier : evidence_corpus.py Projet : boidi/tr_QA

def main():
    parse = argparse.ArgumentParser("Pre-tokenize the TriviaQA evidence corpus")
    parse.add_argument("-o", "--output_dir", type=str, default=join(config.CORPUS_DIR, "triviaqa", "evidence"))
    parse.add_argument("-s", "--source", type=str, default=join(config.TRIVIA_QA, "evidence"))
    # This is slow, using more processes is recommended
    parse.add_argument("-n", "--n_processes", type=int, default=8, help="Number of processes to use")
    parse.add_argument("--wiki_only", action="store_true")
    args = parse.parse_args()
    build_tokenized_corpus(args.source, NltkAndPunctTokenizer(), args.output_dir,
                           n_processes=args.n_processes, wiki_only=args.wiki_only)

Exemple #20

0

Afficher le fichier

def build_unfiltered_corpus(n_processes):
    build_dataset("web-open",
                  NltkAndPunctTokenizer(),
                  dict(dev=join(TRIVIA_QA_UNFILTERED,
                                "unfiltered-web-dev.json"),
                       train=join(TRIVIA_QA_UNFILTERED,
                                  "unfiltered-web-train.json"),
                       test=join(TRIVIA_QA_UNFILTERED,
                                 "unfiltered-web-test-without-answers.json")),
                  answer_detector=FastNormalizedAnswerDetector(),
                  n_process=n_processes)

Exemple #21

0

Afficher le fichier

Fichier : cape_preprocess.py Projet : trunghlt/cape-document-qa

def triviaqa_prepro(wiki_only, n_processes):
    print('Tokenizing {} corpus:'.format('wiki' if wiki_only else 'wiki and web'))
    build_tokenized_corpus(
        TRIVIA_QA_EVIDENCE,
        NltkAndPunctTokenizer(),
        PREPRO_EVIDENCE_DIR,
        n_processes=n_processes,
        wiki_only=wiki_only,
    )
    print('Preparing wiki corpus:')
    build_wiki_corpus(n_processes)
    if not wiki_only:
        print('Preparing web corpus:')
        build_web_corpus(n_processes)

Exemple #22

0

Afficher le fichier

Fichier : wiki.py Projet : kapil1201/Machine-Learning-and-Python-Work

    def __init__(self,
                 cache_dir=None,
                 follow_redirects: bool = True,
                 keep_inverse_mapping: bool = False,
                 extract_lists: bool = False,
                 tokenizer=NltkAndPunctTokenizer()):
        self.tokenizer = tokenizer
        self.extract_lists = extract_lists
        self.follow_redirects = follow_redirects
        self.cache_dir = cache_dir
        self.keep_inverse_mapping = keep_inverse_mapping

        if cache_dir is not None and not exists(self.cache_dir):
            mkdir(self.cache_dir)

Exemple #23

0

Afficher le fichier

def build_unfiltered_corpus(n_processes, sets_to_build, source_dir,
                            target_dir):
    sets_to_build_dict = {}
    if 'dev' in sets_to_build:
        sets_to_build_dict['dev'] = join(source_dir, "unfiltered-web-dev.json")
    if 'train' in sets_to_build:
        sets_to_build_dict['train'] = join(source_dir,
                                           "unfiltered-web-train.json")
    if 'test' in sets_to_build:
        sets_to_build_dict['test'] = join(
            source_dir, "unfiltered-web-test-without-answers.json")

    build_dataset("web-open",
                  NltkAndPunctTokenizer(),
                  sets_to_build_dict,
                  answer_detector=FastNormalizedAnswerDetector(),
                  n_process=n_processes,
                  out_dir=target_dir)

Exemple #24

0

Afficher le fichier

Fichier : wiki.py Projet : sanyaade-machine-learning/document-qa

    def __init__(self, cache_dir=None, follow_redirects: bool=True,
                 keep_inverse_mapping: bool=False,
                 extract_lists: bool=False, tokenizer=NltkAndPunctTokenizer()):
        """
        :param cache_dir: Optional, directory to cache the documents we download
        :param follow_redirects: Follow wiki re-directs
        :param keep_inverse_mapping: Keep track of the inverse mapping of tokens so the text can
                                     be "untokenized" accurately
        :param extract_lists: Include lists in the extracted articles
        :param tokenizer: Tokenizer to use to tokenize the documents
        """
        self.tokenizer = tokenizer
        self.extract_lists = extract_lists
        self.follow_redirects = follow_redirects
        self.cache_dir = cache_dir
        self.keep_inverse_mapping = keep_inverse_mapping

        if cache_dir is not None and not exists(self.cache_dir):
            mkdir(self.cache_dir)

Exemple #25

0

Afficher le fichier

def build_xqa_corpus(corpus_name, n_processes):
    if corpus_name.startswith("en"):
        files_dict = dict(train=join(CORPUS_NAME_TO_PATH[corpus_name], "qa",
                                     "train.json"),
                          dev=join(CORPUS_NAME_TO_PATH[corpus_name], "qa",
                                   "dev.json"),
                          test=join(CORPUS_NAME_TO_PATH[corpus_name], "qa",
                                    "test.json"))
    else:
        files_dict = dict(dev=join(CORPUS_NAME_TO_PATH[corpus_name], "qa",
                                   "dev.json"),
                          test=join(CORPUS_NAME_TO_PATH[corpus_name], "qa",
                                    "test.json"))
    if corpus_name == "en_trans_zh" or corpus_name == "zh":
        tokenizer = ChineseTokenizer()
    else:
        tokenizer = NltkAndPunctTokenizer()
    build_dataset(corpus_name, tokenizer, files_dict,
                  FastNormalizedAnswerDetector(), n_processes)

Exemple #26

0

Afficher le fichier

Fichier : cape_preprocess_squad.py Projet : trunghlt/cape-document-qa

def prepro_squad_fold(name, fold, squad_file_paths):
    tokenizer = NltkAndPunctTokenizer()
    dataset_evidence_dir = join(PREPRO_EVIDENCE_DIR, name)

    if not exists(dataset_evidence_dir):
        makedirs(dataset_evidence_dir)

    voc = set()
    squad_docs = [
        d for squad_file_path in squad_file_paths
        for d in parse_squad_data(squad_file_path, fold, tokenizer)
    ]
    questions = []
    file_map = {}

    for document in tqdm(squad_docs, desc=fold, ncols=80):
        for paragraph in document.paragraphs:
            for question in paragraph.questions:
                doc_id = question.question_id
                doc_savename = get_doc_savename(dataset_evidence_dir, doc_id)
                trivia_q = squad_q2triviaqa_q(question)

                with open(doc_savename + '.txt', 'w', encoding='utf8') as f:
                    f.write(dump_paragraph(paragraph))

                words = {w for sent in paragraph.text for w in sent}
                voc.update(words)

                file_map[doc_id] = doc_savename
                questions.append(trivia_q)

    questions_savename = get_questions_savename(name, fold)
    with open(questions_savename, "wb") as f:
        pickle.dump(questions, f)

    return voc, file_map

Exemple #27

0

Afficher le fichier

Fichier : DocumentQA.py Projet : kapil1201/Machine-Learning-and-Python-Work

    def getAnswer(self):
        #parser = argparse.ArgumentParser(description="Run an ELMo model on user input")
        #parser.add_argument("model", help="Model directory")
        #parser.add_argument("question", help="Question to answer")
        #parser.add_argument("documents", help="List of text documents to answer the question with", nargs='+')
        #args = parser.parse_args()

        #print("Preprocessing...")

        # Load the model
        model_dir = ModelDir(MODEL_DIR)
        model = model_dir.get_model()
        if not isinstance(model, ParagraphQuestionModel):
            raise ValueError(
                "This script is built to work for ParagraphQuestionModel models only"
            )

        conn = pyodbc.connect(DB_CONN)

        cursor = conn.cursor()
        #(23211,28690,33214,25638,25837,26454,28693,26137,31428,32087)
        query="select cast(filetext as varchar(max)) as filetext, name, type from dbo.UserworkspaceData where objectmasterid= "+\
               str(self.ObjectMasterId)+\
               " order by id asc"
        #query="select cast(filetext as varchar(max)) as filetext from kpl_tmp"
        documents = []
        document = ""
        name = ""
        filetype = 0
        for doc in cursor.execute(query):
            document = document + doc[0]
            name = doc[1]
            filetype = doc[2]
        #open("E:/kpl.txt","w+").write(document)
        documents.append(document)
        #documents.replace("\n\n","\n")
        #r.sub("",documents)
        #documents=" ".join(documents.split())
        #open("E:\kpl_test.txt","w+").write(document)
        #doc="D:\Document QnA\document-qa-master\Data\Drug_Delivery_Surveying_Global_Competitive_Landscape_BMI.txt"
        # =============================================================================
        #     if not isfile(doc):
        #         raise ValueError(doc + " does not exist")
        #     with open(doc, "r") as f:
        #         documents.append(f.read())
        # =============================================================================

        #print("Loaded %d documents" % len(documents))
        #temp=documents[0].split()
        # Split documents into lists of paragraphs
        #documents=[" ".join(temp[i:(i+400)]) for i in range(1,len(temp),400)]
        documents = [re.split("\s*\n\s*", doc) for doc in documents]
        # Tokenize the input, the models expects data to be tokenized using `NltkAndPunctTokenizer`
        # Note the model expects case-sensitive input
        tokenizer = NltkAndPunctTokenizer()
        question = tokenizer.tokenize_paragraph_flat(
            self.Question)  # List of words

        # Now list of document->paragraph->sentence->word
        documents = [[tokenizer.tokenize_paragraph(p) for p in doc]
                     for doc in documents]

        # Now group the document into paragraphs, this returns `ExtractedParagraph` objects
        # that additionally remember the start/end token of the paragraph within the source document
        splitter = MergeParagraphs(400)
        #splitter = PreserveParagraphs() # Uncomment to use the natural paragraph grouping
        documents = [splitter.split(doc) for doc in documents]
        #print(str(len(documents))+" kpl") #kpl
        # Now select the top paragraphs using a `ParagraphFilter`
        if len(documents) == 1:
            # Use TF-IDF to select top paragraphs from the document
            selector = TopTfIdf(NltkPlusStopWords(True), n_to_select=5)
            context = selector.prune(question, documents[0])
        else:
            # Use a linear classifier to select top paragraphs among all the documents
            selector = ShallowOpenWebRanker(n_to_select=10)
            context = selector.prune(question, flatten_iterable(documents))

    #print("Select %d paragraph" % len(context))

        if model.preprocessor is not None:
            # Models are allowed to define an additional pre-processing step
            # This will turn the `ExtractedParagraph` objects back into simple lists of tokens
            context = [
                model.preprocessor.encode_text(question, x) for x in context
            ]
        else:
            # Otherwise just use flattened text
            context = [flatten_iterable(x.text) for x in context]
        #x=open("E:\context.txt","a+")
        #[x.write(" ".join(cont)) for cont in context]
        #x.write("\n.......................................................\n")

        #print("Setting up model")
        # Tell the model the batch size (can be None) and vocab to expect, This will load the
        # needed word vectors and fix the batch size to use when building the graph / encoding the input
        voc = set(question)
        for txt in context:
            voc.update(txt)

        model.set_input_spec(self.nlp,
                             ParagraphAndQuestionSpec(batch_size=len(context)),
                             voc)
        # Now we build the actual tensorflow graph, `best_span` and `conf` are
        # tensors holding the predicted span (inclusive) and confidence scores for each
        # element in the input batch, confidence scores being the pre-softmax logit for the span
        #print("Build tf graph") #kpl
        sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
        # We need to use sess.as_default when working with the cuNND stuff, since we need an active
        # session to figure out the # of parameters needed for each layer. The cpu-compatible models don't need this.
        with sess.as_default():
            # 8 means to limit the span to size 8 or less
            best_spans, conf = model.get_prediction().get_best_span(8)

    # Loads the saved weights
        model_dir.restore_checkpoint(sess)

        # Now the model is ready to run
        # The model takes input in the form of `ContextAndQuestion` objects, for example:
        data = [
            ParagraphAndQuestion(x, question, None, "user-question%d" % i)
            for i, x in enumerate(context)
        ]

        #print("Starting run")
        # The model is run in two steps, first it "encodes" a batch of paragraph/context pairs
        # into numpy arrays, then we use `sess` to run the actual model get the predictions
        encoded = model.encode(
            data, is_train=True)  # batch of `ContextAndQuestion` -> feed_dict
        best_spans, conf = sess.run(
            [best_spans, conf], feed_dict=encoded)  # feed_dict -> predictions

        best_para = np.argmax(
            conf
        )  # We get output for each paragraph, select the most-confident one to print

        #print("Best Paragraph: " + str(best_para))
        #print("Best span: " + str(best_spans[best_para]))
        #print("Answer text: " + " ".join(context[best_para][best_spans[best_para][0]:best_spans[best_para][1]+1]))
        #print("Confidence: " + str(conf[best_para]))
        Answer = " ".join(context[best_para]
                          [best_spans[best_para][0]:best_spans[best_para][1] +
                           1])

        print("Confidence: " + str(conf[best_para]))
        print("Best Paragraph: " + str(best_para))
        print("Best span: " + str(best_spans[best_para]))
        print("Answer text: " + Answer)
        print(" ".join(context[best_para]))
        context[best_para][best_spans[best_para][
            0]] = r"<em>" + context[best_para][best_spans[best_para][0]]
        context[best_para][best_spans[best_para][1]] = context[best_para][
            best_spans[best_para][1]] + r"</em>"

        start = 0
        end = len(context[best_para])

        positions = [
            x for x, n in enumerate(context[best_para]
                                    [0:best_spans[best_para][0]]) if n == "."
        ]
        if len(positions) >= 2: start = positions[len(positions) - 2] + 1
        positions = [
            x
            for x, n in enumerate(context[best_para][best_spans[best_para][1] +
                                                     1:]) if n == "."
        ]
        if len(positions) > 1:
            end = best_spans[best_para][1] + 1 + positions[1]

        d = dict()
        if conf[best_para] > 10:
            d["answer"] = Answer
        else:
            d["answer"] = ""
        d["name"] = name
        d["filetype"] = filetype
        d["paragraph"] = re.sub(r' (?=\W)', '',
                                " ".join(context[best_para][start:end]))
        d["ObjectMasterId"] = self.ObjectMasterId

        return d


#if __name__ == "__main__":
#    main()

Exemple #28

0

Afficher le fichier

Fichier : eval_elmo_minimal.py Projet : ccmaymay/document-qa

def run():
    parser = argparse.ArgumentParser()
    parser.add_argument("squad_path", help="path to squad dev data file")
    parser.add_argument("output_path",
                        help="path where evaluation json file will be written")
    parser.add_argument("--model-path",
                        default="model",
                        help="path to model directory")
    parser.add_argument("--n", type=int, default=None)
    parser.add_argument("-b", "--batch_size", type=int, default=100)
    parser.add_argument("--ema", action="store_true")
    args = parser.parse_args()

    squad_path = args.squad_path
    output_path = args.output_path
    model_dir = ModelDir(args.model_path)
    nltk.data.path.append("nltk_data")

    print("Loading data")
    docs = parse_squad_data(squad_path, "", NltkAndPunctTokenizer(), False)
    pairs = split_docs(docs)
    dataset = ParagraphAndQuestionDataset(
        pairs, ClusteredBatcher(args.batch_size, ContextLenKey(), False, True))

    print("Done, init model")
    model = model_dir.get_model()
    loader = ResourceLoader(lambda a, b: load_word_vector_file(
        join(VEC_DIR, "glove.840B.300d.txt"), b))
    lm_model = model.lm_model
    basedir = join(LM_DIR, "squad-context-concat-skip")
    lm_model.lm_vocab_file = join(basedir,
                                  "squad_train_dev_all_unique_tokens.txt")
    lm_model.options_file = join(
        basedir, "options_squad_lm_2x4096_512_2048cnn_2xhighway_skip.json")
    lm_model.weight_file = join(
        basedir,
        "squad_context_concat_lm_2x4096_512_2048cnn_2xhighway_skip.hdf5")
    lm_model.embed_weights_file = None

    model.set_inputs([dataset], loader)

    print("Done, building graph")
    sess = tf.Session()
    with sess.as_default():
        pred = model.get_prediction()
    best_span = pred.get_best_span(17)[0]

    all_vars = tf.global_variables() + tf.get_collection(
        tf.GraphKeys.SAVEABLE_OBJECTS)
    dont_restore_names = {
        x.name
        for x in all_vars if x.name.startswith("bilm")
    }
    print(sorted(dont_restore_names))
    vars = [x for x in all_vars if x.name not in dont_restore_names]

    print("Done, loading weights")
    checkpoint = model_dir.get_best_weights()
    if checkpoint is None:
        print("Loading most recent checkpoint")
        checkpoint = model_dir.get_latest_checkpoint()
    else:
        print("Loading best weights")

    saver = tf.train.Saver(vars)
    saver.restore(sess, checkpoint)

    if args.ema:
        ema = tf.train.ExponentialMovingAverage(0)
        saver = tf.train.Saver(
            {ema.average_name(x): x
             for x in tf.trainable_variables()})
        saver.restore(sess, checkpoint)

    sess.run(
        tf.variables_initializer(
            [x for x in all_vars if x.name in dont_restore_names]))

    print("Done, starting evaluation")
    out = {}
    for i, batch in enumerate(dataset.get_epoch()):
        if args.n is not None and i == args.n:
            break
        print("On batch: %d" % (i + 1))
        enc = model.encode(batch, False)
        spans = sess.run(best_span, feed_dict=enc)
        for (s, e), point in zip(spans, batch):
            out[point.question_id] = point.get_original_text(s, e)

    sess.close()

    print("Done, saving")
    with open(output_path, "w") as f:
        json.dump(out, f)

    print("Mission accomplished!")

Exemple #29

0

Afficher le fichier

Fichier : cape_docqa_machine_reader.py Projet : trunghlt/cape-document-qa

class CapeDocQAMachineReaderModel(CapeMachineReaderModelInterface):
    def __init__(self, machine_reader_config):
        self.tokenizer = NltkAndPunctTokenizer()
        self.config = machine_reader_config
        self.model = self._load_model()
        self.sess = tf.Session()
        self.start_logits, self.end_logits, self.context_rep = self._build_model(
        )
        self._initialize()

    def _load_model(self):
        with open(self.config.model_pickle_file, 'rb') as f:
            model = pickle.load(f)

        model.lm_model.weight_file = self.config.lm_weights_file
        model.lm_model.lm_vocab_file = self.config.vocab_file
        model.lm_model.embed_weights_file = self.config.lm_token_weights_file
        model.lm_model.options_file = self.config.lm_options_file
        return model

    def _build_model(self):
        vocab_to_init_with = {
            line.strip()
            for line in open(self.config.vocab_file, encoding="utf-8")
            if line.strip() not in vocab_to_ignore
        }
        self.model.word_embed.vec_name = self.config.word_vector_file
        with self.sess.as_default():
            self.model.set_input_spec(
                ParagraphAndQuestionSpec(None, None, None, 14),
                vocab_to_init_with,
                word_vec_loader=ResourceLoader(
                    load_vec_fn=lambda x, y: load_word_vectors(
                        x, y, is_path=True)))
            pred = self.model.get_production_predictions_for(
                {x: x
                 for x in self.model.get_placeholders()})
        return pred.start_logits, pred.end_logits, self.model.context_rep

    def _initialize(self):
        all_vars = tf.global_variables() + tf.get_collection(
            tf.GraphKeys.SAVEABLE_OBJECTS)
        lm_var_names = {x.name for x in all_vars if x.name.startswith("bilm")}
        vars_to_restore = [x for x in all_vars if x.name not in lm_var_names]
        saver = tf.train.Saver(vars_to_restore)
        saver.restore(self.sess, self.config.checkpoint_file)
        self.sess.run(
            tf.variables_initializer(
                [x for x in all_vars if x.name in lm_var_names]))

    def tokenize(self, text):
        tokens = self.tokenizer.tokenize_paragraph_flat(text)
        spans = self.tokenizer.convert_to_spans(text, [tokens])[0]
        return tokens, spans

    def get_document_embedding(self, text):
        document_tokens, _ = self.tokenize(text)
        test_question = ParagraphAndQuestion(document_tokens,
                                             ['dummy', 'question'], None,
                                             "cape_question", 'cape_document')
        feed = self.model.encode([test_question], False, cached_doc=None)
        return self.sess.run(self.model.context_rep, feed_dict=feed)[0]

    def get_logits(self, question, document_embedding):
        question_tokens, _ = self.tokenize(question)
        n_words = document_embedding.shape[0]
        dummy_document = ['dummy'] * n_words
        test_question = ParagraphAndQuestion(dummy_document, question_tokens,
                                             None, "cape_question",
                                             'cape_document')
        feed = self.model.encode(
            [test_question],
            False,
            cached_doc=document_embedding[np.newaxis, :, :])
        start_logits, end_logits = self.sess.run(
            [self.start_logits, self.end_logits], feed_dict=feed)
        return start_logits[0][:n_words], end_logits[0][:n_words]

Exemple #30

0

Afficher le fichier

Fichier : run_on_user_text.py Projet : kapil1201/Machine-Learning-and-Python-Work

def main():
    parser = argparse.ArgumentParser(
        description="Run an ELMo model on user input")
    parser.add_argument("model", help="Model directory")
    parser.add_argument("question", help="Question to answer")
    parser.add_argument("context", help="Context to answer the question with")
    args = parser.parse_args()

    # Tokenize the input, the models expected data to be tokenized using `NltkAndPunctTokenizer`
    # Note the model expects case-sensitive input
    tokenizer = NltkAndPunctTokenizer()
    question = tokenizer.tokenize_paragraph_flat(args.question)
    context = tokenizer.tokenize_paragraph_flat(args.context)

    print("Loading model")
    model_dir = ModelDir(args.model)
    model = model_dir.get_model()
    if not isinstance(model, ElmoQaModel):
        raise ValueError(
            "This script is build to work for ElmoQaModel models only")

    # Important! This tells the language model not to use the pre-computed word vectors,
    # which are only applicable for the SQuAD dev/train sets.
    # Instead the language model will use its character-level CNN to compute
    # the word vectors dynamically.
    model.lm_model.embed_weights_file = None

    # Tell the model the batch size and vocab to expect, This will load the needed
    # word vectors and fix the batch size when building the graph / encoding the input
    print("Setting up model")
    voc = set(question)
    voc.update(context)
    model.set_input_spec(ParagraphAndQuestionSpec(batch_size=1), voc)

    # Now we build the actual tensorflow graph, `best_span` and `conf` are
    # tensors holding the predicted span (inclusive) and confidence scores for each
    # element in the input batch
    print("Build tf graph")
    sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
    with sess.as_default():
        # 17 means to limit the span to size 17 or less
        best_spans, conf = model.get_prediction().get_best_span(17)

    # Now restore the weights, this is a bit fiddly since we need to avoid restoring the
    # bilm weights, and instead load them from the pre-computed data
    all_vars = tf.global_variables() + tf.get_collection(
        tf.GraphKeys.SAVEABLE_OBJECTS)
    lm_var_names = {x.name for x in all_vars if x.name.startswith("bilm")}
    vars = [x for x in all_vars if x.name not in lm_var_names]
    model_dir.restore_checkpoint(sess, vars)

    # Run the initializer of the lm weights, which will load them from the lm directory
    sess.run(
        tf.variables_initializer(
            [x for x in all_vars if x.name in lm_var_names]))

    # Now the model is ready to run
    # The model takes input in the form of `ContextAndQuestion` objects, for example:
    data = [ParagraphAndQuestion(context, question, None, "user-question1")]

    print("Starting run")
    # The model is run in two steps, first it "encodes" the paragraph/context pairs
    # into numpy arrays, then to use `sess` to run the actual model get the predictions
    encoded = model.encode(
        data, is_train=False)  # batch of `ContextAndQuestion` -> feed_dict
    best_spans, conf = sess.run([best_spans, conf],
                                feed_dict=encoded)  # feed_dict -> predictions
    print("Best span: " + str(best_spans[0]))
    print("Answer text: " +
          " ".join(context[best_spans[0][0]:best_spans[0][1] + 1]))
    print("Confidence: " + str(conf[0]))