Exemple #1
0
 def get_spec(self):
     batch_size = self.batcher.get_fixed_batch_size()
     num_contexts = 1
     max_q_words = max(len(q.question) for q in self.questions)
     max_c_words = max(max(c.num_tokens for c in (q.distractors + [q.paragraph])) for q in self.questions)
     return QuestionAndParagraphsSpec(batch_size=batch_size, max_num_contexts=num_contexts,
                                      max_num_question_words=max_q_words, max_num_context_words=max_c_words)
Exemple #2
0
 def get_spec(self):
     batch_size = self.batcher.get_fixed_batch_size()
     num_contexts = 1
     max_q_words = max(len(q.question) for q in self.questions)
     max_c_words = max(max(c.num_tokens for c in d.paragraphs) for d in self.title2doc.values())
     return QuestionAndParagraphsSpec(batch_size=batch_size, max_num_contexts=num_contexts,
                                      max_num_question_words=max_q_words, max_num_context_words=max_c_words)
Exemple #3
0
 def get_spec(self):
     batch_size = self.batcher.get_fixed_batch_size()
     num_contexts = 1
     max_q_words = max(len(q.question) for q in self.questions)
     max_c_words = max(multiple_contexts_len(q) for q in self.questions)
     return QuestionAndParagraphsSpec(batch_size=batch_size,
                                      max_num_contexts=num_contexts,
                                      max_num_question_words=max_q_words,
                                      max_num_context_words=max_c_words)
Exemple #4
0
def encode_all_squad(encoder_model: str):
    print("loading data...")
    corpus = SquadRelevanceCorpus()
    train = corpus.get_train()
    dev = corpus.get_dev()

    spec = QuestionAndParagraphsSpec(batch_size=None,
                                     max_num_contexts=1,
                                     max_num_question_words=None,
                                     max_num_context_words=None)
    voc = corpus.get_vocab()

    encoder = SentenceEncoderSingleContext(model_dir_path=encoder_model,
                                           vocabulary=voc,
                                           spec=spec,
                                           loader=corpus.get_resource_loader())

    for questions, title2doc in [(train, corpus.train_title_to_document),
                                 (dev, corpus.dev_title_to_document)]:
        print(
            f"Starting encoding of {'train' if questions == train else 'dev'}")
        # eliminating distractors not from original squad
        title2max = {
            key: max(x.paragraph.par_id for x in group)
            for key, group in itertools.groupby(
                sorted(questions, key=lambda x: x.paragraph.doc_title),
                key=lambda x: x.paragraph.doc_title)
        }
        for title in title2max:
            title2doc[title].paragraphs = title2doc[
                title].paragraphs[:title2max[title] + 1]

        for title in tqdm(title2max):
            np.savez_compressed(
                get_filename(questions == train, title), **{
                    str(k): v
                    for k, v in encode_document(encoder,
                                                title2doc[title]).items()
                })
Exemple #5
0
def squad_build_drqa_doc_encodings(out_dir,
                                   encoder_model,
                                   num_workers,
                                   all_squad=False):
    print("loading data...")
    corpus = SquadRelevanceCorpus()
    questions = corpus.get_dev()
    if all_squad:
        questions.extend(corpus.get_train())
    # docs = corpus.dev_title_to_document.values() if dev else corpus.train_title_to_document.values()
    relevant_titles = list(set([q.paragraph.doc_title for q in questions]))

    conn = sqlite3.connect(DRQA_DOC_DB)
    c = conn.cursor()
    titles = list(set([q.paragraph.doc_title for q in questions]))
    for i, t in enumerate(titles):
        # Had to manually resolve this (due to changes in Wikipedia?)
        if t == "Sky (United Kingdom)":
            titles[i] = "Sky UK"

    title_to_doc_id = {t1: t2 for t1, t2 in zip(titles, relevant_titles)}

    c.execute("CREATE TEMPORARY TABLE squad_docs(id)")
    c.executemany("INSERT INTO squad_docs VALUES (?)", [(x, ) for x in titles])

    c.execute("SELECT id, text FROM documents WHERE id IN squad_docs")

    out = c.fetchall()
    conn.close()

    out = [(title_to_doc_id[title], text) for title, text in out]

    spec = QuestionAndParagraphsSpec(batch_size=None,
                                     max_num_contexts=1,
                                     max_num_question_words=None,
                                     max_num_context_words=None)
    voc = corpus.get_vocab()
    encoder = SentenceEncoderSingleContext(model_dir_path=encoder_model,
                                           vocabulary=voc,
                                           spec=spec,
                                           loader=ResourceLoader())

    # Setup worker pool
    workers = ProcessPool(num_workers, initializer=init, initargs=[])

    documents = {}
    tokenized_documents = {}

    print("Tokenizing...")
    with tqdm(total=len(out)) as pbar:
        for doc, tok_doc in tqdm(
                workers.imap_unordered(get_document_paragraphs, out)):
            documents.update(doc)
            tokenized_documents.update(tok_doc)
            pbar.update()

    encodings = {}
    print("Encoding...")
    for title, paragraphs in tqdm(tokenized_documents.items()):
        dummy_question = "Hello Hello".split()
        model_paragraphs = [
            BinaryQuestionAndParagraphs(question=dummy_question,
                                        paragraphs=[x],
                                        label=1,
                                        num_distractors=0,
                                        question_id='dummy')
            for x in paragraphs
        ]
        encodings.update({
            f"{title}_{i}": rep
            for i, rep in enumerate(encoder.encode_paragraphs(
                model_paragraphs))
        })

    with open(join(out_dir, 'docs.json'), 'w') as f:
        json.dump(documents, f)
    np.savez_compressed(join(out_dir, 'encodings.npz'), **encodings)
Exemple #6
0
def build_doc_eval_file(out_file,
                        encodings_dir,
                        encoder_model,
                        k,
                        per_doc=True):
    print("loading data...")
    corpus = SquadRelevanceCorpus()
    questions = corpus.get_dev()
    spec = QuestionAndParagraphsSpec(batch_size=None,
                                     max_num_contexts=1,
                                     max_num_question_words=None,
                                     max_num_context_words=None)
    voc = corpus.get_vocab()
    encoder = SentenceEncoderSingleContext(model_dir_path=encoder_model,
                                           vocabulary=voc,
                                           spec=spec,
                                           loader=corpus.get_resource_loader())

    par_encs = np.load(join(encodings_dir, 'encodings.npz'))
    with open(join(encodings_dir, 'docs.json'), 'r') as f:
        documents = json.load(f)

    questions_eval_format = []
    questions = sorted(questions, key=lambda x: x.paragraph.doc_title)
    if per_doc:
        title2par_encs = {}
        for p_name, rep in par_encs.items():
            title = '_'.join(p_name.split('_')[:-1])
            if title in title2par_encs:
                title2par_encs[title].update({p_name: rep})
            else:
                title2par_encs[title] = {p_name: rep}
        for title, doc_qs in tqdm(
                itertools.groupby(questions,
                                  key=lambda x: x.paragraph.doc_title)):
            doc_qs = list(doc_qs)
            q_encodings = encode_squad.encode_questions(encoder, doc_qs)
            par2ids = {}
            reps = []
            total_sentences = 0
            for p_name, rep in title2par_encs[title].items():
                par2ids[p_name] = list(
                    range(total_sentences, total_sentences + len(rep)))
                reps.append(rep)
                total_sentences += len(rep)
            id2par = {i: p for p, ids in par2ids.items() for i in ids}
            reps = np.concatenate(reps, axis=0)
            top_k = simple_numpy_knn(q_encodings, reps, k * 2)
            for idx, question in enumerate(doc_qs):
                seen = set()
                p_names = [
                    id2par[x] for x in top_k[idx]
                    if not (id2par[x] in seen or seen.add(id2par[x]))
                ][:k]
                questions_eval_format.append({
                    'qid':
                    question.question_id,
                    'question':
                    ' '.join(question.question),
                    'answers':
                    list(question.answers),
                    'paragraphs': [
                        documents['_'.join(p_name.split('_')[:-1])][int(
                            p_name.split('_')[-1])] for p_name in p_names
                    ]
                })
    else:
        print("encoding questions")
        q_encodings = encode_squad.encode_questions(encoder, questions)
        par2ids = {}
        reps = []
        total_sentences = 0
        for p_name, rep in par_encs.items():
            par2ids[p_name] = list(
                range(total_sentences, total_sentences + len(rep)))
            reps.append(rep)
            total_sentences += len(rep)
        id2par = {i: p for p, ids in par2ids.items() for i in ids}
        reps = np.concatenate(reps, axis=0)
        print("scoring")
        top_k = simple_numpy_knn(q_encodings, reps, k * 2)
        for idx, question in enumerate(questions):
            seen = set()
            p_names = [
                id2par[x] for x in top_k[idx]
                if not (id2par[x] in seen or seen.add(id2par[x]))
            ][:k]
            questions_eval_format.append({
                'qid':
                question.question_id,
                'question':
                ' '.join(question.question),
                'answers':
                list(question.answers),
                'paragraphs': [
                    documents['_'.join(p_name.split('_')[:-1])][int(
                        p_name.split('_')[-1])] for p_name in p_names
                ]
            })

    with open(out_file, 'w') as f:
        json.dump(questions_eval_format, f)
def build_openqa_top_titles(out_file, questions_file, docs_file, encodings_dir,
                            encoder_model, k, use_ema: bool, checkpoint: str,
                            safety_mult: int, n_titles: int):
    print('Loading data...')
    s = time.time()
    with open(questions_file, 'r') as f:
        questions = json.load(f)
    with open(docs_file, 'r') as f:
        documents = json.load(f)
    print(f'Done, took {time.time()-s} seconds.')

    # Setup worker pool
    workers = ProcessPool(16, initializer=init, initargs=[])

    qid2tokenized = {}
    tupled_questions = [(q['qid'], q['question']) for q in questions]
    print("Tokenizing questions...")
    with tqdm(total=len(tupled_questions)) as pbar:
        for tok_q in tqdm(
                workers.imap_unordered(tokenize_question, tupled_questions)):
            qid2tokenized.update(tok_q)
            pbar.update()

    workers.close()
    workers.join()

    voc = set()
    for question in qid2tokenized.values():
        voc.update(question)

    all_titles = list(
        set([title for q in questions for title in q['top_titles']]))

    def parname_to_text(par_name):
        par_title = par_name_to_title(par_name)
        par_num = int(par_name.split('_')[-1])
        return documents[par_title][par_num]

    print(f"Gathering documents...")
    # Setup worker pool
    workers = ProcessPool(32,
                          initializer=init_encoding_handler,
                          initargs=[encodings_dir])
    # title2encs = {}
    # title2idx2par_name = {}
    # with tqdm(total=len(all_titles)) as pbar:
    #     for t2enc, t2id2p in tqdm(workers.imap_unordered(get_title_mappings_from_saver, all_titles)):
    #         title2encs.update(t2enc)
    #         title2idx2par_name.update(t2id2p)
    #         pbar.update()

    print("Loading encoder...")
    spec = QuestionAndParagraphsSpec(batch_size=None,
                                     max_num_contexts=1,
                                     max_num_question_words=None,
                                     max_num_context_words=None)
    encoder = SentenceEncoderSingleContext(model_dir_path=encoder_model,
                                           vocabulary=voc,
                                           spec=spec,
                                           loader=ResourceLoader(),
                                           use_char_inputs=False,
                                           use_ema=use_ema,
                                           checkpoint=checkpoint)

    print("Encoding questions...")
    q_encodings = encoder.encode_text_questions(
        [qid2tokenized[q['qid']] for q in questions],
        return_search_vectors=True,
        show_progress=True)

    print("Calculating similarities...")
    for idx, question in tqdm(enumerate(questions),
                              total=len(questions),
                              ncols=80):
        q_titles = question['top_titles']
        if n_titles is not None:
            q_titles = q_titles[:n_titles]
        title2encs = {}
        title2idx2par_name = {}
        for t2enc, t2id2p in workers.imap_unordered(
                get_title_mappings_from_saver, q_titles):
            title2encs.update(t2enc)
            title2idx2par_name.update(t2id2p)
        q_enc = q_encodings[idx]
        title2ids = {}
        reps = []
        total_sentences = 0
        titles_offset_dict = {}
        for title in q_titles:
            titles_offset_dict[title] = total_sentences
            rep = title2encs[title]
            title2ids[title] = list(
                range(total_sentences, total_sentences + len(rep)))
            reps.append(rep)
            total_sentences += len(rep)
        id2title = {i: title for title, ids in title2ids.items() for i in ids}
        reps = np.concatenate(reps, axis=0)
        top_k = simple_numpy_knn(np.expand_dims(q_enc, 0), reps,
                                 k * safety_mult)[0]

        def id_to_par_name(rep_id):
            return title2idx2par_name[id2title[rep_id]][
                rep_id - titles_offset_dict[id2title[rep_id]]]

        seen = set()
        p_names = [
            id_to_par_name(x) for x in top_k
            if not (id_to_par_name(x) in seen or seen.add(id_to_par_name(x)))
        ][:k]
        question['paragraphs'] = [
            parname_to_text(p_name) for p_name in p_names
        ]

    with open(out_file, 'w') as f:
        json.dump(questions, f)
def build_openqa_iterative_top_titles(out_file, questions_file, docs_file,
                                      encodings_dir, encoder_model, k1, k2, n1,
                                      n2, evaluate: bool,
                                      reformulate_from_text: bool,
                                      use_ema: bool, checkpoint: str):
    print('Loading data...')
    s = time.time()
    with open(questions_file, 'r') as f:
        questions = json.load(f)
    with open(docs_file, 'r') as f:
        documents = json.load(f)
    print(f'Done, took {time.time()-s} seconds.')

    if n1 is not None and n2 is not None:
        for q in questions:
            q['top_titles'] = q['top_titles'][:max(n1, n2)]

    # Setup worker pool
    workers = ProcessPool(16, initializer=init, initargs=[])

    qid2tokenized = {}
    tupled_questions = [(q['qid'], q['question']) for q in questions]
    print("Tokenizing questions...")
    with tqdm(total=len(tupled_questions)) as pbar:
        for tok_q in tqdm(
                workers.imap_unordered(tokenize_question, tupled_questions)):
            qid2tokenized.update(tok_q)
            pbar.update()

    voc = set()
    for question in qid2tokenized.values():
        voc.update(question)

    all_titles = list(
        set([title for q in questions for title in q['top_titles']]))

    def parname_to_text(par_name):
        par_title = par_name_to_title(par_name)
        par_num = int(par_name.split('_')[-1])
        return documents[par_title][par_num]

    print(f"Gathering documents...")
    # Setup worker pool
    workers = ProcessPool(32,
                          initializer=init_encoding_handler,
                          initargs=[encodings_dir])
    title2encs = {}
    title2idx2par_name = {}
    with tqdm(total=len(all_titles)) as pbar:
        for t2enc, t2id2p in tqdm(
                workers.imap_unordered(get_title_mappings_from_saver,
                                       all_titles)):
            title2encs.update(t2enc)
            title2idx2par_name.update(t2id2p)
            pbar.update()
    title2par_name2idxs = {}
    for title, id2par in title2idx2par_name.items():
        par2idxs = {}
        for idx, parname in id2par.items():
            if parname in par2idxs:
                par2idxs[parname].append(idx)
            else:
                par2idxs[parname] = [idx]
        title2par_name2idxs[title] = {
            par: sorted(idxs)
            for par, idxs in par2idxs.items()
        }

    print("Loading encoder...")
    spec = QuestionAndParagraphsSpec(batch_size=None,
                                     max_num_contexts=2,
                                     max_num_question_words=None,
                                     max_num_context_words=None)
    encoder = SentenceEncoderIterativeModel(model_dir_path=encoder_model,
                                            vocabulary=voc,
                                            spec=spec,
                                            loader=ResourceLoader(),
                                            use_char_inputs=False,
                                            use_ema=use_ema,
                                            checkpoint=checkpoint)

    print("Encoding questions...")
    q_original_encodings = encoder.encode_text_questions(
        [qid2tokenized[q['qid']] for q in questions],
        return_search_vectors=False,
        show_progress=True)
    q_search_encodings = encoder.question_rep_to_search_vector(
        question_encodings=q_original_encodings)

    init()  # for initializing the tokenizer

    print("Calculating similarities...")
    for idx, question in tqdm(enumerate(questions), total=len(questions)):
        title2ids = {}
        all_par_reps = []
        total_sentences = 0
        titles_offset_dict = {}
        for title in question['top_titles'][:n1]:
            titles_offset_dict[title] = total_sentences
            rep = title2encs[title]
            title2ids[title] = list(
                range(total_sentences, total_sentences + len(rep)))
            all_par_reps.append(rep)
            total_sentences += len(rep)
        id2title = {i: title for title, ids in title2ids.items() for i in ids}
        all_par_reps = np.concatenate(all_par_reps, axis=0)

        q_enc = q_search_encodings[idx]
        top_k = simple_numpy_knn(np.expand_dims(q_enc, 0), all_par_reps,
                                 k1 * 2)[0]

        def id_to_par_name(rep_id):
            return title2idx2par_name[id2title[rep_id]][
                rep_id - titles_offset_dict[id2title[rep_id]]]

        seen = set()
        p_names = [
            id_to_par_name(x) for x in top_k
            if not (id_to_par_name(x) in seen or seen.add(id_to_par_name(x)))
        ][:k1]
        iteration1_paragraphs = \
            [title2encs[par_name_to_title(pname)][title2par_name2idxs[par_name_to_title(pname)][pname], :]
             for pname in p_names]
        if not reformulate_from_text:
            reformulations = encoder.reformulate_questions(
                questions_rep=np.tile(q_original_encodings[idx],
                                      reps=(len(p_names), 1)),
                paragraphs_rep=iteration1_paragraphs,
                return_search_vectors=True)
        else:
            tok_q = tokenize(question['question']).words()
            par_texts = [
                tokenize(parname_to_text(pname)).words() for pname in p_names
            ]
            reformulations = encoder.reformulate_questions_from_texts(
                tokenized_questions=[tok_q for _ in range(len(par_texts))],
                tokenized_pars=par_texts,
                return_search_vectors=True)

        title2ids = {}
        all_par_reps = []
        total_sentences = 0
        titles_offset_dict = {}
        for title in question['top_titles'][:n2]:
            titles_offset_dict[title] = total_sentences
            rep = title2encs[title]
            title2ids[title] = list(
                range(total_sentences, total_sentences + len(rep)))
            all_par_reps.append(rep)
            total_sentences += len(rep)
        id2title = {i: title for title, ids in title2ids.items() for i in ids}
        all_par_reps = np.concatenate(all_par_reps, axis=0)

        top_k_second = numpy_global_knn(reformulations, all_par_reps, k2 * k1)
        seen = set()
        final_p_name_pairs = [
            (p_names[x1], id_to_par_name(x2)) for x1, x2 in top_k_second
            if not ((p_names[x1], id_to_par_name(x2)) in seen or seen.add(
                (p_names[x1], id_to_par_name(x2))))
        ][:k2]

        # important to note that in the iterative dataset the paragraphs of each question are in pairs
        question['paragraph_pairs'] = final_p_name_pairs

    with open(out_file, 'w') as f:
        json.dump(questions, f)

    if evaluate:
        eval_questions(questions)
Exemple #9
0
def encode_from_file(docs_file,
                     questions_file,
                     encodings_dir,
                     encoder_model,
                     num_workers,
                     hotpot: bool,
                     long_batch: int,
                     short_batch: int,
                     use_chars: bool,
                     use_ema: bool,
                     checkpoint: str,
                     document_chunk_size=1000,
                     samples=None,
                     encode_all_db=False):
    """

    :param out_file: .npz file to dump the encodings
    :param docs_file: path to json file whose structure is [{title: list of paragraphs}, ...]
    :return:
    """
    doc_encs_handler = DocumentEncodingHandler(encodings_dir)
    # Setup worker pool
    workers = ProcessPool(num_workers, initializer=init, initargs=[])

    if docs_file is not None:
        with open(docs_file, 'r') as f:
            documents = json.load(f)
        documents = {
            k: v
            for k, v in documents.items()
            if k not in doc_encs_handler.titles2filenames
        }

        tokenized_documents = {}
        tupled_doc_list = [(title, pars) for title, pars in documents.items()]

        if samples is not None:
            print(f"sampling {samples} samples")
            tupled_doc_list = tupled_doc_list[:samples]

        print("Tokenizing from file...")
        with tqdm(total=len(tupled_doc_list), ncols=80) as pbar:
            for tok_doc in tqdm(
                    workers.imap_unordered(tokenize_document,
                                           tupled_doc_list)):
                tokenized_documents.update(tok_doc)
                pbar.update()
    else:
        if questions_file is not None:
            with open(questions_file, 'r') as f:
                questions = json.load(f)
            all_titles = list(
                set([title for q in questions for title in q['top_titles']]))
        else:
            print("encoding all DB!")
            all_titles = DocDB().get_doc_titles()

        if samples is not None:
            print(f"sampling {samples} samples")
            all_titles = all_titles[:samples]

        all_titles = [
            t for t in all_titles if t not in doc_encs_handler.titles2filenames
        ]
        tokenized_documents = {}

        print("Tokenizing from DB...")
        with tqdm(total=len(all_titles), ncols=80) as pbar:
            for tok_doc in tqdm(
                    workers.imap_unordered(tokenize_from_db, all_titles)):
                tokenized_documents.update(tok_doc)
                pbar.update()

    workers.close()
    workers.join()

    voc = set()
    for paragraphs in tokenized_documents.values():
        for par in paragraphs:
            voc.update(par)

    if not hotpot:
        spec = QuestionAndParagraphsSpec(batch_size=None,
                                         max_num_contexts=1,
                                         max_num_question_words=None,
                                         max_num_context_words=None)
        encoder = SentenceEncoderSingleContext(model_dir_path=encoder_model,
                                               vocabulary=voc,
                                               spec=spec,
                                               loader=ResourceLoader(),
                                               use_char_inputs=use_chars,
                                               use_ema=use_ema,
                                               checkpoint=checkpoint)
    else:
        spec = QuestionAndParagraphsSpec(batch_size=None,
                                         max_num_contexts=2,
                                         max_num_question_words=None,
                                         max_num_context_words=None)
        encoder = SentenceEncoderIterativeModel(model_dir_path=encoder_model,
                                                vocabulary=voc,
                                                spec=spec,
                                                loader=ResourceLoader(),
                                                use_char_inputs=use_chars,
                                                use_ema=use_ema,
                                                checkpoint=checkpoint)

    tokenized_documents_items = list(tokenized_documents.items())
    for tokenized_doc_chunk in tqdm([
            tokenized_documents_items[i:i + document_chunk_size] for i in
            range(0, len(tokenized_documents_items), document_chunk_size)
    ],
                                    ncols=80):
        flattened_pars_with_names = [(f"{title}_{i}", par)
                                     for title, pars in tokenized_doc_chunk
                                     for i, par in enumerate(pars)]

        # filtering out empty paragraphs (probably had some short string the tokenization removed)
        # important to notice that the filtered paragraphs will have no representation,
        # but they still exist in the numbering of paragraphs for consistency with the docs.
        flattened_pars_with_names = [(name, par)
                                     for name, par in flattened_pars_with_names
                                     if len(par) > 0]

        # sort such that longer paragraphs are first to identify OOMs early on
        flattened_pars_with_names = sorted(flattened_pars_with_names,
                                           key=lambda x: len(x[1]),
                                           reverse=True)
        long_paragraphs_ids = [
            i for i, name_par in enumerate(flattened_pars_with_names)
            if len(name_par[1]) >= 900
        ]
        short_paragraphs_ids = [
            i for i, name_par in enumerate(flattened_pars_with_names)
            if len(name_par[1]) < 900
        ]

        # print(f"Encoding {len(flattened_pars_with_names)} paragraphs...")
        name2enc = {}
        dummy_question = "Hello Hello".split()
        if not hotpot:
            model_paragraphs = [
                BinaryQuestionAndParagraphs(question=dummy_question,
                                            paragraphs=[x],
                                            label=1,
                                            num_distractors=0,
                                            question_id='dummy')
                for _, x in flattened_pars_with_names
            ]
        else:
            # todo allow precomputed sentence segments
            model_paragraphs = [
                IterativeQuestionAndParagraphs(question=dummy_question,
                                               paragraphs=[x, dummy_question],
                                               first_label=1,
                                               second_label=1,
                                               question_id='dummy',
                                               sentence_segments=None)
                for _, x in flattened_pars_with_names
            ]

        # print("Encoding long paragraphs...")
        long_pars = [model_paragraphs[i] for i in long_paragraphs_ids]
        name2enc.update({
            flattened_pars_with_names[long_paragraphs_ids[i]][0]: enc
            for i, enc in enumerate(
                encoder.encode_paragraphs(
                    long_pars, batch_size=long_batch, show_progress=True
                ) if not hotpot else encoder.encode_first_paragraphs(
                    long_pars, batch_size=long_batch, show_progress=True))
        })

        # print("Encoding short paragraphs...")
        short_pars = [model_paragraphs[i] for i in short_paragraphs_ids]
        name2enc.update({
            flattened_pars_with_names[short_paragraphs_ids[i]][0]: enc
            for i, enc in enumerate(
                encoder.encode_paragraphs(
                    short_pars, batch_size=short_batch, show_progress=True
                ) if not hotpot else encoder.encode_first_paragraphs(
                    short_pars, batch_size=short_batch, show_progress=True))
        })

        doc_encs_handler.save_multiple_documents(name2enc)
Exemple #10
0

print("Loading TF-IDF...")
tfidf_ranker = TfidfDocRanker()
db = DocDB()

loader = ResourceLoader()
# loader = HotpotQuestions().get_resource_loader()
word_counts = load_counts(join(LOCAL_DATA_DIR, 'hotpot', 'wiki_word_counts.txt'))
title_counts = load_counts(join(LOCAL_DATA_DIR, 'hotpot', 'wiki_title_word_counts.txt'))
word_counts.update(title_counts)
voc = set(word_counts.keys())

print("Loading encoder...")

spec = QuestionAndParagraphsSpec(batch_size=None, max_num_contexts=2,
                                 max_num_question_words=None, max_num_context_words=None)
encoder = SentenceEncoderIterativeModel(model_dir_path=args.encoder_model, vocabulary=voc,
                                        spec=spec, loader=loader, use_char_inputs=False,
                                        use_ema=not args.no_ema,
                                        checkpoint=args.checkpoint)

print("Loading QA model...")
evaluators = [RecordHotpotQAPrediction(15, True, sp_prediction=True, disable_tqdm=True)]
batcher = ClusteredBatcher(64, multiple_contexts_len, truncate_batches=True)
qa_model_dir = ModelDir(args.qa_model)
checkpoint = None
if checkpoint == 'best':
    checkpoint = qa_model_dir.get_best_weights()
if checkpoint is not None:
    print("Using best weights")
else:
Exemple #11
0
def build_openqa_iterative_top_titles(
        base_dir, questions_file, docs_file, encodings_dir, encoder_model,
        k1_list: List[int], k2_list: List[int], n1_list: List[int],
        n2_list: List[int], evaluate: bool, reformulate_from_text: bool,
        use_ema: bool, checkpoint: str, safety_mult: int):
    print('Loading data...')
    s = time.time()
    with open(questions_file, 'r') as f:
        questions = json.load(f)
    if docs_file is not None:
        with open(docs_file, 'r') as f:
            documents = json.load(f)
    else:
        docs_db = DocDB()
    print(f'Done, took {time.time()-s} seconds.')

    if n1_list is not None and n2_list is not None:
        for q in questions:
            q['top_titles'] = q['top_titles'][:max(max(n1_list), max(n2_list))]

    # Setup worker pool
    workers = ProcessPool(16, initializer=init, initargs=[])

    qid2tokenized = {}
    tupled_questions = [(q['qid'], q['question']) for q in questions]
    print("Tokenizing questions...")
    with tqdm(total=len(tupled_questions)) as pbar:
        for tok_q in tqdm(
                workers.imap_unordered(tokenize_question, tupled_questions)):
            qid2tokenized.update(tok_q)
            pbar.update()

    voc = set()
    for question in qid2tokenized.values():
        voc.update(question)

    workers.close()
    workers.join()

    # all_titles = list(set([title for q in questions for title in q['top_titles']]))

    def parname_to_text(par_name):
        par_title = par_name_to_title(par_name)
        par_num = int(par_name.split('_')[-1])
        if docs_file is not None:
            return documents[par_title][par_num]
        return ' '.join(docs_db.get_doc_sentences(par_title))

    # print(f"Gathering documents...")
    # Setup worker pool
    workers = ProcessPool(16,
                          initializer=init_encoding_handler,
                          initargs=[encodings_dir])
    # title2encs = {}
    # title2idx2par_name = {}
    # with tqdm(total=len(all_titles)) as pbar:
    #     for t2enc, t2id2p in tqdm(workers.imap_unordered(get_title_mappings_from_saver, all_titles)):
    #         title2encs.update(t2enc)
    #         title2idx2par_name.update(t2id2p)
    #         pbar.update()
    # title2par_name2idxs = {}
    # for title, id2par in title2idx2par_name.items():
    #     par2idxs = {}
    #     for idx, parname in id2par.items():
    #         if parname in par2idxs:
    #             par2idxs[parname].append(idx)
    #         else:
    #             par2idxs[parname] = [idx]
    #     title2par_name2idxs[title] = {par: sorted(idxs) for par, idxs in par2idxs.items()}

    print("Loading encoder...")
    spec = QuestionAndParagraphsSpec(batch_size=None,
                                     max_num_contexts=2,
                                     max_num_question_words=None,
                                     max_num_context_words=None)
    encoder = SentenceEncoderIterativeModel(model_dir_path=encoder_model,
                                            vocabulary=voc,
                                            spec=spec,
                                            loader=ResourceLoader(),
                                            use_char_inputs=False,
                                            use_ema=use_ema,
                                            checkpoint=checkpoint)

    print("Encoding questions...")
    q_original_encodings = encoder.encode_text_questions(
        [qid2tokenized[q['qid']] for q in questions],
        return_search_vectors=False,
        show_progress=True)
    q_search_encodings = encoder.question_rep_to_search_vector(
        question_encodings=q_original_encodings)

    init()  # for initializing the tokenizer

    total_num = len(n1_list) * len(n2_list) * len(k1_list) * len(k2_list)
    print("Calculating similarities...")
    for n1, n2, k1, k2 in tqdm(itertools.product(n1_list, n2_list, k1_list,
                                                 k2_list),
                               total=total_num,
                               ncols=80):
        questions = iterative_retrieval(encoder, questions, qid2tokenized,
                                        q_search_encodings, workers,
                                        parname_to_text, reformulate_from_text,
                                        n1, n2, k1, k2, safety_mult)
        dir_path = os.path.join(base_dir, f"n2-{n2}", f"n1-{n1}")
        os.makedirs(dir_path, exist_ok=True)
        out_file = os.path.join(dir_path,
                                f"n1-{n1}_n2-{n2}_k1-{k1}_k2-{k2}.json")
        questions_copy = deepcopy(questions)
        for question in questions_copy:
            question.pop('top_titles')
        with open(out_file, 'w') as f:
            json.dump(questions_copy, f)

        if evaluate:
            eval_questions(questions_copy)