Example #1
0
def filter_user_questions(input_to_filter, faq_contents):
    '''
    This function takes the data from the input_to_filter json file, and only
    returns the examples that align with the faq_contents.

    input_to_filer: str, filename of the json to filter
    faq_contents: set, all faq questions in a set.
    '''
    with open(input_to_filter, 'r', encoding='utf-8') as in_stream:
        input_data = json.load(in_stream)

    _, pid2passage, _ = get_passages_by_source(input_data, keep_ood=True)

    filtered_example = []
    examples = get_examples(input_data, keep_ood=True)
    for example in examples:
        related_pid = get_passage_id(example)
        related_passage = pid2passage[related_pid]
        if get_passage_last_header(related_passage) in faq_contents:
            filtered_example.append(example)

    logger.info(
        'file {}: passage size {} / pre-filtering example size {} / post filtering examples size'
        ' {}'.format(input_to_filter, len(input_data['passages']),
                     len(examples), len(filtered_example)))

    return {'examples': filtered_example, 'passages': input_data['passages']}
Example #2
0
def generate_embeddings(ret_trainee, input_file, out_file):
    with open(input_file, "r", encoding="utf-8") as f:
        json_data = json.load(f)

    source2passages, pid2passage, _ = get_passages_by_source(json_data)

    question_embs = []
    labels = []
    for example in tqdm(json_data["examples"]):
        pid = get_passage_id(example)
        passage = pid2passage[pid]
        labels.append('id' if is_in_distribution(passage) else 'ood')
        emb = ret_trainee.retriever.embed_question(get_question(example))
        question_embs.append(emb)

    passage_header_embs = []
    ood = 0
    for source, passages in source2passages.items():
        logger.info('embedding passages for source {}'.format(source))
        for passage in tqdm(passages):
            if is_in_distribution(passage):
                emb = ret_trainee.retriever.embed_paragraph(
                    get_passage_last_header(passage, return_error_for_ood=True))
                passage_header_embs.append(emb)
            else:
                ood += 1

    to_serialize = {"question_embs": question_embs, "passage_header_embs": passage_header_embs,
                    "question_labels": labels}
    with open(out_file, "wb") as out_stream:
        pickle.dump(to_serialize, out_stream)
    logger.info(
        'saved {} question embeddings and {} passage header embeddings ({} skipped because '
        'out-of-distribution)'.format(
            len(question_embs), len(passage_header_embs), ood))
Example #3
0
def generate_embeddings(ret_trainee, input_file=None, out_file=None, json_data=None,
                        embed_passages=True):
    if input_file:
        with open(input_file, "r", encoding="utf-8") as f:
            json_data = json.load(f)
    elif json_data:
        pass
    else:
        raise ValueError("You should specify either the input file or the json_data")

    source2passages, pid2passage, _ = get_passages_by_source(json_data)

    question_embs = []
    question_texts = []
    labels = []
    if json_data.get("examples"):
        for example in tqdm(json_data["examples"]):
            pid = get_passage_id(example)
            passage = pid2passage[pid]
            labels.append('id' if is_in_distribution(passage) else 'ood')
            question = get_question(example)
            emb = ret_trainee.retriever.embed_question(question)
            question_embs.append(emb)
            question_texts.append(question)

    passage_header_embs = []
    ood = 0
    passage_texts = []
    if embed_passages:
        for source, passages in source2passages.items():
            logger.info('embedding passages for source {}'.format(source))
            for passage in tqdm(passages):
                if is_in_distribution(passage):
                    passage_text = get_passage_last_header(passage, return_error_for_ood=True)
                    emb = ret_trainee.retriever.embed_paragraph(
                        passage_text)
                    passage_header_embs.append(emb)
                    passage_texts.append(passage_text)
                else:
                    ood += 1

    to_serialize = {"question_embs": question_embs, "passage_header_embs": passage_header_embs,
                    "question_labels": labels, "passage_texts": passage_texts,
                    "question_texts": question_texts}
    if out_file:
        with open(out_file, "wb") as out_stream:
            pickle.dump(to_serialize, out_stream)
    logger.info(
        'generated {} question embeddings and {} passage header embeddings ({} skipped because '
        'out-of-distribution)'.format(
            len(question_embs), len(passage_header_embs), ood))

    return to_serialize
    def collect_answers(self, source2passages, out_file=None):
        self.source2embedded_passages = {}
        for source, passages in source2passages.items():
            logger.info("encoding source {}".format(source))
            if passages:
                passages_content = [
                    get_passage_last_header(p) for p in passages
                ]
                embedded_passages = self.model.embed_paragrphs(
                    passages_content, progressbar=True)
                self.source2embedded_passages[source] = embedded_passages
                if out_file:
                    torch.save(self.source2embedded_passages, out_file)

            else:
                self.source2embedded_passages[source] = None