Exemple #1
0
    def expand_from_preext_sent_rule(self):
        if not hasattr(self, 'cursor'):
            self.cursor = fever_db.get_cursor()
        if not hasattr(self, 'preext_sent_dict'):
            d_list = load_data(config.RESULT_PATH / \
                "sent_retri_nn/2018_07_17_16-34-19_r/dev_scale(0.1).jsonl")
            self.preext_sent_dict = {item['id']: item for item in d_list}
        item = self.item

        # if len(item['prioritized_docids']) < 5:
        new_pdocids = copy(item['prioritized_docids'])
        sent_ids = self.preext_sent_dict[item['id']]['predicted_sentids']
        for sent_id in sent_ids:
            docid, sent_ind = sent_id.split('<SENT_LINE>')
            sent_ind = int(sent_ind)
            id_list, sent_list, sent_links = \
                fever_db.get_evidence(self.cursor,
                                      docid,
                                      sent_ind)
            sent_links = json.loads(sent_links)
            all_links = np.array(sent_links)
            all_links = np.array(all_links)
            all_links = all_links.reshape(-1, 2)[:, 1]
            all_links = list(map(reverse_convert_brc, all_links))
            new_pdocids.extend([(id_link, 1.0) \
                for id_link in all_links])
        item['prioritized_docids'] = new_pdocids
        return self
Exemple #2
0
    def expand_from_preext_sent_rule(self):
        if not hasattr(self, 'cursor'):
            self.cursor = fever_db.get_cursor()
        if not hasattr(self, 'preext_sent_dict'):
            d_list = read_jsonl(config.RESULT_PATH / \
                "sent_retri_nn/2018_07_17_16-34-19_r/train_scale(0.1).jsonl")
            self.preext_sent_dict = {item['id']: item for item in d_list}
        item = self.item

        # if len(item['prioritized_docids']) < 5:
        new_pdocids = []
        structured_docids_sent = {}
        sent_ids = self.preext_sent_dict[item['id']]['scored_sentids']
        for sent_id, score, probability in sent_ids:
            docid, sent_ind = sent_id.split('<SENT_LINE>')
            sent_ind = int(sent_ind)
            id_list, sent_list, sent_links = \
                fever_db.get_evidence(self.cursor,
                                      docid,
                                      sent_ind)
            sent_links = json.loads(sent_links)
            all_links = np.array(sent_links)
            all_links = np.array(all_links)
            all_links = all_links.reshape(-1, 2)[:, 1]
            all_links = list(map(fever_db.reverse_convert_brc, all_links))
            all_links = list(map(lambda x: x.replace(' ', '_'), all_links))
            prio_docids = [(id_link, score) for id_link in all_links]
            new_pdocids.extend(prio_docids)
            structured_docids_sent.update({sent_id: prio_docids})
        item['prioritized_docids_sent'] = new_pdocids
        item['structured_docids_sent'] = structured_docids_sent
        return self
def utest_check_sentence_lines():
    sent_number_coutner = Counter()
    number_list = []
    db_cursor = fever_db.get_cursor()
    # d_list = load_data("/Users/Eason/RA/FunEver/results/doc_retri/2018_07_04_21:56:49_r/dev.jsonl")
    d_list = load_data(
        "/Users/Eason/RA/FunEver/results/doc_retri/2018_07_04_21:56:49_r/train.jsonl"
    )
    for item in tqdm(d_list):
        p_docids = item['predicted_docids']
        current_sent_list = []
        for doc_id in p_docids:
            r_list = fever_db.get_all_sent_by_doc_id(db_cursor, doc_id)
            current_sent_list.extend(r_list)

        sent_number_coutner.update([len(current_sent_list)])
        number_list.append(len(current_sent_list))
        # print(current_sent_list)

    print(len(number_list))
    print('Mean:', np.mean(number_list))
    print('Max:', np.max(number_list))
    print('Min:', np.min(number_list))
    print('Std:', np.std(number_list))
    print(sent_number_coutner)
Exemple #4
0
    def initialize(self):
        print('Data reader initialization ...')
        self.cursor = fever_db.get_cursor()

        # Prepare Data
        token_indexers = {
            'tokens': \
                SingleIdTokenIndexer(namespace='tokens'),
            'elmo_chars': \
                ELMoTokenCharactersIndexer(namespace='elmo_characters')
        }
        self.fever_data_reader = SSelectorReader(token_indexers=token_indexers,
                                                 lazy=cfg.lazy)

        vocab, weight_dict = load_vocab_embeddings(config.DATA_ROOT \
                                                   / 'vocab_cache' \
                                                   / 'nli_basic')
        # THis is important
        ns = 'selection_labels'
        vocab.add_token_to_namespace('true', namespace=ns)
        vocab.add_token_to_namespace('false', namespace=ns)
        vocab.add_token_to_namespace('hidden', namespace=ns)
        vocab.change_token_with_index_to_namespace('hidden', -2, namespace=ns)
        # Label value

        vocab.get_index_to_token_vocabulary(ns)

        self.vocab = vocab
        self.weight_dict = weight_dict
        self.initialized = True
Exemple #5
0
def adv_sample_v1_0(input_file, additional_file, tokenized=False):
    cursor = fever_db.get_cursor()
    d_list = load_data(input_file)

    if isinstance(additional_file, list):
        additional_d_list = additional_file
    else:
        additional_d_list = load_data(additional_file)
    additional_data_dict = dict()

    for add_item in additional_d_list:
        additional_data_dict[add_item['id']] = add_item

    sampled_data_list = []

    for item in tqdm(d_list):
        # e_list = check_sentences.check_and_clean_evidence(item)
        sampled_e_list, flags = sample_additional_data_for_item_v1_0(
            item, additional_data_dict)
        # print(flags)
        for i, (sampled_evidence,
                flag) in enumerate(zip(sampled_e_list, flags)):
            # Do not copy, might change in the future for error analysis
            # new_item = copy.deepcopy(item)
            new_item = dict()
            # print(new_item['claim'])
            # print(e_list)
            # print(sampled_evidence)
            # print(flag)
            evidence_text = evidence_list_to_text(cursor,
                                                  sampled_evidence,
                                                  contain_head=True,
                                                  id_tokenized=tokenized)

            new_item['id'] = str(item['id']) + '#' + str(i)

            if tokenized:
                new_item['claim'] = item['claim']
            else:
                new_item['claim'] = ' '.join(easy_tokenize(item['claim']))

            new_item['evid'] = evidence_text

            new_item['verifiable'] = item['verifiable']
            new_item['label'] = item['label']

            # print("C:", new_item['claim'])
            # print("E:", new_item['evid'])
            # print("L:", new_item['label'])
            # print()
            sampled_data_list.append(new_item)

    cursor.close()

    return sampled_data_list
Exemple #6
0
    def disambiguous_from_preext_sent_rule(self):
        if not hasattr(self, 'cursor'):
            self.cursor = fever_db.get_cursor()
        if not hasattr(self, 'preext_sent_dict'):
            d_list = read_jsonl(config.RESULT_PATH / \
                "sent_retri_nn/2018_07_17_16-34-19_r/train_sent.jsonl")
            self.preext_sent_dict = {item['id']: item for item in d_list}
        item = self.item

        if len(item['prioritized_docids']) > 60:
            sent_ids = self.preext_sent_dict[item['id']]['']
        return self
Exemple #7
0
def select_sent_for_eval(input_file, additional_file, tokenized=False):
    """
    This method select sentences with upstream sentence retrieval.

    :param input_file: This should be the file with 5 sentences selected.
    :return:
    """
    cursor = fever_db.get_cursor()

    if isinstance(additional_file, list):
        additional_d_list = additional_file
    else:
        additional_d_list = load_data(additional_file)
    additional_data_dict = dict()

    for add_item in additional_d_list:
        additional_data_dict[add_item['id']] = add_item

    d_list = load_data(input_file)

    for item in tqdm(d_list):
        e_list = additional_data_dict[item['id']]['predicted_sentids']
        assert additional_data_dict[item['id']]['label'] == item['label']
        assert additional_data_dict[item['id']]['id'] == item['id']
        assert additional_data_dict[
            item['id']]['verifiable'] == item['verifiable']

        pred_evidence_list = []
        for i, cur_e in enumerate(e_list):
            doc_id = cur_e.split(c_scorer.SENT_LINE)[0]
            ln = int(cur_e.split(
                c_scorer.SENT_LINE)[1])  # Important changes Bugs: July 21
            pred_evidence_list.append((doc_id, ln))

        pred_evidence = check_sentences.Evidences(pred_evidence_list)

        evidence_text = evidence_list_to_text(cursor,
                                              pred_evidence,
                                              contain_head=True,
                                              id_tokenized=tokenized)

        if tokenized:
            pass
        else:
            item['claim'] = ' '.join(easy_tokenize(item['claim']))

        item['evid'] = evidence_text
        item['predicted_evidence'] = convert_evidence2scoring_format(e_list)
        item['predicted_sentids'] = e_list
        # This change need to be saved.
        # item['predicted_label'] = additional_data_dict[item['id']]['label']

    return d_list
def if_idf_select_sentence():
    db_cursor = fever_db.get_cursor()
    loaded_path = "/Users/Eason/RA/FunEver/results/doc_retri/2018_07_04_21:56:49_r/dev.jsonl"
    d_list = load_data(loaded_path)
    # d_list = load_data("/Users/Eason/RA/FunEver/results/doc_retri/2018_07_04_21:56:49_r/train.jsonl")

    for item in tqdm(d_list):
        # print()
        p_docids = item['predicted_docids']
        cleaned_claim = ' '.join(easy_tokenize(item['claim']))
        # print(cleaned_claim)

        current_sent_list = []
        current_id_list = []
        for doc_id in p_docids:
            r_list, id_list = fever_db.get_all_sent_by_doc_id(
                db_cursor, doc_id)
            current_sent_list.extend(r_list)
            current_id_list.extend(id_list)

        Args = namedtuple('Args', 'ngram hash_size num_workers')

        args = Args(2, int(8192), 4)

        ranker = OnlineTfidfDocRanker(args, args.hash_size, args.ngram,
                                      current_sent_list)

        selected_index, selected_score = ranker.closest_docs(cleaned_claim,
                                                             k=5)

        selected_sent_id = []
        for ind in selected_index:
            curent_selected = current_id_list[ind]
            doc_id, ln = curent_selected.split('(-.-)')
            # ln = int(ln)
            # selected_sent_id.append([doc_id, ln])
            selected_sent_id.append(doc_id + c_scorer.SENT_LINE + ln)

        item['predicted_sentids'] = selected_sent_id

    eval_mode = {'check_sent_id_correct': True, 'standard': False}
    print(c_scorer.fever_score(d_list, d_list, mode=eval_mode, verbose=False))

    out_fname = config.RESULT_PATH / "sent_retri" / f"{utils.get_current_time_str()}_r" / "dev.jsonl"
    save_intermidiate_results(d_list,
                              out_filename=out_fname,
                              last_loaded_path=loaded_path)
Exemple #9
0
def tf_idf_rank(args, top_k=5):
    dev_path = config.PRO_ROOT / \
               'results_old/doc_retri/docretri.basic.nopageview/dev.jsonl'

    cursor = get_cursor()
    d_list = read_jsonl(dev_path)

    d_list_test = d_list

    for i, item in enumerate(spcl(d_list_test)):
        all_sent = []
        all_ids = [it[0] for it in item['prioritized_docids']]

        try:

            for doc_id in all_ids:
                r_list, _ = get_all_sent_by_doc_id(cursor,
                                                   doc_id,
                                                   with_h_links=False)
                all_sent.append(' '.join(r_list))

            ranker = OnlineTfidfDocRanker(args, args.hash_size, args.ngram,
                                          all_sent)
        except Exception as e:
            if i - 1 >= 0:
                print(f'Early quit at {i-1} because of {e}')
                save_path = config.RESULT_PATH / \
                            'doc_retri/docretri.tfidfrank/' \
                            f'dev_quit_dump_{uuid4()}.json'
                DocRetrievalExperiment.dump_results(d_list_test[:i], save_path)
            raise e

        rank_ind, rank_score = \
            ranker.closest_docs(' '.join(item['claim_tokens']), k=100)
        id_score_dict = {docid: 0 for docid in all_ids}
        id_score_dict.update({all_ids[ri]: rs \
                              for ri, rs in zip(rank_ind, rank_score)})
        item['prioritized_docids'] = [(k, v) for k, v in id_score_dict.items()]
        item['predicted_docids'] = \
                list(set([k for k, v \
                            in sorted(item['prioritized_docids'],
                                      key=lambda x: (-x[1], x[0]))][:top_k]))

    save_path = config.RESULT_PATH / 'doc_retri/docretri.tfidfrank/dev.json'
    DocRetrievalExperiment.dump_results(d_list_test, save_path)
Exemple #10
0
    def expand_from_doc_rule(self):
        """Current method: if 'prioritized_docids' is shorter than 5, then
        expand every found ID by extract the document, find some highly-scored
        (currently tf-idf score) sentences, find links in them and append those
        documents.

        Discussions on some variations
        ------------------------------
        1. Can use other types of sentence similarity score
        2. Can (kind of) combine sentence score into priority
        3. Match appears first can have higher score propagated

        """
        if not hasattr(self, 'cursor'):
            self.cursor = fever_db.get_cursor()
        item = self.item

        if len(item['prioritized_docids']) < 2:
            # print(f"Query tf-idf... because length={len(item['prioritized_docids'])}")
            new_pdocids = copy(item['prioritized_docids'])
            for docid, priority in item['prioritized_docids']:
                # print(f"Query tf-idf for {docid}")
                sent_list, id_list, sent_links = \
                    fever_db.get_all_sent_by_doc_id(self.cursor,
                                                    docid,
                                                    with_h_links=True)
                # indexes, scores = \
                #     self.sent_sim.preceding_sent_similarity(sent_list,
                #                                         item['claim'])
                indexes, scores = \
                    self.sent_sim.tfidf_similarity(sent_list,
                                                        item['claim'])

                high_tfidf_indexes = indexes[scores > 3.0]
                if len(high_tfidf_indexes) > 0:
                    all_links = np.array(sent_links)[high_tfidf_indexes]
                    all_links = [ii for i in all_links for ii in i]  # flatten links
                    all_links = np.array(all_links)
                    all_links = all_links.reshape(-1, 2)[:, 1]
                    all_links = list(map(reverse_convert_brc, all_links))
                    new_pdocids.extend([(id_link, 1.0*priority) \
                        for id_link in all_links])
            item['prioritized_docids'] = new_pdocids
        return self
Exemple #11
0
def fever_app(caller):
    logger = logging.getLogger()
    dictConfig({
        'version': 1,
        'formatters': {'default': {
            'format': '[%(asctime)s] %(levelname)s in %(module)s: %(message)s',
        }},
        'handlers': {'wsgi': {
            'class': 'logging.StreamHandler',
            'stream': 'ext://sys.stderr',
            'formatter': 'default'
        }},
        'root': {
            'level': 'INFO',
            'handlers': ['wsgi']
        },
        'allennlp': {
            'level': 'INFO',
            'handlers': ['wsgi']
        },
    })

    logger.info("Set up flask app")

    nn_doc_retri_threshold = 0.00001
    top_k = 100
    nn_doc_top_k = 10
    sent_prob_for_2doc = 0.1
    sent_topk_for_2doc = 5
    sentence_retri_1_scale_prob = 0.05
    sentence_retri_2_scale_prob = 0.9
    sent_retri_2_top_k = 1
    enhance_retri_1_scale_prob = -1

    def predict_pipeline(claims):
        # Step 1: Tokenization
        logger.info('Step 1')
        logger.info('Start: ' + str(datetime.datetime.now().time()))

        tokenized_list = []
        for idx, claim in enumerate(claims):
            claim_tok = ' '.join(tok.tokenize(text_clean.normalize(claim["claim"])).words())
            item_tokenized = {'id': idx, 'claim': claim_tok}
            tokenized_list.append(item_tokenized)
        logger.info('End: ' + str(datetime.datetime.now().time()))

        # Step 2: 1st Doc retrieval
        logger.info('Step 2')
        logger.info('Start: ' + str(datetime.datetime.now().time()))

        for item in tokenized_list:
            item_doc_retrieval = item
            item_rb.first_only_rules(item_doc_retrieval)
            item_doc_retrieval['predicted_docids'] = list(
                set([k for k, v in sorted(item_doc_retrieval['prioritized_docids'],
                                          key=lambda x: (-x[1], x[0]))][:top_k]))

        doc_retrieval_list = tokenized_list
        item_remove_old_rule(doc_retrieval_list)
        item_resorting(doc_retrieval_list)

        nn_doc_list = nn_doc_model.pipeline_function_list(doc_retrieval_list, doc_retrieval_model, vocab, cursor)
        enforce_disabuigation_into_retrieval_result_v2(nn_doc_list, doc_retrieval_list, prob_sh=nn_doc_retri_threshold)
        logger.info('End: ' + str(datetime.datetime.now().time()))

        # Step 3: 1st Sentence selection
        logger.info('Step 3')
        logger.info('Start: ' + str(datetime.datetime.now().time()))
        dev_sent_list_1_e0 = simple_nnmodel.pipeline_first_sent_selection_list(tokenized_list, doc_retrieval_list,
                                                                               sent_selector_model, vocab,
                                                                               top_k=nn_doc_top_k, cursor=cursor)
        dev_sent_list_1_e1 = simple_nnmodel.pipeline_first_sent_selection_list(tokenized_list, doc_retrieval_list,
                                                                               sent_selector_model_1, vocab,
                                                                               top_k=nn_doc_top_k, cursor=cursor)
        dev_sent_list_1_e2 = simple_nnmodel.pipeline_first_sent_selection_list(tokenized_list, doc_retrieval_list,
                                                                               sent_selector_model_2, vocab,
                                                                               top_k=nn_doc_top_k, cursor=cursor)
        dev_sent_list_1 = merge_sent_results([dev_sent_list_1_e0, dev_sent_list_1_e1, dev_sent_list_1_e2])
        filtered_dev_instance_1_for_doc2 = simi_sampler.threshold_sampler_insure_unique_list(tokenized_list,
                                                                                             dev_sent_list_1,
                                                                                             sent_prob_for_2doc,
                                                                                             top_n=sent_topk_for_2doc)
        dev_sent_1_list = simi_sampler.threshold_sampler_insure_unique_list(doc_retrieval_list, dev_sent_list_1,
                                                                            sentence_retri_1_scale_prob,
                                                                            top_n=sent_topk_for_2doc)
        logger.info('End: ' + str(datetime.datetime.now().time()))

        # Step 4: 2nd Doc retrieval
        logger.info('Step 4')
        logger.info('Start: ' + str(datetime.datetime.now().time()))
        item_rb.preext_sent_dict = {item['id']: item for item in filtered_dev_instance_1_for_doc2}

        for item in dev_sent_1_list:
            item_rb.second_only_rules(item)
            pids = [it[0] for it in item['prioritized_docids']]
            item['prioritized_docids_aside'] = [it for it in item['prioritized_docids_aside'] if it[0] not in pids]
            porg = set([k for k, v in sorted(item['prioritized_docids'], key=lambda x: (-x[1], x[0]))][:top_k])
            paside = set([k for k, v in sorted(item['prioritized_docids_aside'], key=lambda x: (-x[1], x[0]))][:top_k])
            item['predicted_docids'] = list(porg | paside)
            item['predicted_docids_origin'] = list(porg)
            item['predicted_docids_aside'] = list(paside)

        logger.info('End: ' + str(datetime.datetime.now().time()))

        # Step 5: 2nd Sentence selection
        logger.info('Step 5')
        logger.info('Start: ' + str(datetime.datetime.now().time()))
        dev_sent_list_2 = get_score_multihop_list(tokenized_list, dev_sent_1_list, sent_selector_2_model, vocab, cursor)
        logger.info('End: ' + str(datetime.datetime.now().time()))

        # Step 6: NLI
        logger.info('Step 6')
        logger.info('Start: ' + str(datetime.datetime.now().time()))
        sentence_retri_nli_scale_prob = 0.1
        sent_select_results_list_1 = simi_sampler.threshold_sampler_insure_unique_list(tokenized_list, dev_sent_list_1,
                                                                                       sentence_retri_nli_scale_prob,
                                                                                       top_n=5)
        nli_results = mesim_wn_simi_v1_2.pipeline_nli_run_list(tokenized_list,
                                                               sent_select_results_list_1,
                                                               [dev_sent_list_1, dev_sent_list_2],
                                                               nli_model, vocab, dev_fever_data_reader, cursor)
        delete_unused_evidence(nli_results)

        nli_results = simi_sampler.threshold_sampler_insure_unique_merge(nli_results, dev_sent_list_2,
                                                                         sentence_retri_2_scale_prob,
                                                                         top_n=5, add_n=sent_retri_2_top_k)
        delete_unused_evidence(nli_results)

        nli_results = simi_sampler.threshold_sampler_insure_unique_merge(nli_results, dev_sent_list_1,
                                                                         enhance_retri_1_scale_prob,
                                                                         top_n=100, add_n=100)
        delete_unused_evidence(nli_results)

        predictions = []
        for final_item in nli_results:
            sentences = []
            for evidence in final_item['predicted_evidence']:
                sentences.append([evidence[0], evidence[1]])
            prediction = final_item['predicted_label'].upper()
            predictions.append({"predicted_label":prediction,"predicted_evidence":sentences})
        logger.info('End: ' + str(datetime.datetime.now().time()))
        return predictions

    cursor = fever_db.get_cursor()


    tok = CoreNLPTokenizer(annotators=['pos', 'lemma'])
    item_rb = ItemRuleBuilderSpiral(tokenizer=tok, cursor=cursor)
    p_dict = wn_persistent_api.persistence_load()
    model_path_dict = {
        'sselector': config.DATA_ROOT / 'models/sent_selector',
        'sselector_1': config.DATA_ROOT / 'models/sent_selector_1',
        'sselector_2': config.DATA_ROOT / 'models/sent_selector_2',
        'nn_doc_selector': config.DATA_ROOT / 'models/nn_doc_selector',
        'no_doc_nli': config.DATA_ROOT / 'models/nli',
    }
    # Preload the NN models
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu", index=0)
    vocab, weight_dict = load_vocab_embeddings(config.DATA_ROOT / "vocab_cache" / "nli_basic")

    doc_retrieval_model = nn_doc_model.Model(weight=weight_dict['glove.840B.300d'],
                                             vocab_size=vocab.get_vocab_size('tokens'),
                                             embedding_dim=300, max_l=160, num_of_class=2)
    load_model(doc_retrieval_model, model_path_dict['nn_doc_selector'], device)

    sent_selector_model = simple_nnmodel.Model(weight=weight_dict['glove.840B.300d'],
                                               vocab_size=vocab.get_vocab_size('tokens'),
                                               embedding_dim=300, max_l=300, num_of_class=2)
    load_model(sent_selector_model, model_path_dict['sselector'], device)
    sent_selector_model_1 = simple_nnmodel.Model(weight=weight_dict['glove.840B.300d'],
                                                 vocab_size=vocab.get_vocab_size('tokens'),
                                                 embedding_dim=300, max_l=300, num_of_class=2)
    load_model(sent_selector_model_1, model_path_dict['sselector_1'], device)
    sent_selector_model_2 = simple_nnmodel.Model(weight=weight_dict['glove.840B.300d'],
                                                 vocab_size=vocab.get_vocab_size('tokens'),
                                                 embedding_dim=300, max_l=300, num_of_class=2)
    load_model(sent_selector_model_2, model_path_dict['sselector_2'], device)

    sent_selector_2_model = simple_nnmodel.Model(weight=weight_dict['glove.840B.300d'],
                                                 vocab_size=vocab.get_vocab_size('tokens'),
                                                 embedding_dim=300, max_l=300, num_of_class=2)
    load_model(sent_selector_2_model, model_path_dict['sselector'], device)

    # Prepare Data
    token_indexers = {
        'tokens': SingleIdTokenIndexer(namespace='tokens'),  # This is the raw tokens
        'elmo_chars': ELMoTokenCharactersIndexer(namespace='elmo_characters')  # This is the elmo_characters
    }
    dev_fever_data_reader = WNSIMIReader(token_indexers=token_indexers, lazy=True, wn_p_dict=p_dict, max_l=420)
    nli_model = mesim_wn_simi_v1_2.Model(rnn_size_in=(1024 + 300 + dev_fever_data_reader.wn_feature_size,
                                                      1024 + 450 + dev_fever_data_reader.wn_feature_size),
                                         rnn_size_out=(450, 450),
                                         weight=weight_dict['glove.840B.300d'],
                                         vocab_size=vocab.get_vocab_size('tokens'),
                                         mlp_d=900, embedding_dim=300, max_l=400)

    load_model(nli_model, model_path_dict['no_doc_nli'], device)
    logger.info('Finished loading models.')
    return caller(predict_pipeline)
Exemple #12
0
def get_additional_list(tokenized_data_file,
                        additional_data_file,
                        item_key='prioritized_docids_aside',
                        top_k=6):
    """
    This method will select all the sentence from upstream doc retrieval and label the correct evident as true
    :param item_key: The item that specify the additional prioritized document ids.
    :param tokenized_data_file: Remember this is tokenized data with original format containing 'evidence'
    :param additional_data_file:    This is the data after document retrieval.
                                    This file need to contain *"predicted_docids"* field.
    :return:
    """
    cursor = fever_db.get_cursor()
    d_list = load_jsonl(tokenized_data_file)

    additional_d_list = load_jsonl(additional_data_file)
    additional_data_dict = dict()

    for add_item in additional_d_list:
        additional_data_dict[int(add_item['id'])] = add_item

    full_data_list = []

    for item in tqdm(d_list):
        doc_ids_p_list = additional_data_dict[int(item['id'])][item_key]
        doc_ids = list(
            set([
                k
                for k, v in sorted(doc_ids_p_list, key=lambda x: (-x[1], x[0]))
            ][:top_k]))

        # if not pred:
        #     if item['evidence'] is not None:
        #         e_list = utils.check_sentences.check_and_clean_evidence(item)
        #         all_evidence_set = set(itertools.chain.from_iterable([evids.evidences_list for evids in e_list]))
        #     else:
        #         all_evidence_set = None
        #     # print(all_evidence_set)
        #     r_list = []
        #     id_list = []
        #
        #     if all_evidence_set is not None:
        #         for doc_id, ln in all_evidence_set:
        #             _, text, _ = fever_db.get_evidence(cursor, doc_id, ln)
        #             r_list.append(text)
        #             id_list.append(doc_id + '(-.-)' + str(ln))
        #
        # else:            # If pred, then reset to not containing ground truth evidence.

        all_evidence_set = None
        r_list = []
        id_list = []

        for doc_id in doc_ids:
            cur_r_list, cur_id_list = fever_db.get_all_sent_by_doc_id(
                cursor, doc_id, with_h_links=False)
            # Merging to data list and removing duplicate
            for i in range(len(cur_r_list)):
                if cur_id_list[i] in id_list:
                    continue
                else:
                    r_list.append(cur_r_list[i])
                    id_list.append(cur_id_list[i])

        assert len(id_list) == len(set(id_list))  # check duplicate
        assert len(r_list) == len(id_list)

        zipped_s_id_list = list(zip(r_list, id_list))
        # Sort using id
        # sorted(evidences_set, key=lambda x: (x[0], x[1]))
        zipped_s_id_list = sorted(zipped_s_id_list,
                                  key=lambda x: (x[1][0], x[1][1]))

        all_sent_list = convert_to_formatted_sent(zipped_s_id_list,
                                                  all_evidence_set,
                                                  contain_head=True,
                                                  id_tokenized=True)
        cur_id = item['id']
        for i, sent_item in enumerate(all_sent_list):
            sent_item['selection_id'] = str(cur_id) + "<##>" + str(
                sent_item['sid'])
            # selection_id is '[item_id<##>[doc_id]<SENT_LINE>[line_number]'
            sent_item['query'] = item['claim']
            full_data_list.append(sent_item)

    return full_data_list
def select_sent_with_prob_for_eval_list(input_file,
                                        additional_file,
                                        prob_dict_file,
                                        tokenized=False,
                                        pipeline=False,
                                        is_demo=False):
    """
    This method select sentences with upstream sentence retrieval.

    :param input_file: This should be the file with 5 sentences selected.
    :return:
    """
    cursor = fever_db.get_cursor()

    if isinstance(additional_file, list):
        additional_d_list = additional_file
    else:
        additional_d_list = load_data(additional_file)
    additional_data_dict = dict()

    for add_item in additional_d_list:
        additional_data_dict[add_item['id']] = add_item

    d_list = input_file

    for item in tqdm(d_list):
        e_list = additional_data_dict[item['id']]['predicted_sentids']
        if not pipeline:
            assert additional_data_dict[item['id']]['label'] == item['label']
            assert additional_data_dict[
                item['id']]['verifiable'] == item['verifiable']
        assert additional_data_dict[item['id']]['id'] == item['id']

        pred_evidence_list = []
        for i, cur_e in enumerate(e_list):
            doc_id = cur_e.split(c_scorer.SENT_LINE)[0]
            ln = int(cur_e.split(
                c_scorer.SENT_LINE)[1])  # Important changes Bugs: July 21
            pred_evidence_list.append((doc_id, ln))

        pred_evidence = check_sentences.Evidences(pred_evidence_list)

        evidence_text_list = evidence_list_to_text_list(cursor,
                                                        pred_evidence,
                                                        contain_head=True,
                                                        id_tokenized=tokenized)

        evidences = sorted(pred_evidence, key=lambda x: (x[0], x[1]))
        item_id = int(item['id'])

        evidence_text_list_with_prob = []
        for text, (doc_id, ln) in zip(evidence_text_list, evidences):
            ssid = (item_id, doc_id, int(ln))
            if ssid not in prob_dict_file:
                print("Some sentence pair don't have 'prob'.")
                prob = 0.5
            else:
                prob = prob_dict_file[ssid]['prob']
                assert item['claim'] == prob_dict_file[ssid]['claim']

            evidence_text_list_with_prob.append((text, prob))

        if tokenized:
            pass
        else:
            item['claim'] = ' '.join(easy_tokenize(item['claim']))

        item['evid'] = evidence_text_list_with_prob
        item['predicted_evidence'] = convert_evidence2scoring_format(e_list)
        item['predicted_sentids'] = e_list
        # This change need to be saved.
        # item['predicted_label'] = additional_data_dict[item['id']]['label']

    return d_list
def adv_simi_sample_with_prob_v1_1(input_file,
                                   additional_file,
                                   prob_dict_file,
                                   tokenized=False):
    cursor = fever_db.get_cursor()
    d_list = load_data(input_file)

    if isinstance(additional_file, list):
        additional_d_list = additional_file
    else:
        additional_d_list = load_data(additional_file)
    additional_data_dict = dict()

    for add_item in additional_d_list:
        additional_data_dict[add_item['id']] = add_item

    sampled_data_list = []
    count = 0

    for item in tqdm(d_list):
        # e_list = check_sentences.check_and_clean_evidence(item)
        sampled_e_list, flags = sample_additional_data_for_item_v1_1(
            item, additional_data_dict)
        # print(flags)
        for i, (sampled_evidence,
                flag) in enumerate(zip(sampled_e_list, flags)):
            # Do not copy, might change in the future for error analysis
            # new_item = copy.deepcopy(item)
            new_item = dict()
            # print(new_item['claim'])
            # print(e_list)
            # print(sampled_evidence)
            # print(flag)
            evidence_text_list = evidence_list_to_text_list(
                cursor,
                sampled_evidence,
                contain_head=True,
                id_tokenized=tokenized)

            evidences = sorted(sampled_evidence, key=lambda x: (x[0], x[1]))
            item_id = int(item['id'])

            evidence_text_list_with_prob = []
            for text, (doc_id, ln) in zip(evidence_text_list, evidences):
                ssid = (int(item_id), doc_id, int(ln))
                if ssid not in prob_dict_file:
                    count += 1
                    print("Some sentence pair don't have 'prob'.")
                    prob = 0.5
                else:
                    prob = prob_dict_file[ssid]['prob']
                    assert item['claim'] == prob_dict_file[ssid]['claim']

                evidence_text_list_with_prob.append((text, prob))

            new_item['id'] = str(item['id']) + '#' + str(i)

            if tokenized:
                new_item['claim'] = item['claim']
            else:
                new_item['claim'] = ' '.join(easy_tokenize(item['claim']))

            new_item['evid'] = evidence_text_list_with_prob

            new_item['verifiable'] = item['verifiable']
            new_item['label'] = item['label']

            # print("C:", new_item['claim'])
            # print("E:", new_item['evid'])
            # print("L:", new_item['label'])
            # print()
            sampled_data_list.append(new_item)

    cursor.close()

    print(count)
    return sampled_data_list
Exemple #15
0
    d_list = common.load_jsonl(
        "/home/easonnie/projects/FunEver/results/doc_retri_bls/docretri.basic.nopageview/dev.jsonl"
    )
    # d_list = common.load_jsonl("/Users/Eason/RA/FunEver/results/doc_retri_bls/docretri.pageview/dev.jsonl")

    # filtered_list = []
    # for item in d_list:
    #     if filter_contain_parenthese(item):
    # if filter_contain_parenthese_valid(item):
    #     filtered_list.append(item)

    # d_list = filtered_list
    pos_count = 0
    neg_count = 0

    cursor = fever_db.get_cursor()

    p_list, n_list = [], []
    # inference_list = []

    # train_list = sample_disamb_training(d_list, cursor, sample_ratio=1.0)
    # print("Length:", len(train_list))
    # for item in d_list:
    #     positive_list, negative_list = disabuigation_training_build(item, cursor, contain_first_sentence=True)
    #     p_list.extend(positive_list)
    #     n_list.extend(negative_list)

    # for item in d_list:
    #     inference_list.extend(inference_build(item, cursor, contain_first_sentence=False))
    # inference_list = sample_disamb_inference(d_list, cursor)
    train_list = sample_disamb_training_v0(d_list, cursor, only_found=False)
Exemple #16
0
def train_fever_v1():
    num_epoch = 10
    seed = 12
    batch_size = 128
    dev_batch_size = 128
    # experiment_name = "simple_nn_doc_first_sent"
    experiment_name = "simple_nn_doc"
    lazy = True
    torch.manual_seed(seed)
    contain_first_sentence = False
    pn_ratio = 1.0
    # keep_neg_sample_prob = 0.4
    # sample_prob_decay = 0.05

    dev_upstream_file = config.RESULT_PATH / "doc_retri_bls/docretri.basic.nopageview/dev.jsonl"
    train_upstream_file = config.RESULT_PATH / "doc_retri_bls/docretri.basic.nopageview/train.jsonl"
    dev_data_list = common.load_jsonl(dev_upstream_file)

    # Prepare Data
    token_indexers = {
        'tokens': SingleIdTokenIndexer(namespace='tokens'),  # This is the raw tokens
        'elmo_chars': ELMoTokenCharactersIndexer(namespace='elmo_characters')  # This is the elmo_characters
    }

    train_fever_data_reader = SSelectorReader(token_indexers=token_indexers, lazy=lazy, max_l=180)
    # dev_fever_data_reader = SSelectorReader(token_indexers=token_indexers, lazy=False)
    dev_fever_data_reader = SSelectorReader(token_indexers=token_indexers, lazy=lazy, max_l=180)

    cursor = fever_db.get_cursor()
    complete_upstream_dev_data = disamb.sample_disamb_inference(common.load_jsonl(dev_upstream_file), cursor,
                                                                contain_first_sentence=contain_first_sentence)
    print("Dev size:", len(complete_upstream_dev_data))
    dev_instances = dev_fever_data_reader.read(complete_upstream_dev_data)

    # Load Vocabulary
    biterator = BasicIterator(batch_size=batch_size)
    dev_biterator = BasicIterator(batch_size=dev_batch_size)

    vocab, weight_dict = load_vocab_embeddings(config.DATA_ROOT / "vocab_cache" / "nli_basic")
    # THis is important
    vocab.add_token_to_namespace("true", namespace="selection_labels")
    vocab.add_token_to_namespace("false", namespace="selection_labels")
    vocab.add_token_to_namespace("hidden", namespace="selection_labels")
    vocab.change_token_with_index_to_namespace("hidden", -2, namespace='selection_labels')
    # Label value

    vocab.get_index_to_token_vocabulary('selection_labels')

    print(vocab.get_token_to_index_vocabulary('selection_labels'))
    print(vocab.get_vocab_size('tokens'))

    biterator.index_with(vocab)
    dev_biterator.index_with(vocab)

    # exit(0)
    # Build Model
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu", index=0)
    device_num = -1 if device.type == 'cpu' else 0

    model = Model(weight=weight_dict['glove.840B.300d'],
                  vocab_size=vocab.get_vocab_size('tokens'),
                  embedding_dim=300, max_l=160, num_of_class=2)

    model.display()
    model.to(device)

    # Create Log File
    file_path_prefix, date = save_tool.gen_file_prefix(f"{experiment_name}")
    # Save the source code.
    script_name = os.path.basename(__file__)
    with open(os.path.join(file_path_prefix, script_name), 'w') as out_f, open(__file__, 'r') as it:
        out_f.write(it.read())
        out_f.flush()
    # Save source code end.

    best_dev = -1
    iteration = 0

    start_lr = 0.0002
    optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=start_lr)
    criterion = nn.CrossEntropyLoss()

    for i_epoch in range(num_epoch):
        print("Resampling...")
        # Resampling
        complete_upstream_train_data = disamb.sample_disamb_training_v0(common.load_jsonl(train_upstream_file),
                                                                        cursor, pn_ratio, contain_first_sentence)
        print("Sample Prob.:", pn_ratio)

        print("Sampled_length:", len(complete_upstream_train_data))
        sampled_train_instances = train_fever_data_reader.read(complete_upstream_train_data)

        train_iter = biterator(sampled_train_instances, shuffle=True, num_epochs=1, cuda_device=device_num)
        for i, batch in tqdm(enumerate(train_iter)):
            model.train()
            out = model(batch)
            y = batch['selection_label']

            loss = criterion(out, y)

            # No decay
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            iteration += 1

            if i_epoch <= 5:
                mod = 1000
            else:
                mod = 500

            if iteration % mod == 0:
                eval_iter = dev_biterator(dev_instances, shuffle=False, num_epochs=1, cuda_device=device_num)
                complete_upstream_dev_data = hidden_eval(model, eval_iter, complete_upstream_dev_data)

                disamb.enforce_disabuigation_into_retrieval_result_v0(complete_upstream_dev_data,
                                                                      dev_data_list)
                oracle_score, pr, rec, f1 = c_scorer.fever_doc_only(dev_data_list, dev_data_list, max_evidence=5)

                print(f"Dev(raw_acc/pr/rec/f1):{oracle_score}/{pr}/{rec}/{f1}")
                print("Strict score:", oracle_score)
                print(f"Eval Tracking score:", f"{oracle_score}")

                need_save = False
                if oracle_score > best_dev:
                    best_dev = oracle_score
                    need_save = True

                if need_save:
                    save_path = os.path.join(
                        file_path_prefix,
                        f'i({iteration})_epoch({i_epoch})_'
                        f'(tra_score:{oracle_score}|pr:{pr}|rec:{rec}|f1:{f1})'
                    )

                    torch.save(model.state_dict(), save_path)
        # 
        print("Epoch Evaluation...")
        eval_iter = dev_biterator(dev_instances, shuffle=False, num_epochs=1, cuda_device=device_num)
        complete_upstream_dev_data = hidden_eval(model, eval_iter, complete_upstream_dev_data)

        disamb.enforce_disabuigation_into_retrieval_result_v0(complete_upstream_dev_data,
                                                              dev_data_list)
        oracle_score, pr, rec, f1 = c_scorer.fever_doc_only(dev_data_list, dev_data_list, max_evidence=5)

        print(f"Dev(raw_acc/pr/rec/f1):{oracle_score}/{pr}/{rec}/{f1}")
        print("Strict score:", oracle_score)
        print(f"Eval Tracking score:", f"{oracle_score}")

        need_save = False
        if oracle_score > best_dev:
            best_dev = oracle_score
            need_save = True

        if need_save:
            save_path = os.path.join(
                file_path_prefix,
                f'i({iteration})_epoch({i_epoch})_e'
                f'(tra_score:{oracle_score}|pr:{pr}|rec:{rec}|f1:{f1})'
            )

            torch.save(model.state_dict(), save_path)
Exemple #17
0
def get_full_list(tokenized_data_file,
                  additional_data_file,
                  pred=False,
                  top_k=None):
    """
    This method will select all the sentence from upstream doc retrieval and label the correct evident as true
    :param tokenized_data_file: Remember this is tokenized data with original format containing 'evidence'
    :param additional_data_file:    This is the data after document retrieval.
                                    This file need to contain *"predicted_docids"* field.
    :return:
    """
    cursor = fever_db.get_cursor()
    d_list = load_jsonl(tokenized_data_file)

    if not isinstance(additional_data_file, list):
        additional_d_list = load_jsonl(additional_data_file)
    else:
        additional_d_list = additional_data_file

    if top_k is not None:
        print("Upstream document number truncate to:", top_k)
        trucate_item(additional_d_list, top_k=top_k)

    additional_data_dict = dict()

    for add_item in additional_d_list:
        additional_data_dict[add_item['id']] = add_item

    full_data_list = []

    for item in tqdm(d_list):
        doc_ids = additional_data_dict[item['id']]["predicted_docids"]

        if not pred:
            if item['evidence'] is not None:
                e_list = utils.check_sentences.check_and_clean_evidence(item)
                all_evidence_set = set(
                    itertools.chain.from_iterable(
                        [evids.evidences_list for evids in e_list]))
            else:
                all_evidence_set = None
            # print(all_evidence_set)
            r_list = []
            id_list = []

            if all_evidence_set is not None:
                for doc_id, ln in all_evidence_set:
                    _, text, _ = fever_db.get_evidence(cursor, doc_id, ln)
                    r_list.append(text)
                    id_list.append(doc_id + '(-.-)' + str(ln))

        else:  # If pred, then reset to not containing ground truth evidence.
            all_evidence_set = None
            r_list = []
            id_list = []

        for doc_id in doc_ids:
            cur_r_list, cur_id_list = fever_db.get_all_sent_by_doc_id(
                cursor, doc_id, with_h_links=False)
            # Merging to data list and removing duplicate
            for i in range(len(cur_r_list)):
                if cur_id_list[i] in id_list:
                    continue
                else:
                    r_list.append(cur_r_list[i])
                    id_list.append(cur_id_list[i])

        assert len(id_list) == len(set(id_list))  # check duplicate
        assert len(r_list) == len(id_list)

        zipped_s_id_list = list(zip(r_list, id_list))
        # Sort using id
        # sorted(evidences_set, key=lambda x: (x[0], x[1]))
        zipped_s_id_list = sorted(zipped_s_id_list,
                                  key=lambda x: (x[1][0], x[1][1]))

        all_sent_list = convert_to_formatted_sent(zipped_s_id_list,
                                                  all_evidence_set,
                                                  contain_head=True,
                                                  id_tokenized=True)
        cur_id = item['id']
        for i, sent_item in enumerate(all_sent_list):
            sent_item['selection_id'] = str(cur_id) + "<##>" + str(
                sent_item['sid'])
            sent_item['query'] = item['claim']

            if 'label' in item.keys():
                sent_item['claim_label'] = item['label']

            full_data_list.append(sent_item)

    return full_data_list