def used_func_for_fast_key_word_matching(): # Load tokenizer path_stanford_corenlp_full_2017_06_09 = str(config.PRO_ROOT / 'dep_packages/stanford-corenlp-full-2017-06-09/*') drqa.tokenizers.set_default('corenlp_classpath', path_stanford_corenlp_full_2017_06_09) tok = CoreNLPTokenizer(annotators=['pos', 'lemma', 'ner']) keyword_processor = KeywordProcessor(case_sensitive=True) id_to_key_dict = load_keyword_dict(config.DATA_ROOT / "id_dict.jsonl") # Write this in a for loop to keep track of the progress for clean_name, keywords in tqdm(id_to_key_dict.items()): if not isinstance(keywords, list): raise AttributeError("Value of key {} should be a list".format(clean_name)) for keyword in keywords: keyword_processor.add_keyword(keyword, clean_name) # Load data for predicting d_list = load_data(config.FEVER_DEV_JSONL) sample_answer(d_list, tok, keyword_p=keyword_processor) # save the the results for evaluating out_fname = config.RESULT_PATH / "doc_retri" / f"{utils.get_current_time_str()}_r" / "dev.jsonl" save_intermidiate_results(d_list, out_filename=out_fname) # Evaluating # out_fname = '/Users/Eason/RA/FunEver/results/doc_retri/2018_06_29_17:41:14_r/dev.jsonl' d_list = load_data(out_fname) eval_mode = {'check_doc_id_correct': True, 'standard': False} # print(fever_score(d_list, d_list, mode=eval_mode, error_analysis_file=Path(out_fname).parent / "analysis.log")) print(fever_score(d_list, d_list, mode=eval_mode))
def used_func_for_fast_key_word_matching_expanded_kw(): """ Added on July 1. :return: """ # Load tokenizer path_stanford_corenlp_full_2017_06_09 = str(config.PRO_ROOT / 'dep_packages/stanford-corenlp-full-2017-06-09/*') drqa_yixin.tokenizers.set_default('corenlp_classpath', path_stanford_corenlp_full_2017_06_09) tok = CoreNLPTokenizer(annotators=['pos', 'lemma', 'ner']) # keyword_processor = KeywordProcessor(case_sensitive=True) id_to_key_dict = load_keyword_dict(config.DATA_ROOT / "id_dict.jsonl") id_dict_key_word_expand(id_to_key_dict, create_new_key_word_dict=False) # exit(-2) # Write this in a for loop to keep track of the progress build_flashtext_processor_wit(keyword_processor, id_to_key_dict) # Load data for predicting d_list = load_data(config.FEVER_DEV_JSONL) sample_answer(d_list, tok, keyword_p=keyword_processor) # save the the results for evaluating out_fname = config.RESULT_PATH / "doc_retri" / f"{utils.get_current_time_str()}_r" / "dev.jsonl" save_intermidiate_results(d_list, out_filename=out_fname) # Evaluating # out_fname = '/Users/Eason/RA/FunEver/results/doc_retri/2018_07_01_17:20:54_r/dev.jsonl' # out_fname = '/Users/Eason/RA/FunEver/results/doc_retri/2018_07_01_17:08:06_r/dev.jsonl' # d_list = load_data(out_fname) eval_mode = {'check_doc_id_correct': True, 'standard': False} # print(fever_score(d_list, d_list, mode=eval_mode, error_analysis_file=Path(out_fname).parent / "analysis.log")) print(fever_score(d_list, d_list, mode=eval_mode, verbose=False))
def create_instance(self): path_stanford_corenlp_full_2017_06_09 = \ str(config.PRO_ROOT / 'dep_packages/stanford-corenlp-full-2017-06-09/*') print("Load tokenizer:", path_stanford_corenlp_full_2017_06_09) drqa_yixin.tokenizers.set_default( 'corenlp_classpath', path_stanford_corenlp_full_2017_06_09) _tok = CoreNLPTokenizer(annotators=['pos', 'lemma']) self.instance = _tok
def used_func_for_building_normalized_key_word_index_for_docids(): path_stanford_corenlp_full_2017_06_09 = str(config.PRO_ROOT / 'dep_packages/stanford-corenlp-full-2017-06-09/*') drqa.tokenizers.set_default('corenlp_classpath', path_stanford_corenlp_full_2017_06_09) tok = CoreNLPTokenizer(annotators=['pos', 'lemma', 'ner']) did_list = get_all_doc_ids(str(config.FEVER_DB), max_ind=None) build_keyword_dict(did_list, tok, config.DATA_ROOT / "id_dict.jsonl")
def tokenized_claim_list(in_list): path_stanford_corenlp_full_2017_06_09 = str( config.PRO_ROOT / 'dep_packages/stanford-corenlp-full-2017-06-09/*') drqa_yixin.tokenizers.set_default('corenlp_classpath', path_stanford_corenlp_full_2017_06_09) tok = CoreNLPTokenizer(annotators=['pos', 'lemma']) for item in tqdm(in_list): item['claim'] = ' '.join(easy_tokenize(item['claim'], tok)) return in_list
def tokenized_claim(in_file, out_file): path_stanford_corenlp_full_2017_06_09 = str( config.PRO_ROOT / 'dep_packages/stanford-corenlp-full-2017-06-09/*') print(path_stanford_corenlp_full_2017_06_09) drqa_yixin.tokenizers.set_default('corenlp_classpath', path_stanford_corenlp_full_2017_06_09) tok = CoreNLPTokenizer(annotators=['pos', 'lemma']) d_list = load_jsonl(in_file) for item in tqdm(d_list): item['claim'] = ' '.join(easy_tokenize(item['claim'], tok)) save_jsonl(d_list, out_file)
from utils import fever_db, check_sentences import config import drqa_yixin.tokenizers from drqa_yixin.tokenizers import CoreNLPTokenizer from tqdm import tqdm from utils import c_scorer, text_clean from utils import common path_stanford_corenlp_full_2017_06_09 = str( config.PRO_ROOT / 'dep_packages/stanford-corenlp-full-2017-06-09/*') print(path_stanford_corenlp_full_2017_06_09) drqa_yixin.tokenizers.set_default('corenlp_classpath', path_stanford_corenlp_full_2017_06_09) tok = CoreNLPTokenizer(annotators=['pos', 'lemma']) random.seed = 12 def easy_tokenize(text): return tok.tokenize(text_clean.normalize(text)).words() def load_data(file): d_list = [] with open(file, encoding='utf-8', mode='r') as in_f: for line in in_f: item = json.loads(line.strip()) d_list.append(item)
def initialize_tokenizer(self): snlp_path = str(config.PRO_ROOT / \ 'dep_packages/stanford-corenlp-full-2017-06-09/*') set_default('corenlp_classpath', snlp_path) return CoreNLPTokenizer(annotators=['pos', 'lemma', 'ner'])
def used_func_for_fast_key_word_matching_prioritized_kw(): """ Added on July 1. :return: """ # Load tokenizer path_stanford_corenlp_full_2017_06_09 = str( config.PRO_ROOT / 'dep_packages/stanford-corenlp-full-2017-06-09/*') drqa_yixin.tokenizers.set_default('corenlp_classpath', path_stanford_corenlp_full_2017_06_09) tok = CoreNLPTokenizer(annotators=['pos', 'lemma', 'ner']) # doc_tokens, doc_lemmas = parse_doc_id('Hourglass_-LRB-James_Taylor_album-RRB-', tok) # print(doc_tokens) # print(doc_lemmas) # print(get_words_inside_parenthese(doc_tokens)) # print(get_words_inside_parenthese(doc_lemmas)) # claim_t = ['album'] # claim_l = ['album'] # print(check_inside_paretheses_overlap(doc_tokens, doc_lemmas, claim_t, claim_l)) # exit(-1) # keyword_processor = KeywordProcessor(case_sensitive=True) id_to_key_dict = load_keyword_dict(config.DATA_ROOT / "id_dict.jsonl", filtering=True) exact_match_rule_dict = set_priority(id_to_key_dict, priority=5.0) print(len(exact_match_rule_dict)) noisy_key_dict = id_dict_key_word_expand(id_to_key_dict, create_new_key_word_dict=True) noisy_parenthese_rule_dict = set_priority(noisy_key_dict, priority=1.0) print("Noisy_Parenthese_Rule_Dict:", len(noisy_parenthese_rule_dict)) # exit(-2) # Write this in a for loop to keep track of the progress build_flashtext_processor_with_prioritized_kw_dict(keyword_processor, exact_match_rule_dict) build_flashtext_processor_with_prioritized_kw_dict( keyword_processor, noisy_parenthese_rule_dict) # Load data for predicting d_list = load_data(config.FEVER_TRAIN_JSONL) # d_list = load_data(config.FEVER_DEV_JSONL) sample_answer_with_priority(d_list, tok, keyword_processor, top_k=5) # save the the results for evaluating out_fname = config.RESULT_PATH / "doc_retri" / f"{utils.get_current_time_str()}_r" / "train.jsonl" # out_fname = config.RESULT_PATH / "doc_retri" / f"{utils.get_current_time_str()}_r" / "dev.jsonl" save_intermidiate_results(d_list, out_filename=out_fname) # Evaluating # out_fname = '/Users/Eason/RA/FunEver/results/doc_retri/2018_07_01_17:20:54_r/dev.jsonl' # out_fname = '/Users/Eason/RA/FunEver/results/doc_retri/2018_07_01_17:08:06_r/dev.jsonl' # d_list = load_data(out_fname) eval_mode = {'check_doc_id_correct': True, 'standard': False} # print(fever_score(d_list, d_list, mode=eval_mode, error_analysis_file=Path(out_fname).parent / "analysis.log")) print(fever_score(d_list, d_list, mode=eval_mode, verbose=False))
import utils.wiki_term_builder from tqdm import tqdm import json from utils.fever_db import get_all_doc_ids, convert_brc from utils.c_scorer import fever_score from utils import text_clean from pathlib import Path import copy import utils import utils.common as common path_stanford_corenlp_full_2017_06_09 = str( config.PRO_ROOT / 'dep_packages/stanford-corenlp-full-2017-06-09/*') drqa_yixin.tokenizers.set_default('corenlp_classpath', path_stanford_corenlp_full_2017_06_09) global_tok = CoreNLPTokenizer(annotators=['pos', 'lemma', 'ner']) def memodict(f): """ Memoization decorator for a function taking a single argument """ class memodict(dict): def __missing__(self, key): ret = self[key] = f(key) return ret return memodict().__getitem__ def build_keyword_dict(did_list, tokenizer, out_filename): out_f = open(out_filename, encoding='utf-8', mode='w') if out_filename is not None else None