def read_ir_result(path, n_sentences=5): ''' 读取句子检索的结果 ''' short_evidences_counter = 0 instances = read_jsonl(path) for instance in instances: if len(instance['predicted_sentences']) < n_sentences: short_evidences_counter += 1 instance['predicted_sentences'] = instance[ 'predicted_sentences'][:n_sentences] # 只保留前 n 个句子 print('short_evidences: {} / {}'.format(short_evidences_counter, len(instances))) t2jnum = titles_to_jsonl_num( wikipedia_dir=abs_path('data/wiki-pages/'), doctitles=abs_path('data/preprocessed_data/doctitles')) titles = list() # 获取所有标题的列表 for instance in instances: titles.extend([title for title, _ in instance['predicted_sentences']]) t2l2s = load_doclines(titles, t2jnum) # 证据语句 for instance in instances: instance['evidence'] = get_evidence_sentence_list( instance['predicted_sentences'], t2l2s) return instances
def run_sent_ret(config): train, dev = load_paper_dataset() with open('data/preprocessed_data/edocs.bin', 'rb') as rb: edocs = pickle.load(rb) with open(config['doc_ret_model'], 'rb') as rb: dmodel = pickle.load(rb) t2jnum = titles_to_jsonl_num() try: with open(config['sent_ret_model'], 'rb') as rb: model = pickle.load(rb) # 加载模型参数 except BaseException: try: selected = load_selected(config['sent_ret_line']) # 加载采样数据 except BaseException: docs = doc_ret(train, edocs, model=dmodel) selected = select_lines(docs, t2jnum, train, config['sent_ret_line']) model = sent_ret_model() X, y = model.process_train(selected, train) # 训练模型 model.fit(X, y) with open(config['sent_ret_model'], 'wb') as wb: pickle.dump(model, wb) docs = doc_ret(dev, edocs, model=dmodel) # 进行文档检索 lines = load_doc_lines(docs, t2jnum) evidence = sent_ret(dev, docs, lines, best=config['n_best'], model=model) # 进行句子检索 line_hits(dev, evidence) # 评估结果
def get_evidence(data=dict()): with open("data/edocs.bin", "rb") as rb: edocs = pickle.load(rb) with open("data/doc_ir_model.bin", "rb") as rb: dmodel = pickle.load(rb) t2jnum = titles_to_jsonl_num() with open("data/line_ir_model.bin", "rb") as rb: lmodel = pickle.load(rb) docs = doc_ir(data, edocs, model=dmodel) lines = load_doc_lines(docs, t2jnum) evidence = line_ir(data, docs, lines, model=lmodel) return docs, evidence
def evi_ret(data=dict(), n_docs=5, n_sents=5): with open('./data/preprocessed_data/edocs.bin', 'rb') as rb: edocs = pickle.load(rb) with open('./results/doc_ret/doc_ret_model.bin', 'rb') as rb: dmodel = pickle.load(rb) t2jnum = titles_to_jsonl_num() with open('./results/sent_ret/sent_ret_model.bin', 'rb') as rb: lmodel = pickle.load(rb) docs = doc_ret(data, edocs, model=dmodel, best=n_docs) lines = load_doc_lines(docs, t2jnum) evidence = sent_ret(data, docs, lines, model=lmodel, best=n_sents) return docs, evidence
def convert(instances, prependlinum=False, prependtitle=False, use_ir_prediction=False): """convert FEVER format to jack SNLI format Arg instances: list of dictionary of FEVER format Returns instances: list of dictionary of jack SNLI format """ # get all titles and load t2l2s all_titles = list() # use "predicted_sentences" for NEI for instance in tqdm(instances, desc="process for NEI"): if instance["label"] == "NOT ENOUGH INFO": evidences = instance["predicted_sentences"] # assert evidences == [(title, linum), (title, linum), ...] # change its shape to the normal evidence format evidences = [[["dummy", "dummy", title, linum]] for title, linum in evidences] instance["evidence"] = evidences if use_ir_prediction: titles = [title for title, _ in instance["predicted_sentences"]] else: titles = [ title for evidence_set in instance["evidence"] for _, _, title, _ in evidence_set ] all_titles.extend(titles) print("loading wiki data...") t2jnum = titles_to_jsonl_num( wikipedia_dir=abs_path("data/wiki-pages/wiki-pages/"), doctitles=abs_path("data/doctitles")) t2l2s = load_doclines(all_titles, t2jnum) converted_instances = list() for instance in tqdm(instances, desc="conversion"): converted_instances.extend( _convert_instance( instance, t2l2s, prependlinum=prependlinum, prependtitle=prependtitle, use_ir_prediction=use_ir_prediction)) return converted_instances
def run_doc_ret(config): train, dev = load_paper_dataset() if os.path.exists(config['doc_ret_model']): with open(config['doc_ret_model'], 'rb') as rb: model = pickle.load(rb) else: if os.path.exists(config['doc_ret_docs']): selected = load_selected(config['doc_ret_docs']) else: selected = sample_docs(train, config['doc_ret_docs']) # 建立模型 model = doc_ret_model() # 对训练数据进行预处理 X, y = model.process_train(selected, train) # 训练模型 model.fit(X, y) # 存储训练好的模型 with open(config['doc_ret_model'], 'wb') as wb: pickle.dump(model, wb) if os.path.exists('data/preprocessed_data/edocs.bin'): with open('data/preprocessed_data/edocs.bin', 'rb') as rb: edocs = pickle.load(rb) else: t2jnum = titles_to_jsonl_num() edocs = title_edict(t2jnum) with open('data/preprocessed_data/edocs.bin', 'wb') as wb: pickle.dump(edocs, wb) print(len(model.f2v)) # 使用训练好的模型对验证集进行文档检索 docs = doc_ret(dev, edocs, best=config['n_best'], model=model) # 对检索结果进行评估 title_hits(dev, docs)
def select_docs(train): samp_size = 25000 tots = {"SUPPORTS": 0, "REFUTES": 0} sofar = {"SUPPORTS": 0, "REFUTES": 0} try: with open("data/edocs.bin", "rb") as rb: edocs = pickle.load(rb) except: t2jnum = titles_to_jsonl_num() edocs = title_edict(t2jnum) with open("data/edocs.bin", "wb") as wb: pickle.dump(edocs, wb) examples = Counter() id2titles = dict() for example in train: cid = example["id"] claim = example["claim"] l = example["label"] if l == 'NOT ENOUGH INFO': continue all_evidence = [e for eset in example["evidence"] for e in eset] docs = set() for ev in all_evidence: evid = ev[2] if evid != None: docs.add(evid) t2phrases = find_titles_in_claim(example["claim"], edocs) id2titles[cid] = t2phrases flag = False for title in t2phrases: if title in docs: flag = True if flag: tots[l] += 1 selected = dict() for example in tqdm(train): yn = 0 cid = example["id"] l = example["label"] if l == 'NOT ENOUGH INFO': continue all_evidence = [e for eset in example["evidence"] for e in eset] docs = set() for ev in all_evidence: evid = ev[2] if evid != None: docs.add(evid) #t2phrases=find_titles_in_claim(example["claim"],edocs) t2phrases = id2titles[cid] for title in t2phrases: if title in docs: yn = 1 prob = (samp_size - sofar[l]) / (tots[l]) if yn == 1 and random() < prob: titles = list(t2phrases.keys()) shuffle(titles) flagy = False flagn = False for t in titles: if not flagy and t in docs: ty = t flagy = True if not flagn and t not in docs: tn = t flagn = True if flagy and flagn: selected[cid] = dict() for t, y_n in [(ty, 1), (tn, 0)]: ps = t2phrases[t] shuffle(ps) p, s = ps[0] selected[cid][y_n] = [t, p, s] sofar[l] += 1 break if yn == 1: tots[l] -= 1 with open("data/doc_ir_docs", "w") as w: for cid in selected: for yn in selected[cid]: [t, p, s] = selected[cid][yn] w.write( str(cid) + "\t" + str(yn) + "\t" + t + "\t" + p + "\t" + str(s) + "\n") for l in sofar: print(l, sofar[l]) return selected
if cid in selected: claim = example["claim"] ctoks = word_tokenize(claim.lower()) titles = list() for yn in selected[cid]: [title, phrase, start] = selected[cid][yn] titles.append(title) rdocs[example["id"]] = (titles, ctoks) try: t2tf = titles_to_tf() doctf = load_doc_tf(rdocs, t2tf) except: term_and_doc_freqs() t2tf = titles_to_tf() doctf = load_doc_tf(rdocs, t2tf) X, y = model.process_train(selected, train, doctf) model.fit(X, y) with open("data/doc_ir_model.bin", "wb") as wb: pickle.dump(model, wb) try: with open("data/edocs.bin", "rb") as rb: edocs = pickle.load(rb) except: t2jnum = titles_to_jsonl_num() edocs = title_edict(t2jnum) with open("data/edocs.bin", "wb") as wb: pickle.dump(edocs, wb) print(len(model.f2v)) docs = doc_ir(dev, edocs, model=model) title_hits(dev, docs)
def convert(instances, prependlinum=False, prependtitle=False, use_ir_prediction=False, n_sentences=5, depparse_batch_size=32, num_samples=None): """convert FEVER format to jack SNLI format Arg instances: list of dictionary of FEVER format Returns instances: list of dictionary of jack SNLI format """ if not ("label" in instances[0]): test = True else: test = False # get all titles and load t2l2s all_titles = list() # use "predicted_sentences" for NEI for instance in tqdm(instances, desc="process for NEI"): if ("label" not in instance) or (instance["label"] == "NOT ENOUGH INFO"): evidences = instance["predicted_sentences"][:n_sentences] # assert evidences == [(title, linum), (title, linum), ...] # change its shape to the normal evidence format evidences = [[["dummy", "dummy", title, linum]] for title, linum in evidences] instance["evidence"] = evidences if use_ir_prediction: titles = [ title for title, _ in instance["predicted_sentences"][:n_sentences] ] else: titles = [ title for evidence_set in instance["evidence"] for _, _, title, _ in evidence_set ] all_titles.extend(titles) print("loading wiki data...") t2jnum = titles_to_jsonl_num( wikipedia_dir=abs_path("data/wiki-pages/wiki-pages/"), doctitles=abs_path("data/doctitles")) t2l2s = load_doclines(all_titles, t2jnum) converted_instances = list() for instance in tqdm(instances, desc="conversion"): converted_instances.extend( _convert_instance(instance, t2l2s, prependlinum=prependlinum, prependtitle=prependtitle, use_ir_prediction=use_ir_prediction, n_sentences=n_sentences)) print("evaluating dependency...", file=sys.stderr) dep_type_invalid_cnt = 0 if num_samples is None: num_samples = len(converted_instances) for i in tqdm(range(0, num_samples, depparse_batch_size)): nlp_input = "" n_sent = 0 for j in range(i, min(len(converted_instances), i + depparse_batch_size)): question = converted_instances[j]["sentence2"] support = converted_instances[j]["sentence1"] converted_instances[j]["q_tokenized"] = pattern.findall(question) converted_instances[j]["s_tokenized"] = pattern.findall(support) nlp_input += ((" ".join(converted_instances[j]["q_tokenized"])) + "\n" + \ " ".join(converted_instances[j]["s_tokenized"]) + "\n") n_sent += 2 doc = nlp(nlp_input) assert len(doc.sentences) == n_sent for j in range(i, min(len(converted_instances), i + depparse_batch_size)): converted_instances[j]["q_tokenized"] = [ t.text for t in doc.sentences[(j - i) * 2].tokens ] converted_instances[j]["s_tokenized"] = [ t.text for t in doc.sentences[(j - i) * 2 + 1].tokens ] converted_instances[j]["q_dep_i"] = [None] * (len( converted_instances[j]["q_tokenized"])) converted_instances[j]["q_dep_j"] = [None] * (len( converted_instances[j]["q_tokenized"])) converted_instances[j]["q_dep_type"] = [None] * (len( converted_instances[j]["q_tokenized"])) converted_instances[j]["s_dep_i"] = [None] * (len( converted_instances[j]["s_tokenized"])) converted_instances[j]["s_dep_j"] = [None] * (len( converted_instances[j]["s_tokenized"])) converted_instances[j]["s_dep_type"] = [None] * (len( converted_instances[j]["s_tokenized"])) for idx, d in enumerate(doc.sentences[(j - i) * 2].dependencies): if type2id.unit2id(d[1]) is None: dep_type_invalid_cnt += 1 continue if d[1] == 'root': converted_instances[j]["q_dep_i"][idx] = int( d[2].index) - 1 converted_instances[j]["q_dep_j"][idx] = int( d[2].index) - 1 converted_instances[j]["q_dep_type"][ idx] = type2id.unit2id(d[1]) continue converted_instances[j]["q_dep_i"][idx] = int(d[0].index) - 1 converted_instances[j]["q_dep_j"][idx] = int(d[2].index) - 1 converted_instances[j]["q_dep_type"][idx] = type2id.unit2id( d[1]) idx += 1 idx = 0 for idx, d in enumerate(doc.sentences[(j - i) * 2 + 1].dependencies): if type2id.unit2id(d[1]) is None: dep_type_invalid_cnt += 1 continue if d[1] == 'root': converted_instances[j]["s_dep_i"][idx] = int( d[2].index) - 1 converted_instances[j]["s_dep_j"][idx] = int( d[2].index) - 1 converted_instances[j]["s_dep_type"][ idx] = type2id.unit2id(d[1]) continue converted_instances[j]["s_dep_i"][idx] = int(d[0].index) - 1 converted_instances[j]["s_dep_j"][idx] = int(d[2].index) - 1 converted_instances[j]["s_dep_type"][idx] = type2id.unit2id( d[1]) idx += 1 print('Number of invalid dependency type', dep_type_invalid_cnt, file=sys.stderr) return converted_instances
def save_wrong_instances(actual_file, predicted_labels_file, predicted_evidence_file, out_file): label_predictions = read_jsonl(predicted_labels_file) ev_predictions = read_jsonl(predicted_evidence_file) actual = read_jsonl(actual_file) all_titles = list() for ev_pred, act in zip(ev_predictions, actual): ev_titles = [title for title, _ in ev_pred["predicted_sentences"]] act_titles = [ title for evidence_set in act["evidence"] for _, _, title, _ in evidence_set ] titles = ev_titles + act_titles all_titles.extend(titles) print("loading wiki data...") t2jnum = titles_to_jsonl_num( wikipedia_dir=abs_path("data/wiki-pages/wiki-pages/"), doctitles=abs_path("data/doctitles")) t2l2s = load_doclines(all_titles, t2jnum) counter = 0 observations = list() print("loading vocabulary list...") import pickle with open("vocab_list.db", "rb") as f: vocab = pickle.load(f) pos_counter = 0 neg_counter = 0 print("processing predictions...") for label_pred, ev_pred, act in tqdm( zip(label_predictions, ev_predictions, actual)): actual_label = act["label"] assert actual_label == label_pred["actual"] pred_label = label_pred["predicted"] if pred_label != actual_label: continue counter += 1 actual_ev = act["evidence"] pred_labels = label_pred["prediction_list"] pred_ev = ev_pred["predicted_sentences"] pred_ev_sent = resolve_evidences(pred_ev, t2l2s, actual=False) claim = act["claim"] ev_contained = convert(compare_evidences(actual_ev, pred_ev)) actual_ev_sent = resolve_evidences(actual_ev, t2l2s) assert not (actual_label != "NOT ENOUGH INFO" and len(actual_ev_sent) != len(actual_ev)) pred_sentence = " ".join(pred_ev_sent) ac_sentence = " ".join(sent for sentences in actual_ev_sent for sent in sentences if sent != "**Not Found**") unk_words = find_unk(pred_sentence + " " + ac_sentence, vocab) if pred_label == actual_label: pos_counter += 1 else: neg_counter += 1 # overwrite when label is NEI if actual_label == "NOT ENOUGH INFO": ev_contained = ["-" for e in ev_contained] # # skip for NEI or no correct evidence. # if ev_contained == ["X"] * 5 and ev_contained != ["-"] * 5: # continue label_pred_ev = [ "<{}> <{}> {}".format(label, contained, ev) for label, contained, ev in zip(shorten_labels(pred_labels), ev_contained, pred_ev) ] actual_ev = ev_pred["evidence"] observations.append({ "id": act["id"], "claim": claim, "predicted_evidences": label_pred_ev, "predicted_sentences": pred_ev_sent, "predicted_label": pred_label, "actual_evidence": actual_ev, "actual_sentences": actual_ev_sent, "actual_label": actual_label, "unk_words": unk_words }) random.shuffle(observations) save_jsonl_pretty_print(observations, out_file) print("pos_counter", pos_counter) print("neg_counter", neg_counter) print("wrong labels:", counter)
def sample_docs(train, save_file): ''' 在训练数据中进行采样 返回值: seleted[cid][yn] = [title, phrase, start] ''' samp_size = 25000 tots = {'SUPPORTS': 0, 'REFUTES': 0} sofar = {'SUPPORTS': 0, 'REFUTES': 0} # 读取文档标题字典 if os.path.exists('data/preprocessed_data/edocs.bin'): with open('data/preprocessed_data/edocs.bin', 'rb') as rb: edocs = pickle.load(rb) else: t2jnum = titles_to_jsonl_num() edocs = title_edict(t2jnum) with open('data/preprocessed_data/edocs.bin', 'wb') as wb: pickle.dump(edocs, wb) id2titles = dict() for example in train: cid = example['id'] label = example['label'] if label == 'NOT ENOUGH INFO': continue # 构建训练集中的证据对应的文档集 all_evidence = [ evi for evi_set in example['evidence'] for evi in evi_set ] docs = set() for evi in all_evidence: evi_doc = evi[2] if evi_doc is not None: docs.add(evi_doc) # 将 claim 中存在的 title 转换为对应的标题短语 t2phrases = find_titles_in_claim(example['claim'], edocs) id2titles[cid] = t2phrases flag = False for title in t2phrases: if title in docs: flag = True # 如果 claim 中出现的 title 存在于证据对应的文档集中,即 claim 可通过其中出现的 title 来获取对应的证据 if flag: tots[label] += 1 selected = dict() # 进行采样选择数据 for example in tqdm(train): yn = 0 # yn 表示类型,1:正样本,0:负样本 cid = example['id'] label = example['label'] if label == 'NOT ENOUGH INFO': continue all_evidence = [ evi for evi_set in example['evidence'] for evi in evi_set ] docs = set() for evi in all_evidence: evi_doc = evi[2] if evi_doc is not None: docs.add(evi_doc) t2phrases = id2titles[cid] # 通过实例的 cid 来直接获取对应的 title 字典 for title in t2phrases: if title in docs: yn = 1 prob = (samp_size - sofar[label]) / (tots[label]) if yn == 1 and random() < prob: titles = list(t2phrases.keys()) shuffle(titles) flagy = False flagn = False for t in titles: if not flagy and t in docs: ty = t flagy = True if not flagn and t not in docs: tn = t flagn = True if flagy and flagn: selected[cid] = dict() for t, y_n in [(ty, 1), (tn, 0)]: ps = t2phrases[t] shuffle(ps) p, s = ps[0] selected[cid][y_n] = [t, p, s] sofar[label] += 1 break if yn == 1: tots[label] -= 1 # 将采样结果写入文件 with open(save_file, 'w') as w: for cid in selected: for yn in selected[cid]: [t, p, s] = selected[cid][yn] w.write( str(cid) + '\t' + str(yn) + '\t' + t + '\t' + p + '\t' + str(s) + '\n') for label in sofar: print(label, sofar[label]) return selected