def read_ir_result(path, prependlinum=False, concatev=False): """ Returns instances: list of dictionary update instance['predicted_sentences'] with list of evidences (list of str) """ instances = read_jsonl(path) t2jnum = titles_to_jsonl_num( wikipedia_dir=abs_path("data/wiki-pages/wiki-pages/"), doctitles=abs_path("data/doctitles")) titles = list() # make list of titles for instance in instances: titles.extend([title for title, _ in instance["predicted_sentences"]]) # load title2line2sentences t2l2s = load_doclines(titles, t2jnum) for instance in instances: if concatev: instance["evidence"] = [" ".join(get_evidence_sentence_list( instance["predicted_sentences"], t2l2s, prependlinum=prependlinum))] else: instance["evidence"] = get_evidence_sentence_list( instance["predicted_sentences"], t2l2s, prependlinum=prependlinum) return instances
def main(): parts = 10 small = 3 train_path = util.abs_path('data/crf_train.tag') out_path_train = util.abs_path('data/%d_%d_train.tag' % (parts - small, parts)) out_path_validation = util.abs_path('data/%d_%d_train.tag' % (small, parts)) split(train_path, out_path_validation, out_path_train, parts=parts, small=small)
def read_ir_result(path, n_sentences=5): ''' 读取句子检索的结果 ''' short_evidences_counter = 0 instances = read_jsonl(path) for instance in instances: if len(instance['predicted_sentences']) < n_sentences: short_evidences_counter += 1 instance['predicted_sentences'] = instance[ 'predicted_sentences'][:n_sentences] # 只保留前 n 个句子 print('short_evidences: {} / {}'.format(short_evidences_counter, len(instances))) t2jnum = titles_to_jsonl_num( wikipedia_dir=abs_path('data/wiki-pages/'), doctitles=abs_path('data/preprocessed_data/doctitles')) titles = list() # 获取所有标题的列表 for instance in instances: titles.extend([title for title, _ in instance['predicted_sentences']]) t2l2s = load_doclines(titles, t2jnum) # 证据语句 for instance in instances: instance['evidence'] = get_evidence_sentence_list( instance['predicted_sentences'], t2l2s) return instances
def load_paper_dataset(): """Reads the Fever train/dev set used on the paper. """ train_ds = load_fever_train(path=abs_path("data/train.jsonl"), howmany=9999999999) dev_ds = load_fever_train(path=abs_path("data/dev.jsonl"), howmany=9999999999) return train_ds, dev_ds
def load_paper_dataset(train=abs_path("data/train.jsonl"), dev=abs_path("data/dev.jsonl"), test=abs_path("data/test.jsonl")): """Reads the Fever train/dev set used on the paper. """ train_ds = load_fever_train(path=train, howmany=9999999999) dev_ds = load_fever_train(path=dev, howmany=9999999999) test_ds = load_fever_train(path=test, howmany=9999999999) return train_ds, dev_ds, test_ds
def read_ir_result(path, n_sentences=5, prependlinum=False, prependtitle=False, concatev=False): """ Returns instances: list of dictionary update instance['predicted_sentences'] with list of evidences (list of str) """ short_evidences_counter = 0 instances = read_jsonl(path) # only read n_sentences for instance in instances: if len(instance["predicted_sentences"]) < n_sentences: short_evidences_counter += 1 instance["predicted_sentences"] = instance[ "predicted_sentences"][:n_sentences] print("short_evidences: {} / {}".format(short_evidences_counter, len(instances))) t2jnum = titles_to_jsonl_num( wikipedia_dir=abs_path("data/wiki-pages/wiki-pages/"), doctitles=abs_path("data/doctitles")) titles = list() # make list of titles for instance in instances: titles.extend([title for title, _ in instance["predicted_sentences"]]) # load title2line2sentences t2l2s = load_doclines(titles, t2jnum) for instance in instances: if concatev: instance["evidence"] = [ " ".join( get_evidence_sentence_list(instance["predicted_sentences"], t2l2s, prependlinum=prependlinum, prependtitle=prependtitle)) ] else: instance["evidence"] = get_evidence_sentence_list( instance["predicted_sentences"], t2l2s, prependlinum=prependlinum, prependtitle=prependtitle) return instances
def convert(instances, prependlinum=False, prependtitle=False, use_ir_prediction=False): """convert FEVER format to jack SNLI format Arg instances: list of dictionary of FEVER format Returns instances: list of dictionary of jack SNLI format """ # get all titles and load t2l2s all_titles = list() # use "predicted_sentences" for NEI for instance in tqdm(instances, desc="process for NEI"): if instance["label"] == "NOT ENOUGH INFO": evidences = instance["predicted_sentences"] # assert evidences == [(title, linum), (title, linum), ...] # change its shape to the normal evidence format evidences = [[["dummy", "dummy", title, linum]] for title, linum in evidences] instance["evidence"] = evidences if use_ir_prediction: titles = [title for title, _ in instance["predicted_sentences"]] else: titles = [ title for evidence_set in instance["evidence"] for _, _, title, _ in evidence_set ] all_titles.extend(titles) print("loading wiki data...") t2jnum = titles_to_jsonl_num( wikipedia_dir=abs_path("data/wiki-pages/wiki-pages/"), doctitles=abs_path("data/doctitles")) t2l2s = load_doclines(all_titles, t2jnum) converted_instances = list() for instance in tqdm(instances, desc="conversion"): converted_instances.extend( _convert_instance( instance, t2l2s, prependlinum=prependlinum, prependtitle=prependtitle, use_ir_prediction=use_ir_prediction)) return converted_instances
def load_doclines(titles, t2jnum, filtering=True): """load all lines for provided titles Args titles: list of titles """ if filtering: filtered_titles = [title for title in titles if title in t2jnum] print("mismatch: {} / {}".format( len(titles) - len(filtered_titles), len(titles))) titles = filtered_titles return load_doc_lines( {"dummy_id": [(title, "dummy_linum") for title in titles]}, t2jnum, wikipedia_dir=abs_path("data/wiki-pages/wiki-pages/"))
def _order(proto): module = os.path.basename(proto) module, _ = os.path.splitext(module) _dir = os.path.dirname(proto) root = root_rule(proto) imports = [] for child in root.getChildren(): if child.getType() == proto_parser.IMPORT_LITERAL: path, = child.getChildren() path = path.getText().strip('"') _module, _ = os.path.splitext(path) imports.append(_module) path = os.path.join(_dir, path) path = abs_path(path) for path, _module, _imports in _order(path): yield path, _module, _imports yield proto, module, imports
def main(): selected_feature_dir = util.abs_path('pickle/2008test/selected_features/') matrix_dir = util.abs_path('pickle/2008test/matrix/') run(selected_feature_dir, matrix_dir) return None
def convert(instances, prependlinum=False, prependtitle=False, use_ir_prediction=False, n_sentences=5, depparse_batch_size=32, num_samples=None): """convert FEVER format to jack SNLI format Arg instances: list of dictionary of FEVER format Returns instances: list of dictionary of jack SNLI format """ if not ("label" in instances[0]): test = True else: test = False # get all titles and load t2l2s all_titles = list() # use "predicted_sentences" for NEI for instance in tqdm(instances, desc="process for NEI"): if ("label" not in instance) or (instance["label"] == "NOT ENOUGH INFO"): evidences = instance["predicted_sentences"][:n_sentences] # assert evidences == [(title, linum), (title, linum), ...] # change its shape to the normal evidence format evidences = [[["dummy", "dummy", title, linum]] for title, linum in evidences] instance["evidence"] = evidences if use_ir_prediction: titles = [ title for title, _ in instance["predicted_sentences"][:n_sentences] ] else: titles = [ title for evidence_set in instance["evidence"] for _, _, title, _ in evidence_set ] all_titles.extend(titles) print("loading wiki data...") t2jnum = titles_to_jsonl_num( wikipedia_dir=abs_path("data/wiki-pages/wiki-pages/"), doctitles=abs_path("data/doctitles")) t2l2s = load_doclines(all_titles, t2jnum) converted_instances = list() for instance in tqdm(instances, desc="conversion"): converted_instances.extend( _convert_instance(instance, t2l2s, prependlinum=prependlinum, prependtitle=prependtitle, use_ir_prediction=use_ir_prediction, n_sentences=n_sentences)) print("evaluating dependency...", file=sys.stderr) dep_type_invalid_cnt = 0 if num_samples is None: num_samples = len(converted_instances) for i in tqdm(range(0, num_samples, depparse_batch_size)): nlp_input = "" n_sent = 0 for j in range(i, min(len(converted_instances), i + depparse_batch_size)): question = converted_instances[j]["sentence2"] support = converted_instances[j]["sentence1"] converted_instances[j]["q_tokenized"] = pattern.findall(question) converted_instances[j]["s_tokenized"] = pattern.findall(support) nlp_input += ((" ".join(converted_instances[j]["q_tokenized"])) + "\n" + \ " ".join(converted_instances[j]["s_tokenized"]) + "\n") n_sent += 2 doc = nlp(nlp_input) assert len(doc.sentences) == n_sent for j in range(i, min(len(converted_instances), i + depparse_batch_size)): converted_instances[j]["q_tokenized"] = [ t.text for t in doc.sentences[(j - i) * 2].tokens ] converted_instances[j]["s_tokenized"] = [ t.text for t in doc.sentences[(j - i) * 2 + 1].tokens ] converted_instances[j]["q_dep_i"] = [None] * (len( converted_instances[j]["q_tokenized"])) converted_instances[j]["q_dep_j"] = [None] * (len( converted_instances[j]["q_tokenized"])) converted_instances[j]["q_dep_type"] = [None] * (len( converted_instances[j]["q_tokenized"])) converted_instances[j]["s_dep_i"] = [None] * (len( converted_instances[j]["s_tokenized"])) converted_instances[j]["s_dep_j"] = [None] * (len( converted_instances[j]["s_tokenized"])) converted_instances[j]["s_dep_type"] = [None] * (len( converted_instances[j]["s_tokenized"])) for idx, d in enumerate(doc.sentences[(j - i) * 2].dependencies): if type2id.unit2id(d[1]) is None: dep_type_invalid_cnt += 1 continue if d[1] == 'root': converted_instances[j]["q_dep_i"][idx] = int( d[2].index) - 1 converted_instances[j]["q_dep_j"][idx] = int( d[2].index) - 1 converted_instances[j]["q_dep_type"][ idx] = type2id.unit2id(d[1]) continue converted_instances[j]["q_dep_i"][idx] = int(d[0].index) - 1 converted_instances[j]["q_dep_j"][idx] = int(d[2].index) - 1 converted_instances[j]["q_dep_type"][idx] = type2id.unit2id( d[1]) idx += 1 idx = 0 for idx, d in enumerate(doc.sentences[(j - i) * 2 + 1].dependencies): if type2id.unit2id(d[1]) is None: dep_type_invalid_cnt += 1 continue if d[1] == 'root': converted_instances[j]["s_dep_i"][idx] = int( d[2].index) - 1 converted_instances[j]["s_dep_j"][idx] = int( d[2].index) - 1 converted_instances[j]["s_dep_type"][ idx] = type2id.unit2id(d[1]) continue converted_instances[j]["s_dep_i"][idx] = int(d[0].index) - 1 converted_instances[j]["s_dep_j"][idx] = int(d[2].index) - 1 converted_instances[j]["s_dep_type"][idx] = type2id.unit2id( d[1]) idx += 1 print('Number of invalid dependency type', dep_type_invalid_cnt, file=sys.stderr) return converted_instances
def main(): matrix_dir = util.abs_path('pickle/2008test/matrix') cosine_dir = util.abs_path('pickle/2008test/cosine/') run(matrix_dir, cosine_dir) return None
def main(): matrix_dir = util.abs_path('pickle/2008test/matrix/') svd_matrix_dir = util.abs_path('pickle/2008test/svd_matrix/') run(matrix_dir, svd_matrix_dir) return None
def main(): path = util.abs_path('data/crf_train.tag') with open(path) as f: check_column_num(f) return None
def load_doclines(titles, t2jnum, filtering=True): ''' 加载 title 对应的所有 lines 参数: titles: list of titles ''' # 过滤掉不在 t2jnum 里的标题 if filtering: filtered_titles = [title for title in titles if title in t2jnum] print('mismatch: {} / {}'.format(len(titles) - len(filtered_titles), len(titles))) titles = filtered_titles return load_doc_lines({'dummy_id': [(title, 'dummy_linum') for title in titles]}, t2jnum, wikipedia_dir=abs_path('data/wiki-pages/'))
def main(): feature_dir = util.abs_path('pickle/2008test/features/') selected_feature_dir = util.abs_path('pickle/2008test/selected_features/') run(feature_dir, selected_feature_dir) return None
def setUp(self): config_path = util.abs_path('configure/2007test.NN.nltk.json') config = util.load_pickle(config_path, typ='json') self.flt = FeatureExtractor(config)
def main(): category_dir = util.abs_path('pickle/2008test/louvain_category/') result_dir = util.abs_path('pickle/2008test/result/louvain') run(category_dir, result_dir) return None
def save_wrong_instances(actual_file, predicted_labels_file, predicted_evidence_file, out_file): label_predictions = read_jsonl(predicted_labels_file) ev_predictions = read_jsonl(predicted_evidence_file) actual = read_jsonl(actual_file) all_titles = list() for ev_pred, act in zip(ev_predictions, actual): ev_titles = [title for title, _ in ev_pred["predicted_sentences"]] act_titles = [ title for evidence_set in act["evidence"] for _, _, title, _ in evidence_set ] titles = ev_titles + act_titles all_titles.extend(titles) print("loading wiki data...") t2jnum = titles_to_jsonl_num( wikipedia_dir=abs_path("data/wiki-pages/wiki-pages/"), doctitles=abs_path("data/doctitles")) t2l2s = load_doclines(all_titles, t2jnum) counter = 0 observations = list() print("loading vocabulary list...") import pickle with open("vocab_list.db", "rb") as f: vocab = pickle.load(f) pos_counter = 0 neg_counter = 0 print("processing predictions...") for label_pred, ev_pred, act in tqdm( zip(label_predictions, ev_predictions, actual)): actual_label = act["label"] assert actual_label == label_pred["actual"] pred_label = label_pred["predicted"] if pred_label != actual_label: continue counter += 1 actual_ev = act["evidence"] pred_labels = label_pred["prediction_list"] pred_ev = ev_pred["predicted_sentences"] pred_ev_sent = resolve_evidences(pred_ev, t2l2s, actual=False) claim = act["claim"] ev_contained = convert(compare_evidences(actual_ev, pred_ev)) actual_ev_sent = resolve_evidences(actual_ev, t2l2s) assert not (actual_label != "NOT ENOUGH INFO" and len(actual_ev_sent) != len(actual_ev)) pred_sentence = " ".join(pred_ev_sent) ac_sentence = " ".join(sent for sentences in actual_ev_sent for sent in sentences if sent != "**Not Found**") unk_words = find_unk(pred_sentence + " " + ac_sentence, vocab) if pred_label == actual_label: pos_counter += 1 else: neg_counter += 1 # overwrite when label is NEI if actual_label == "NOT ENOUGH INFO": ev_contained = ["-" for e in ev_contained] # # skip for NEI or no correct evidence. # if ev_contained == ["X"] * 5 and ev_contained != ["-"] * 5: # continue label_pred_ev = [ "<{}> <{}> {}".format(label, contained, ev) for label, contained, ev in zip(shorten_labels(pred_labels), ev_contained, pred_ev) ] actual_ev = ev_pred["evidence"] observations.append({ "id": act["id"], "claim": claim, "predicted_evidences": label_pred_ev, "predicted_sentences": pred_ev_sent, "predicted_label": pred_label, "actual_evidence": actual_ev, "actual_sentences": actual_ev_sent, "actual_label": actual_label, "unk_words": unk_words }) random.shuffle(observations) save_jsonl_pretty_print(observations, out_file) print("pos_counter", pos_counter) print("neg_counter", neg_counter) print("wrong labels:", counter)
results = list() preds_length = list() all_settings = list() instances = read_ir_result(args.in_file, prependlinum=args.prependlinum, concatev=args.concatev) for instance in instances: evidence_list = instance["evidence"] claim = instance["claim"] settings = [QASetting(question=claim, support=[evidence]) for evidence in evidence_list] all_settings.append(settings) # pointer loops from 0 to less than (or equal to) len(all_settings) with step args.batch_size preds_list = list() for pointer in tqdm(range(0, len(all_settings), args.batch_size)): batch_settings = all_settings[pointer: pointer + args.batch_size] n_settings = [len(settings_) for settings_ in batch_settings] preds_list.extend(reshape(dam_reader(flatten(batch_settings)), n_settings)) results = list() for instance, preds in zip(instances, preds_list): prediction, scores, prediction_list = aggregate_preds(preds, args.only_use_topev) results.append({ "actual": instance["label"], "predicted": convert_label(prediction, inverse=True), "scores": scores, "prediction_list": [convert_label(pred, inverse=True) for pred in prediction_list] }) save_jsonl(results, abs_path(args.out_file))
def main(): cosine_dir = util.abs_path('pickle/2008test/cosine/') category_dir = util.abs_path('pickle/2008test/category/') run(cosine_dir, category_dir) return None
def main(): in_path = util.abs_path('data/ckip_test_utf16.tag') out_path = util.abs_path('data/crf_test.tag') lst = load_data(in_path) dump_as_crf_format(lst, out_path) return None
def main(): in_path = util.abs_path('data/ckip_train_utf16.tag') out_path = util.abs_path('data/crf_train.tag') lst = load_data(in_path) dump_as_crf_format(lst, out_path) return None