Exemple #1
0
def read_ir_result(path, prependlinum=False, concatev=False):
    """
    Returns
    instances: list of dictionary
    update instance['predicted_sentences'] with list of evidences (list of str)
    """
    instances = read_jsonl(path)
    t2jnum = titles_to_jsonl_num(
        wikipedia_dir=abs_path("data/wiki-pages/wiki-pages/"),
        doctitles=abs_path("data/doctitles"))
    titles = list()

    # make list of titles
    for instance in instances:
        titles.extend([title for title, _ in instance["predicted_sentences"]])

    # load title2line2sentences
    t2l2s = load_doclines(titles, t2jnum)

    for instance in instances:
        if concatev:
            instance["evidence"] = [" ".join(get_evidence_sentence_list(
                instance["predicted_sentences"], t2l2s, prependlinum=prependlinum))]
        else:
            instance["evidence"] = get_evidence_sentence_list(
                instance["predicted_sentences"], t2l2s, prependlinum=prependlinum)

    return instances
def main():
    parts = 10
    small = 3
    train_path = util.abs_path('data/crf_train.tag')
    out_path_train = util.abs_path('data/%d_%d_train.tag' % (parts - small, parts))
    out_path_validation = util.abs_path('data/%d_%d_train.tag' % (small, parts))
    split(train_path, out_path_validation, out_path_train, parts=parts, small=small)
def read_ir_result(path, n_sentences=5):
    '''
    读取句子检索的结果
    '''
    short_evidences_counter = 0

    instances = read_jsonl(path)
    for instance in instances:
        if len(instance['predicted_sentences']) < n_sentences:
            short_evidences_counter += 1
        instance['predicted_sentences'] = instance[
            'predicted_sentences'][:n_sentences]  # 只保留前 n 个句子
    print('short_evidences: {} / {}'.format(short_evidences_counter,
                                            len(instances)))

    t2jnum = titles_to_jsonl_num(
        wikipedia_dir=abs_path('data/wiki-pages/'),
        doctitles=abs_path('data/preprocessed_data/doctitles'))

    titles = list()
    # 获取所有标题的列表
    for instance in instances:
        titles.extend([title for title, _ in instance['predicted_sentences']])

    t2l2s = load_doclines(titles, t2jnum)

    # 证据语句
    for instance in instances:
        instance['evidence'] = get_evidence_sentence_list(
            instance['predicted_sentences'], t2l2s)

    return instances
Exemple #4
0
def load_paper_dataset():
    """Reads the Fever train/dev set used on the paper.
    """
    train_ds = load_fever_train(path=abs_path("data/train.jsonl"),
                                howmany=9999999999)
    dev_ds = load_fever_train(path=abs_path("data/dev.jsonl"),
                              howmany=9999999999)
    return train_ds, dev_ds
Exemple #5
0
def load_paper_dataset(train=abs_path("data/train.jsonl"),
                       dev=abs_path("data/dev.jsonl"),
                       test=abs_path("data/test.jsonl")):
    """Reads the Fever train/dev set used on the paper.
    """
    train_ds = load_fever_train(path=train, howmany=9999999999)
    dev_ds = load_fever_train(path=dev, howmany=9999999999)
    test_ds = load_fever_train(path=test, howmany=9999999999)
    return train_ds, dev_ds, test_ds
def main():
    parts = 10
    small = 3
    train_path = util.abs_path('data/crf_train.tag')
    out_path_train = util.abs_path('data/%d_%d_train.tag' %
                                   (parts - small, parts))
    out_path_validation = util.abs_path('data/%d_%d_train.tag' %
                                        (small, parts))
    split(train_path,
          out_path_validation,
          out_path_train,
          parts=parts,
          small=small)
Exemple #7
0
def read_ir_result(path,
                   n_sentences=5,
                   prependlinum=False,
                   prependtitle=False,
                   concatev=False):
    """
    Returns
    instances: list of dictionary
    update instance['predicted_sentences'] with list of evidences (list of str)
    """
    short_evidences_counter = 0
    instances = read_jsonl(path)
    # only read n_sentences
    for instance in instances:
        if len(instance["predicted_sentences"]) < n_sentences:
            short_evidences_counter += 1
        instance["predicted_sentences"] = instance[
            "predicted_sentences"][:n_sentences]
    print("short_evidences: {} / {}".format(short_evidences_counter,
                                            len(instances)))

    t2jnum = titles_to_jsonl_num(
        wikipedia_dir=abs_path("data/wiki-pages/wiki-pages/"),
        doctitles=abs_path("data/doctitles"))
    titles = list()

    # make list of titles
    for instance in instances:
        titles.extend([title for title, _ in instance["predicted_sentences"]])

    # load title2line2sentences
    t2l2s = load_doclines(titles, t2jnum)

    for instance in instances:
        if concatev:
            instance["evidence"] = [
                " ".join(
                    get_evidence_sentence_list(instance["predicted_sentences"],
                                               t2l2s,
                                               prependlinum=prependlinum,
                                               prependtitle=prependtitle))
            ]
        else:
            instance["evidence"] = get_evidence_sentence_list(
                instance["predicted_sentences"],
                t2l2s,
                prependlinum=prependlinum,
                prependtitle=prependtitle)

    return instances
Exemple #8
0
def convert(instances, prependlinum=False, prependtitle=False, use_ir_prediction=False):
    """convert FEVER format to jack SNLI format
    Arg
    instances: list of dictionary of FEVER format

    Returns
    instances: list of dictionary of jack SNLI format
    """
    # get all titles and load t2l2s
    all_titles = list()

    # use "predicted_sentences" for NEI
    for instance in tqdm(instances, desc="process for NEI"):
        if instance["label"] == "NOT ENOUGH INFO":
            evidences = instance["predicted_sentences"]
            # assert evidences == [(title, linum), (title, linum), ...]

            # change its shape to the normal evidence format
            evidences = [[["dummy", "dummy", title, linum]]
                         for title, linum in evidences]
            instance["evidence"] = evidences

        if use_ir_prediction:
            titles = [title for title, _ in instance["predicted_sentences"]]
        else:
            titles = [
                title for evidence_set in instance["evidence"]
            for _, _, title, _ in evidence_set
            ]
        all_titles.extend(titles)

    print("loading wiki data...")
    t2jnum = titles_to_jsonl_num(
        wikipedia_dir=abs_path("data/wiki-pages/wiki-pages/"),
        doctitles=abs_path("data/doctitles"))
    t2l2s = load_doclines(all_titles, t2jnum)

    converted_instances = list()
    for instance in tqdm(instances, desc="conversion"):
        converted_instances.extend(
            _convert_instance(
                instance, t2l2s, prependlinum=prependlinum, prependtitle=prependtitle, use_ir_prediction=use_ir_prediction))

    return converted_instances
Exemple #9
0
def load_doclines(titles, t2jnum, filtering=True):
    """load all lines for provided titles
    Args
    titles: list of titles
    """
    if filtering:
        filtered_titles = [title for title in titles if title in t2jnum]
        print("mismatch: {} / {}".format(
            len(titles) - len(filtered_titles), len(titles)))
        titles = filtered_titles

    return load_doc_lines(
        {"dummy_id": [(title, "dummy_linum") for title in titles]},
        t2jnum,
        wikipedia_dir=abs_path("data/wiki-pages/wiki-pages/"))
Exemple #10
0
def _order(proto):
    module = os.path.basename(proto)
    module, _ = os.path.splitext(module)
    _dir = os.path.dirname(proto)
    root = root_rule(proto)
    imports = []
    for child in root.getChildren():
        if child.getType() == proto_parser.IMPORT_LITERAL:
            path, = child.getChildren()
            path = path.getText().strip('"')
            _module, _ = os.path.splitext(path)
            imports.append(_module)
            path = os.path.join(_dir, path)
            path = abs_path(path)
            for path, _module, _imports in _order(path):
                yield path, _module, _imports
    yield proto, module, imports
def main():
    selected_feature_dir = util.abs_path('pickle/2008test/selected_features/')
    matrix_dir = util.abs_path('pickle/2008test/matrix/')
    run(selected_feature_dir, matrix_dir)
    return None
Exemple #12
0
def convert(instances,
            prependlinum=False,
            prependtitle=False,
            use_ir_prediction=False,
            n_sentences=5,
            depparse_batch_size=32,
            num_samples=None):
    """convert FEVER format to jack SNLI format
    Arg
    instances: list of dictionary of FEVER format

    Returns
    instances: list of dictionary of jack SNLI format
    """

    if not ("label" in instances[0]):
        test = True
    else:
        test = False

    # get all titles and load t2l2s
    all_titles = list()

    # use "predicted_sentences" for NEI
    for instance in tqdm(instances, desc="process for NEI"):
        if ("label" not in instance) or (instance["label"]
                                         == "NOT ENOUGH INFO"):
            evidences = instance["predicted_sentences"][:n_sentences]
            # assert evidences == [(title, linum), (title, linum), ...]

            # change its shape to the normal evidence format
            evidences = [[["dummy", "dummy", title, linum]]
                         for title, linum in evidences]
            instance["evidence"] = evidences

        if use_ir_prediction:
            titles = [
                title
                for title, _ in instance["predicted_sentences"][:n_sentences]
            ]
        else:
            titles = [
                title for evidence_set in instance["evidence"]
                for _, _, title, _ in evidence_set
            ]
        all_titles.extend(titles)

    print("loading wiki data...")
    t2jnum = titles_to_jsonl_num(
        wikipedia_dir=abs_path("data/wiki-pages/wiki-pages/"),
        doctitles=abs_path("data/doctitles"))
    t2l2s = load_doclines(all_titles, t2jnum)

    converted_instances = list()
    for instance in tqdm(instances, desc="conversion"):
        converted_instances.extend(
            _convert_instance(instance,
                              t2l2s,
                              prependlinum=prependlinum,
                              prependtitle=prependtitle,
                              use_ir_prediction=use_ir_prediction,
                              n_sentences=n_sentences))

    print("evaluating dependency...", file=sys.stderr)
    dep_type_invalid_cnt = 0

    if num_samples is None:
        num_samples = len(converted_instances)
    for i in tqdm(range(0, num_samples, depparse_batch_size)):
        nlp_input = ""
        n_sent = 0
        for j in range(i, min(len(converted_instances),
                              i + depparse_batch_size)):
            question = converted_instances[j]["sentence2"]
            support = converted_instances[j]["sentence1"]
            converted_instances[j]["q_tokenized"] = pattern.findall(question)
            converted_instances[j]["s_tokenized"] = pattern.findall(support)
            nlp_input += ((" ".join(converted_instances[j]["q_tokenized"])) + "\n" + \
                        " ".join(converted_instances[j]["s_tokenized"]) + "\n")
            n_sent += 2
        doc = nlp(nlp_input)
        assert len(doc.sentences) == n_sent
        for j in range(i, min(len(converted_instances),
                              i + depparse_batch_size)):
            converted_instances[j]["q_tokenized"] = [
                t.text for t in doc.sentences[(j - i) * 2].tokens
            ]
            converted_instances[j]["s_tokenized"] = [
                t.text for t in doc.sentences[(j - i) * 2 + 1].tokens
            ]
            converted_instances[j]["q_dep_i"] = [None] * (len(
                converted_instances[j]["q_tokenized"]))
            converted_instances[j]["q_dep_j"] = [None] * (len(
                converted_instances[j]["q_tokenized"]))
            converted_instances[j]["q_dep_type"] = [None] * (len(
                converted_instances[j]["q_tokenized"]))
            converted_instances[j]["s_dep_i"] = [None] * (len(
                converted_instances[j]["s_tokenized"]))
            converted_instances[j]["s_dep_j"] = [None] * (len(
                converted_instances[j]["s_tokenized"]))
            converted_instances[j]["s_dep_type"] = [None] * (len(
                converted_instances[j]["s_tokenized"]))

            for idx, d in enumerate(doc.sentences[(j - i) * 2].dependencies):
                if type2id.unit2id(d[1]) is None:
                    dep_type_invalid_cnt += 1
                    continue
                if d[1] == 'root':
                    converted_instances[j]["q_dep_i"][idx] = int(
                        d[2].index) - 1
                    converted_instances[j]["q_dep_j"][idx] = int(
                        d[2].index) - 1
                    converted_instances[j]["q_dep_type"][
                        idx] = type2id.unit2id(d[1])
                    continue
                converted_instances[j]["q_dep_i"][idx] = int(d[0].index) - 1
                converted_instances[j]["q_dep_j"][idx] = int(d[2].index) - 1
                converted_instances[j]["q_dep_type"][idx] = type2id.unit2id(
                    d[1])
                idx += 1
            idx = 0
            for idx, d in enumerate(doc.sentences[(j - i) * 2 +
                                                  1].dependencies):
                if type2id.unit2id(d[1]) is None:
                    dep_type_invalid_cnt += 1
                    continue
                if d[1] == 'root':
                    converted_instances[j]["s_dep_i"][idx] = int(
                        d[2].index) - 1
                    converted_instances[j]["s_dep_j"][idx] = int(
                        d[2].index) - 1
                    converted_instances[j]["s_dep_type"][
                        idx] = type2id.unit2id(d[1])
                    continue
                converted_instances[j]["s_dep_i"][idx] = int(d[0].index) - 1
                converted_instances[j]["s_dep_j"][idx] = int(d[2].index) - 1
                converted_instances[j]["s_dep_type"][idx] = type2id.unit2id(
                    d[1])
                idx += 1

    print('Number of invalid dependency type',
          dep_type_invalid_cnt,
          file=sys.stderr)

    return converted_instances
def main():
    matrix_dir = util.abs_path('pickle/2008test/matrix')
    cosine_dir = util.abs_path('pickle/2008test/cosine/')
    run(matrix_dir, cosine_dir)
    return None
Exemple #14
0
def main():
    matrix_dir = util.abs_path('pickle/2008test/matrix/')
    svd_matrix_dir = util.abs_path('pickle/2008test/svd_matrix/')
    run(matrix_dir, svd_matrix_dir)
    return None
Exemple #15
0
def main():
    path = util.abs_path('data/crf_train.tag')
    with open(path) as f:
        check_column_num(f)
    return None
Exemple #16
0
def load_doclines(titles, t2jnum, filtering=True):
    '''
    加载 title 对应的所有 lines

    参数:
    titles: list of titles

    '''
    # 过滤掉不在 t2jnum 里的标题
    if filtering:
        filtered_titles = [title for title in titles if title in t2jnum]
        print('mismatch: {} / {}'.format(len(titles) - len(filtered_titles), len(titles)))
        titles = filtered_titles

    return load_doc_lines({'dummy_id': [(title, 'dummy_linum') for title in titles]}, t2jnum, wikipedia_dir=abs_path('data/wiki-pages/'))
Exemple #17
0
def main():
    feature_dir = util.abs_path('pickle/2008test/features/')
    selected_feature_dir = util.abs_path('pickle/2008test/selected_features/')
    run(feature_dir, selected_feature_dir)
    return None
 def setUp(self):
     config_path = util.abs_path('configure/2007test.NN.nltk.json')
     config = util.load_pickle(config_path, typ='json')
     self.flt = FeatureExtractor(config)
 def setUp(self):
     config_path = util.abs_path('configure/2007test.NN.nltk.json')
     config = util.load_pickle(config_path, typ='json')
     self.flt = FeatureExtractor(config)
def main():
    category_dir = util.abs_path('pickle/2008test/louvain_category/')
    result_dir = util.abs_path('pickle/2008test/result/louvain')
    run(category_dir, result_dir)
    return None
Exemple #21
0
def save_wrong_instances(actual_file, predicted_labels_file,
                         predicted_evidence_file, out_file):
    label_predictions = read_jsonl(predicted_labels_file)
    ev_predictions = read_jsonl(predicted_evidence_file)
    actual = read_jsonl(actual_file)

    all_titles = list()
    for ev_pred, act in zip(ev_predictions, actual):
        ev_titles = [title for title, _ in ev_pred["predicted_sentences"]]
        act_titles = [
            title for evidence_set in act["evidence"]
            for _, _, title, _ in evidence_set
        ]
        titles = ev_titles + act_titles
        all_titles.extend(titles)

    print("loading wiki data...")
    t2jnum = titles_to_jsonl_num(
        wikipedia_dir=abs_path("data/wiki-pages/wiki-pages/"),
        doctitles=abs_path("data/doctitles"))
    t2l2s = load_doclines(all_titles, t2jnum)

    counter = 0
    observations = list()

    print("loading vocabulary list...")
    import pickle
    with open("vocab_list.db", "rb") as f:
        vocab = pickle.load(f)

    pos_counter = 0
    neg_counter = 0
    print("processing predictions...")
    for label_pred, ev_pred, act in tqdm(
            zip(label_predictions, ev_predictions, actual)):
        actual_label = act["label"]
        assert actual_label == label_pred["actual"]

        pred_label = label_pred["predicted"]
        if pred_label != actual_label:
            continue

        counter += 1
        actual_ev = act["evidence"]
        pred_labels = label_pred["prediction_list"]
        pred_ev = ev_pred["predicted_sentences"]
        pred_ev_sent = resolve_evidences(pred_ev, t2l2s, actual=False)

        claim = act["claim"]
        ev_contained = convert(compare_evidences(actual_ev, pred_ev))
        actual_ev_sent = resolve_evidences(actual_ev, t2l2s)
        assert not (actual_label != "NOT ENOUGH INFO"
                    and len(actual_ev_sent) != len(actual_ev))

        pred_sentence = " ".join(pred_ev_sent)
        ac_sentence = " ".join(sent for sentences in actual_ev_sent
                               for sent in sentences
                               if sent != "**Not Found**")
        unk_words = find_unk(pred_sentence + " " + ac_sentence, vocab)

        if pred_label == actual_label:
            pos_counter += 1
        else:
            neg_counter += 1

        # overwrite when label is NEI
        if actual_label == "NOT ENOUGH INFO":
            ev_contained = ["-" for e in ev_contained]

        # # skip for NEI or no correct evidence.
        # if ev_contained == ["X"] * 5 and ev_contained != ["-"] * 5:
        #     continue

        label_pred_ev = [
            "<{}> <{}> {}".format(label, contained, ev) for label, contained,
            ev in zip(shorten_labels(pred_labels), ev_contained, pred_ev)
        ]
        actual_ev = ev_pred["evidence"]

        observations.append({
            "id": act["id"],
            "claim": claim,
            "predicted_evidences": label_pred_ev,
            "predicted_sentences": pred_ev_sent,
            "predicted_label": pred_label,
            "actual_evidence": actual_ev,
            "actual_sentences": actual_ev_sent,
            "actual_label": actual_label,
            "unk_words": unk_words
        })

    random.shuffle(observations)
    save_jsonl_pretty_print(observations, out_file)
    print("pos_counter", pos_counter)
    print("neg_counter", neg_counter)
    print("wrong labels:", counter)
Exemple #22
0
    results = list()
    preds_length = list()
    all_settings = list()
    instances = read_ir_result(args.in_file, prependlinum=args.prependlinum, concatev=args.concatev)
    for instance in instances:
        evidence_list = instance["evidence"]
        claim = instance["claim"]
        settings = [QASetting(question=claim, support=[evidence]) for evidence in evidence_list]
        all_settings.append(settings)

    # pointer loops from 0 to less than (or equal to) len(all_settings) with step args.batch_size
    preds_list = list()
    for pointer in tqdm(range(0, len(all_settings), args.batch_size)):
        batch_settings = all_settings[pointer: pointer + args.batch_size]
        n_settings = [len(settings_) for settings_ in batch_settings]
        preds_list.extend(reshape(dam_reader(flatten(batch_settings)), n_settings))

    results = list()
    for instance, preds in zip(instances, preds_list):
        prediction, scores, prediction_list = aggregate_preds(preds, args.only_use_topev)
        results.append({
            "actual": instance["label"],
            "predicted":
            convert_label(prediction, inverse=True),
            "scores":
            scores,
            "prediction_list":
            [convert_label(pred, inverse=True) for pred in prediction_list]
        })
    save_jsonl(results, abs_path(args.out_file))
Exemple #23
0
def main():
    cosine_dir = util.abs_path('pickle/2008test/cosine/')
    category_dir = util.abs_path('pickle/2008test/category/')
    run(cosine_dir, category_dir)
    return None
Exemple #24
0
def main():
    cosine_dir = util.abs_path('pickle/2008test/cosine/')
    category_dir = util.abs_path('pickle/2008test/category/')
    run(cosine_dir, category_dir)
    return None
def main():
    selected_feature_dir = util.abs_path('pickle/2008test/selected_features/')
    matrix_dir = util.abs_path('pickle/2008test/matrix/')
    run(selected_feature_dir, matrix_dir)
    return None
def main():
    category_dir = util.abs_path('pickle/2008test/louvain_category/')
    result_dir = util.abs_path('pickle/2008test/result/louvain')
    run(category_dir, result_dir)
    return None
Exemple #27
0
def main():
    in_path = util.abs_path('data/ckip_test_utf16.tag')
    out_path = util.abs_path('data/crf_test.tag')
    lst = load_data(in_path)
    dump_as_crf_format(lst, out_path)
    return None
Exemple #28
0
def main():
    matrix_dir = util.abs_path('pickle/2008test/matrix/')
    svd_matrix_dir = util.abs_path('pickle/2008test/svd_matrix/')
    run(matrix_dir, svd_matrix_dir)
    return None
def main():
    in_path = util.abs_path('data/ckip_train_utf16.tag')
    out_path = util.abs_path('data/crf_train.tag')
    lst = load_data(in_path)
    dump_as_crf_format(lst, out_path)
    return None
Exemple #30
0
def main():
    path = util.abs_path('data/crf_train.tag')
    with open(path) as f:
        check_column_num(f)
    return None