예제 #1
0
def load_feature_by_data_set(data_set_path: str, feature_path: str,
                             max_sent_num: int):
    from common.dataset.reader import JSONLineReader
    import pickle
    import os
    with open(os.path.join(feature_path, 'feature.p'), 'rb') as f:
        features = pickle.load(f)
    with open(os.path.join(feature_path, 'data_idx_map.p'), 'rb') as f:
        data_idx_map = pickle.load(f)
    jlr = JSONLineReader()
    lines = jlr.read(data_set_path)
    feature_dim = features.shape[1]
    padding = np.zeros([feature_dim], np.float32)
    claim_features = []
    evidence_features = []
    for line in lines:
        _id = line['id']
        key = _concat_sent(CLAIM, _id)
        claim_features.append(features[data_idx_map[key]])
        evidence_per_claim_features = []
        for sent in line['predicted_evidence']:
            page, line_num = sent[-2], sent[-1]
            key = _concat_sent(page, line_num)
            evidence_per_claim_features.append(features[data_idx_map[key]])
        if len(evidence_per_claim_features) > max_sent_num:
            evidence_features.append(
                evidence_per_claim_features[:max_sent_num])
        else:
            for _ in range(max_sent_num - len(evidence_per_claim_features)):
                evidence_per_claim_features.append(padding)
            evidence_features.append(evidence_per_claim_features)
    return np.asarray(claim_features,
                      np.float32), np.asarray(evidence_features, np.float32)
예제 #2
0
def number_feature(data_set_path: str, db_path: str, max_sent_num: int):
    from common.dataset.reader import JSONLineReader
    db = FeverDocDB(db_path)
    jlr = JSONLineReader()
    lines = jlr.read(data_set_path)
    num_feat = np.zeros([len(lines), max_sent_num, 3], dtype=np.int32)
    for i, line in enumerate(lines):
        claim_text = line['claim']
        claim_tokens = tokenize(claim_text)
        all_nums = set()
        for token in claim_tokens:
            if is_token_numeric(token):
                all_nums.add(float(token))
        for j, evidence in enumerate(line['predicted_evidence']):
            if j >= max_sent_num:
                break
            page, line_num = evidence[-2], evidence[-1]
            all_evidence_nums = []
            evidence_text = evidence_num_to_text(db, page, line_num)
            evidence_tokens = tokenize(evidence_text)
            for token in evidence_tokens:
                if is_token_numeric(token):
                    all_evidence_nums.append(float(token))
            has_num = len(all_evidence_nums) > 0
            has_identical_num = any(n in all_nums for n in all_evidence_nums)
            has_different_num = any(n not in all_nums
                                    for n in all_evidence_nums)
            num_feat[i][j][0], num_feat[i][j][1], num_feat[i][j][
                2] = _interprete_num_result(has_num, has_identical_num,
                                            has_different_num)
    return num_feat
def generate_submission(_predictions, _ids, test_set_path, submission_path):
    """
    Generate submission file for shared task: http://fever.ai/task.html
    :param _ids:
    :param _predictions:
    :param test_set_path:
    :param submission_path:
    :return:
    """
    from common.dataset.reader import JSONLineReader
    from tqdm import tqdm
    import json
    _predictions_with_id = list(zip(_ids, _predictions))
    jlr = JSONLineReader()
    json_lines = jlr.read(test_set_path)
    os.makedirs(os.path.dirname(os.path.abspath(submission_path)), exist_ok=True)
    with open(submission_path, 'w') as f:
        for line in tqdm(json_lines):
            for i, evidence in enumerate(line['predicted_evidence']):
                line['predicted_evidence'][i][0] = normalize(evidence[0])
            _id = line['id']
            _pred_label = prediction_2_label(2)
            for _pid, _plabel in _predictions_with_id:
                if _pid == _id:
                    _pred_label = prediction_2_label(_plabel)
                    break
            obj = {"id": _id,"predicted_label": _pred_label,"predicted_evidence": line['predicted_evidence']}
            f.write(json.dumps(obj))
            f.write('\n')
예제 #4
0
def main(db_file, k_wiki, in_file, out_file, add_claim=True, parallel=True):
    # tfidf_path = "data/index/fever-tfidf-ngram=2-hash=16777216-tokenizer=simple.npz"
    method = Doc_Retrieval(database_path=db_file, add_claim=add_claim, k_wiki_results=k_wiki)
    processed = dict()
    path = os.getcwd()
    jlr = JSONLineReader()
    lines = jlr.read(os.path.join(path, in_file))
    if os.path.isfile(os.path.join(path, in_file + ".progress")):
        with open(os.path.join(path, in_file + ".progress"), 'rb') as f_progress:
            import pickle
            progress = pickle.load(f_progress)
            print(os.path.join(path, in_file + ".progress") + " exists. Load it as progress file.")
    else:
        progress = dict()

    try:
        with ThreadPool(processes=4 if parallel else None) as p:
            for line in tqdm(
                    get_map_function(parallel, p)(lambda l: process_line_with_progress(method, l, progress), lines),
                    total=len(lines)):
                processed[line['id']] = line
                progress[line['id']] = line
                # time.sleep(0.5)
        with open(os.path.join(path, out_file), "w+") as f2:
            for line in lines:
                f2.write(json.dumps(processed[line['id']]) + "\n")
    finally:
        with open(os.path.join(path, in_file + ".progress"), 'wb') as f_progress:
            import pickle
            pickle.dump(progress, f_progress, pickle.HIGHEST_PROTOCOL)
def generate_submission(_predictions, test_set_path, submission_path):
    """
    Generate submission file for shared task: http://fever.ai/task.html
    :param _predictions:
    :param test_set_path:
    :param submission_path:
    :return:
    """
    jlr = JSONLineReader()
    json_lines = jlr.read(test_set_path)
    with open(submission_path, 'w') as f:
        for _prediction, line in tqdm(zip(_predictions, json_lines)):
            for i, evidence in enumerate(line['predicted_evidence']):
                line['predicted_evidence'][i][0] = normalize(evidence[0])
            obj = {
                "id": line['id'],
                "predicted_evidence": line['predicted_evidence'],
                "predicted_label": prediction_2_label(_prediction)
            }
            f.write(json.dumps(obj))
            f.write('\n')
                                                              max_evidence]
    _line['scores'] = _line['scores'][:args.max_evidence]
    return _line


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('input', help='/path/to/input/file')
    parser.add_argument('output', help='/path/to/output/file')
    parser.add_argument('--max_evidence',
                        help='max num of evidences',
                        type=int,
                        default=5)
    args = parser.parse_args()
    LogHelper.setup()
    logger = LogHelper.get_logger("replace_noise_dataset")
    random.seed(55)
    jlr = JSONLineReader()
    lines = jlr.read(args.input)
    counter = 0
    with open(args.output, 'w') as f:
        for i, line in tqdm(enumerate(lines)):
            if not line[
                    'label'] == 'NOT ENOUGH INFO' and not is_gold_evidence_predicted(
                        line):
                counter += 1
                logger.info("line " + str(i + 1) + " should be filled")
                line = random_fill_gold_evidence(line)
            f.write(json.dumps(line) + '\n')
    logger.info(str(counter) + " samples filled with gold evidence")
예제 #7
0
import argparse
import json
import os

from sklearn.model_selection import train_test_split

from common.dataset.reader import JSONLineReader

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--file', help='/path/to/jsonl/file', required=True)
    parser.add_argument('--out', help='/path/to/output/folder', default="data/rte_cor")
    parser.add_argument('--split-rate', type=float, help='split rate for test set', default=0.2)
    args = parser.parse_args()
    jlr = JSONLineReader()
    all_data = jlr.read(args.file)
    train, test = train_test_split(all_data, test_size=args.split_rate, random_state=55)
    os.makedirs(args.out, exist_ok=True)
    train_path = os.path.join(args.out, "train.jsonl")
    test_path = os.path.join(args.out, "test.jsonl")
    with open(train_path, "w") as train_file:
        for item in train:
            train_file.write(json.dumps(item) + "\n")
    with open(test_path, "w") as test_file:
        for item in test:
            test_file.write(json.dumps(item) + "\n")


'''
--file '/home/hanselowski/workspace/athene-fever/snopes-fever/data/snopes-data/snopes.claim.jsonl'
--out '/home/hanselowski/workspace/athene-fever/snopes-fever/data/snopes-data/data-out'
예제 #8
0
parser = argparse.ArgumentParser()
parser.add_argument('db_path', type=str, help='/path/to/fever.db')

args = parser.parse_args()

jlr = JSONLineReader()

docdb = FeverDocDB(args.db_path)

idx = docdb.get_non_empty_doc_ids()
idx = list(filter(lambda item: not uninformative(item), tqdm(idx)))

r = SimpleRandom.get_instance()

with open("data/fever/test.ns.rand.jsonl", "w+") as f:
    for line in jlr.read("data/fever-data/test.jsonl"):
        if line["label"] == "NOT ENOUGH INFO":

            for evidence_group in line['evidence']:
                for evidence in evidence_group:
                    evidence[2] = idx[r.next_rand(0, len(idx))]
                    evidence[3] = -1

        f.write(json.dumps(line) + "\n")

with open("data/fever/dev.ns.rand.jsonl", "w+") as f:
    for line in jlr.read("data/fever-data/dev.jsonl"):
        if line["label"] == "NOT ENOUGH INFO":
            for evidence_group in line['evidence']:
                for evidence in evidence_group:
                    evidence[2] = idx[r.next_rand(0, len(idx))]
예제 #9
0
import argparse
import json
import os

from common.dataset.reader import JSONLineReader

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--submission',
                        help='/path/to/submission/file',
                        required=True)
    parser.add_argument('--data', help='/path/to/data/file', required=True)
    parser.add_argument('--output', help='/path/to/output/file', required=True)
    args = parser.parse_args()
    jlr = JSONLineReader()
    submission_lines = jlr.read(args.submission)
    data_lines = jlr.read(args.data)
    assert len(submission_lines) == len(
        data_lines), "lengths of submission and data set are different!"
    submission_dict = {}
    for line in submission_lines:
        submission_dict[line['id']] = line
    assert len(submission_dict) == len(
        submission_lines), "lines in submission are not unique!"
    sorted_lines = []
    for d in data_lines:
        sorted_lines.append(submission_dict[d['id']])
    assert len(sorted_lines) == len(
        data_lines), "some claims from data set are missing in submission!"
    os.makedirs(os.path.dirname(os.path.abspath(args.output)), exist_ok=True)
    with open(args.output, 'w') as f:
    if len(gold_sents) > 0:
        logger.warn(
            str(len(gold_sents)) +
            " gold sentences cannot be filled into prediction")
    return selected_sents


if __name__ == '__main__':
    LogHelper.setup()
    logger = LogHelper.get_logger('fill_gold_sentences')
    parser = argparse.ArgumentParser()
    parser.add_argument('--input', help='/path/to/input/file', required=True)
    parser.add_argument('--output', help='/path/to/output/file', required=True)
    parser.add_argument('--max-sent',
                        type=int,
                        help='Maximal number of sentences per claim',
                        default=10)
    args = parser.parse_args()
    jlr = JSONLineReader()
    data = jlr.read(args.input)
    with open(args.output, "w+") as output_file:
        for data in tqdm(data):
            if data['verifiable'] != 'NOT VERIFIABLE':
                pred_sents = data['predicted_sentences']
                gold_evidences = data['evidence']
                gold_sents = _sents_from_evidences(gold_evidences)
                filled_pred_sents = _fill_pred_sents_with_gold(
                    pred_sents, gold_sents, args.max_sent)
                data['predicted_sentences'] = filled_pred_sents
            output_file.write(json.dumps(data) + "\n")