예제 #1
0
def prediction_processing(dataset_path, predictions):
    """
    process the predicted (doc_id,sent_id) pairs to the score system desired format
    :param dataset_path:
    :param predictions:
    :return:
    """

    final_predictions = []
    jsr = JSONLineReader()

    with open(dataset_path, "r") as f:
        lines = jsr.process(f)
        #
        # lines = lines[:100]

        for idx, line in enumerate(lines):
            if len(line['predicted_pages']) == 0:
                line['predicted_evidence'] = []
            else:
                line['predicted_evidence'] = [[
                    prediction[0], int(prediction[1])
                ] for prediction in predictions[idx]]
            line['predicted_label'] = "REFUTES"
            final_predictions.append(line)

    return final_predictions
예제 #2
0
def number_feature(data_set_path: str, db_path: str, max_sent_num: int):
    from common.dataset.reader import JSONLineReader
    db = FeverDocDB(db_path)
    jlr = JSONLineReader()
    lines = jlr.read(data_set_path)
    num_feat = np.zeros([len(lines), max_sent_num, 3], dtype=np.int32)
    for i, line in enumerate(lines):
        claim_text = line['claim']
        claim_tokens = tokenize(claim_text)
        all_nums = set()
        for token in claim_tokens:
            if is_token_numeric(token):
                all_nums.add(float(token))
        for j, evidence in enumerate(line['predicted_evidence']):
            if j >= max_sent_num:
                break
            page, line_num = evidence[-2], evidence[-1]
            all_evidence_nums = []
            evidence_text = evidence_num_to_text(db, page, line_num)
            evidence_tokens = tokenize(evidence_text)
            for token in evidence_tokens:
                if is_token_numeric(token):
                    all_evidence_nums.append(float(token))
            has_num = len(all_evidence_nums) > 0
            has_identical_num = any(n in all_nums for n in all_evidence_nums)
            has_different_num = any(n not in all_nums
                                    for n in all_evidence_nums)
            num_feat[i][j][0], num_feat[i][j][1], num_feat[i][j][
                2] = _interprete_num_result(has_num, has_identical_num,
                                            has_different_num)
    return num_feat
예제 #3
0
    def sampling(self, datapath, num_sample=1):

        jlr = JSONLineReader()

        X = []
        count = 0
        with open(datapath, "r") as f:
            lines = jlr.process(f)

            for line in tqdm(lines):
                count += 1
                pos_pairs = []
                # count1 += 1
                if line['label'].upper() == "NOT ENOUGH INFO":
                    continue
                neg_sents = []
                claim = line['claim']

                pos_set = set()
                for evidence_set in line['evidence']:
                    pos_sent = self.get_whole_evidence(evidence_set, self.db)
                    if pos_sent in pos_set:
                        continue
                    pos_set.add(pos_sent)

                p_lines = []
                evidence_set = set([(evidence[2], evidence[3])
                                    for evidences in line['evidence']
                                    for evidence in evidences])

                pages = [
                    page for page in line['predicted_pages']
                    if page is not None
                ]

                for page in pages:
                    doc_lines = self.db.get_doc_lines(page)
                    p_lines.extend(self.get_valid_texts(doc_lines, page))
                for doc_line in p_lines:
                    if (doc_line[1], doc_line[2]) not in evidence_set:
                        neg_sents.append(doc_line[0])

                num_sampling = num_sample
                if len(neg_sents) < num_sampling:
                    num_sampling = len(neg_sents)
                    # print(neg_sents)
                if num_sampling == 0:
                    continue
                else:
                    for pos_sent in pos_set:
                        samples = random.sample(neg_sents, num_sampling)
                        for sample in samples:
                            if not sample:
                                continue
                            X.append((claim, pos_sent, sample))
                            if count % 1000 == 0:
                                print(
                                    "claim:{} ,evidence :{} sample:{}".format(
                                        claim, pos_sent, sample))
        return X
def generate_submission(_predictions, _ids, test_set_path, submission_path):
    """
    Generate submission file for shared task: http://fever.ai/task.html
    :param _ids:
    :param _predictions:
    :param test_set_path:
    :param submission_path:
    :return:
    """
    from common.dataset.reader import JSONLineReader
    from tqdm import tqdm
    import json
    _predictions_with_id = list(zip(_ids, _predictions))
    jlr = JSONLineReader()
    json_lines = jlr.read(test_set_path)
    os.makedirs(os.path.dirname(os.path.abspath(submission_path)), exist_ok=True)
    with open(submission_path, 'w') as f:
        for line in tqdm(json_lines):
            for i, evidence in enumerate(line['predicted_evidence']):
                line['predicted_evidence'][i][0] = normalize(evidence[0])
            _id = line['id']
            _pred_label = prediction_2_label(2)
            for _pid, _plabel in _predictions_with_id:
                if _pid == _id:
                    _pred_label = prediction_2_label(_plabel)
                    break
            obj = {"id": _id,"predicted_label": _pred_label,"predicted_evidence": line['predicted_evidence']}
            f.write(json.dumps(obj))
            f.write('\n')
예제 #5
0
def main(db_file, k_wiki, in_file, out_file, add_claim=True, parallel=True):
    # tfidf_path = "data/index/fever-tfidf-ngram=2-hash=16777216-tokenizer=simple.npz"
    method = Doc_Retrieval(database_path=db_file, add_claim=add_claim, k_wiki_results=k_wiki)
    processed = dict()
    path = os.getcwd()
    jlr = JSONLineReader()
    lines = jlr.read(os.path.join(path, in_file))
    if os.path.isfile(os.path.join(path, in_file + ".progress")):
        with open(os.path.join(path, in_file + ".progress"), 'rb') as f_progress:
            import pickle
            progress = pickle.load(f_progress)
            print(os.path.join(path, in_file + ".progress") + " exists. Load it as progress file.")
    else:
        progress = dict()

    try:
        with ThreadPool(processes=4 if parallel else None) as p:
            for line in tqdm(
                    get_map_function(parallel, p)(lambda l: process_line_with_progress(method, l, progress), lines),
                    total=len(lines)):
                processed[line['id']] = line
                progress[line['id']] = line
                # time.sleep(0.5)
        with open(os.path.join(path, out_file), "w+") as f2:
            for line in lines:
                f2.write(json.dumps(processed[line['id']]) + "\n")
    finally:
        with open(os.path.join(path, in_file + ".progress"), 'wb') as f_progress:
            import pickle
            pickle.dump(progress, f_progress, pickle.HIGHEST_PROTOCOL)
예제 #6
0
def load_feature_by_data_set(data_set_path: str, feature_path: str,
                             max_sent_num: int):
    from common.dataset.reader import JSONLineReader
    import pickle
    import os
    with open(os.path.join(feature_path, 'feature.p'), 'rb') as f:
        features = pickle.load(f)
    with open(os.path.join(feature_path, 'data_idx_map.p'), 'rb') as f:
        data_idx_map = pickle.load(f)
    jlr = JSONLineReader()
    lines = jlr.read(data_set_path)
    feature_dim = features.shape[1]
    padding = np.zeros([feature_dim], np.float32)
    claim_features = []
    evidence_features = []
    for line in lines:
        _id = line['id']
        key = _concat_sent(CLAIM, _id)
        claim_features.append(features[data_idx_map[key]])
        evidence_per_claim_features = []
        for sent in line['predicted_evidence']:
            page, line_num = sent[-2], sent[-1]
            key = _concat_sent(page, line_num)
            evidence_per_claim_features.append(features[data_idx_map[key]])
        if len(evidence_per_claim_features) > max_sent_num:
            evidence_features.append(
                evidence_per_claim_features[:max_sent_num])
        else:
            for _ in range(max_sent_num - len(evidence_per_claim_features)):
                evidence_per_claim_features.append(padding)
            evidence_features.append(evidence_per_claim_features)
    return np.asarray(claim_features,
                      np.float32), np.asarray(evidence_features, np.float32)
def train_dev_split(train_datapath, split_rate):

    with open(train_datapath, "r") as f:
        jlr = JSONLineReader()
        lines = jlr.process(f)
        random.shuffle(lines)

        dev_lines = lines[:int(len(lines) * split_rate)]
        train_lines = lines[int(len(lines) * split_rate):]
    return train_lines, dev_lines
예제 #8
0
def load_words(embedding_file, train_datapath, test_path, db_filename,
               num_sample, sampled_path):

    words = set()

    def _insert(iterable):
        for w in iterable:
            w = Dictionary.normalize(w)
            if valid_words and w not in valid_words:
                continue
            words.add(w)

    valid_words = index_embedding_words(embedding_file)

    X_claim, X_sents, y = load_generate_samples(db_filename, train_datapath,
                                                num_sample, sampled_path)
    X_claim = set(X_claim)
    for claim in X_claim:
        words = nltk.word_tokenize(claim)
        _insert(words)

    for sent in X_sents:
        words = simple_tokenizer(sent)
        _insert(words)

    with open(test_path, "r") as f:
        jlr = JSONLineReader()
        db = FeverDocDB(db_filename)

        lines = jlr.process(f)
        for line in lines:
            claim = line['claim']
            words = nltk.word_tokenize(claim)
            _insert(words)
            evidence_set = set([(evidence[2], evidence[3])
                                for evidences in line['evidence']
                                for evidence in evidences])
            pages = set()
            pages.update(evidence[0] for evidence in line['predicted_pages'])
            pages.update(evidence[0] for evidence in evidence_set)
            for page in pages:
                doc_lines = db.get_doc_lines(page)
                if not doc_lines:
                    continue
                doc_lines = [
                    doc_line.split("\t")[1]
                    if len(doc_line.split("\t")[1]) > 1 else ""
                    for doc_line in doc_lines.split("\n")
                ]
                doc_lines = [doc_line for doc_line in doc_lines if doc_line]
                for doc_line in doc_lines:
                    words = simple_tokenizer(doc_line)
                    _insert(words)
    return words
예제 #9
0
def test_data_4_siamese(db_filename, dataset_path):
    db = FeverDocDB(db_filename)
    jlr = JSONLineReader()

    X_claims = []
    X_sents = []
    all_sents_id = []

    with open(dataset_path, "r") as f:
        lines = jlr.process(f)
        # lines = lines[:1000]

        for line in tqdm(lines):
            claims = []
            sents = []
            sents_indexes = []
            p_lines = []
            claim = line['claim']
            pages = set()
            pages.update(evidence[0] for evidence in line['predicted_pages'])
            for page in pages:
                doc_lines = db.get_doc_lines(page)
                if not doc_lines:
                    continue
                doc_lines = [
                    doc_line.split("\t")[1]
                    if len(doc_line.split("\t")[1]) > 1 else ""
                    for doc_line in doc_lines.split("\n")
                ]
                p_lines.extend(
                    zip(doc_lines, [page] * len(doc_lines),
                        range(len(doc_lines))))
            for doc_line in p_lines:
                if not doc_line[0]:
                    continue
                else:
                    claims.append(claim)
                    sents.append(doc_line[0])
                    sents_indexes.append((doc_line[1], doc_line[2]))
            X_claims.append(claims)
            X_sents.append(sents)
            all_sents_id.append(sents_indexes)
    # print(len(X_claims))
    # print(len(X_sents))
    # print(len(all_sents_id))
    # X_claims_indexes, X_sents_indexes = [], []
    # for idx, claims in enumerate(X_claims):
    #     claims_index, sents_index = data_transformer(claims, X_sents[idx], word_dict)
    #     X_claims_indexes.append(claims_index)
    #     X_sents_indexes.append(sents_index)

    return X_claims, X_sents, all_sents_id
def test_data(db_path, dataset_path, type="ranking"):
    """
    generate dev examples to feed into the classifier
    :param db_path:
    :param dataset_path:
    :param type:
    :return:
    """

    with open(db_path) as f:
        db = json.load(f)
        jsr = JSONLineReader()
        inputs = []
        X_claim = []
        X_sents = []
        indexes = []
        with open(dataset_path, "r") as f:
            lines = jsr.process(f)

            for line in tqdm(lines):
                valid_lines = []
                claims = []
                sents_idnexes = []
                claim = line['claim']

                #for doc_line in p_lines:
                doc = line['predicted_evidence']
                # doc = line['evidence']
                for doc_line in doc:
                    if not doc_line:
                        continue
                    else:
                        # print(doc_line[0])
                        if type == "cos":
                            sents_idnexes.append(doc_line)
                            valid_lines.append(
                                get_whole_evidence([doc_line], db))
                            claims.append(claim)
                        elif type == "ranking":
                            sents_idnexes.append((doc_line[0], doc_line[1]))
                            valid_lines.append(
                                (claim, get_whole_evidence([doc_line], db)))
                if type == "cos":
                    X_sents.append(valid_lines)
                    X_claim.append(claims)
                elif type == "ranking":
                    inputs.append(valid_lines)
                indexes.append(sents_idnexes)
            inputs = list(zip(X_claim, X_sents))
            return inputs, indexes
def test_data_loader(save_path, db_filename=None, data_path=None):
    if os.path.exists(save_path):
        with open(save_path, 'rb') as f:
            X = pickle.load(f)
            claims, list_sents, sents_indexes = zip(*X)
    else:
        with open(data_path, "rb") as f:
            jlr = JSONLineReader()
            lines = jlr.process(f)
        claims, list_sents, sents_indexes = test_processing(db_filename, lines)
        X = zip(claims, list_sents, sents_indexes)
        with open(save_path, 'wb') as f:
            pickle.dump(X, f)
    return claims, list_sents, sents_indexes
    def sampling(self,datapath,num_sample=1):

        jlr = JSONLineReader()

        X = []
        count = 0
        with open(datapath, "r") as f:
            lines = jlr.process(f)

            for line in tqdm(lines):
                print('line: ', line)
                count += 1
                if line['label'].upper() == "NOT ENOUGH INFO":
                    continue
                neg_sents = []
                claim = line['claim']
                print('claim: ',claim)

                pos_set = set()
                pos_set_ref = []
                for evidence_set in line['evidence']:
                    for evidence_sentence in evidence_set:
                        pos_set.add(self.get_whole_evidence([evidence_sentence], self.db))
                        pos_set_ref.append(evidence_sentence)
                    print('pos_set: ', pos_set) 
                
                neg_sents = []
                for neg_evidence in line['predicted_evidence']:
                    # if neg_evidence not in evidence_set: 
                    if neg_evidence not in pos_set_ref: 
                        neg_sents.append(self.get_whole_evidence([neg_evidence], self.db))

                num_sampling = num_sample     
                if len(neg_sents) < num_sampling:   
                    num_sampling = len(neg_sents)
                    # print(neg_sents)
                if num_sampling == 0:
                    continue
                else:
                    for pos_sent in pos_set:
                        samples = random.sample(neg_sents, num_sampling)
                        for sample in samples:
                            print('sample: ',sample) 
                            if not sample:
                                continue
                            X.append((claim, pos_sent, sample))
                            if count % 1000 == 0:
                                print("claim:{} ,evidence :{} sample:{}".format(claim, pos_sent, sample))
        return X
예제 #13
0
def prediction_processing(dataset_path, predictions):
    """
    process the predicted (doc_id,sent_id) pairs to the score system desired format
    :param dataset_path:
    :param predictions:
    :return:
    """

    final_predictions = []
    jsr = JSONLineReader()

    with open(dataset_path, "r") as f:
        lines = jsr.process(f)
        prediction_processing_no_reload(lines, predictions)

    return final_predictions
def prediction_processing(dataset_path, predictions, db_filename):
    """
    process the predicted (doc_id,sent_id) pairs to the score system desired format
    :param dataset_path:
    :param predictions:
    :return:
    """

    final_predictions = []
    jsr = JSONLineReader()
    with open(db_filename) as f: 
        db = json.load(f)
        
    out_error_ana = []   
    with open(dataset_path, "r") as f:
        lines = jsr.process(f)

        cnt = 0
        for line in lines:
            
            pos_set_ref = line['evidence']
            if len(pos_set_ref) == 0 or not pos_set_ref[0]:
                continue

            line['predicted_evidence'] = [[prediction[0], int(prediction[1])] for prediction in predictions[cnt]]
            cnt_gold = 0
            out_error_ana.append("Claim: "+str(cnt))
            out_error_ana.append(line['claim'])
            out_error_ana.append("Gold evidence:")
            for evidence_set in line['evidence']:
                for evidence_sentence in evidence_set:
                    out_error_ana.append(get_whole_evidence([evidence_sentence], db))
                    cnt_gold += 1
                    
            out_error_ana.append("Predicted evidence:")
            for evidence_set in line['predicted_evidence'][:cnt_gold]:
                out_error_ana.append(get_whole_evidence([evidence_set], db))
            out_error_ana.append("")

            line['predicted_label'] = "refutes"
            final_predictions.append(line)
            cnt += 1
            if cnt == len(predictions):
                break

    return final_predictions, out_error_ana
예제 #15
0
    def sampling(self, datapath, num_sample=1):

        jlr = JSONLineReader()
        ret = []
        print("sampling for " + datapath)
        with open(datapath, "r") as f:
            lines = jlr.process(f)
            print(len(lines))
            with ThreadPool(processes=48) as p:
                for line in tqdm(p.imap(lambda x: self.handle(x, num_sample),
                                        lines),
                                 total=len(lines)):
                    if line is not None:
                        ret.extend(line)

        print("Done")

        return ret
예제 #16
0
    def dev_processing(self, data_path):

        jlr = JSONLineReader()

        with open(data_path, "r") as f:
            lines = jlr.process(f)

            devs = []
            labels = []
            for line in tqdm(lines):

                dev = []
                label = []
                if line['label'].upper() == "NOT ENOUGH INFO":
                    continue
                evidence_set = set([(evidence[2], evidence[3])
                                    for evidences in line['evidence']
                                    for evidence in evidences])

                pages = [
                    page for page in line['predicted_pages']
                    if page is not None
                ]
                for page, num in evidence_set:
                    pages.append(page)
                pages = set(pages)

                p_lines = []
                for page in pages:
                    doc_lines = self.db.get_doc_lines(page)
                    p_lines.extend(self.get_valid_texts(doc_lines, page))
                for doc_line in p_lines:
                    if not doc_line[0]:
                        continue
                    dev.append((line['claim'], doc_line[0]))
                    if (doc_line[1], doc_line[2]) in evidence_set:
                        label.append(1)
                    else:
                        label.append(0)
                if len(dev) == 0 or len(label) == 0:
                    continue
                devs.append(dev)
                labels.append(label)
        return devs, labels
def cos_train(db_filepath, dataset_path):
    """
    Use the cosine similarity score to rank (claim,sentence) pair in the dev set
    don't need training data
    :param db_filepath:
    :param dataset_path:
    :return:
    """

    with open(db_filepath) as f:
        db = json.load(f)
        jlr = JSONLineReader()

        X = []
        y = []
        with open(dataset_path, "r") as f:
            lines = jlr.process(f)

            for line in tqdm(lines):
                if line['label'] == "NOT ENOUGH INFO":
                    continue

                #label, dev = [], []
                pos_set_ref = line['evidence']
                if len(pos_set_ref) == 0 or not pos_set_ref[0]:
                    continue

                pos_set = set()
                for evidence_set in line['evidence']:
                    for evidence_sentence in evidence_set:
                        pos_set.add(get_whole_evidence([evidence_sentence],
                                                       db))

                for evidence_sentence_ref in line['predicted_evidence']:
                    evidence_sentence = get_whole_evidence(
                        [evidence_sentence_ref], db)
                    X.append((line['claim'], evidence_sentence))
                    if evidence_sentence in pos_set:
                        y.append(1)
                    else:
                        y.append(0)

        return X, y
예제 #18
0
    def data_processing_for_joint(self, data_path):
        from athene.rte.utils.data_reader import label_dict
        jlr = JSONLineReader()

        with open(data_path, "r") as f:
            lines = jlr.process(f)

            datas = []
            sent_labels = []
            claim_labels = []
            for line in tqdm(lines):
                claim_labels.append(label_dict.index(line['label']))
                data = []
                sent_label = []
                evidence_set = set([(evidence[2], evidence[3])
                                    for evidences in line['evidence']
                                    for evidence in evidences])

                pages = [
                    page for page in line['predicted_pages']
                    if page is not None
                ]
                for page, num in evidence_set:
                    pages.append(page)
                pages = set(pages)

                p_lines = []
                for page in pages:
                    doc_lines = self.db.get_doc_lines(page)
                    p_lines.extend(self.get_valid_texts(doc_lines, page))
                for doc_line in p_lines:
                    if not doc_line[0]:
                        continue
                    data.append((line['claim'], doc_line[0]))
                    if (doc_line[1], doc_line[2]) in evidence_set:
                        sent_label.append(1)
                    else:
                        sent_label.append(0)
                if len(data) == 0 or len(sent_label) == 0:
                    continue
                datas.append(data)
                sent_labels.append(sent_label)
        return datas, sent_labels, claim_labels
def predict_processing(db_path, dataset_path):

    with open(db_path) as f:
        db = json.load(f)
        jlr = JSONLineReader()

        devs = []
        all_indexes = []

        with open(dataset_path, "rb") as f:
            lines = jlr.process(f)

            for line in tqdm(lines):
                dev = []
                indexes = []
                claim = line['claim']

                ##########################
                pos_set_ref = line['evidence']
                if len(pos_set_ref) == 0 or not pos_set_ref[0]:
                    continue

                pos_set = set()
                for evidence_set in line['evidence']:
                    for evidence_sentence in evidence_set:
                        pos_set.add(get_whole_evidence([evidence_sentence],
                                                       db))

                for evidence_sentence_ref in line['predicted_evidence']:
                    evidence_sentence = get_whole_evidence(
                        [evidence_sentence_ref], db)
                    dev.append((line['claim'], evidence_sentence))
                    indexes.append(evidence_sentence_ref)
                ##########################

                if len(dev) == 0:
                    dev.append((claim, 'no evidence for this claim'))
                    indexes.append(('empty', 0))

                devs.append(dev)
                all_indexes.append(indexes)
        return devs, all_indexes
def generate_submission(_predictions, test_set_path, submission_path):
    """
    Generate submission file for shared task: http://fever.ai/task.html
    :param _predictions:
    :param test_set_path:
    :param submission_path:
    :return:
    """
    jlr = JSONLineReader()
    json_lines = jlr.read(test_set_path)
    with open(submission_path, 'w') as f:
        for _prediction, line in tqdm(zip(_predictions, json_lines)):
            for i, evidence in enumerate(line['predicted_evidence']):
                line['predicted_evidence'][i][0] = normalize(evidence[0])
            obj = {
                "id": line['id'],
                "predicted_evidence": line['predicted_evidence'],
                "predicted_label": prediction_2_label(_prediction)
            }
            f.write(json.dumps(obj))
            f.write('\n')
    def dev_processing(self,data_path):

        jlr = JSONLineReader()

        with open(data_path,"r") as f:
            lines = jlr.process(f)

            devs = []
            labels = []
            for line in tqdm(lines):

#                 if line['label'].upper() == "NOT ENOUGH INFO":
#                     continue
                
                label, dev = [], []
                pos_set_ref = line['evidence']
                if len(pos_set_ref) == 0 or not pos_set_ref[0]:
                    continue
                
                pos_set = set()
                for evidence_set in line['evidence']:
                    for evidence_sentence in evidence_set:
                        pos_set.add(self.get_whole_evidence([evidence_sentence], self.db))
                
                for evidence_sentence_ref in line['predicted_evidence']:
                    evidence_sentence = self.get_whole_evidence([evidence_sentence_ref], self.db)
                    dev.append((line['claim'], evidence_sentence))
                    
                    if evidence_sentence in pos_set:
                        label.append(1)
                    else:
                        label.append(0)
                        
                        
                if len(dev) == 0 or len(label) == 0:
                    continue
                devs.append(dev)
                labels.append(label)

        return devs,labels
def generate_prediction_files(predictions,p_sents_indexes,data_path,final_prediction_path):

    """
    transform the generated predictions from classifier to lists of dicts form to feed into the score system
    :param predictions:
    :param p_sents_indexes:
    :param data_path:
    :param final_prediction_path:
    :return:
    """
    jlr = JSONLineReader()


    final_predictions = []
    with open(data_path,"r") as f:
        lines = jlr.process(f)

        print(len(predictions))
        print(len(p_sents_indexes))
        print(len(lines))
        assert len(predictions) == len(p_sents_indexes) == len(lines)
        for idx,line in enumerate(lines):

            line['predicted_evidence'] = []
            line['predicted_label'] = 'refutes'
            predicted_sents = predictions[idx]
            sents_indexes = p_sents_indexes[idx]
            for i in range(len(sents_indexes)):
                if predicted_sents[i] == 1:
                    line['predicted_evidence'].append([sents_indexes[i][0],sents_indexes[i][1]])

            final_predictions.append(line)

    with open(final_prediction_path,"w") as f:

        for prediction in final_predictions:
            f.write(json.dumps(prediction)+'\n')

    return final_predictions
예제 #23
0
    def __init__(self,
                 db: FeverDocDB,
                 wiki_tokenizer: Tokenizer = None,
                 claim_tokenizer: Tokenizer = None,
                 token_indexers: Dict[str, TokenIndexer] = None) -> None:
        self._wiki_tokenizer = wiki_tokenizer or WordTokenizer()
        self._claim_tokenizer = claim_tokenizer or WordTokenizer()
        self._token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()}

        self.db = db

        self.formatter = FEVERSentenceFormatter(set(self.db.get_doc_ids()), FEVERLabelSchema())
        self.reader = JSONLineReader()
예제 #24
0
    def predict_processing(self, datapath):

        jlr = JSONLineReader()

        devs = []
        all_indexes = []

        with open(datapath, "rb") as f:
            lines = jlr.process(f)

            for line in tqdm(lines):
                dev = []
                indexes = []
                pages = set()
                # pages = line['predicted_pages']
                pages.update(page for page in line['predicted_pages'])
                # if len(pages) == 0:
                #     pages.add("Michael_Hutchence")
                claim = line['claim']
                p_lines = []
                #Separa sentences and add predict page
                for page in pages:
                    doc_lines = self.db.get_doc_lines(page)
                    if not doc_lines:
                        continue
                    p_lines.extend(self.get_valid_texts(doc_lines, page))

                for doc_line in p_lines:
                    if not doc_line[0]:
                        continue
                    dev.append((claim, doc_line[0]))
                    indexes.append((doc_line[1], doc_line[2]))
                # print(len(dev))
                if len(dev) == 0:
                    dev.append((claim, 'no evidence for this claim'))
                    indexes.append(('empty', 0))
                devs.append(dev)
                all_indexes.append(indexes)
        return devs, all_indexes
def tfidf_test_processing(base_path, dbfilename, test_data_path,
                          test_store_path, pro_extract_sents_path,
                          h_max_length, s_max_length, iword_dict):
    dev_index_path = os.path.join(
        base_path, "data/train_data/dev.h_{}.s_{}.tfidf.indexes.p".format(
            h_max_length, s_max_length))
    devs, location_indexes = dev_data_loader(test_store_path, dbfilename,
                                             test_data_path)
    if os.path.exists(dev_index_path):
        with open(dev_index_path, "rb") as f:
            devs_indexes = pickle.load(f)
    else:
        with open(pro_extract_sents_path, "r") as f:
            jlr = JSONLineReader()
            lines = jlr.process(f)

            inputs = []
            new_location_indexes = []
            for i, line in enumerate(lines):
                pro_extract_sents = []
                sent_index = []
                predict_sents = line['predicted_sentences']
                claim = line['claim']
                predict_sents_set = set([
                    (doc_id, sent_num) for doc_id, sent_num in predict_sents
                ])
                # print(predict_sents_set)
                for j, index in enumerate(location_indexes[i]):
                    if (index[0], index[1]) in predict_sents_set:
                        # print(devs[i][j])
                        # print(devs[i])
                        pro_extract_sents.append((claim, devs[i][j][1]))
                        sent_index.append((index[0], index[1]))
                inputs.append(pro_extract_sents)
                new_location_indexes.append(sent_index)
            devs_indexes = test_data_indexes(inputs, iword_dict, h_max_length,
                                             s_max_length)
    return devs_indexes, new_location_indexes
def dev_processing(db_filename, datapath):
    db = FeverDocDB(db_filename)
    jlr = JSONLineReader()

    devs = []
    all_indexes = []

    with open(datapath, "rb") as f:
        lines = jlr.process(f)

        for line in tqdm(lines):
            dev = []
            indexes = []
            pages = set()
            pages.update(page[0] for page in line['predicted_pages'])
            if len(pages) == 0:
                pages.add("Michael_Hutchence")
            claim = line['claim']
            p_lines = []
            for page in pages:
                doc_lines = db.get_doc_lines(page)
                if not doc_lines:
                    continue
                p_lines.extend(get_valid_texts(doc_lines, page))

            for doc_line in p_lines:
                if not doc_line[0]:
                    continue
                dev.append((claim, doc_line[0]))
                indexes.append((doc_line[1], doc_line[2]))
            # print(len(dev))
            if len(dev) == 0:
                dev.append((claim, 'no evidence for this claim'))
                indexes.append(('empty', 0))
            devs.append(dev)
            all_indexes.append(indexes)
    return devs, all_indexes
    def __init__(self,
                 db: Union[FeverDocDB, SnopesDocDB],
                 sentence_level=False,
                 wiki_tokenizer: Tokenizer = None,
                 claim_tokenizer: Tokenizer = None,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 filtering: str = None) -> None:
        self._sentence_level = sentence_level
        self._wiki_tokenizer = wiki_tokenizer or WordTokenizer()
        self._claim_tokenizer = claim_tokenizer or WordTokenizer()
        self._token_indexers = token_indexers or {
            'tokens': SingleIdTokenIndexer()
        }

        self.db = db

        self.formatter = FEVERGoldFormatter(set(self.db.get_doc_ids()),
                                            FEVERLabelSchema(),
                                            filtering=filtering)
        self.reader = JSONLineReader()
예제 #28
0
                        default='simple',
                        help=("String option specifying tokenizer type to use "
                              "(e.g. 'corenlp')"))

    parser.add_argument('--num-workers',
                        type=int,
                        default=None,
                        help='Number of CPU processes (for tokenizing, etc)')
    args = parser.parse_args()
    doc_freqs = None
    if args.use_precomputed:
        _, metadata = utils.load_sparse_csr(args.model)
        doc_freqs = metadata['doc_freqs'].squeeze()

    db = FeverDocDB("data/fever/fever.db")
    jlr = JSONLineReader()
    formatter = FEVERGoldFormatter(set(), FEVERLabelSchema())

    jlr = JSONLineReader()

    with open(args.in_file, "r") as f, open(
            "data/fever/{0}.sentences.{3}.p{1}.s{2}.jsonl".format(
                args.split, args.max_page, args.max_sent,
                "precomputed" if args.use_precomputed else "not_precomputed"),
            "w+") as out_file:
        lines = jlr.process(f)
        #lines = tf_idf_claims_batch(lines)

        for line in tqdm(lines):
            line = tf_idf_claim(line)
            out_file.write(json.dumps(line) + "\n")
                                                              max_evidence]
    _line['scores'] = _line['scores'][:args.max_evidence]
    return _line


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('input', help='/path/to/input/file')
    parser.add_argument('output', help='/path/to/output/file')
    parser.add_argument('--max_evidence',
                        help='max num of evidences',
                        type=int,
                        default=5)
    args = parser.parse_args()
    LogHelper.setup()
    logger = LogHelper.get_logger("replace_noise_dataset")
    random.seed(55)
    jlr = JSONLineReader()
    lines = jlr.read(args.input)
    counter = 0
    with open(args.output, 'w') as f:
        for i, line in tqdm(enumerate(lines)):
            if not line[
                    'label'] == 'NOT ENOUGH INFO' and not is_gold_evidence_predicted(
                        line):
                counter += 1
                logger.info("line " + str(i + 1) + " should be filled")
                line = random_fill_gold_evidence(line)
            f.write(json.dumps(line) + '\n')
    logger.info(str(counter) + " samples filled with gold evidence")
예제 #30
0
            for sample in samples:
                evidence_pages.add(sample)

    elif len(evidence_pages) >= k:

        samples = random.sample(evidence_pages, k)
        evidence_pages = set(samples)
    return evidence_pages


path = os.getcwd()
path = re.sub("/src.*", "", path)
db = FeverDocDB(os.path.join(path, "data/fever/fever.db"))
doc_ids = db.get_doc_ids()
doc_ids = doc_ids[1:]
jlr = JSONLineReader()
# with open(os.path.join(path, "data/fever-data/train.jsonl"), "r") as f:
#     with open(os.path.join(path, 'data/fever/train.p5.jsonl'), "w") as f2:
#         lines = f.readlines()
#         for line in lines:
#             js = json.loads(line)
#             pages = sample_doc(js,doc_ids,k=5)
#             js['predicted_pages'] = list(pages)
#             f2.write(json.dumps(js)+"\n")

with open(os.path.join(path, "data/fever-data/dev.jsonl"), "r") as f:
    with open(os.path.join(path, "data/fever/dev.p5.jsonl"), "w") as f2:
        lines = f.readlines()
        for line in lines:
            js = json.loads(line)
            pages = sample_doc(js, doc_ids, k=5)