Ejemplo n.º 1
0
def prediction_processing(dataset_path, predictions):
    """
    process the predicted (doc_id,sent_id) pairs to the score system desired format
    :param dataset_path:
    :param predictions:
    :return:
    """

    final_predictions = []
    jsr = JSONLineReader()

    with open(dataset_path, "r") as f:
        lines = jsr.process(f)
        #
        # lines = lines[:100]

        for idx, line in enumerate(lines):
            if len(line['predicted_pages']) == 0:
                line['predicted_evidence'] = []
            else:
                line['predicted_evidence'] = [[
                    prediction[0], int(prediction[1])
                ] for prediction in predictions[idx]]
            line['predicted_label'] = "REFUTES"
            final_predictions.append(line)

    return final_predictions
Ejemplo n.º 2
0
    def sampling(self, datapath, num_sample=1):

        jlr = JSONLineReader()

        X = []
        count = 0
        with open(datapath, "r") as f:
            lines = jlr.process(f)

            for line in tqdm(lines):
                count += 1
                pos_pairs = []
                # count1 += 1
                if line['label'].upper() == "NOT ENOUGH INFO":
                    continue
                neg_sents = []
                claim = line['claim']

                pos_set = set()
                for evidence_set in line['evidence']:
                    pos_sent = self.get_whole_evidence(evidence_set, self.db)
                    if pos_sent in pos_set:
                        continue
                    pos_set.add(pos_sent)

                p_lines = []
                evidence_set = set([(evidence[2], evidence[3])
                                    for evidences in line['evidence']
                                    for evidence in evidences])

                pages = [
                    page for page in line['predicted_pages']
                    if page is not None
                ]

                for page in pages:
                    doc_lines = self.db.get_doc_lines(page)
                    p_lines.extend(self.get_valid_texts(doc_lines, page))
                for doc_line in p_lines:
                    if (doc_line[1], doc_line[2]) not in evidence_set:
                        neg_sents.append(doc_line[0])

                num_sampling = num_sample
                if len(neg_sents) < num_sampling:
                    num_sampling = len(neg_sents)
                    # print(neg_sents)
                if num_sampling == 0:
                    continue
                else:
                    for pos_sent in pos_set:
                        samples = random.sample(neg_sents, num_sampling)
                        for sample in samples:
                            if not sample:
                                continue
                            X.append((claim, pos_sent, sample))
                            if count % 1000 == 0:
                                print(
                                    "claim:{} ,evidence :{} sample:{}".format(
                                        claim, pos_sent, sample))
        return X
def train_dev_split(train_datapath, split_rate):

    with open(train_datapath, "r") as f:
        jlr = JSONLineReader()
        lines = jlr.process(f)
        random.shuffle(lines)

        dev_lines = lines[:int(len(lines) * split_rate)]
        train_lines = lines[int(len(lines) * split_rate):]
    return train_lines, dev_lines
Ejemplo n.º 4
0
def load_words(embedding_file, train_datapath, test_path, db_filename,
               num_sample, sampled_path):

    words = set()

    def _insert(iterable):
        for w in iterable:
            w = Dictionary.normalize(w)
            if valid_words and w not in valid_words:
                continue
            words.add(w)

    valid_words = index_embedding_words(embedding_file)

    X_claim, X_sents, y = load_generate_samples(db_filename, train_datapath,
                                                num_sample, sampled_path)
    X_claim = set(X_claim)
    for claim in X_claim:
        words = nltk.word_tokenize(claim)
        _insert(words)

    for sent in X_sents:
        words = simple_tokenizer(sent)
        _insert(words)

    with open(test_path, "r") as f:
        jlr = JSONLineReader()
        db = FeverDocDB(db_filename)

        lines = jlr.process(f)
        for line in lines:
            claim = line['claim']
            words = nltk.word_tokenize(claim)
            _insert(words)
            evidence_set = set([(evidence[2], evidence[3])
                                for evidences in line['evidence']
                                for evidence in evidences])
            pages = set()
            pages.update(evidence[0] for evidence in line['predicted_pages'])
            pages.update(evidence[0] for evidence in evidence_set)
            for page in pages:
                doc_lines = db.get_doc_lines(page)
                if not doc_lines:
                    continue
                doc_lines = [
                    doc_line.split("\t")[1]
                    if len(doc_line.split("\t")[1]) > 1 else ""
                    for doc_line in doc_lines.split("\n")
                ]
                doc_lines = [doc_line for doc_line in doc_lines if doc_line]
                for doc_line in doc_lines:
                    words = simple_tokenizer(doc_line)
                    _insert(words)
    return words
Ejemplo n.º 5
0
def test_data_4_siamese(db_filename, dataset_path):
    db = FeverDocDB(db_filename)
    jlr = JSONLineReader()

    X_claims = []
    X_sents = []
    all_sents_id = []

    with open(dataset_path, "r") as f:
        lines = jlr.process(f)
        # lines = lines[:1000]

        for line in tqdm(lines):
            claims = []
            sents = []
            sents_indexes = []
            p_lines = []
            claim = line['claim']
            pages = set()
            pages.update(evidence[0] for evidence in line['predicted_pages'])
            for page in pages:
                doc_lines = db.get_doc_lines(page)
                if not doc_lines:
                    continue
                doc_lines = [
                    doc_line.split("\t")[1]
                    if len(doc_line.split("\t")[1]) > 1 else ""
                    for doc_line in doc_lines.split("\n")
                ]
                p_lines.extend(
                    zip(doc_lines, [page] * len(doc_lines),
                        range(len(doc_lines))))
            for doc_line in p_lines:
                if not doc_line[0]:
                    continue
                else:
                    claims.append(claim)
                    sents.append(doc_line[0])
                    sents_indexes.append((doc_line[1], doc_line[2]))
            X_claims.append(claims)
            X_sents.append(sents)
            all_sents_id.append(sents_indexes)
    # print(len(X_claims))
    # print(len(X_sents))
    # print(len(all_sents_id))
    # X_claims_indexes, X_sents_indexes = [], []
    # for idx, claims in enumerate(X_claims):
    #     claims_index, sents_index = data_transformer(claims, X_sents[idx], word_dict)
    #     X_claims_indexes.append(claims_index)
    #     X_sents_indexes.append(sents_index)

    return X_claims, X_sents, all_sents_id
def test_data(db_path, dataset_path, type="ranking"):
    """
    generate dev examples to feed into the classifier
    :param db_path:
    :param dataset_path:
    :param type:
    :return:
    """

    with open(db_path) as f:
        db = json.load(f)
        jsr = JSONLineReader()
        inputs = []
        X_claim = []
        X_sents = []
        indexes = []
        with open(dataset_path, "r") as f:
            lines = jsr.process(f)

            for line in tqdm(lines):
                valid_lines = []
                claims = []
                sents_idnexes = []
                claim = line['claim']

                #for doc_line in p_lines:
                doc = line['predicted_evidence']
                # doc = line['evidence']
                for doc_line in doc:
                    if not doc_line:
                        continue
                    else:
                        # print(doc_line[0])
                        if type == "cos":
                            sents_idnexes.append(doc_line)
                            valid_lines.append(
                                get_whole_evidence([doc_line], db))
                            claims.append(claim)
                        elif type == "ranking":
                            sents_idnexes.append((doc_line[0], doc_line[1]))
                            valid_lines.append(
                                (claim, get_whole_evidence([doc_line], db)))
                if type == "cos":
                    X_sents.append(valid_lines)
                    X_claim.append(claims)
                elif type == "ranking":
                    inputs.append(valid_lines)
                indexes.append(sents_idnexes)
            inputs = list(zip(X_claim, X_sents))
            return inputs, indexes
def test_data_loader(save_path, db_filename=None, data_path=None):
    if os.path.exists(save_path):
        with open(save_path, 'rb') as f:
            X = pickle.load(f)
            claims, list_sents, sents_indexes = zip(*X)
    else:
        with open(data_path, "rb") as f:
            jlr = JSONLineReader()
            lines = jlr.process(f)
        claims, list_sents, sents_indexes = test_processing(db_filename, lines)
        X = zip(claims, list_sents, sents_indexes)
        with open(save_path, 'wb') as f:
            pickle.dump(X, f)
    return claims, list_sents, sents_indexes
    def sampling(self,datapath,num_sample=1):

        jlr = JSONLineReader()

        X = []
        count = 0
        with open(datapath, "r") as f:
            lines = jlr.process(f)

            for line in tqdm(lines):
                print('line: ', line)
                count += 1
                if line['label'].upper() == "NOT ENOUGH INFO":
                    continue
                neg_sents = []
                claim = line['claim']
                print('claim: ',claim)

                pos_set = set()
                pos_set_ref = []
                for evidence_set in line['evidence']:
                    for evidence_sentence in evidence_set:
                        pos_set.add(self.get_whole_evidence([evidence_sentence], self.db))
                        pos_set_ref.append(evidence_sentence)
                    print('pos_set: ', pos_set) 
                
                neg_sents = []
                for neg_evidence in line['predicted_evidence']:
                    # if neg_evidence not in evidence_set: 
                    if neg_evidence not in pos_set_ref: 
                        neg_sents.append(self.get_whole_evidence([neg_evidence], self.db))

                num_sampling = num_sample     
                if len(neg_sents) < num_sampling:   
                    num_sampling = len(neg_sents)
                    # print(neg_sents)
                if num_sampling == 0:
                    continue
                else:
                    for pos_sent in pos_set:
                        samples = random.sample(neg_sents, num_sampling)
                        for sample in samples:
                            print('sample: ',sample) 
                            if not sample:
                                continue
                            X.append((claim, pos_sent, sample))
                            if count % 1000 == 0:
                                print("claim:{} ,evidence :{} sample:{}".format(claim, pos_sent, sample))
        return X
Ejemplo n.º 9
0
def prediction_processing(dataset_path, predictions):
    """
    process the predicted (doc_id,sent_id) pairs to the score system desired format
    :param dataset_path:
    :param predictions:
    :return:
    """

    final_predictions = []
    jsr = JSONLineReader()

    with open(dataset_path, "r") as f:
        lines = jsr.process(f)
        prediction_processing_no_reload(lines, predictions)

    return final_predictions
def prediction_processing(dataset_path, predictions, db_filename):
    """
    process the predicted (doc_id,sent_id) pairs to the score system desired format
    :param dataset_path:
    :param predictions:
    :return:
    """

    final_predictions = []
    jsr = JSONLineReader()
    with open(db_filename) as f: 
        db = json.load(f)
        
    out_error_ana = []   
    with open(dataset_path, "r") as f:
        lines = jsr.process(f)

        cnt = 0
        for line in lines:
            
            pos_set_ref = line['evidence']
            if len(pos_set_ref) == 0 or not pos_set_ref[0]:
                continue

            line['predicted_evidence'] = [[prediction[0], int(prediction[1])] for prediction in predictions[cnt]]
            cnt_gold = 0
            out_error_ana.append("Claim: "+str(cnt))
            out_error_ana.append(line['claim'])
            out_error_ana.append("Gold evidence:")
            for evidence_set in line['evidence']:
                for evidence_sentence in evidence_set:
                    out_error_ana.append(get_whole_evidence([evidence_sentence], db))
                    cnt_gold += 1
                    
            out_error_ana.append("Predicted evidence:")
            for evidence_set in line['predicted_evidence'][:cnt_gold]:
                out_error_ana.append(get_whole_evidence([evidence_set], db))
            out_error_ana.append("")

            line['predicted_label'] = "refutes"
            final_predictions.append(line)
            cnt += 1
            if cnt == len(predictions):
                break

    return final_predictions, out_error_ana
Ejemplo n.º 11
0
    def sampling(self, datapath, num_sample=1):

        jlr = JSONLineReader()
        ret = []
        print("sampling for " + datapath)
        with open(datapath, "r") as f:
            lines = jlr.process(f)
            print(len(lines))
            with ThreadPool(processes=48) as p:
                for line in tqdm(p.imap(lambda x: self.handle(x, num_sample),
                                        lines),
                                 total=len(lines)):
                    if line is not None:
                        ret.extend(line)

        print("Done")

        return ret
Ejemplo n.º 12
0
    def dev_processing(self, data_path):

        jlr = JSONLineReader()

        with open(data_path, "r") as f:
            lines = jlr.process(f)

            devs = []
            labels = []
            for line in tqdm(lines):

                dev = []
                label = []
                if line['label'].upper() == "NOT ENOUGH INFO":
                    continue
                evidence_set = set([(evidence[2], evidence[3])
                                    for evidences in line['evidence']
                                    for evidence in evidences])

                pages = [
                    page for page in line['predicted_pages']
                    if page is not None
                ]
                for page, num in evidence_set:
                    pages.append(page)
                pages = set(pages)

                p_lines = []
                for page in pages:
                    doc_lines = self.db.get_doc_lines(page)
                    p_lines.extend(self.get_valid_texts(doc_lines, page))
                for doc_line in p_lines:
                    if not doc_line[0]:
                        continue
                    dev.append((line['claim'], doc_line[0]))
                    if (doc_line[1], doc_line[2]) in evidence_set:
                        label.append(1)
                    else:
                        label.append(0)
                if len(dev) == 0 or len(label) == 0:
                    continue
                devs.append(dev)
                labels.append(label)
        return devs, labels
def cos_train(db_filepath, dataset_path):
    """
    Use the cosine similarity score to rank (claim,sentence) pair in the dev set
    don't need training data
    :param db_filepath:
    :param dataset_path:
    :return:
    """

    with open(db_filepath) as f:
        db = json.load(f)
        jlr = JSONLineReader()

        X = []
        y = []
        with open(dataset_path, "r") as f:
            lines = jlr.process(f)

            for line in tqdm(lines):
                if line['label'] == "NOT ENOUGH INFO":
                    continue

                #label, dev = [], []
                pos_set_ref = line['evidence']
                if len(pos_set_ref) == 0 or not pos_set_ref[0]:
                    continue

                pos_set = set()
                for evidence_set in line['evidence']:
                    for evidence_sentence in evidence_set:
                        pos_set.add(get_whole_evidence([evidence_sentence],
                                                       db))

                for evidence_sentence_ref in line['predicted_evidence']:
                    evidence_sentence = get_whole_evidence(
                        [evidence_sentence_ref], db)
                    X.append((line['claim'], evidence_sentence))
                    if evidence_sentence in pos_set:
                        y.append(1)
                    else:
                        y.append(0)

        return X, y
Ejemplo n.º 14
0
    def data_processing_for_joint(self, data_path):
        from athene.rte.utils.data_reader import label_dict
        jlr = JSONLineReader()

        with open(data_path, "r") as f:
            lines = jlr.process(f)

            datas = []
            sent_labels = []
            claim_labels = []
            for line in tqdm(lines):
                claim_labels.append(label_dict.index(line['label']))
                data = []
                sent_label = []
                evidence_set = set([(evidence[2], evidence[3])
                                    for evidences in line['evidence']
                                    for evidence in evidences])

                pages = [
                    page for page in line['predicted_pages']
                    if page is not None
                ]
                for page, num in evidence_set:
                    pages.append(page)
                pages = set(pages)

                p_lines = []
                for page in pages:
                    doc_lines = self.db.get_doc_lines(page)
                    p_lines.extend(self.get_valid_texts(doc_lines, page))
                for doc_line in p_lines:
                    if not doc_line[0]:
                        continue
                    data.append((line['claim'], doc_line[0]))
                    if (doc_line[1], doc_line[2]) in evidence_set:
                        sent_label.append(1)
                    else:
                        sent_label.append(0)
                if len(data) == 0 or len(sent_label) == 0:
                    continue
                datas.append(data)
                sent_labels.append(sent_label)
        return datas, sent_labels, claim_labels
def predict_processing(db_path, dataset_path):

    with open(db_path) as f:
        db = json.load(f)
        jlr = JSONLineReader()

        devs = []
        all_indexes = []

        with open(dataset_path, "rb") as f:
            lines = jlr.process(f)

            for line in tqdm(lines):
                dev = []
                indexes = []
                claim = line['claim']

                ##########################
                pos_set_ref = line['evidence']
                if len(pos_set_ref) == 0 or not pos_set_ref[0]:
                    continue

                pos_set = set()
                for evidence_set in line['evidence']:
                    for evidence_sentence in evidence_set:
                        pos_set.add(get_whole_evidence([evidence_sentence],
                                                       db))

                for evidence_sentence_ref in line['predicted_evidence']:
                    evidence_sentence = get_whole_evidence(
                        [evidence_sentence_ref], db)
                    dev.append((line['claim'], evidence_sentence))
                    indexes.append(evidence_sentence_ref)
                ##########################

                if len(dev) == 0:
                    dev.append((claim, 'no evidence for this claim'))
                    indexes.append(('empty', 0))

                devs.append(dev)
                all_indexes.append(indexes)
        return devs, all_indexes
    def dev_processing(self,data_path):

        jlr = JSONLineReader()

        with open(data_path,"r") as f:
            lines = jlr.process(f)

            devs = []
            labels = []
            for line in tqdm(lines):

#                 if line['label'].upper() == "NOT ENOUGH INFO":
#                     continue
                
                label, dev = [], []
                pos_set_ref = line['evidence']
                if len(pos_set_ref) == 0 or not pos_set_ref[0]:
                    continue
                
                pos_set = set()
                for evidence_set in line['evidence']:
                    for evidence_sentence in evidence_set:
                        pos_set.add(self.get_whole_evidence([evidence_sentence], self.db))
                
                for evidence_sentence_ref in line['predicted_evidence']:
                    evidence_sentence = self.get_whole_evidence([evidence_sentence_ref], self.db)
                    dev.append((line['claim'], evidence_sentence))
                    
                    if evidence_sentence in pos_set:
                        label.append(1)
                    else:
                        label.append(0)
                        
                        
                if len(dev) == 0 or len(label) == 0:
                    continue
                devs.append(dev)
                labels.append(label)

        return devs,labels
Ejemplo n.º 17
0
    def predict_processing(self, datapath):

        jlr = JSONLineReader()

        devs = []
        all_indexes = []

        with open(datapath, "rb") as f:
            lines = jlr.process(f)

            for line in tqdm(lines):
                dev = []
                indexes = []
                pages = set()
                # pages = line['predicted_pages']
                pages.update(page for page in line['predicted_pages'])
                # if len(pages) == 0:
                #     pages.add("Michael_Hutchence")
                claim = line['claim']
                p_lines = []
                #Separa sentences and add predict page
                for page in pages:
                    doc_lines = self.db.get_doc_lines(page)
                    if not doc_lines:
                        continue
                    p_lines.extend(self.get_valid_texts(doc_lines, page))

                for doc_line in p_lines:
                    if not doc_line[0]:
                        continue
                    dev.append((claim, doc_line[0]))
                    indexes.append((doc_line[1], doc_line[2]))
                # print(len(dev))
                if len(dev) == 0:
                    dev.append((claim, 'no evidence for this claim'))
                    indexes.append(('empty', 0))
                devs.append(dev)
                all_indexes.append(indexes)
        return devs, all_indexes
def generate_prediction_files(predictions,p_sents_indexes,data_path,final_prediction_path):

    """
    transform the generated predictions from classifier to lists of dicts form to feed into the score system
    :param predictions:
    :param p_sents_indexes:
    :param data_path:
    :param final_prediction_path:
    :return:
    """
    jlr = JSONLineReader()


    final_predictions = []
    with open(data_path,"r") as f:
        lines = jlr.process(f)

        print(len(predictions))
        print(len(p_sents_indexes))
        print(len(lines))
        assert len(predictions) == len(p_sents_indexes) == len(lines)
        for idx,line in enumerate(lines):

            line['predicted_evidence'] = []
            line['predicted_label'] = 'refutes'
            predicted_sents = predictions[idx]
            sents_indexes = p_sents_indexes[idx]
            for i in range(len(sents_indexes)):
                if predicted_sents[i] == 1:
                    line['predicted_evidence'].append([sents_indexes[i][0],sents_indexes[i][1]])

            final_predictions.append(line)

    with open(final_prediction_path,"w") as f:

        for prediction in final_predictions:
            f.write(json.dumps(prediction)+'\n')

    return final_predictions
def tfidf_test_processing(base_path, dbfilename, test_data_path,
                          test_store_path, pro_extract_sents_path,
                          h_max_length, s_max_length, iword_dict):
    dev_index_path = os.path.join(
        base_path, "data/train_data/dev.h_{}.s_{}.tfidf.indexes.p".format(
            h_max_length, s_max_length))
    devs, location_indexes = dev_data_loader(test_store_path, dbfilename,
                                             test_data_path)
    if os.path.exists(dev_index_path):
        with open(dev_index_path, "rb") as f:
            devs_indexes = pickle.load(f)
    else:
        with open(pro_extract_sents_path, "r") as f:
            jlr = JSONLineReader()
            lines = jlr.process(f)

            inputs = []
            new_location_indexes = []
            for i, line in enumerate(lines):
                pro_extract_sents = []
                sent_index = []
                predict_sents = line['predicted_sentences']
                claim = line['claim']
                predict_sents_set = set([
                    (doc_id, sent_num) for doc_id, sent_num in predict_sents
                ])
                # print(predict_sents_set)
                for j, index in enumerate(location_indexes[i]):
                    if (index[0], index[1]) in predict_sents_set:
                        # print(devs[i][j])
                        # print(devs[i])
                        pro_extract_sents.append((claim, devs[i][j][1]))
                        sent_index.append((index[0], index[1]))
                inputs.append(pro_extract_sents)
                new_location_indexes.append(sent_index)
            devs_indexes = test_data_indexes(inputs, iword_dict, h_max_length,
                                             s_max_length)
    return devs_indexes, new_location_indexes
def dev_processing(db_filename, datapath):
    db = FeverDocDB(db_filename)
    jlr = JSONLineReader()

    devs = []
    all_indexes = []

    with open(datapath, "rb") as f:
        lines = jlr.process(f)

        for line in tqdm(lines):
            dev = []
            indexes = []
            pages = set()
            pages.update(page[0] for page in line['predicted_pages'])
            if len(pages) == 0:
                pages.add("Michael_Hutchence")
            claim = line['claim']
            p_lines = []
            for page in pages:
                doc_lines = db.get_doc_lines(page)
                if not doc_lines:
                    continue
                p_lines.extend(get_valid_texts(doc_lines, page))

            for doc_line in p_lines:
                if not doc_line[0]:
                    continue
                dev.append((claim, doc_line[0]))
                indexes.append((doc_line[1], doc_line[2]))
            # print(len(dev))
            if len(dev) == 0:
                dev.append((claim, 'no evidence for this claim'))
                indexes.append(('empty', 0))
            devs.append(dev)
            all_indexes.append(indexes)
    return devs, all_indexes
Ejemplo n.º 21
0
                        help=("String option specifying tokenizer type to use "
                              "(e.g. 'corenlp')"))

    parser.add_argument('--num-workers',
                        type=int,
                        default=None,
                        help='Number of CPU processes (for tokenizing, etc)')
    args = parser.parse_args()
    doc_freqs = None
    if args.use_precomputed:
        _, metadata = utils.load_sparse_csr(args.model)
        doc_freqs = metadata['doc_freqs'].squeeze()

    db = FeverDocDB("data/fever/fever.db")
    jlr = JSONLineReader()
    formatter = FEVERGoldFormatter(set(), FEVERLabelSchema())

    jlr = JSONLineReader()

    with open(args.in_file, "r") as f, open(
            "data/fever/{0}.sentences.{3}.p{1}.s{2}.jsonl".format(
                args.split, args.max_page, args.max_sent,
                "precomputed" if args.use_precomputed else "not_precomputed"),
            "w+") as out_file:
        lines = jlr.process(f)
        #lines = tf_idf_claims_batch(lines)

        for line in tqdm(lines):
            line = tf_idf_claim(line)
            out_file.write(json.dumps(line) + "\n")
def prepare_ranking(db_filename, datapath, k=10, num_sample=3):
    """

    :param db_filename:
    :param datapath:
    :param k:
    :param num_sample:
    :return:
    """

    db = FeverDocDB(db_filename)
    jlr = JSONLineReader()

    X = []
    with open(datapath, "r") as f:
        lines = jlr.process(f)

        for line in tqdm(lines):
            if line['label'].upper() == "NOT ENOUGH INFO":
                continue
            p_lines = []
            pos_sents = []
            neg_sents = []
            claim = line['claim']
            evidence_set = set([(evidence[2], evidence[3])
                                for evidences in line['evidence']
                                for evidence in evidences])
            sampled_sents_idx = [(id, number)
                                 for id, number in line['predicted_sentences']]
            sampled_sents_idx = [
                index for index in sampled_sents_idx
                if index not in evidence_set
            ]
            if k:
                sampled_sents_idx = sampled_sents_idx[:k]
            pages = set()
            pages.update(evidence[0] for evidence in line['predicted_pages'])
            pages.update(evidence[0] for evidence in evidence_set)
            for page in pages:
                doc_lines = db.get_doc_lines(page)
                if not doc_lines:
                    continue
                doc_lines = [
                    doc_line.split("\t")[1]
                    if len(doc_line.split("\t")[1]) > 1 else ""
                    for doc_line in doc_lines.split("\n")
                ]
                p_lines.extend(
                    zip(doc_lines, [page] * len(doc_lines),
                        range(len(doc_lines))))
            for doc_line in p_lines:
                if not doc_line[0]:
                    continue
                elif (doc_line[1], doc_line[2]) in sampled_sents_idx:
                    neg_sents.append(doc_line[0])
                elif (doc_line[1], doc_line[2]) in evidence_set:
                    pos_sents.append(doc_line[0])
            # print(line)
            # print(sampled_sents_idx)
            # print(neg_sents)
            if len(sampled_sents_idx) < num_sample:
                continue
            for sent in pos_sents:
                neg_samples = random.sample(neg_sents, num_sample)
                triplet = (claim, sent, neg_samples)
                X.append(triplet)

    return X
Ejemplo n.º 23
0
def sample_ranking_train(db_filename, datapath, k=5, num_sample=2):
    """

    :param db_filename: path stores wiki-pages database
    :param datapath: path stores fever predicted pages train set
    :param k: number of sentences where to select negative examples
    :param num_sample: number of negative examples to sample
    :return: X: claim and sentence pairs y: if the sentence in evidence set
    """

    db = FeverDocDB(db_filename)
    jlr = JSONLineReader()

    X_claim = []
    X_sents = []
    y = []
    count = 0

    with open(datapath, "r") as f:
        lines = jlr.process(f)
        # lines = lines[:1000]

        for line in tqdm(lines):
            num_sampling = num_sample
            if line['label'].upper() == "NOT ENOUGH INFO":
                continue
            p_lines = []
            neg_sents = []
            claim = line['claim']
            evidence_set = set([(evidence[2], evidence[3])
                                for evidences in line['evidence']
                                for evidence in evidences])
            sampled_sents_idx = [(id, number)
                                 for id, number in line['predicted_sentences']]
            sampled_sents_idx = sampled_sents_idx[0:k + 5]
            sampled_sents_idx = [
                index for index in sampled_sents_idx
                if index not in evidence_set
            ]
            pages = set()
            pages.update(evidence[0] for evidence in line['predicted_pages'])
            pages.update(evidence[0] for evidence in evidence_set)
            for page in pages:
                doc_lines = db.get_doc_lines(page)
                if not doc_lines:
                    continue
                doc_lines = [
                    doc_line.split("\t")[1]
                    if len(doc_line.split("\t")[1]) > 1 else ""
                    for doc_line in doc_lines.split("\n")
                ]
                p_lines.extend(
                    zip(doc_lines, [page] * len(doc_lines),
                        range(len(doc_lines))))
            for doc_line in p_lines:
                if not doc_line[0]:
                    continue
                elif (doc_line[1], doc_line[2]) in sampled_sents_idx:
                    neg_sents.append(doc_line[0])
                elif (doc_line[1], doc_line[2]) in evidence_set:
                    X_claim.append(claim)
                    X_sents.append(doc_line[0])
                    y.append(1)

            if len(sampled_sents_idx) < num_sample:
                count += 1
                num_sampling = len(sampled_sents_idx)

            samples = random.sample(neg_sents, num_sampling)
            for neg_example in samples:
                X_claim.append(claim)
                X_sents.append(neg_example)
                y.append(0)
        print(count)

    return X_claim, X_sents, y
def test_data(db_path, dataset_path, type="ranking"):
    """
    generate dev examples to feed into the classifier
    :param db_path:
    :param dataset_path:
    :param type:
    :return:
    """

    db = FeverDocDB(db_path)
    jsr = JSONLineReader()

    inputs = []
    X_claim = []
    X_sents = []
    indexes = []

    with open(dataset_path, "r") as f:
        lines = jsr.process(f)

        for line in tqdm(lines):

            p_lines = []
            valid_lines = []
            claims = []
            sents_idnexes = []
            claim = line['claim']
            # X_claim.append([claim])
            predicted_pages = line['predicted_pages']
            for page in predicted_pages:
                # doc_lines = db.get_doc_lines(page[0])
                doc_lines = db.get_doc_lines(page[0])

                if not doc_lines:
                    # print(page)
                    continue
                doc_lines = [doc_line.split("\t")[1] if len(doc_line.split("\t")[1]) > 1 else "" for doc_line in
                             doc_lines.split("\n")]
                p_lines.extend(zip(doc_lines, [page[0]] * len(doc_lines), range(len(doc_lines))))

            for doc_line in p_lines:
                if not doc_line[0]:
                    continue
                else:
                    # print(doc_line[0])
                    if type == "cos":
                        sents_idnexes.append((doc_line[1], doc_line[2]))
                        valid_lines.append(doc_line[0])
                        claims.append(claim)
                    elif type == "ranking":
                        sents_idnexes.append((doc_line[1], doc_line[2]))
                        valid_lines.append((claim, doc_line[0]))
            if type == "cos":
                X_sents.append(valid_lines)
                X_claim.append(claims)
            elif type == "ranking":
                inputs.append(valid_lines)
            indexes.append(sents_idnexes)
        inputs = list(zip(X_claim, X_sents))

        return inputs, indexes
def sample_4_ranking(db_path, data_path, type="train", num_neg=3, seed=55):
    """
    sample a set of negative sentecnes and combine positive sentence to form the training data
    :param db_path:
    :param data_path:
    :param type:
    :param num_neg:
    :param seed:
    :return:
    """

    random.seed(seed)
    if type == "train":
        claims, related_pages_sents, _, y = label_sents(db_path,
                                                        data_path,
                                                        type="train")

        train_triplets = []
        for i, claim in tqdm(enumerate(claims)):
            neg_sents = [j for j, label in enumerate(y[i]) if label != 1]
            for idx, label in enumerate(y[i]):
                if label == 1:
                    pos_sent = related_pages_sents[i][idx]
                    samples = random.sample(neg_sents, num_neg)
                    sampled_neg_sents = []
                    for index in samples:
                        sampled_neg_sents.append(related_pages_sents[i][index])
                    triplet = (claim, pos_sent, sampled_neg_sents)
                    train_triplets.append(triplet)
        return train_triplets

    elif type == "dev" or type == "test":
        """
        For dev or test set, use claim and sentence pairs to get scores of each pair
        """
        with open(db_path) as f:
            db = json.load(f)
            jsr = JSONLineReader()
            with open(data_path, "r") as f:
                lines = jsr.process(f)

                dev_examples = []
                pages_sents_indexes = []
                for line in tqdm(lines):

                    ##########################
                    pos_set_ref = line['evidence']
                    if len(pos_set_ref) == 0 or not pos_set_ref[0]:
                        continue

                    pos_set = set()
                    for evidence_set in line['evidence']:
                        for evidence_sentence in evidence_set:
                            pos_set.add(
                                get_whole_evidence([evidence_sentence], db))

                    for evidence_sentence_ref in line['predicted_evidence']:
                        evidence_sentence = get_whole_evidence(
                            [evidence_sentence_ref], db)
                        dev_examples.append((line['claim'], evidence_sentence))
                        pages_sents_indexes.append(evidence_sentence_ref)
                    ##########################


#                     p_lines = []
#                     feed_tuples = []
#                     sents_indexes = []
#                     claim = line['claim']
#                     for page in line['predicted_pages']:
#                         doc_lines = db.get_doc_lines(page)
#                         if not doc_lines:
#                             # print(page)
#                             continue
#                         doc_lines = [doc_line.split("\t")[1] if len(doc_line.split("\t")[1]) > 1 else "" for doc_line in
#                                      doc_lines.split("\n")]
#                         p_lines.extend(zip(doc_lines, [page] * len(doc_lines), range(len(doc_lines))))
#
#                     for doc_line in p_lines:
#                         if not doc_line[0]:
#                             continue
#                         else:
#                             # print(doc_line[0])
#                             sents_indexes.append((doc_line[1], doc_line[2]))
#                             feed_tuples.append((claim,doc_line[0]))
#
#                     dev_examples.append(feed_tuples)
#                     pages_sents_indexes.append(sents_indexes)

            return dev_examples, pages_sents_indexes
def label_sents(db_path, data_path, type="train"):
    """
    This function is to label all sentences in the evidence set to 1 and not in evidence set to 0 for training data
    :param db_path:
    :param data_path:
    :param type:
    :return:
    """

    db = FeverDocDB(db_path)
    jsr = JSONLineReader()
    claims = []
    related_pages_sents = []
    pages_sents_indexes = []
    y = []
    with open(data_path, "r") as f:
        lines = jsr.process(f)
        count = 0
        for line in tqdm(lines):
            if line['label'] == "NOT ENOUGH INFO" and type == "train":
                continue
            p_lines = []
            valid_lines = []
            line_labels = []
            sents_idnexes = []
            claim = line['claim']
            evidences = line['evidence']
            evidence_set = set()
            pages_list = []
            for evidence in evidences:
                for sent in evidence:
                    evidence_set.add((sent[2], sent[3]))
                    pages_list.append(sent[2])
            # predicted_pages = line['predicted_pages']
            predicted_pages = [page[0] for page in line['predicted_pages']]
            predicted_pages = predicted_pages + pages_list
            predicted_pages = set(predicted_pages)
            if len(predicted_pages) > 5:
                count += 1
            claims.append(claim)
            for page in predicted_pages:
                doc_lines = db.get_doc_lines(page)
                if not doc_lines:
                    # print(page)
                    continue
                doc_lines = [
                    doc_line.split("\t")[1]
                    if len(doc_line.split("\t")[1]) > 1 else ""
                    for doc_line in doc_lines.split("\n")
                ]
                p_lines.extend(
                    zip(doc_lines, [page] * len(doc_lines),
                        range(len(doc_lines))))

            for doc_line in p_lines:
                # ignore empty sentences
                if not doc_line[0]:
                    continue
                else:
                    # print(doc_line[0])
                    sents_idnexes.append((doc_line[1], doc_line[2]))
                    valid_lines.append(doc_line[0])
                    is_added = False
                    for sent in evidence_set:
                        if sent[0] == doc_line[1] and sent[1] == doc_line[2]:
                            line_labels.append(1)
                            is_added = True
                            break
                    if is_added != True:
                        line_labels.append(0)
            # print(len(p_lines))
            # print(len(line_labels))
            # print(len(valid_lines))
            assert len(line_labels) == len(valid_lines) == len(sents_idnexes)
            related_pages_sents.append(valid_lines)
            pages_sents_indexes.append(sents_idnexes)
            y.append(line_labels)
    print(count)
    return claims, related_pages_sents, pages_sents_indexes, y
def in_class_sampling(db_filename, datapath, num_sample=1, k=5):
    """

        :param db_filename: path stores wiki-pages database
        :param datapath: path stores fever predicted pages train set
        :param k: number of sentences where to select negative examples
        :param num_sample: number of negative examples to sample
        :return: X: claim and sentence pairs y: if the sentence in evidence set
        """

    db = FeverDocDB(db_filename)
    jlr = JSONLineReader()

    X = []
    count = 0

    count1 = 1
    with open(datapath, "r") as f:
        lines = jlr.process(f)
        # lines = lines[:1000]

        for line in tqdm(lines):
            pos_pairs = []
            count1 += 1
            num_sampling = num_sample
            if line['label'].upper() == "NOT ENOUGH INFO":
                continue
            p_lines = []
            neg_sents = []
            claim = line['claim']

            for evidence_set in line['evidence']:
                pos_sent = get_whole_evidence(evidence_set, db)
                print("claim:{} pos_sent:{}".format(claim, pos_sent))
                pos_pairs.append((claim, pos_sent))

            evidence_set = set([(evidence[2], evidence[3])
                                for evidences in line['evidence']
                                for evidence in evidences])
            sampled_sents_idx = [(id, number)
                                 for id, number in line['predicted_sentences']]
            sampled_sents_idx = sampled_sents_idx[0:k + 5]
            sampled_sents_idx = [
                index for index in sampled_sents_idx
                if index not in evidence_set
            ]
            pages = set()
            pages.update(evidence[0] for evidence in line['predicted_pages'])
            pages.update(evidence[0] for evidence in evidence_set)
            for page in pages:
                doc_lines = db.get_doc_lines(page)
                p_lines.extend(get_valid_texts(doc_lines, page))
            for doc_line in p_lines:
                if not doc_line[0]:
                    continue
                elif (doc_line[1], doc_line[2]) in sampled_sents_idx:
                    neg_sents.append(doc_line[0])
                # elif (doc_line[1], doc_line[2]) in evidence_set:
                #     if count1%10000==0:
                #         print("page_id:{},sent_num:{}".format(doc_line[1],doc_line[2]))
                #         print("evidence_set:{}".format(evidence_set))
                #     pos_pairs.append((claim,doc_line[0]))

            if len(sampled_sents_idx) < num_sample:
                num_sampling = len(neg_sents)
            if num_sampling == 0:
                count += 1
                continue
            else:
                for pair in pos_pairs:
                    samples = random.sample(neg_sents, num_sampling)
                    for sample in samples:
                        X.append((pair[0], pair[1], sample))
                        if count1 % 10000 == 0:
                            print("claim:{},pos:{},neg:{}".format(
                                claim, pair[1], sample))
        print(count)

    return X