def preprocess_claim_with_doc(claim_with_docs: tuple) -> list:
    claim_id = claim_with_docs[0]
    # remove any NOT_VERIFIABLE claims that were processed earlier
    if not claim_is_verifiable(claim_id, dataset=args.dataset):
        return []

    claim = get_claim(claim_id, dataset=args.dataset)
    evidence_map = get_evidence_page_line_map(claim_id, args.dataset)
    print('Preprocessing docs for claim [{}]: {}'.format(
        claim_id, evidence_map.keys()))

    preprocessed_pairs = []
    for page_id, relevant_line_ids in evidence_map.items():
        wiki_page = retrieve_wiki_page(page_id)
        for line_id in relevant_line_ids:
            # add the relevant claim/sentence pair...
            positive_line = wiki_page.lines[line_id]
            positive_input = transform_LR_input(claim, positive_line.text)
            preprocessed_pairs.append(
                (claim_id, page_id, line_id, positive_input, 1))

            # ...and, to keep it balanced, one irrelevant sample
            negative_line = get_irrelevant_line(wiki_page, relevant_line_ids)
            negative_input = transform_LR_input(claim, negative_line.text)
            preprocessed_pairs.append(
                (claim_id, page_id, negative_line.id, negative_input, 0))

    return preprocessed_pairs
def preprocess_claim(claim_row: pd.Series) -> list:
    claim_id, verifiable, label, claim, evidence = claim_row[1].values
    if not verifiable == 'VERIFIABLE':
        return []
    print('Preprocessing docs for claim [{}]'.format(claim_id))

    # output will be the same for all evidence items belonging to this claim
    #label = 'entails' if label == 'SUPPORTS' else 'neutral'
    label = 1 if label == 'SUPPORTS' else 0
    preprocessed_pairs = []
    evidence_map = get_evidence_page_line_map(claim_id, args.dataset)
    evidence_sentences = []

    for page_id, relevant_line_ids in evidence_map.items():
        wiki_page = retrieve_wiki_page(page_id)
        evidence_sentences.extend(
            [wiki_page.lines[id].text for id in relevant_line_ids])

    premise = ' '.join(evidence_sentences)
    premise = recreate_punctuation_in_doc_text(premise)
    hypothesis = recreate_punctuation_in_doc_text(claim)
    pair = {
        'label': label,
        'sentence1': premise,
        'sentence2': hypothesis,
        'claim_id': claim_id
    }
    preprocessed_pairs.append(pair)

    return preprocessed_pairs
Beispiel #3
0
def preprocess_claim(claim_row: pd.Series) -> list:
    claim_id, verifiable, label, claim, evidence = claim_row[1].values
    if not verifiable == 'VERIFIABLE':
        return []
    print('Preprocessing docs for claim [{}]'.format(claim_id))

    # output will be the same for all evidence items belonging to this claim
    output = 1 if label == 'SUPPORTS' else 0
    preprocessed_pairs = []
    evidence_map = get_evidence_page_line_map(claim_id, args.dataset)

    for page_id, relevant_line_ids in evidence_map.items():
        wiki_page = retrieve_wiki_page(page_id)
        for line_id in relevant_line_ids:
            line = wiki_page.lines[line_id]
            line_text = line.text
            line_contains_references = 1 if line.anchors else 0
            line_position_absolute = line.id
            line_position_relative = line.id / len(wiki_page.lines)
            num_evidence_docs_for_claim = len(evidence_map.keys())
            num_lines_for_evidence = len(relevant_line_ids)
            num_coordination_terms_claim = get_num_coordination_terms(
                line_text,
                preprocess_claim_text(claim).split())
            num_coordination_terms_title = get_num_coordination_terms(
                line_text, preprocess_doc_title(page_id))
            input = transform_NN_input(
                claim, line_text, line_contains_references,
                line_position_absolute, line_position_relative,
                num_evidence_docs_for_claim, num_lines_for_evidence,
                num_coordination_terms_claim, num_coordination_terms_title)
            preprocessed_pairs.append(
                (claim_id, page_id, line_id, input, output))

    return preprocessed_pairs
Beispiel #4
0
def preprocess_claim(claim_row: pd.Series) -> list:
    claim_id, verifiable, label, claim, evidence = claim_row[1].values
    if not verifiable == 'VERIFIABLE':
        return []
    print('Preprocessing docs for claim [{}]'.format(claim_id))

    # output will be the same for all evidence items belonging to this claim
    output = 1 if label == 'SUPPORTS' else 0
    preprocessed_pairs = []
    evidence_map = get_evidence_page_line_map(claim_id, args.dataset)

    evidence_sentences = []
    num_evidence_docs_for_claim = len(evidence_map.keys())
    num_references = 0
    num_evidence_items = 0
    num_evidence_words = 0
    num_coordination_terms_evidence_claim = 0
    num_coordination_terms_titles_claim = 0
    evidence_sentence_positions = []

    # concat evidence (can be from multiple wiki_pages and/or lines)
    for page_id, relevant_line_ids in evidence_map.items():
        wiki_page = retrieve_wiki_page(page_id)
        evidence_sentences.extend(
            [wiki_page.lines[id].text for id in relevant_line_ids])

        # count metrics and subtract features
        for line_id in relevant_line_ids:
            line = wiki_page.lines[line_id]
            line_text = line.text
            num_evidence_words += len(line_text.split())
            num_references += len(line.anchors)
            num_evidence_items += 1
            evidence_sentence_positions.append(line.id)
            num_coordination_terms_evidence_claim += get_num_coordination_terms(
                line_text,
                preprocess_claim_text(claim).split())
            num_coordination_terms_titles_claim += get_num_coordination_terms(
                line_text, preprocess_doc_title(page_id))

    combined_evidence = ' '.join(evidence_sentences)
    avg_sentence_position = np.mean(evidence_sentence_positions)

    input = transform_NN_input(claim, combined_evidence,
                               num_evidence_docs_for_claim, num_references,
                               num_evidence_items,
                               num_coordination_terms_evidence_claim,
                               num_coordination_terms_titles_claim,
                               avg_sentence_position, num_evidence_words)
    preprocessed_pairs.append((claim_id, input, output))

    return preprocessed_pairs
Beispiel #5
0
def display_or_store_result(claim: str,
                            claim_id: int,
                            result_docs: list,
                            dir_path: str,
                            display_only: bool = False):
    if display_only:
        print(colored('Results for claim "{}":'.format(claim), 'yellow'))
        for doc in result_docs:
            page_id = doc[0]
            wiki_page = retrieve_wiki_page(page_id)
            print(wiki_page)
    else:
        #result_path = '{}{}.jsonl'.format(path, claim_id)
        #write_list_to_jsonl(result_path, result_docs)
        print(
            colored(
                'Storing results for claim "{}"\n{}:'.format(
                    claim, result_docs), 'yellow'))
        write_list_to_oneline_csv(dir_path, claim_id, result_docs)
def preprocess_claim(claim_row: pd.Series) -> list:
    claim_id, verifiable, label, claim, evidence = claim_row[1].values
    if not verifiable == 'VERIFIABLE':
        return []
    print('Preprocessing docs for claim [{}]'.format(claim_id))

    # output will be the same for all evidence items belonging to this claim
    output = 1 if label == 'SUPPORTS' else 0
    preprocessed_pairs = []
    evidence_map = get_evidence_page_line_map(claim_id, args.dataset)

    for page_id, relevant_line_ids in evidence_map.items():
        wiki_page = retrieve_wiki_page(page_id)
        for line_id in relevant_line_ids:
            line_text = wiki_page.lines[line_id].text
            input = transform_NN_input(claim, line_text)
            preprocessed_pairs.append((claim_id, page_id, line_id, input, output))

    return preprocessed_pairs
Beispiel #7
0
def preprocess_claim_with_doc(claim_with_docs: tuple) -> list:
    partial_result = []
    claim_id = claim_with_docs[0]
    # remove any NOT_VERIFIABLE claims that were processed earlier
    if not claim_is_verifiable(claim_id, dataset=args.dataset):
        return []

    claim = get_claim(claim_id, dataset=args.dataset)
    evidence_map = get_evidence_page_line_map(claim_id, args.dataset)

    retrieved_doc_ids = set(claim_with_docs[1])
    evidence_doc_ids = evidence_map.keys()
    docs_for_training = retrieved_doc_ids.union(evidence_doc_ids)
    print('Preprocessing docs for claim [{}]: {}'.format(
        claim_id, docs_for_training))

    docs = [retrieve_wiki_page(id) for id in docs_for_training]
    for doc in docs:
        partial_result.extend(
            preprocess_doc(claim_id, claim, doc, evidence_map))

    return partial_result
Beispiel #8
0
# where the actual retrieval takes place

import argparse
import time

from dataaccess.access_wiki_page import retrieve_wiki_page

parser = argparse.ArgumentParser()
parser.add_argument("--id",
                    help="ID of a document to retrieve for test purposes",
                    required=True)
parser.add_argument(
    "--complete",
    help="print complete doc text instead of ID + text preview",
    action="store_true")
args = parser.parse_args()

if __name__ == '__main__':
    if (args.id):
        start_time = time.time()
        wiki_document = retrieve_wiki_page(args.id)
        print('Retrieved document "{}" after {:.5f} seconds'.format(
            args.id,
            time.time() - start_time))
        if (args.complete):
            print(wiki_document.text)
        else:
            print(wiki_document)
    else:
        print('Please add ID to retrieve')