Exemple #1
0
def preprocess_claim(claim_row: pd.Series) -> list:
    claim_id, verifiable, label, claim, evidence = claim_row[1].values
    if not verifiable == 'VERIFIABLE':
        return []
    print('Preprocessing docs for claim [{}]'.format(claim_id))

    # output will be the same for all evidence items belonging to this claim
    output = 1 if label == 'SUPPORTS' else 0
    preprocessed_pairs = []
    evidence_map = get_evidence_page_line_map(claim_id, args.dataset)

    for page_id, relevant_line_ids in evidence_map.items():
        wiki_page = retrieve_wiki_page(page_id)
        for line_id in relevant_line_ids:
            line = wiki_page.lines[line_id]
            line_text = line.text
            line_contains_references = 1 if line.anchors else 0
            line_position_absolute = line.id
            line_position_relative = line.id / len(wiki_page.lines)
            num_evidence_docs_for_claim = len(evidence_map.keys())
            num_lines_for_evidence = len(relevant_line_ids)
            num_coordination_terms_claim = get_num_coordination_terms(
                line_text,
                preprocess_claim_text(claim).split())
            num_coordination_terms_title = get_num_coordination_terms(
                line_text, preprocess_doc_title(page_id))
            input = transform_NN_input(
                claim, line_text, line_contains_references,
                line_position_absolute, line_position_relative,
                num_evidence_docs_for_claim, num_lines_for_evidence,
                num_coordination_terms_claim, num_coordination_terms_title)
            preprocessed_pairs.append(
                (claim_id, page_id, line_id, input, output))

    return preprocessed_pairs
Exemple #2
0
def transform_LR_input(claim_text: str, line_text: str, debug: bool = False):
    # remove punctuation that are otherwise part of tokens
    preprocessed_claim = preprocess_claim_text(claim_text)
    # remove artifacts like -LRB- etc.
    preprocessed_line = preprocess_doc_text(line_text)

    claim_vector = transform_sentence_to_vector(preprocessed_claim, debug)
    line_vector = transform_sentence_to_vector(preprocessed_line, debug)

    return get_vector_difference(claim_vector, line_vector)
def transform_NN_input(claim_text: str, line_text: str):
    # remove punctuation that are otherwise part of tokens
    preprocessed_claim = preprocess_claim_text(claim_text)
    # remove artifacts like -LRB- etc.
    preprocessed_line = preprocess_doc_text(line_text)

    claim_vector = transform_sentence_to_vector(preprocessed_claim, args.debug)
    line_vector = transform_sentence_to_vector(preprocessed_line, args.debug)
    combined_claim_line_vector = get_vector_difference(claim_vector, line_vector)

    return combined_claim_line_vector
Exemple #4
0
def preprocess_claim(claim_row: pd.Series) -> list:
    claim_id, verifiable, label, claim, evidence = claim_row[1].values
    if not verifiable == 'VERIFIABLE':
        return []
    print('Preprocessing docs for claim [{}]'.format(claim_id))

    # output will be the same for all evidence items belonging to this claim
    output = 1 if label == 'SUPPORTS' else 0
    preprocessed_pairs = []
    evidence_map = get_evidence_page_line_map(claim_id, args.dataset)

    evidence_sentences = []
    num_evidence_docs_for_claim = len(evidence_map.keys())
    num_references = 0
    num_evidence_items = 0
    num_evidence_words = 0
    num_coordination_terms_evidence_claim = 0
    num_coordination_terms_titles_claim = 0
    evidence_sentence_positions = []

    # concat evidence (can be from multiple wiki_pages and/or lines)
    for page_id, relevant_line_ids in evidence_map.items():
        wiki_page = retrieve_wiki_page(page_id)
        evidence_sentences.extend(
            [wiki_page.lines[id].text for id in relevant_line_ids])

        # count metrics and subtract features
        for line_id in relevant_line_ids:
            line = wiki_page.lines[line_id]
            line_text = line.text
            num_evidence_words += len(line_text.split())
            num_references += len(line.anchors)
            num_evidence_items += 1
            evidence_sentence_positions.append(line.id)
            num_coordination_terms_evidence_claim += get_num_coordination_terms(
                line_text,
                preprocess_claim_text(claim).split())
            num_coordination_terms_titles_claim += get_num_coordination_terms(
                line_text, preprocess_doc_title(page_id))

    combined_evidence = ' '.join(evidence_sentences)
    avg_sentence_position = np.mean(evidence_sentence_positions)

    input = transform_NN_input(claim, combined_evidence,
                               num_evidence_docs_for_claim, num_references,
                               num_evidence_items,
                               num_coordination_terms_evidence_claim,
                               num_coordination_terms_titles_claim,
                               avg_sentence_position, num_evidence_words)
    preprocessed_pairs.append((claim_id, input, output))

    return preprocessed_pairs
def retrieve_documents_for_claim(claim: str, claim_id: int):
    print(
        colored('Retrieving documents for claim [{}]: "{}"'.format(
            claim_id, claim),
                attrs=['bold']))
    preprocessed_claim = preprocess_claim_text(claim)
    claim_terms = process_normalise_tokenise_filter(preprocessed_claim)

    # only docs that appear in index for at least one claim term to be considered
    doc_candidates = get_candidate_documents_for_claim(claim_terms,
                                                       mode='raw_count')

    scoring_function = get_query_likelihood_score_no_smoothing
    if args.smoothing == 'laplace':
        scoring_function = get_query_likelihood_score_laplace_smoothing
    if args.smoothing == 'laplace_lindstone':
        scoring_function = get_query_likelihood_score_laplace_lindstone_smoothing
    if args.smoothing == 'jelinek_mercer':
        scoring_function = get_query_likelihood_score_jelinek_mercer_smoothing
    if args.smoothing == 'dirichlet':
        scoring_function = get_query_likelihood_score_dirichlet_smoothing

    # query likelihood scores for each claim-doc combination
    docs_with_query_likelihood_scores = [
        scoring_function(claim_terms, doc_with_terms)
        for doc_with_terms in doc_candidates.items()
    ]

    # zero values lead to random retrievals if all documents evaluate to zero, so might rather want to show no results
    if (args.remove_zero_likelihood):
        docs_with_query_likelihood_scores = list(
            filter(lambda x: x[1] != 0, docs_with_query_likelihood_scores))

    # sort by query likelihood and limit to top results
    docs_with_query_likelihood_scores.sort(key=itemgetter(1), reverse=True)
    result_docs = docs_with_query_likelihood_scores[:
                                                    DOCS_TO_RETRIEVE_PER_CLAIM]

    result_directory = '{}{}/'.format(RETRIEVED_PROBABILISTIC_DIRECTORY,
                                      args.smoothing or 'no_smoothing')
    display_or_store_result(claim, claim_id, result_docs, result_directory,
                            args.print)
def retrieve_documents_for_claim(claim: str, claim_id: int):
    print(colored('Retrieving documents for claim [{}]: "{}"'.format(claim_id, claim), attrs=['bold']))
    preprocessed_claim = preprocess_claim_text(claim)
    claim_terms = process_normalise_tokenise_filter(preprocessed_claim)
    claim_vector = get_tfidf_vector_for_claim(claim_terms)
    claim_norm = get_tfidf_vector_norm(claim_terms, args.variant)

    # only docs that appear in index for at least one claim term to be considered
    doc_candidates = get_candidate_documents_for_claim(claim_terms)

    # similarity scores for each claim-doc combination
    docs_with_similarity_scores = [
        scoring_function(claim_terms, claim_vector, claim_norm, doc_with_terms) for doc_with_terms in
        doc_candidates.items()]

    # sort by similarity and limit to top results
    docs_with_similarity_scores.sort(key=itemgetter(1), reverse=True)
    result_docs = docs_with_similarity_scores[:DOCS_TO_RETRIEVE_PER_CLAIM]

    display_or_store_result(claim, claim_id, result_docs, RETRIEVED_TFIDF_DIRECTORY, args.print)
Exemple #7
0
def transform_NN_input(
        claim_text: str, line_text: str, line_contains_references: int,
        line_position_absolute: int, line_position_relative: float,
        num_evidence_docs_for_claim: int, num_lines_for_evidence: int,
        num_coordination_terms_claim: int, num_coordination_terms_title: int):
    # remove punctuation that are otherwise part of tokens
    preprocessed_claim = preprocess_claim_text(claim_text)
    # remove artifacts like -LRB- etc.
    preprocessed_line = preprocess_doc_text(line_text)

    claim_vector = transform_sentence_to_vector(preprocessed_claim, args.debug)
    line_vector = transform_sentence_to_vector(preprocessed_line, args.debug)
    combined_claim_line_vector = get_vector_difference(claim_vector,
                                                       line_vector)
    additional_features = np.array(
        (line_contains_references, line_position_absolute,
         line_position_relative, num_evidence_docs_for_claim,
         num_lines_for_evidence, num_coordination_terms_claim,
         num_coordination_terms_title))

    return np.concatenate((combined_claim_line_vector, additional_features))