def create_and_export_queries():
    """
    Output the generated queries only
    """
    num_examples = 100
    bqg = get_query_generator()
    truth_tup_extractor = get_truth_tuple_extractor()

    ids = []
    original_claims = []
    processed = []

    for idx, claim in enumerate(train_data_generator()):
        if idx == num_examples:
            break
        ids.append(claim.id)
        original_claims.append(claim.claim)
        claim_truth_tuples = truth_tup_extractor.extract(claim.claimant + " " +
                                                         claim.claim)
        processed.append(bqg.get_query(claim, truth_tuples=claim_truth_tuples))

    export_df = pd.DataFrame(data={
        "id": ids,
        "original": original_claims,
        "queries": processed
    })
    save_results(export_df, "basic_query_generator", "queries")
Example #2
0
def compare_query_results_with_train_data():
    """
    Compare the resulting URL's from searched articles with the ground truth articles given in training data
    """
    num_examples = 1000  # Limit # of examples so this runs faster
    bqg = get_query_generator()
    client = get_search_client()

    ids = []
    claims = []
    queries = []
    training_urls = []
    client_urls = []
    shared_urls_for_claim = []

    # TODO: Finish this off
    for idx, claim in train_data_generator("/Users/frankjia/Desktop/LeadersPrize/train/train.json"):
        if idx == num_examples:
            break
        print(idx)
        # Execute query
        q = bqg.get_query(claim)
        res = client.search(q)
        searched_urls = []
        # Get URLs from the result
        for r in res.results:
            searched_urls.append(r.url)
        # Get URL's from training data
        training_article_urls = []
        for item in claim.related_articles:
            training_article_urls.append(item.url)
        # Get shared items
        shared_urls = list(set(training_article_urls).intersection(searched_urls))

        ids.append(claim.id)
        claims.append(claim.claim)
        queries.append(q)
        training_urls.append(training_article_urls)
        client_urls.append(searched_urls)
        shared_urls_for_claim.append(shared_urls)

    # Get stats
    num_claims_with_shared_articles = sum([1 for item in shared_urls_for_claim if item])
    frac_shared = float(num_claims_with_shared_articles) / len(ids)
    print(f"{num_claims_with_shared_articles} claims searched with shared articles out of {len(ids)}: {frac_shared}")

    num_shared_articles = sum([len(item) for item in shared_urls_for_claim])
    total_train_articles = sum([len(item) for item in training_urls])
    print(f"{num_shared_articles} shared articles found in {total_train_articles} total: {float(num_shared_articles) / total_train_articles}")
def main():
    html_preprocessor = get_html_preprocessor()
    error_files = []
    for idx, claim in train_data_generator(TRAIN_DATA_PATH + "train.json"):
        if idx < PROCESS_RANGE.start:
            continue
        elif idx >= PROCESS_RANGE.stop:
            break
        # Add the articles if we're not retrieving from search client
        for related_article in claim.related_articles:
            article_html = get_train_article(TRAIN_DATA_PATH, related_article.filepath)
            if not html_preprocessor.process(article_html).text:
                print(related_article.filepath)
                error_files.append(related_article.filepath)

    output = '\n'.join(error_files)
    with open('../output/html_processor/html_unprocessable_files.txt', 'a') as f:
        f.write(output)
def execute_queries_export_urls():
    """
    Output the queries as well as the resulting URL's from the search client
    Will also create the HTML files returned from the query
    """
    num_examples = 30  # Limit # of examples so this runs faster
    bqg = get_query_generator()
    client = get_search_client()
    timestamp = get_timestamp()

    ids = []
    original_claims = []
    data = []

    for idx, claim in enumerate(train_data_generator()):
        if idx == num_examples:
            break

        ids.append(claim.id)
        original_claims.append(claim.claim)
        q = bqg.get_query(claim)
        res = client.search(q)
        export_str = ""
        for i, r in enumerate(res.results):
            # Add to URLs
            export_str += f"{r.score}: {r.url} | "
            # Write Result
            filepath = f"output/basic_query_generator/query_html_{timestamp}/{claim.id}_{i}.html"
            os.makedirs(os.path.dirname(filepath), exist_ok=True)
            with open(filepath, 'a') as f:
                f.write(r.content)
        data.append(export_str)

    export_df = pd.DataFrame(data={
        "id": ids,
        "original": original_claims,
        "results": data
    })
    save_results(export_df,
                 "basic_query_generator",
                 "query_to_url",
                 time_str=timestamp)
def main():
    # Services
    query_generator = get_query_generator()
    client = get_search_client()
    html_preprocessor = get_html_preprocessor()
    text_preprocessor = get_text_preprocessor()
    # Create the appropriate relevance scorer
    relevance_scorer = get_infersent_relevance_scorer() if RELEVANCE_TYPE == INFERSENT_RELEVANCE_SCORER \
        else get_word2vec_relevance_scorer()

    # Outputs
    ids = []
    original_claims = []
    processed_claims = []
    queries = []
    processed_sentences = []
    true_labels = []

    for idx, claim in enumerate(train_data_generator()):
        if idx == NUM_EXAMPLES:
            break
        print(idx)

        query = query_generator.get_query(claim)
        search_res = client.search(query)
        processed_claim = ' '.join(
            text_preprocessor.process(
                claim.claim).bert_sentences)  # Preprocessed claim

        ids.append(claim.id)
        original_claims.append(claim.claim)
        true_labels.append(claim.label)
        queries.append(query)
        processed_claims.append(processed_claim)
        # Continue in case of error
        if search_res.error:
            processed_sentences.append(f"Error: {search_res.error}")
            continue

        # Create master list of sentences
        sentences = []
        for article in search_res.results:
            html_processed = html_preprocessor.process(article.content).text
            text_processed = text_preprocessor.process(html_processed)
            sentences += text_processed.bert_sentences

        # Run relevance scores
        if RELEVANCE_TYPE == INFERSENT_RELEVANCE_SCORER:
            relevances = get_infersent_relevances(claim.claim, sentences,
                                                  relevance_scorer)
        else:
            relevances = get_word2vec_relevances(claim.claim, sentences,
                                                 relevance_scorer)

        # Combine the two results
        processed_sentences_with_relevance = list(zip(relevances, sentences))
        # Construct final string
        processed_sentences_with_relevance.sort(key=lambda item: item[0],
                                                reverse=True)
        process_result = ""
        for rel, sent in processed_sentences_with_relevance:
            if len(process_result) > 10000:
                # Some basic truncation to limit file size
                break
            process_result += f"|SEP| {rel}: {sent} \n"

        processed_sentences.append(process_result)

    # Export the result, with relevance scores and the processed text
    export_df = pd.DataFrame(
        data={
            "id": ids,
            "label": true_labels,
            "original": original_claims,
            "query": queries,
            "processed_claim": processed_claims,
            "processed_sentences": processed_sentences
        })
    save_results(export_df, "sentence_relevance_scorer",
                 f"claim_to_{RELEVANCE_TYPE}_relevance")
Example #6
0
def test_document_relevance_scorer():
    """
    This runs the pipeline up to retrieving articles and ranking them by relevance.
    Determines, if there is shared articles between what is retrieved and what is given as related_articles,
    the ranking of the related article
    """
    num_examples = 1000  # Limit # of examples so this runs faster
    bqg = get_query_generator()
    client = get_search_client()
    html_preprocessor = get_html_preprocessor()
    text_prepreprocessor = get_text_preprocessor()
    article_relevance_scorer = get_lsa_relevance_scorer()

    total_searched = 0
    average_rankings = []

    for idx, claim in train_data_generator(
            "/Users/frankjia/Desktop/LeadersPrize/train/train.json"):
        if idx == num_examples:
            break
        print(idx)
        # Execute query
        q = bqg.get_query(claim)
        searched_articles = client.search(q).results

        # Process articles from raw HTML to parsed text
        pipeline_articles: List[PipelineArticle] = []
        for raw_article in searched_articles:
            if raw_article and raw_article.content:
                pipeline_article = PipelineArticle(raw_article)
                # Extract data from HTML
                html_process_result = html_preprocessor.process(
                    raw_article.content)
                pipeline_article.html_attributes = html_process_result.html_atts
                pipeline_article.raw_body_text = html_process_result.text
                pipeline_articles.append(pipeline_article)

        # Get Article Relevance
        preprocessed_claim_sentences = text_prepreprocessor.process(
            claim.claim + " " + claim.claimant).sentences
        preprocessed_claim = claim.claim + " " + claim.claimant
        if preprocessed_claim_sentences:
            preprocessed_claim = preprocessed_claim_sentences[0]
        pipeline_article_texts: List[str] = [
            p.raw_body_text for p in pipeline_articles
        ]
        article_relevances = article_relevance_scorer.analyze(
            preprocessed_claim, pipeline_article_texts)
        for article_relevance, pipeline_article in zip(article_relevances,
                                                       pipeline_articles):
            # Sometimes we get nan from numpy operations
            pipeline_article.relevance = article_relevance if math.isfinite(
                article_relevance) else 0

        # Based on article relevance, only consider the top relevances
        pipeline_articles.sort(key=lambda article: article.relevance,
                               reverse=True)

        sorted_urls = [article.url for article in pipeline_articles]
        claim_urls = [article.url for article in claim.related_articles]
        common_urls = list(set(sorted_urls).intersection(claim_urls))

        total_searched += 1
        if common_urls:
            # Determine index of shared url in the sorted urls
            index_sum = 0
            for url in common_urls:
                index_sum += sorted_urls.index(url)
            average_rankings.append(float(index_sum) / len(common_urls))

    print("RESULTS")
    print(total_searched)
    print(len(average_rankings))
    print(float(sum(average_rankings)) / len(average_rankings))
Example #7
0
def main():
    """
    For each claim, run query generator and rank the results by relevance using the LSA document relevance scorer
    """
    # Services
    query_generator = get_query_generator()
    client = get_search_client()
    html_preprocessor = get_html_preprocessor()
    relevance_scorer = get_lsa_relevance_scorer()

    # Outputs
    ids = []
    original_claims = []
    ranked_articles_for_claims = []

    for idx, claim in train_data_generator(
            "/Users/frankjia/Desktop/LeadersPrize/train/train.json"):
        if idx == NUM_EXAMPLES:
            break
        print(idx)

        query = query_generator.get_query(claim)
        search_res = client.search(query)

        ids.append(claim.id)
        original_claims.append(claim.claim)

        # Create master list of sentences
        article_texts = []
        article_urls = []
        for article in search_res.results:
            # Write the article for future checking
            url_filename = re.sub(
                r"([/:.])+", "_",
                article.url)  # Create a save-friendly filename
            filepath = f"output/{claim.id}/{url_filename}.html"
            os.makedirs(os.path.dirname(filepath), exist_ok=True)
            with open(filepath, 'a') as f:
                f.write(article.content)

            # Process the articles
            html_processed_text = html_preprocessor.process(
                article.content).text
            article_urls.append(article.url)
            article_texts.append(html_processed_text)

        # Both claim and article_texts are unpreprocessed - LSA class currently does the preprocessing
        relevances = relevance_scorer.analyze(claim.claim, article_texts)
        print(relevances)
        articles_with_relevances = list(zip(article_urls, relevances))
        articles_with_relevances.sort(key=lambda x: x[1], reverse=True)

        # Create an export string with the URL and the relevance:
        article_rank_result = ""
        for url, rel in articles_with_relevances:
            article_rank_result += f"( {rel}: {url} )"
        ranked_articles_for_claims.append(article_rank_result)

    # Export the result, with relevance scores and the processed text
    export_df = pd.DataFrame(
        data={
            "id": ids,
            "claim": original_claims,
            "ranked_articles": ranked_articles_for_claims
        })
    save_results(export_df, "document_relevance_scorer",
                 "claim_to_ranked_articles")
def test_pipeline(process_range: range, config: Dict, train_data_path: str):
    raw_claims: List[LeadersPrizeClaim] = []
    raw_claim_dicts: List[dict] = []
    init_articles = not config.get(PipelineConfigKeys.RETRIEVE_ARTICLES, True)
    if init_articles:
        print(
            "Reading articles from training data. Will not call search client")
    for idx, claim_dict, claim in train_data_generator(
            train_data_path + "trial_combined_data_long.json"):
        if idx < process_range.start:
            continue
        elif idx >= process_range.stop:
            break
        # Add the articles if we're not retrieving from search client
        if init_articles:
            articles: List[SearchQueryResult] = []
            for related_article in claim.related_articles:
                article_html = get_train_article(train_data_path,
                                                 related_article.filepath)
                articles.append(
                    SearchQueryResult(content=article_html,
                                      url=related_article.url))
            claim.mock_search_results = articles
        raw_claims.append(claim)
        raw_claim_dicts.append(claim_dict)

    start_time = datetime.now()

    # Create pipeline
    pipeline = LeadersPrizePipeline(config)

    # Run the prediction
    results = pipeline.predict(raw_claims)

    print(f"{len(results)} processed in {datetime.now() - start_time}")

    # Export results
    claims = []
    labels = []
    reasoner_inputs = []
    pred_labels = []
    supporting_article_urls = []
    explanations = []
    for res in results:
        claims.append(res.preprocessed_claim)
        labels.append(res.original_claim.label)
        reasoner_input = ""
        for idx, sent in enumerate(res.sentences_for_transformer):
            if idx == 10:
                break
            reasoner_input += " " + sent.preprocessed_text
        reasoner_inputs.append(reasoner_input)
        pred_labels.append(res.submission_label)
        supporting_article_urls.append(", ".join(
            res.submission_article_urls.values()))
        explanations.append(res.submission_explanation)
    results_df = pd.DataFrame(
        data={
            "claim": claims,
            "label": labels,
            "reasoner_input": reasoner_inputs,
            "predicted": pred_labels,
            "article_urls": supporting_article_urls,
            "explanation": explanations
        })

    # Get accuracies
    eval_predictions(labels, pred_labels)
    # Get datacup score
    eval_datacup(raw_claim_dicts, results)

    save_results(results_df, "pipeline_test", "full_pipeline")