Example #1
0
def test_mining_database(monkeypatch, capsys, fake_sqlalchemy_engine, mining_schema_df):
    mining_schema_df = mining_schema_df.drop_duplicates(ignore_index=True)

    responses.add_callback(
        responses.POST,
        "http://test/database",
        callback=request_callback,
        content_type="text/csv",
    )

    responses.add_callback(
        responses.POST,
        "http://test/help",
        callback=request_callback_help,
        content_type="application/json",
    )

    mining_schema = MiningSchema()
    mining_schema.add_from_df(mining_schema_df)
    mining_widget = MiningWidget(
        mining_server_url="http://test",
        mining_schema=mining_schema,
    )
    empty_dataframe = pd.DataFrame()
    assert empty_dataframe.equals(mining_widget.get_extracted_table())

    bot = MiningWidgetBot(mining_widget, capsys, monkeypatch)
    bot.set_value("input_text", "HELLO")
    bot.click("mine_articles")

    assert len(responses.calls) == 1
    assert "No article saver was provided. Nothing to mine." in bot.stdout_cached

    article_saver = ArticleSaver(fake_sqlalchemy_engine)
    for i in range(2):
        article_saver.add_article(article_id=i)

    mining_widget = MiningWidget(
        mining_server_url="http://test",
        mining_schema=mining_schema,
        article_saver=article_saver,
    )

    bot = MiningWidgetBot(mining_widget, capsys, monkeypatch)
    bot.set_value("input_text", "HELLO")
    bot.click("mine_articles")

    assert len(responses.calls) == 3
    assert "Collecting saved items..." in bot.stdout_cached
    assert isinstance(mining_widget.get_extracted_table(), pd.DataFrame)

    display_objs = bot.display_cached
    assert len(display_objs) == 3  # 1 schema + 1 warning + 1 table_extractions
    assert isinstance(display_objs[0], pd.DataFrame)
    assert isinstance(display_objs[2], pd.DataFrame)

    assert display_objs[0].equals(mining_schema_df)
    assert isinstance(display_objs[1], HTML)
    assert display_objs[2].equals(table_extractions)
Example #2
0
    def test_summaries(self, fake_sqlalchemy_engine, tmpdir):
        """Test that article_saver is good. """

        article_saver = ArticleSaver(connection=fake_sqlalchemy_engine)

        # Check the possible article_id, paragraphs_id of the fake database
        # Create a fake article_saver.saved_articles dictionary
        # (Which should be the output of the widget)
        sql_query = "SELECT article_id FROM articles"
        article_ids = pd.read_sql(
            sql_query, fake_sqlalchemy_engine)["article_id"].to_list()
        all_articles_paragraphs_id = {}
        for article_id in set(article_ids):
            sql_query = f"""
            SELECT paragraph_pos_in_article
            FROM sentences
            WHERE article_id = {article_id}
            """
            all_paragraph_pos_in_article = pd.read_sql(
                sql_query,
                fake_sqlalchemy_engine)["paragraph_pos_in_article"].to_list()
            all_articles_paragraphs_id[article_id] = [
                paragraph_pos_in_article for paragraph_pos_in_article in set(
                    all_paragraph_pos_in_article)
            ]
            # For all articles extract only the first of their paragraphs
            paragraph_pos_in_article = all_articles_paragraphs_id[article_id][
                0]
            article_saver.add_paragraph(article_id, paragraph_pos_in_article)

        # For the last article extract all its paragraphs
        article_saver.add_article(article_id)
        n_paragraphs_full_article = len(set(all_paragraph_pos_in_article))

        # Check that the retrieving of the different text is working
        df_chosen_texts = article_saver.get_chosen_texts()
        assert isinstance(df_chosen_texts, pd.DataFrame)
        assert df_chosen_texts.columns.to_list() == [
            "article_id",
            "section_name",
            "paragraph_pos_in_article",
            "text",
        ]
        assert (len(df_chosen_texts) == len(all_articles_paragraphs_id) +
                n_paragraphs_full_article - 1)

        # Cached chosen texts
        df_chosen_texts_cached = article_saver.get_chosen_texts()
        assert len(df_chosen_texts) == len(df_chosen_texts_cached)

        # Check summary table
        summary_table = article_saver.summary_table()
        assert isinstance(summary_table, pd.DataFrame)
Example #3
0
def test_inclusion_text(fake_sqlalchemy_engine, monkeypatch, capsys, tmpdir):
    http_address = activate_responses(fake_sqlalchemy_engine)

    responses.add_callback(
        responses.POST,
        "http://test/help",
        callback=request_callback_help,
        content_type="application/json",
    )

    widget = SearchWidget(
        bbs_search_url=http_address,
        bbs_mysql_engine=fake_sqlalchemy_engine,
        article_saver=ArticleSaver(fake_sqlalchemy_engine),
        results_per_page=10,
    )

    bot = SearchWidgetBot(widget, capsys, monkeypatch)

    bot.set_value("inclusion_text", "")
    bot.click("investigate_button")

    assert bot.display_cached

    bot.set_value("inclusion_text", "THIS TEXT DOES NOT EXIST IN ANY SENTENCE")
    bot.click("investigate_button")

    assert not bot.display_cached
Example #4
0
def test_report_article_saver(fake_sqlalchemy_engine, monkeypatch, capsys, tmpdir):
    """Make sure creation of report with article saver state works."""
    tmpdir = Path(tmpdir)
    http_address = activate_responses(fake_sqlalchemy_engine)

    responses.add_callback(
        responses.POST,
        "http://test/help",
        callback=request_callback_help,
        content_type="application/json",
    )

    widget = SearchWidget(
        bbs_search_url=http_address,
        bbs_mysql_engine=fake_sqlalchemy_engine,
        article_saver=ArticleSaver(fake_sqlalchemy_engine),
    )

    bot = SearchWidgetBot(widget, capsys, monkeypatch)

    bot.set_value("top_results", 2)
    bot.set_value("default_value_article_saver", _Save.ARTICLE)
    bot.click("investigate_button")

    bot.stdout_cached  # clear standard output

    with cd_temp(tmpdir):
        bot.click("articles_button")

    assert "Creating the saved results report... " in bot.stdout_cached

    assert len([f for f in tmpdir.iterdir() if f.suffix == ".html"]) == 1
Example #5
0
def test_article_saver_global(fake_sqlalchemy_engine, monkeypatch, capsys,
                              saving_mode):
    """Make sure that default saving buttons result in correct checkboxes."""
    responses.add_callback(
        responses.POST,
        "http://test/help",
        callback=request_callback_help,
        content_type="application/json",
    )

    k = 10
    http_address = activate_responses(fake_sqlalchemy_engine)

    widget = SearchWidget(
        bbs_search_url=http_address,
        bbs_mysql_engine=fake_sqlalchemy_engine,
        article_saver=ArticleSaver(fake_sqlalchemy_engine),
        results_per_page=k,
    )

    bot = SearchWidgetBot(widget, capsys, monkeypatch)

    bot.set_value("top_results", k)
    bot.set_value("default_value_article_saver", saving_mode)
    bot.click("investigate_button")

    captured_display_objects = bot.display_cached

    assert len(captured_display_objects) == k * bot.n_displays_per_result

    if saving_mode == _Save.NOTHING:
        assert not widget.article_saver.state

    elif saving_mode == _Save.PARAGRAPH:
        assert 0 < len(widget.article_saver.state) <= k
        assert all(x[1] != -1 for x in widget.article_saver.state)

    elif saving_mode == _Save.ARTICLE:
        assert 0 < len(widget.article_saver.state) <= k
        assert all(x[1] == -1 for x in widget.article_saver.state)
    else:
        raise ValueError(f"Unrecognized saving mode: {saving_mode}")

    for i, display_obj in enumerate(captured_display_objects):
        if isinstance(display_obj, ipywidgets.Checkbox):
            if display_obj.description == "Extract the paragraph":
                assert display_obj.value == (saving_mode == _Save.PARAGRAPH)

            elif display_obj.description == "Extract the entire article":
                assert display_obj.value == (saving_mode == _Save.ARTICLE)

            else:
                raise ValueError(f"Unrecognized checkbox, {i}")

        elif isinstance(display_obj, HTML):
            pass

        else:
            raise TypeError(f"Unrecognized type: {type(display_obj)}")
Example #6
0
def test_article_saver_gets_updated(
    fake_sqlalchemy_engine, monkeypatch, capsys, saving_mode
):
    """Clicking paragraph or article checkbox modifies the ArticleSaver state."""

    responses.add_callback(
        responses.POST,
        "http://test/help",
        callback=request_callback_help,
        content_type="application/json",
    )

    k = 10
    result_to_take = 3

    http_address = activate_responses(fake_sqlalchemy_engine)

    widget = SearchWidget(
        bbs_search_url=http_address,
        bbs_mysql_engine=fake_sqlalchemy_engine,
        article_saver=ArticleSaver(fake_sqlalchemy_engine),
        results_per_page=k,
    )

    bot = SearchWidgetBot(widget, capsys, monkeypatch)

    bot.set_value("top_results", k)
    bot.set_value("default_value_article_saver", _Save.NOTHING)
    bot.click("investigate_button")

    captured_display_objects = bot.display_cached

    assert len(captured_display_objects) == k * bot.n_displays_per_result
    assert bot.get_value("default_value_article_saver") == _Save.NOTHING

    start = result_to_take * bot.n_displays_per_result
    end = (result_to_take + 1) * bot.n_displays_per_result
    meta, chb_paragraph, chb_article, out = captured_display_objects[start:end]

    # Check the checkbox
    if saving_mode == _Save.NOTHING:

        assert not widget.article_saver.state

    elif saving_mode == _Save.PARAGRAPH:
        chb_paragraph.value = True

        assert len(widget.article_saver.state) == 1  # actual len is 0
        assert list(widget.article_saver.state)[0][1] != -1

    elif saving_mode == _Save.ARTICLE:
        chb_article.value = True

        assert len(widget.article_saver.state) == 1
        assert list(widget.article_saver.state)[0][1] == -1  # actual value 4

    else:
        raise ValueError(f"Unrecognized saving mode: {saving_mode}")
Example #7
0
def test_errors(fake_sqlalchemy_engine, monkeypatch, capsys):
    """Check that widget raises an error when bbs search server not working."""

    with pytest.raises(Exception):
        SearchWidget(
            bbs_search_url="fake_address",
            bbs_mysql_engine=fake_sqlalchemy_engine,
            article_saver=ArticleSaver(fake_sqlalchemy_engine),
            results_per_page=3,
        )
Example #8
0
def test_paging(
    fake_sqlalchemy_engine, monkeypatch, capsys, query_text, k, results_per_page
):
    """Test that paging is displaying the right number results"""

    http_address = activate_responses(fake_sqlalchemy_engine)

    responses.add_callback(
        responses.POST,
        "http://test/help",
        callback=request_callback_help,
        content_type="application/json",
    )

    widget = SearchWidget(
        bbs_search_url=http_address,
        bbs_mysql_engine=fake_sqlalchemy_engine,
        article_saver=ArticleSaver(connection=fake_sqlalchemy_engine),
        results_per_page=results_per_page,
    )

    bot = SearchWidgetBot(widget, capsys, monkeypatch)

    # Initial state
    assert (
        'Click on "Search Literature!" button to display some results.'
        in bot.stdout_cached
    )
    assert not bot.display_cached

    bot.set_value("top_results", k)
    bot.set_value("granularity", "sentences")
    bot.set_value("query_text", query_text)
    bot.click("investigate_button")
    assert (
        len(bot.display_cached) == min(results_per_page, k) * bot.n_displays_per_result
    )

    results_left = k - min(results_per_page, k)

    # Make sure paging works
    while results_left > 0:
        bot.click("page_forward")
        displayed_results = min(results_per_page, results_left)

        assert len(bot.display_cached) == displayed_results * bot.n_displays_per_result

        results_left -= displayed_results
Example #9
0
def get_search_widget_bot(
    fake_sqlalchemy_engine, monkeypatch, capsys, checkpoint_path=None
):
    http_address = activate_responses(fake_sqlalchemy_engine)

    responses.add_callback(
        responses.POST,
        "http://test/help",
        callback=request_callback_help,
        content_type="application/json",
    )

    widget = SearchWidget(
        bbs_search_url=http_address,
        bbs_mysql_engine=fake_sqlalchemy_engine,
        article_saver=ArticleSaver(fake_sqlalchemy_engine),
        checkpoint_path=checkpoint_path,
    )

    bot = SearchWidgetBot(widget, capsys, monkeypatch)

    return bot
Example #10
0
    def test_adding_removing(self):
        article_saver = ArticleSaver(connection=None)

        full_articles = np.array([101, 102, 103])
        just_paragraphs = np.array([(101, 0), (103, 2), (103, 5)])

        # Adding items
        for article_id in full_articles:
            article_saver.add_article(article_id)
        for article_id, paragraph_id in just_paragraphs:
            article_saver.add_paragraph(article_id, paragraph_id)

        # Checking if items were saved
        for article_id in full_articles:
            assert article_saver.has_article(article_id)
        for article_id, paragraph_id in just_paragraphs:
            assert article_saver.has_paragraph(article_id, paragraph_id)

        # Test type of IDs in article saver state
        for article_id, paragraph_id in article_saver.state:
            assert type(article_id) == int
            assert type(paragraph_id) == int

        # Removing items
        article_to_remove = full_articles[0]
        paragraph_to_remove = just_paragraphs[2]

        article_saver.remove_article(article_to_remove)
        assert not article_saver.has_article(article_to_remove)
        article_saver.remove_paragraph(*paragraph_to_remove)
        assert not article_saver.has_paragraph(*paragraph_to_remove)
        article_saver.remove_paragraph("fake_article", 12345)  # doesn't exist

        # Removing all items
        article_saver.remove_all()
        for article_id in full_articles:
            assert not article_saver.has_article(article_id)
        for article_id, paragraph_id in just_paragraphs:
            assert not article_saver.has_paragraph(article_id, paragraph_id)
Example #11
0
def test_correct_results_order(fake_sqlalchemy_engine, monkeypatch, capsys):
    """Check that the most relevant sentence is the first result."""
    n_sentences = fake_sqlalchemy_engine.execute(
        "SELECT COUNT(*) FROM sentences"
    ).fetchone()[0]

    most_relevant_sbiobert_id = 7
    query_sbiobert = (
        f"SELECT text FROM sentences WHERE sentence_id = {most_relevant_sbiobert_id}"
    )
    most_relevant_sbiobert_text = fake_sqlalchemy_engine.execute(
        query_sbiobert
    ).fetchone()[0]

    embedding_model_sbiobert = Mock()
    embedding_model_sbiobert.embed.return_value = np.array([0, 1])  # 90 degrees

    embedding_models = {
        "SBioBERT": embedding_model_sbiobert,
    }

    precomputed_embeddings = {
        "SBioBERT": torch.ones((n_sentences, 2)).to(dtype=torch.float32)
        / 2 ** (1 / 2),  # 45 degrees
    }

    norm = (0.1 ** 2 + 0.9 ** 2) ** (1 / 2)
    precomputed_embeddings["SBioBERT"][most_relevant_sbiobert_id - 1, :] = (
        torch.tensor([0.1, 0.9]) / norm
    )
    # ~90 degrees

    indices = np.arange(1, n_sentences + 1)

    searcher = SearchEngine(
        embedding_models,
        precomputed_embeddings,
        indices,
        connection=fake_sqlalchemy_engine,
    )

    responses.add_callback(
        responses.POST,
        "http://test",
        callback=partial(request_callback, searcher=searcher),
        content_type="application/json",
    )

    responses.add_callback(
        responses.POST,
        "http://test/help",
        callback=request_callback_help,
        content_type="application/json",
    )

    k = 1
    widget = SearchWidget(
        bbs_search_url="http://test",
        bbs_mysql_engine=fake_sqlalchemy_engine,
        article_saver=ArticleSaver(fake_sqlalchemy_engine),
        results_per_page=k,
    )

    bot = SearchWidgetBot(widget, capsys, monkeypatch)

    bot.set_value("top_results", k)
    bot.set_value("print_paragraph", False)

    bot.set_value("sent_embedder", "SBioBERT")
    bot.click("investigate_button")

    captured_display_objects = bot.display_cached

    assert len(captured_display_objects) == k * bot.n_displays_per_result
    assert (
        textwrap.fill(most_relevant_sbiobert_text, width=80)
        in captured_display_objects[-1].data
    )