def test_mining_database(monkeypatch, capsys, fake_sqlalchemy_engine, mining_schema_df): mining_schema_df = mining_schema_df.drop_duplicates(ignore_index=True) responses.add_callback( responses.POST, "http://test/database", callback=request_callback, content_type="text/csv", ) responses.add_callback( responses.POST, "http://test/help", callback=request_callback_help, content_type="application/json", ) mining_schema = MiningSchema() mining_schema.add_from_df(mining_schema_df) mining_widget = MiningWidget( mining_server_url="http://test", mining_schema=mining_schema, ) empty_dataframe = pd.DataFrame() assert empty_dataframe.equals(mining_widget.get_extracted_table()) bot = MiningWidgetBot(mining_widget, capsys, monkeypatch) bot.set_value("input_text", "HELLO") bot.click("mine_articles") assert len(responses.calls) == 1 assert "No article saver was provided. Nothing to mine." in bot.stdout_cached article_saver = ArticleSaver(fake_sqlalchemy_engine) for i in range(2): article_saver.add_article(article_id=i) mining_widget = MiningWidget( mining_server_url="http://test", mining_schema=mining_schema, article_saver=article_saver, ) bot = MiningWidgetBot(mining_widget, capsys, monkeypatch) bot.set_value("input_text", "HELLO") bot.click("mine_articles") assert len(responses.calls) == 3 assert "Collecting saved items..." in bot.stdout_cached assert isinstance(mining_widget.get_extracted_table(), pd.DataFrame) display_objs = bot.display_cached assert len(display_objs) == 3 # 1 schema + 1 warning + 1 table_extractions assert isinstance(display_objs[0], pd.DataFrame) assert isinstance(display_objs[2], pd.DataFrame) assert display_objs[0].equals(mining_schema_df) assert isinstance(display_objs[1], HTML) assert display_objs[2].equals(table_extractions)
def test_summaries(self, fake_sqlalchemy_engine, tmpdir): """Test that article_saver is good. """ article_saver = ArticleSaver(connection=fake_sqlalchemy_engine) # Check the possible article_id, paragraphs_id of the fake database # Create a fake article_saver.saved_articles dictionary # (Which should be the output of the widget) sql_query = "SELECT article_id FROM articles" article_ids = pd.read_sql( sql_query, fake_sqlalchemy_engine)["article_id"].to_list() all_articles_paragraphs_id = {} for article_id in set(article_ids): sql_query = f""" SELECT paragraph_pos_in_article FROM sentences WHERE article_id = {article_id} """ all_paragraph_pos_in_article = pd.read_sql( sql_query, fake_sqlalchemy_engine)["paragraph_pos_in_article"].to_list() all_articles_paragraphs_id[article_id] = [ paragraph_pos_in_article for paragraph_pos_in_article in set( all_paragraph_pos_in_article) ] # For all articles extract only the first of their paragraphs paragraph_pos_in_article = all_articles_paragraphs_id[article_id][ 0] article_saver.add_paragraph(article_id, paragraph_pos_in_article) # For the last article extract all its paragraphs article_saver.add_article(article_id) n_paragraphs_full_article = len(set(all_paragraph_pos_in_article)) # Check that the retrieving of the different text is working df_chosen_texts = article_saver.get_chosen_texts() assert isinstance(df_chosen_texts, pd.DataFrame) assert df_chosen_texts.columns.to_list() == [ "article_id", "section_name", "paragraph_pos_in_article", "text", ] assert (len(df_chosen_texts) == len(all_articles_paragraphs_id) + n_paragraphs_full_article - 1) # Cached chosen texts df_chosen_texts_cached = article_saver.get_chosen_texts() assert len(df_chosen_texts) == len(df_chosen_texts_cached) # Check summary table summary_table = article_saver.summary_table() assert isinstance(summary_table, pd.DataFrame)
def test_inclusion_text(fake_sqlalchemy_engine, monkeypatch, capsys, tmpdir): http_address = activate_responses(fake_sqlalchemy_engine) responses.add_callback( responses.POST, "http://test/help", callback=request_callback_help, content_type="application/json", ) widget = SearchWidget( bbs_search_url=http_address, bbs_mysql_engine=fake_sqlalchemy_engine, article_saver=ArticleSaver(fake_sqlalchemy_engine), results_per_page=10, ) bot = SearchWidgetBot(widget, capsys, monkeypatch) bot.set_value("inclusion_text", "") bot.click("investigate_button") assert bot.display_cached bot.set_value("inclusion_text", "THIS TEXT DOES NOT EXIST IN ANY SENTENCE") bot.click("investigate_button") assert not bot.display_cached
def test_report_article_saver(fake_sqlalchemy_engine, monkeypatch, capsys, tmpdir): """Make sure creation of report with article saver state works.""" tmpdir = Path(tmpdir) http_address = activate_responses(fake_sqlalchemy_engine) responses.add_callback( responses.POST, "http://test/help", callback=request_callback_help, content_type="application/json", ) widget = SearchWidget( bbs_search_url=http_address, bbs_mysql_engine=fake_sqlalchemy_engine, article_saver=ArticleSaver(fake_sqlalchemy_engine), ) bot = SearchWidgetBot(widget, capsys, monkeypatch) bot.set_value("top_results", 2) bot.set_value("default_value_article_saver", _Save.ARTICLE) bot.click("investigate_button") bot.stdout_cached # clear standard output with cd_temp(tmpdir): bot.click("articles_button") assert "Creating the saved results report... " in bot.stdout_cached assert len([f for f in tmpdir.iterdir() if f.suffix == ".html"]) == 1
def test_article_saver_global(fake_sqlalchemy_engine, monkeypatch, capsys, saving_mode): """Make sure that default saving buttons result in correct checkboxes.""" responses.add_callback( responses.POST, "http://test/help", callback=request_callback_help, content_type="application/json", ) k = 10 http_address = activate_responses(fake_sqlalchemy_engine) widget = SearchWidget( bbs_search_url=http_address, bbs_mysql_engine=fake_sqlalchemy_engine, article_saver=ArticleSaver(fake_sqlalchemy_engine), results_per_page=k, ) bot = SearchWidgetBot(widget, capsys, monkeypatch) bot.set_value("top_results", k) bot.set_value("default_value_article_saver", saving_mode) bot.click("investigate_button") captured_display_objects = bot.display_cached assert len(captured_display_objects) == k * bot.n_displays_per_result if saving_mode == _Save.NOTHING: assert not widget.article_saver.state elif saving_mode == _Save.PARAGRAPH: assert 0 < len(widget.article_saver.state) <= k assert all(x[1] != -1 for x in widget.article_saver.state) elif saving_mode == _Save.ARTICLE: assert 0 < len(widget.article_saver.state) <= k assert all(x[1] == -1 for x in widget.article_saver.state) else: raise ValueError(f"Unrecognized saving mode: {saving_mode}") for i, display_obj in enumerate(captured_display_objects): if isinstance(display_obj, ipywidgets.Checkbox): if display_obj.description == "Extract the paragraph": assert display_obj.value == (saving_mode == _Save.PARAGRAPH) elif display_obj.description == "Extract the entire article": assert display_obj.value == (saving_mode == _Save.ARTICLE) else: raise ValueError(f"Unrecognized checkbox, {i}") elif isinstance(display_obj, HTML): pass else: raise TypeError(f"Unrecognized type: {type(display_obj)}")
def test_article_saver_gets_updated( fake_sqlalchemy_engine, monkeypatch, capsys, saving_mode ): """Clicking paragraph or article checkbox modifies the ArticleSaver state.""" responses.add_callback( responses.POST, "http://test/help", callback=request_callback_help, content_type="application/json", ) k = 10 result_to_take = 3 http_address = activate_responses(fake_sqlalchemy_engine) widget = SearchWidget( bbs_search_url=http_address, bbs_mysql_engine=fake_sqlalchemy_engine, article_saver=ArticleSaver(fake_sqlalchemy_engine), results_per_page=k, ) bot = SearchWidgetBot(widget, capsys, monkeypatch) bot.set_value("top_results", k) bot.set_value("default_value_article_saver", _Save.NOTHING) bot.click("investigate_button") captured_display_objects = bot.display_cached assert len(captured_display_objects) == k * bot.n_displays_per_result assert bot.get_value("default_value_article_saver") == _Save.NOTHING start = result_to_take * bot.n_displays_per_result end = (result_to_take + 1) * bot.n_displays_per_result meta, chb_paragraph, chb_article, out = captured_display_objects[start:end] # Check the checkbox if saving_mode == _Save.NOTHING: assert not widget.article_saver.state elif saving_mode == _Save.PARAGRAPH: chb_paragraph.value = True assert len(widget.article_saver.state) == 1 # actual len is 0 assert list(widget.article_saver.state)[0][1] != -1 elif saving_mode == _Save.ARTICLE: chb_article.value = True assert len(widget.article_saver.state) == 1 assert list(widget.article_saver.state)[0][1] == -1 # actual value 4 else: raise ValueError(f"Unrecognized saving mode: {saving_mode}")
def test_errors(fake_sqlalchemy_engine, monkeypatch, capsys): """Check that widget raises an error when bbs search server not working.""" with pytest.raises(Exception): SearchWidget( bbs_search_url="fake_address", bbs_mysql_engine=fake_sqlalchemy_engine, article_saver=ArticleSaver(fake_sqlalchemy_engine), results_per_page=3, )
def test_paging( fake_sqlalchemy_engine, monkeypatch, capsys, query_text, k, results_per_page ): """Test that paging is displaying the right number results""" http_address = activate_responses(fake_sqlalchemy_engine) responses.add_callback( responses.POST, "http://test/help", callback=request_callback_help, content_type="application/json", ) widget = SearchWidget( bbs_search_url=http_address, bbs_mysql_engine=fake_sqlalchemy_engine, article_saver=ArticleSaver(connection=fake_sqlalchemy_engine), results_per_page=results_per_page, ) bot = SearchWidgetBot(widget, capsys, monkeypatch) # Initial state assert ( 'Click on "Search Literature!" button to display some results.' in bot.stdout_cached ) assert not bot.display_cached bot.set_value("top_results", k) bot.set_value("granularity", "sentences") bot.set_value("query_text", query_text) bot.click("investigate_button") assert ( len(bot.display_cached) == min(results_per_page, k) * bot.n_displays_per_result ) results_left = k - min(results_per_page, k) # Make sure paging works while results_left > 0: bot.click("page_forward") displayed_results = min(results_per_page, results_left) assert len(bot.display_cached) == displayed_results * bot.n_displays_per_result results_left -= displayed_results
def get_search_widget_bot( fake_sqlalchemy_engine, monkeypatch, capsys, checkpoint_path=None ): http_address = activate_responses(fake_sqlalchemy_engine) responses.add_callback( responses.POST, "http://test/help", callback=request_callback_help, content_type="application/json", ) widget = SearchWidget( bbs_search_url=http_address, bbs_mysql_engine=fake_sqlalchemy_engine, article_saver=ArticleSaver(fake_sqlalchemy_engine), checkpoint_path=checkpoint_path, ) bot = SearchWidgetBot(widget, capsys, monkeypatch) return bot
def test_adding_removing(self): article_saver = ArticleSaver(connection=None) full_articles = np.array([101, 102, 103]) just_paragraphs = np.array([(101, 0), (103, 2), (103, 5)]) # Adding items for article_id in full_articles: article_saver.add_article(article_id) for article_id, paragraph_id in just_paragraphs: article_saver.add_paragraph(article_id, paragraph_id) # Checking if items were saved for article_id in full_articles: assert article_saver.has_article(article_id) for article_id, paragraph_id in just_paragraphs: assert article_saver.has_paragraph(article_id, paragraph_id) # Test type of IDs in article saver state for article_id, paragraph_id in article_saver.state: assert type(article_id) == int assert type(paragraph_id) == int # Removing items article_to_remove = full_articles[0] paragraph_to_remove = just_paragraphs[2] article_saver.remove_article(article_to_remove) assert not article_saver.has_article(article_to_remove) article_saver.remove_paragraph(*paragraph_to_remove) assert not article_saver.has_paragraph(*paragraph_to_remove) article_saver.remove_paragraph("fake_article", 12345) # doesn't exist # Removing all items article_saver.remove_all() for article_id in full_articles: assert not article_saver.has_article(article_id) for article_id, paragraph_id in just_paragraphs: assert not article_saver.has_paragraph(article_id, paragraph_id)
def test_correct_results_order(fake_sqlalchemy_engine, monkeypatch, capsys): """Check that the most relevant sentence is the first result.""" n_sentences = fake_sqlalchemy_engine.execute( "SELECT COUNT(*) FROM sentences" ).fetchone()[0] most_relevant_sbiobert_id = 7 query_sbiobert = ( f"SELECT text FROM sentences WHERE sentence_id = {most_relevant_sbiobert_id}" ) most_relevant_sbiobert_text = fake_sqlalchemy_engine.execute( query_sbiobert ).fetchone()[0] embedding_model_sbiobert = Mock() embedding_model_sbiobert.embed.return_value = np.array([0, 1]) # 90 degrees embedding_models = { "SBioBERT": embedding_model_sbiobert, } precomputed_embeddings = { "SBioBERT": torch.ones((n_sentences, 2)).to(dtype=torch.float32) / 2 ** (1 / 2), # 45 degrees } norm = (0.1 ** 2 + 0.9 ** 2) ** (1 / 2) precomputed_embeddings["SBioBERT"][most_relevant_sbiobert_id - 1, :] = ( torch.tensor([0.1, 0.9]) / norm ) # ~90 degrees indices = np.arange(1, n_sentences + 1) searcher = SearchEngine( embedding_models, precomputed_embeddings, indices, connection=fake_sqlalchemy_engine, ) responses.add_callback( responses.POST, "http://test", callback=partial(request_callback, searcher=searcher), content_type="application/json", ) responses.add_callback( responses.POST, "http://test/help", callback=request_callback_help, content_type="application/json", ) k = 1 widget = SearchWidget( bbs_search_url="http://test", bbs_mysql_engine=fake_sqlalchemy_engine, article_saver=ArticleSaver(fake_sqlalchemy_engine), results_per_page=k, ) bot = SearchWidgetBot(widget, capsys, monkeypatch) bot.set_value("top_results", k) bot.set_value("print_paragraph", False) bot.set_value("sent_embedder", "SBioBERT") bot.click("investigate_button") captured_display_objects = bot.display_cached assert len(captured_display_objects) == k * bot.n_displays_per_result assert ( textwrap.fill(most_relevant_sbiobert_text, width=80) in captured_display_objects[-1].data )