def test_zero_min_similarity(self): """Since sparse matrices exclude zero elements, this test ensures that zero similarity matches are returned when min_similarity <= 0. A bug related to this was first pointed out by @nbcvijanovic""" simple_example = SimpleExample() s_master = simple_example.customers_df['Customer Name'] s_dup = simple_example.whatever_series_1 matches = match_strings(s_master, s_dup, min_similarity=0) pd.testing.assert_frame_equal( simple_example.expected_result_with_zeroes, matches)
def test_zero_min_similarity_small_max_n_matches(self): """This test ensures that a warning is issued when n_max_matches is suspected to be too small while min_similarity <= 0 and include_zeroes is True""" simple_example = SimpleExample() s_master = simple_example.customers_df['Customer Name'] s_dup = simple_example.two_strings with self.assertRaises(Exception): _ = match_strings(s_master, s_dup, max_n_matches=1, min_similarity=0)
def test_match_list_diagonal(self): """This test ensures that all self-joins are present""" # This bug is difficult to reproduce -- I mostly encounter it while working with very large datasets; # for small datasets setting max_n_matches=1 reproduces the bug simple_example = SimpleExample() df = simple_example.customers_df['Customer Name'] matches = match_strings(df, max_n_matches=1) num_self_joins = len( matches[matches['left_index'] == matches['right_index']]) num_strings = len(df) self.assertEqual(num_self_joins, num_strings)
def test_match_list_diagonal(self): """test fails whenever _matches_list's number of self-joins is not equal to the number of strings""" # This bug is difficult to reproduce -- I mostly encounter it while working with very large datasets; # for small datasets setting max_n_matches=1 reproduces the bug simple_example = SimpleExample() df = simple_example.customers_df['Customer Name'] matches = match_strings(df, max_n_matches=1) num_self_joins = len( matches[matches['left_index'] == matches['right_index']]) num_strings = len(df) self.assertNotEqual(num_self_joins, num_strings)
def test_match_strings(self, mock_StringGouper): """mocks StringGrouper to test if the high-level function match_strings utilizes it as expected""" mock_StringGrouper_instance = mock_StringGouper.return_value mock_StringGrouper_instance.fit.return_value = mock_StringGrouper_instance mock_StringGrouper_instance.get_matches.return_value = 'whatever' test_series_1 = None test_series_id_1 = None df = match_strings(test_series_1, master_id=test_series_id_1) mock_StringGrouper_instance.fit.assert_called_once() mock_StringGrouper_instance.get_matches.assert_called_once() self.assertEqual(df, 'whatever')