コード例 #1
0
 def test_zero_min_similarity(self):
     """Since sparse matrices exclude zero elements, this test ensures that zero similarity matches are
     returned when min_similarity <= 0.  A bug related to this was first pointed out by @nbcvijanovic"""
     simple_example = SimpleExample()
     s_master = simple_example.customers_df['Customer Name']
     s_dup = simple_example.whatever_series_1
     matches = match_strings(s_master, s_dup, min_similarity=0)
     pd.testing.assert_frame_equal(
         simple_example.expected_result_with_zeroes, matches)
コード例 #2
0
 def test_zero_min_similarity_small_max_n_matches(self):
     """This test ensures that a warning is issued when n_max_matches is suspected to be too small while
     min_similarity <= 0 and include_zeroes is True"""
     simple_example = SimpleExample()
     s_master = simple_example.customers_df['Customer Name']
     s_dup = simple_example.two_strings
     with self.assertRaises(Exception):
         _ = match_strings(s_master,
                           s_dup,
                           max_n_matches=1,
                           min_similarity=0)
コード例 #3
0
 def test_match_list_diagonal(self):
     """This test ensures that all self-joins are present"""
     # This bug is difficult to reproduce -- I mostly encounter it while working with very large datasets;
     # for small datasets setting max_n_matches=1 reproduces the bug
     simple_example = SimpleExample()
     df = simple_example.customers_df['Customer Name']
     matches = match_strings(df, max_n_matches=1)
     num_self_joins = len(
         matches[matches['left_index'] == matches['right_index']])
     num_strings = len(df)
     self.assertEqual(num_self_joins, num_strings)
コード例 #4
0
 def test_match_list_diagonal(self):
     """test fails whenever _matches_list's number of self-joins is not equal to the number of strings"""
     # This bug is difficult to reproduce -- I mostly encounter it while working with very large datasets;
     # for small datasets setting max_n_matches=1 reproduces the bug
     simple_example = SimpleExample()
     df = simple_example.customers_df['Customer Name']
     matches = match_strings(df, max_n_matches=1)
     num_self_joins = len(
         matches[matches['left_index'] == matches['right_index']])
     num_strings = len(df)
     self.assertNotEqual(num_self_joins, num_strings)
コード例 #5
0
    def test_match_strings(self, mock_StringGouper):
        """mocks StringGrouper to test if the high-level function match_strings utilizes it as expected"""
        mock_StringGrouper_instance = mock_StringGouper.return_value
        mock_StringGrouper_instance.fit.return_value = mock_StringGrouper_instance
        mock_StringGrouper_instance.get_matches.return_value = 'whatever'

        test_series_1 = None
        test_series_id_1 = None
        df = match_strings(test_series_1, master_id=test_series_id_1)

        mock_StringGrouper_instance.fit.assert_called_once()
        mock_StringGrouper_instance.get_matches.assert_called_once()
        self.assertEqual(df, 'whatever')