Ejemplo n.º 1
0
 def test_get_groups_raises_exception(self):
     """Should raise an exception if called before the StringGrouper is fit"""
     test_series_1 = pd.Series(['foooo', 'bar', 'baz', 'foooo'])
     test_series_2 = pd.Series(['foooo', 'bar', 'baz', 'foooob'])
     sg = StringGrouper(test_series_1, test_series_2)
     with self.assertRaises(StringGrouperNotFitException):
         _ = sg.get_groups()
Ejemplo n.º 2
0
 def test_get_groups_single_df(self):
     """Should return a pd.series object with the same length as the original df. The series object will contain
     a list of the grouped strings"""
     test_series_1 = pd.Series(['foooo', 'bar', 'baz', 'foooob'])
     sg = StringGrouper(test_series_1)
     sg = sg.fit()
     result = sg.get_groups()
     expected_result = pd.Series(['foooo', 'bar', 'baz', 'foooo'])
     pd.testing.assert_series_equal(expected_result, result)
Ejemplo n.º 3
0
 def test_get_groups_two_df_no_match(self):
     """Should return a pd.series object with the length of the dupes. If no match is found in dupes,
     the original will be returned"""
     test_series_1 = pd.Series(['foooo', 'bar', 'baz'])
     test_series_2 = pd.Series(['foooo', 'dooz', 'bar', 'baz', 'foooob'])
     sg = StringGrouper(test_series_1, test_series_2)
     sg = sg.fit()
     result = sg.get_groups()
     expected_result = pd.Series(['foooo', 'dooz', 'bar', 'baz', 'foooo'])
     pd.testing.assert_series_equal(expected_result, result)
Ejemplo n.º 4
0
 def test_get_groups_two_df_same_similarity(self):
     """Should return a pd.series object with the length of the dupes. If there are two dupes with the same
     similarity, the first one is chosen"""
     test_series_1 = pd.Series(['foooo', 'bar', 'baz', 'foooo'])
     test_series_2 = pd.Series(['foooo', 'bar', 'baz', 'foooob'])
     sg = StringGrouper(test_series_1, test_series_2)
     sg = sg.fit()
     result = sg.get_groups()
     expected_result = pd.Series(['foooo', 'bar', 'baz', 'foooo'])
     pd.testing.assert_series_equal(expected_result, result)
Ejemplo n.º 5
0
 def test_get_groups_two_df(self):
     """Should return a pd.series object with the length of the dupes. The series will contain the master string
     that matches the dupe with the highest similarity"""
     test_series_1 = pd.Series(['foooo', 'bar', 'baz'])
     test_series_2 = pd.Series(['foooo', 'bar', 'baz', 'foooob'])
     sg = StringGrouper(test_series_1, test_series_2)
     sg = sg.fit()
     result = sg.get_groups()
     expected_result = pd.Series(['foooo', 'bar', 'baz', 'foooo'])
     pd.testing.assert_series_equal(expect ed_result, result)
 def test_get_groups_1_string_series_1_id_series(self):
     """Should return a pd.series object with the same length as the original df. The series object will contain
     a list of the grouped strings"""
     test_series_1 = pd.Series(['foooo', 'bar', 'baz', 'foooob'])
     test_series_id_1 = pd.Series(['A0', 'A1', 'A2', 'A3'])
     sg = StringGrouper(test_series_1, master_id=test_series_id_1)
     sg = sg.fit()
     result = sg.get_groups()
     expected_result = pd.DataFrame(list(zip(['A0', 'A1', 'A2', 'A0'], ['foooo', 'bar', 'baz', 'foooo'])))
     pd.testing.assert_frame_equal(expected_result, result)
 def test_get_groups_4_df_no_match(self):
     """Should return a pd.series object with the length of the dupes. If no match is found in dupes,
     the original will be returned"""
     test_series_1 = pd.Series(['foooo', 'bar', 'baz'])
     test_series_2 = pd.Series(['foooo', 'dooz', 'bar', 'baz', 'foooob'])
     test_series_id_1 = pd.Series(['A0', 'A1', 'A2'])
     test_series_id_2 = pd.Series(['B0', 'B1', 'B2', 'B3', 'B4'])
     sg = StringGrouper(test_series_1, test_series_2, master_id=test_series_id_1, duplicates_id=test_series_id_2)
     sg = sg.fit()
     result = sg.get_groups()
     expected_result = pd.DataFrame(list(zip(['A0', 'B1', 'A1', 'A2', 'A0'], ['foooo', 'dooz', 'bar', 'baz', 'foooo'])))
     pd.testing.assert_frame_equal(expected_result, result)
 def test_get_groups_4_df_same_similarity(self):
     """Should return a pd.series object with the length of the dupes. If there are two dupes with the same
     similarity, the first one is chosen"""
     test_series_1 = pd.Series(['foooo', 'bar', 'baz', 'foooo'])
     test_series_2 = pd.Series(['foooo', 'bar', 'baz', 'foooob'])
     test_series_id_1 = pd.Series(['A0', 'A1', 'A2', 'A3'])
     test_series_id_2 = pd.Series(['B0', 'B1', 'B2', 'B3'])
     sg = StringGrouper(test_series_1, test_series_2, master_id=test_series_id_1, duplicates_id=test_series_id_2)
     sg = sg.fit()
     result = sg.get_groups()
     expected_result = pd.DataFrame(list(zip(['A0', 'A1', 'A2', 'A0'], ['foooo', 'bar', 'baz', 'foooo'])))
     pd.testing.assert_frame_equal(expected_result, result)
 def test_get_groups_2_string_series_2_id_series(self):
     """Should return a pd.series object with the length of the dupes. The series will contain the master string
     that matches the dupe with the highest similarity"""
     test_series_1 = pd.Series(['foooo', 'bar', 'baz'])
     test_series_2 = pd.Series(['foooo', 'bar', 'baz', 'foooob'])
     test_series_id_1 = pd.Series(['A0', 'A1', 'A2'])
     test_series_id_2 = pd.Series(['B0', 'B1', 'B2', 'B3'])
     sg = StringGrouper(test_series_1, test_series_2, master_id=test_series_id_1, duplicates_id=test_series_id_2)
     sg = sg.fit()
     result = sg.get_groups()
     expected_result = pd.DataFrame(list(zip(['A0', 'A1', 'A2', 'A0'], ['foooo', 'bar', 'baz', 'foooo'])))
     pd.testing.assert_frame_equal(expected_result, result)
 def test_get_groups_2_string_series_with_numeric_indexes_and_missing_master_value(
         self):
     """Should return a pd.DataFrame object with the length of the dupes. The series will contain the master string
     that matches the dupe with the highest similarity"""
     test_series_1 = pd.Series(['foooo', 'bar', 'foooo'], index=[0, 1, 2])
     test_series_2 = pd.Series(['foooo', 'bar', 'baz', 'foooob'],
                               index=[100, 101, 102, 103])
     sg = StringGrouper(test_series_1, test_series_2, replace_na=True)
     sg = sg.fit()
     result = sg.get_groups()
     expected_result = pd.DataFrame(
         list(zip([0, 1, 102, 0], ['foooo', 'bar', 'baz', 'foooo'])),
         columns=['most_similar_index', 'most_similar_master'],
         index=test_series_2.index)
     pd.testing.assert_frame_equal(expected_result, result)
    def test_prior_matches_added(self):
        """When a new match is added, any pre-existing matches should also be updated"""
        sample = [
            'microsoftoffice 365 home', 'microsoftoffice 365 pers',
            'microsoft office'
        ]

        df = pd.DataFrame(sample, columns=['name'])

        sg = StringGrouper(df['name'], ignore_index=True)
        sg = sg.fit()

        sg = sg.add_match('microsoft office', 'microsoftoffice 365 home')
        sg = sg.add_match('microsoftoffice 365 pers', 'microsoft office')
        df['deduped'] = sg.get_groups()
        # All strings should now match to the same "master" string
        self.assertEqual(1, len(df.deduped.unique()))