def test_get_matches_1_series_1_id_series(self):
     test_series_1 = pd.Series(['foo', 'bar', 'baz', 'foo'])
     test_series_id_1 = pd.Series(['A0', 'A1', 'A2', 'A3'])
     sg = StringGrouper(test_series_1, master_id=test_series_id_1)
     sg = sg.fit()
     left_side = ['foo', 'foo', 'bar', 'baz', 'foo', 'foo']
     left_side_id = ['A0', 'A0', 'A1', 'A2', 'A3', 'A3']
     left_index = [0, 0, 1, 2, 3, 3]
     right_side = ['foo', 'foo', 'bar', 'baz', 'foo', 'foo']
     right_side_id = ['A0', 'A3', 'A1', 'A2', 'A0', 'A3']
     right_index = [0, 3, 1, 2, 0, 3]
     similarity = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
     expected_df = pd.DataFrame({
         'left_index': left_index,
         'left_side': left_side,
         'left_id': left_side_id,
         'similarity': similarity,
         'right_id': right_side_id,
         'right_side': right_side,
         'right_index': right_index
     })
     expected_df.loc[:,
                     'similarity'] = expected_df.loc[:,
                                                     'similarity'].astype(
                                                         sg._config.
                                                         tfidf_matrix_dtype)
     pd.testing.assert_frame_equal(expected_df, sg.get_matches())
 def test_get_groups_single_df(self):
     """Should return a pd.series object with the same length as the original df. The series object will contain
     a list of the grouped strings"""
     test_series_1 = pd.Series(['foooo', 'bar', 'baz', 'foooob'])
     sg = StringGrouper(test_series_1)
     sg = sg.fit()
     result = sg.get_groups()
     expected_result = pd.Series(['foooo', 'bar', 'baz', 'foooo'])
     pd.testing.assert_series_equal(expected_result, result)
 def test_get_matches_single(self):
     test_series_1 = pd.Series(['foo', 'bar', 'baz', 'foo'])
     sg = StringGrouper(test_series_1)
     sg = sg.fit()
     left_side = ['foo', 'foo', 'bar', 'baz', 'foo', 'foo']
     right_side = ['foo', 'foo', 'bar', 'baz', 'foo', 'foo']
     similarity = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
     expected_df = pd.DataFrame({'left_side': left_side, 'right_side': right_side, 'similarity': similarity})
     pd.testing.assert_frame_equal(expected_df, sg.get_matches())
 def test_get_groups_two_df_no_match(self):
     """Should return a pd.series object with the length of the dupes. If no match is found in dupes,
     the original will be returned"""
     test_series_1 = pd.Series(['foooo', 'bar', 'baz'])
     test_series_2 = pd.Series(['foooo', 'dooz', 'bar', 'baz', 'foooob'])
     sg = StringGrouper(test_series_1, test_series_2)
     sg = sg.fit()
     result = sg.get_groups()
     expected_result = pd.Series(['foooo', 'dooz', 'bar', 'baz', 'foooo'])
     pd.testing.assert_series_equal(expected_result, result)
 def test_get_groups_two_df_same_similarity(self):
     """Should return a pd.series object with the length of the dupes. If there are two dupes with the same
     similarity, the first one is chosen"""
     test_series_1 = pd.Series(['foooo', 'bar', 'baz', 'foooo'])
     test_series_2 = pd.Series(['foooo', 'bar', 'baz', 'foooob'])
     sg = StringGrouper(test_series_1, test_series_2)
     sg = sg.fit()
     result = sg.get_groups()
     expected_result = pd.Series(['foooo', 'bar', 'baz', 'foooo'])
     pd.testing.assert_series_equal(expected_result, result)
 def test_get_groups_two_df(self):
     """Should return a pd.series object with the length of the dupes. The series will contain the master string
     that matches the dupe with the highest similarity"""
     test_series_1 = pd.Series(['foooo', 'bar', 'baz'])
     test_series_2 = pd.Series(['foooo', 'bar', 'baz', 'foooob'])
     sg = StringGrouper(test_series_1, test_series_2)
     sg = sg.fit()
     result = sg.get_groups()
     expected_result = pd.Series(['foooo', 'bar', 'baz', 'foooo'])
     pd.testing.assert_series_equal(expect ed_result, result)
 def test_get_groups_1_string_series_1_id_series(self):
     """Should return a pd.series object with the same length as the original df. The series object will contain
     a list of the grouped strings"""
     test_series_1 = pd.Series(['foooo', 'bar', 'baz', 'foooob'])
     test_series_id_1 = pd.Series(['A0', 'A1', 'A2', 'A3'])
     sg = StringGrouper(test_series_1, master_id=test_series_id_1)
     sg = sg.fit()
     result = sg.get_groups()
     expected_result = pd.DataFrame(list(zip(['A0', 'A1', 'A2', 'A0'], ['foooo', 'bar', 'baz', 'foooo'])))
     pd.testing.assert_frame_equal(expected_result, result)
 def test_build_matches_list(self):
     """Should create the cosine similarity matrix of two series"""
     test_series_1 = pd.Series(['foo', 'bar', 'baz'])
     test_series_2 = pd.Series(['foo', 'bar', 'bop'])
     sg = StringGrouper(test_series_1, test_series_2)
     sg = sg.fit()
     master = [0, 1]
     dupe_side = [0, 1]
     similarity = [1.0, 1.0]
     expected_df = pd.DataFrame({'master_side': master, 'dupe_side': dupe_side, 'similarity': similarity})
     pd.testing.assert_frame_equal(expected_df, sg._matches_list)
 def test_get_groups_4_df_no_match(self):
     """Should return a pd.series object with the length of the dupes. If no match is found in dupes,
     the original will be returned"""
     test_series_1 = pd.Series(['foooo', 'bar', 'baz'])
     test_series_2 = pd.Series(['foooo', 'dooz', 'bar', 'baz', 'foooob'])
     test_series_id_1 = pd.Series(['A0', 'A1', 'A2'])
     test_series_id_2 = pd.Series(['B0', 'B1', 'B2', 'B3', 'B4'])
     sg = StringGrouper(test_series_1, test_series_2, master_id=test_series_id_1, duplicates_id=test_series_id_2)
     sg = sg.fit()
     result = sg.get_groups()
     expected_result = pd.DataFrame(list(zip(['A0', 'B1', 'A1', 'A2', 'A0'], ['foooo', 'dooz', 'bar', 'baz', 'foooo'])))
     pd.testing.assert_frame_equal(expected_result, result)
 def test_get_groups_4_df_same_similarity(self):
     """Should return a pd.series object with the length of the dupes. If there are two dupes with the same
     similarity, the first one is chosen"""
     test_series_1 = pd.Series(['foooo', 'bar', 'baz', 'foooo'])
     test_series_2 = pd.Series(['foooo', 'bar', 'baz', 'foooob'])
     test_series_id_1 = pd.Series(['A0', 'A1', 'A2', 'A3'])
     test_series_id_2 = pd.Series(['B0', 'B1', 'B2', 'B3'])
     sg = StringGrouper(test_series_1, test_series_2, master_id=test_series_id_1, duplicates_id=test_series_id_2)
     sg = sg.fit()
     result = sg.get_groups()
     expected_result = pd.DataFrame(list(zip(['A0', 'A1', 'A2', 'A0'], ['foooo', 'bar', 'baz', 'foooo'])))
     pd.testing.assert_frame_equal(expected_result, result)
 def test_get_groups_2_string_series_2_id_series(self):
     """Should return a pd.series object with the length of the dupes. The series will contain the master string
     that matches the dupe with the highest similarity"""
     test_series_1 = pd.Series(['foooo', 'bar', 'baz'])
     test_series_2 = pd.Series(['foooo', 'bar', 'baz', 'foooob'])
     test_series_id_1 = pd.Series(['A0', 'A1', 'A2'])
     test_series_id_2 = pd.Series(['B0', 'B1', 'B2', 'B3'])
     sg = StringGrouper(test_series_1, test_series_2, master_id=test_series_id_1, duplicates_id=test_series_id_2)
     sg = sg.fit()
     result = sg.get_groups()
     expected_result = pd.DataFrame(list(zip(['A0', 'A1', 'A2', 'A0'], ['foooo', 'bar', 'baz', 'foooo'])))
     pd.testing.assert_frame_equal(expected_result, result)
 def test_get_matches_1_series_1_id_series(self):
     test_series_1 = pd.Series(['foo', 'bar', 'baz', 'foo'])
     test_series_id_1 = pd.Series(['A0', 'A1', 'A2', 'A3'])
     sg = StringGrouper(test_series_1, master_id=test_series_id_1)
     sg = sg.fit()
     left_side = ['foo', 'foo', 'bar', 'baz', 'foo', 'foo']
     left_side_id = ['A0', 'A0', 'A1', 'A2', 'A3', 'A3']
     right_side = ['foo', 'foo', 'bar', 'baz', 'foo', 'foo']
     right_side_id = ['A3', 'A0', 'A1', 'A2', 'A3', 'A0']
     similarity = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
     expected_df = pd.DataFrame({'left_side_id': left_side_id, 'left_side': left_side,
                                 'right_side_id': right_side_id, 'right_side': right_side, 'similarity': similarity})
     pd.testing.assert_frame_equal(expected_df, sg.get_matches())
 def test_get_groups_2_string_series_with_numeric_indexes_and_missing_master_value(
         self):
     """Should return a pd.DataFrame object with the length of the dupes. The series will contain the master string
     that matches the dupe with the highest similarity"""
     test_series_1 = pd.Series(['foooo', 'bar', 'foooo'], index=[0, 1, 2])
     test_series_2 = pd.Series(['foooo', 'bar', 'baz', 'foooob'],
                               index=[100, 101, 102, 103])
     sg = StringGrouper(test_series_1, test_series_2, replace_na=True)
     sg = sg.fit()
     result = sg.get_groups()
     expected_result = pd.DataFrame(
         list(zip([0, 1, 102, 0], ['foooo', 'bar', 'baz', 'foooo'])),
         columns=['most_similar_index', 'most_similar_master'],
         index=test_series_2.index)
     pd.testing.assert_frame_equal(expected_result, result)
    def test_prior_matches_added(self):
        """When a new match is added, any pre-existing matches should also be updated"""
        sample = [
            'microsoftoffice 365 home', 'microsoftoffice 365 pers',
            'microsoft office'
        ]

        df = pd.DataFrame(sample, columns=['name'])

        sg = StringGrouper(df['name'], ignore_index=True)
        sg = sg.fit()

        sg = sg.add_match('microsoft office', 'microsoftoffice 365 home')
        sg = sg.add_match('microsoftoffice 365 pers', 'microsoft office')
        df['deduped'] = sg.get_groups()
        # All strings should now match to the same "master" string
        self.assertEqual(1, len(df.deduped.unique()))
 def test_case_insensitive_build_matches_list(self):
     """Should create the cosine similarity matrix of two case insensitive series"""
     test_series_1 = pd.Series(['foo', 'BAR', 'baz'])
     test_series_2 = pd.Series(['FOO', 'bar', 'bop'])
     sg = StringGrouper(test_series_1, test_series_2)
     sg = sg.fit()
     master = [0, 1]
     dupe_side = [0, 1]
     similarity = [1.0, 1.0]
     expected_df = pd.DataFrame({
         'master_side': master,
         'dupe_side': dupe_side,
         'similarity': similarity
     })
     expected_df.loc[:,
                     'similarity'] = expected_df.loc[:,
                                                     'similarity'].astype(
                                                         sg._config.
                                                         tfidf_matrix_dtype)
     pd.testing.assert_frame_equal(expected_df, sg._matches_list)