def test_get_matches_1_series_1_id_series(self): test_series_1 = pd.Series(['foo', 'bar', 'baz', 'foo']) test_series_id_1 = pd.Series(['A0', 'A1', 'A2', 'A3']) sg = StringGrouper(test_series_1, master_id=test_series_id_1) sg = sg.fit() left_side = ['foo', 'foo', 'bar', 'baz', 'foo', 'foo'] left_side_id = ['A0', 'A0', 'A1', 'A2', 'A3', 'A3'] left_index = [0, 0, 1, 2, 3, 3] right_side = ['foo', 'foo', 'bar', 'baz', 'foo', 'foo'] right_side_id = ['A0', 'A3', 'A1', 'A2', 'A0', 'A3'] right_index = [0, 3, 1, 2, 0, 3] similarity = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0] expected_df = pd.DataFrame({ 'left_index': left_index, 'left_side': left_side, 'left_id': left_side_id, 'similarity': similarity, 'right_id': right_side_id, 'right_side': right_side, 'right_index': right_index }) expected_df.loc[:, 'similarity'] = expected_df.loc[:, 'similarity'].astype( sg._config. tfidf_matrix_dtype) pd.testing.assert_frame_equal(expected_df, sg.get_matches())
def test_get_groups_single_df(self): """Should return a pd.series object with the same length as the original df. The series object will contain a list of the grouped strings""" test_series_1 = pd.Series(['foooo', 'bar', 'baz', 'foooob']) sg = StringGrouper(test_series_1) sg = sg.fit() result = sg.get_groups() expected_result = pd.Series(['foooo', 'bar', 'baz', 'foooo']) pd.testing.assert_series_equal(expected_result, result)
def test_get_matches_single(self): test_series_1 = pd.Series(['foo', 'bar', 'baz', 'foo']) sg = StringGrouper(test_series_1) sg = sg.fit() left_side = ['foo', 'foo', 'bar', 'baz', 'foo', 'foo'] right_side = ['foo', 'foo', 'bar', 'baz', 'foo', 'foo'] similarity = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0] expected_df = pd.DataFrame({'left_side': left_side, 'right_side': right_side, 'similarity': similarity}) pd.testing.assert_frame_equal(expected_df, sg.get_matches())
def test_get_groups_two_df_no_match(self): """Should return a pd.series object with the length of the dupes. If no match is found in dupes, the original will be returned""" test_series_1 = pd.Series(['foooo', 'bar', 'baz']) test_series_2 = pd.Series(['foooo', 'dooz', 'bar', 'baz', 'foooob']) sg = StringGrouper(test_series_1, test_series_2) sg = sg.fit() result = sg.get_groups() expected_result = pd.Series(['foooo', 'dooz', 'bar', 'baz', 'foooo']) pd.testing.assert_series_equal(expected_result, result)
def test_get_groups_two_df_same_similarity(self): """Should return a pd.series object with the length of the dupes. If there are two dupes with the same similarity, the first one is chosen""" test_series_1 = pd.Series(['foooo', 'bar', 'baz', 'foooo']) test_series_2 = pd.Series(['foooo', 'bar', 'baz', 'foooob']) sg = StringGrouper(test_series_1, test_series_2) sg = sg.fit() result = sg.get_groups() expected_result = pd.Series(['foooo', 'bar', 'baz', 'foooo']) pd.testing.assert_series_equal(expected_result, result)
def test_get_groups_two_df(self): """Should return a pd.series object with the length of the dupes. The series will contain the master string that matches the dupe with the highest similarity""" test_series_1 = pd.Series(['foooo', 'bar', 'baz']) test_series_2 = pd.Series(['foooo', 'bar', 'baz', 'foooob']) sg = StringGrouper(test_series_1, test_series_2) sg = sg.fit() result = sg.get_groups() expected_result = pd.Series(['foooo', 'bar', 'baz', 'foooo']) pd.testing.assert_series_equal(expect ed_result, result)
def test_get_groups_1_string_series_1_id_series(self): """Should return a pd.series object with the same length as the original df. The series object will contain a list of the grouped strings""" test_series_1 = pd.Series(['foooo', 'bar', 'baz', 'foooob']) test_series_id_1 = pd.Series(['A0', 'A1', 'A2', 'A3']) sg = StringGrouper(test_series_1, master_id=test_series_id_1) sg = sg.fit() result = sg.get_groups() expected_result = pd.DataFrame(list(zip(['A0', 'A1', 'A2', 'A0'], ['foooo', 'bar', 'baz', 'foooo']))) pd.testing.assert_frame_equal(expected_result, result)
def test_build_matches_list(self): """Should create the cosine similarity matrix of two series""" test_series_1 = pd.Series(['foo', 'bar', 'baz']) test_series_2 = pd.Series(['foo', 'bar', 'bop']) sg = StringGrouper(test_series_1, test_series_2) sg = sg.fit() master = [0, 1] dupe_side = [0, 1] similarity = [1.0, 1.0] expected_df = pd.DataFrame({'master_side': master, 'dupe_side': dupe_side, 'similarity': similarity}) pd.testing.assert_frame_equal(expected_df, sg._matches_list)
def test_get_groups_4_df_no_match(self): """Should return a pd.series object with the length of the dupes. If no match is found in dupes, the original will be returned""" test_series_1 = pd.Series(['foooo', 'bar', 'baz']) test_series_2 = pd.Series(['foooo', 'dooz', 'bar', 'baz', 'foooob']) test_series_id_1 = pd.Series(['A0', 'A1', 'A2']) test_series_id_2 = pd.Series(['B0', 'B1', 'B2', 'B3', 'B4']) sg = StringGrouper(test_series_1, test_series_2, master_id=test_series_id_1, duplicates_id=test_series_id_2) sg = sg.fit() result = sg.get_groups() expected_result = pd.DataFrame(list(zip(['A0', 'B1', 'A1', 'A2', 'A0'], ['foooo', 'dooz', 'bar', 'baz', 'foooo']))) pd.testing.assert_frame_equal(expected_result, result)
def test_get_groups_4_df_same_similarity(self): """Should return a pd.series object with the length of the dupes. If there are two dupes with the same similarity, the first one is chosen""" test_series_1 = pd.Series(['foooo', 'bar', 'baz', 'foooo']) test_series_2 = pd.Series(['foooo', 'bar', 'baz', 'foooob']) test_series_id_1 = pd.Series(['A0', 'A1', 'A2', 'A3']) test_series_id_2 = pd.Series(['B0', 'B1', 'B2', 'B3']) sg = StringGrouper(test_series_1, test_series_2, master_id=test_series_id_1, duplicates_id=test_series_id_2) sg = sg.fit() result = sg.get_groups() expected_result = pd.DataFrame(list(zip(['A0', 'A1', 'A2', 'A0'], ['foooo', 'bar', 'baz', 'foooo']))) pd.testing.assert_frame_equal(expected_result, result)
def test_get_groups_2_string_series_2_id_series(self): """Should return a pd.series object with the length of the dupes. The series will contain the master string that matches the dupe with the highest similarity""" test_series_1 = pd.Series(['foooo', 'bar', 'baz']) test_series_2 = pd.Series(['foooo', 'bar', 'baz', 'foooob']) test_series_id_1 = pd.Series(['A0', 'A1', 'A2']) test_series_id_2 = pd.Series(['B0', 'B1', 'B2', 'B3']) sg = StringGrouper(test_series_1, test_series_2, master_id=test_series_id_1, duplicates_id=test_series_id_2) sg = sg.fit() result = sg.get_groups() expected_result = pd.DataFrame(list(zip(['A0', 'A1', 'A2', 'A0'], ['foooo', 'bar', 'baz', 'foooo']))) pd.testing.assert_frame_equal(expected_result, result)
def test_get_matches_1_series_1_id_series(self): test_series_1 = pd.Series(['foo', 'bar', 'baz', 'foo']) test_series_id_1 = pd.Series(['A0', 'A1', 'A2', 'A3']) sg = StringGrouper(test_series_1, master_id=test_series_id_1) sg = sg.fit() left_side = ['foo', 'foo', 'bar', 'baz', 'foo', 'foo'] left_side_id = ['A0', 'A0', 'A1', 'A2', 'A3', 'A3'] right_side = ['foo', 'foo', 'bar', 'baz', 'foo', 'foo'] right_side_id = ['A3', 'A0', 'A1', 'A2', 'A3', 'A0'] similarity = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0] expected_df = pd.DataFrame({'left_side_id': left_side_id, 'left_side': left_side, 'right_side_id': right_side_id, 'right_side': right_side, 'similarity': similarity}) pd.testing.assert_frame_equal(expected_df, sg.get_matches())
def test_get_groups_2_string_series_with_numeric_indexes_and_missing_master_value( self): """Should return a pd.DataFrame object with the length of the dupes. The series will contain the master string that matches the dupe with the highest similarity""" test_series_1 = pd.Series(['foooo', 'bar', 'foooo'], index=[0, 1, 2]) test_series_2 = pd.Series(['foooo', 'bar', 'baz', 'foooob'], index=[100, 101, 102, 103]) sg = StringGrouper(test_series_1, test_series_2, replace_na=True) sg = sg.fit() result = sg.get_groups() expected_result = pd.DataFrame( list(zip([0, 1, 102, 0], ['foooo', 'bar', 'baz', 'foooo'])), columns=['most_similar_index', 'most_similar_master'], index=test_series_2.index) pd.testing.assert_frame_equal(expected_result, result)
def test_prior_matches_added(self): """When a new match is added, any pre-existing matches should also be updated""" sample = [ 'microsoftoffice 365 home', 'microsoftoffice 365 pers', 'microsoft office' ] df = pd.DataFrame(sample, columns=['name']) sg = StringGrouper(df['name'], ignore_index=True) sg = sg.fit() sg = sg.add_match('microsoft office', 'microsoftoffice 365 home') sg = sg.add_match('microsoftoffice 365 pers', 'microsoft office') df['deduped'] = sg.get_groups() # All strings should now match to the same "master" string self.assertEqual(1, len(df.deduped.unique()))
def test_case_insensitive_build_matches_list(self): """Should create the cosine similarity matrix of two case insensitive series""" test_series_1 = pd.Series(['foo', 'BAR', 'baz']) test_series_2 = pd.Series(['FOO', 'bar', 'bop']) sg = StringGrouper(test_series_1, test_series_2) sg = sg.fit() master = [0, 1] dupe_side = [0, 1] similarity = [1.0, 1.0] expected_df = pd.DataFrame({ 'master_side': master, 'dupe_side': dupe_side, 'similarity': similarity }) expected_df.loc[:, 'similarity'] = expected_df.loc[:, 'similarity'].astype( sg._config. tfidf_matrix_dtype) pd.testing.assert_frame_equal(expected_df, sg._matches_list)