Ejemplos de StringGrouper.get_groups en Python

Lenguaje de programación: Python

Namespace/Package Name: string_grouper.string_grouper

Clase / Tipo: StringGrouper

Método / Función: get_groups

Ejemplos en hotexamples.com: 11

Python StringGrouper.get_groups - 11 ejemplos encontrados. Estos son los ejemplos en Python del mundo real mejor valorados de string_grouper.string_grouper.StringGrouper.get_groups extraídos de proyectos de código abierto. Puedes valorar ejemplos para ayudarnos a mejorar la calidad de los ejemplos.

Métodos usados con frecuencia

Mostrar Ocultar

StringGrouper(18)

fit(15)

get_groups(11)

get_matches(8)

add_match(4)

n_grams(4)

_get_tf_idf_matrices(3)

_build_matches(1)

_clean_groups(1)

_get_non_matches_list(1)

remove_match(1)

Ejemplo n.º 1

0

Mostrar archivo

Archivo: test_string_grouper.py Proyecto: qinyufm/string_grouper

 def test_get_groups_raises_exception(self):
     """Should raise an exception if called before the StringGrouper is fit"""
     test_series_1 = pd.Series(['foooo', 'bar', 'baz', 'foooo'])
     test_series_2 = pd.Series(['foooo', 'bar', 'baz', 'foooob'])
     sg = StringGrouper(test_series_1, test_series_2)
     with self.assertRaises(StringGrouperNotFitException):
         _ = sg.get_groups()

Ejemplo n.º 2

0

Mostrar archivo

Archivo: test_string_grouper.py Proyecto: qinyufm/string_grouper

 def test_get_groups_single_df(self):
     """Should return a pd.series object with the same length as the original df. The series object will contain
     a list of the grouped strings"""
     test_series_1 = pd.Series(['foooo', 'bar', 'baz', 'foooob'])
     sg = StringGrouper(test_series_1)
     sg = sg.fit()
     result = sg.get_groups()
     expected_result = pd.Series(['foooo', 'bar', 'baz', 'foooo'])
     pd.testing.assert_series_equal(expected_result, result)

Ejemplo n.º 3

0

Mostrar archivo

Archivo: test_string_grouper.py Proyecto: qinyufm/string_grouper

 def test_get_groups_two_df_no_match(self):
     """Should return a pd.series object with the length of the dupes. If no match is found in dupes,
     the original will be returned"""
     test_series_1 = pd.Series(['foooo', 'bar', 'baz'])
     test_series_2 = pd.Series(['foooo', 'dooz', 'bar', 'baz', 'foooob'])
     sg = StringGrouper(test_series_1, test_series_2)
     sg = sg.fit()
     result = sg.get_groups()
     expected_result = pd.Series(['foooo', 'dooz', 'bar', 'baz', 'foooo'])
     pd.testing.assert_series_equal(expected_result, result)

Ejemplo n.º 4

0

Mostrar archivo

Archivo: test_string_grouper.py Proyecto: qinyufm/string_grouper

 def test_get_groups_two_df_same_similarity(self):
     """Should return a pd.series object with the length of the dupes. If there are two dupes with the same
     similarity, the first one is chosen"""
     test_series_1 = pd.Series(['foooo', 'bar', 'baz', 'foooo'])
     test_series_2 = pd.Series(['foooo', 'bar', 'baz', 'foooob'])
     sg = StringGrouper(test_series_1, test_series_2)
     sg = sg.fit()
     result = sg.get_groups()
     expected_result = pd.Series(['foooo', 'bar', 'baz', 'foooo'])
     pd.testing.assert_series_equal(expected_result, result)

Ejemplo n.º 5

0

Mostrar archivo

Archivo: test_string_grouper.py Proyecto: qinyufm/string_grouper

 def test_get_groups_two_df(self):
     """Should return a pd.series object with the length of the dupes. The series will contain the master string
     that matches the dupe with the highest similarity"""
     test_series_1 = pd.Series(['foooo', 'bar', 'baz'])
     test_series_2 = pd.Series(['foooo', 'bar', 'baz', 'foooob'])
     sg = StringGrouper(test_series_1, test_series_2)
     sg = sg.fit()
     result = sg.get_groups()
     expected_result = pd.Series(['foooo', 'bar', 'baz', 'foooo'])
     pd.testing.assert_series_equal(expect ed_result, result)

Ejemplo n.º 6

0

Mostrar archivo

Archivo: test_string_grouper.py Proyecto: mbotezatu/string_grouper

 def test_get_groups_1_string_series_1_id_series(self):
     """Should return a pd.series object with the same length as the original df. The series object will contain
     a list of the grouped strings"""
     test_series_1 = pd.Series(['foooo', 'bar', 'baz', 'foooob'])
     test_series_id_1 = pd.Series(['A0', 'A1', 'A2', 'A3'])
     sg = StringGrouper(test_series_1, master_id=test_series_id_1)
     sg = sg.fit()
     result = sg.get_groups()
     expected_result = pd.DataFrame(list(zip(['A0', 'A1', 'A2', 'A0'], ['foooo', 'bar', 'baz', 'foooo'])))
     pd.testing.assert_frame_equal(expected_result, result)

Ejemplo n.º 7

0

Mostrar archivo

Archivo: test_string_grouper.py Proyecto: mbotezatu/string_grouper

 def test_get_groups_4_df_no_match(self):
     """Should return a pd.series object with the length of the dupes. If no match is found in dupes,
     the original will be returned"""
     test_series_1 = pd.Series(['foooo', 'bar', 'baz'])
     test_series_2 = pd.Series(['foooo', 'dooz', 'bar', 'baz', 'foooob'])
     test_series_id_1 = pd.Series(['A0', 'A1', 'A2'])
     test_series_id_2 = pd.Series(['B0', 'B1', 'B2', 'B3', 'B4'])
     sg = StringGrouper(test_series_1, test_series_2, master_id=test_series_id_1, duplicates_id=test_series_id_2)
     sg = sg.fit()
     result = sg.get_groups()
     expected_result = pd.DataFrame(list(zip(['A0', 'B1', 'A1', 'A2', 'A0'], ['foooo', 'dooz', 'bar', 'baz', 'foooo'])))
     pd.testing.assert_frame_equal(expected_result, result)

Ejemplo n.º 8

0

Mostrar archivo

Archivo: test_string_grouper.py Proyecto: mbotezatu/string_grouper

 def test_get_groups_4_df_same_similarity(self):
     """Should return a pd.series object with the length of the dupes. If there are two dupes with the same
     similarity, the first one is chosen"""
     test_series_1 = pd.Series(['foooo', 'bar', 'baz', 'foooo'])
     test_series_2 = pd.Series(['foooo', 'bar', 'baz', 'foooob'])
     test_series_id_1 = pd.Series(['A0', 'A1', 'A2', 'A3'])
     test_series_id_2 = pd.Series(['B0', 'B1', 'B2', 'B3'])
     sg = StringGrouper(test_series_1, test_series_2, master_id=test_series_id_1, duplicates_id=test_series_id_2)
     sg = sg.fit()
     result = sg.get_groups()
     expected_result = pd.DataFrame(list(zip(['A0', 'A1', 'A2', 'A0'], ['foooo', 'bar', 'baz', 'foooo'])))
     pd.testing.assert_frame_equal(expected_result, result)

Ejemplo n.º 9

0

Mostrar archivo

Archivo: test_string_grouper.py Proyecto: mbotezatu/string_grouper

 def test_get_groups_2_string_series_2_id_series(self):
     """Should return a pd.series object with the length of the dupes. The series will contain the master string
     that matches the dupe with the highest similarity"""
     test_series_1 = pd.Series(['foooo', 'bar', 'baz'])
     test_series_2 = pd.Series(['foooo', 'bar', 'baz', 'foooob'])
     test_series_id_1 = pd.Series(['A0', 'A1', 'A2'])
     test_series_id_2 = pd.Series(['B0', 'B1', 'B2', 'B3'])
     sg = StringGrouper(test_series_1, test_series_2, master_id=test_series_id_1, duplicates_id=test_series_id_2)
     sg = sg.fit()
     result = sg.get_groups()
     expected_result = pd.DataFrame(list(zip(['A0', 'A1', 'A2', 'A0'], ['foooo', 'bar', 'baz', 'foooo'])))
     pd.testing.assert_frame_equal(expected_result, result)

Ejemplo n.º 10

0

Mostrar archivo

Archivo: test_string_grouper.py Proyecto: justasojourner/string_grouper

 def test_get_groups_2_string_series_with_numeric_indexes_and_missing_master_value(
         self):
     """Should return a pd.DataFrame object with the length of the dupes. The series will contain the master string
     that matches the dupe with the highest similarity"""
     test_series_1 = pd.Series(['foooo', 'bar', 'foooo'], index=[0, 1, 2])
     test_series_2 = pd.Series(['foooo', 'bar', 'baz', 'foooob'],
                               index=[100, 101, 102, 103])
     sg = StringGrouper(test_series_1, test_series_2, replace_na=True)
     sg = sg.fit()
     result = sg.get_groups()
     expected_result = pd.DataFrame(
         list(zip([0, 1, 102, 0], ['foooo', 'bar', 'baz', 'foooo'])),
         columns=['most_similar_index', 'most_similar_master'],
         index=test_series_2.index)
     pd.testing.assert_frame_equal(expected_result, result)

Ejemplo n.º 11

0

Mostrar archivo

Archivo: test_string_grouper.py Proyecto: justasojourner/string_grouper

    def test_prior_matches_added(self):
        """When a new match is added, any pre-existing matches should also be updated"""
        sample = [
            'microsoftoffice 365 home', 'microsoftoffice 365 pers',
            'microsoft office'
        ]

        df = pd.DataFrame(sample, columns=['name'])

        sg = StringGrouper(df['name'], ignore_index=True)
        sg = sg.fit()

        sg = sg.add_match('microsoft office', 'microsoftoffice 365 home')
        sg = sg.add_match('microsoftoffice 365 pers', 'microsoft office')
        df['deduped'] = sg.get_groups()
        # All strings should now match to the same "master" string
        self.assertEqual(1, len(df.deduped.unique()))