def test_get_name_from_dataframes(self): file_names = ["example1.xlsx", "example2.xlsx"] dataframe_list = [ pd.DataFrame( data={ "A": [" kek, \n LLC\n\t\r ", "lol singapore, llc"], "\n\t\r\nB\n": ["b", "b"] }), pd.DataFrame( data={ "A": [" lel, \n LLC\n\t\r ", "kok london, llc"], "\n\t gasf\n": ["b", "b"] }), ] list_expected = [ CompanyNameWithFileName(file_names[0], "kek, llc"), CompanyNameWithFileName(file_names[0], "lol singapore, llc"), CompanyNameWithFileName(file_names[1], "lel, llc"), CompanyNameWithFileName(file_names[1], "kok london, llc") ] processer = DataFramePreprocessor("A") list_actual = processer.get_company_names_from_dataframes( dataframe_list, file_names) assert list_expected == list_actual
def test_mapper_to_df(): name_to_group = { CompanyNameWithFileName("a", "b"): 1, CompanyNameWithFileName("c", "z"): 0, CompanyNameWithFileName("c", "b"): 1, CompanyNameWithFileName("c", "ze"): 2 } actual = CompanyMapper.create_dataframe_from_mapper( CompanyMapper(name_to_group)) rows = [["a", "b", 1], ["c", "b", 1], ["c", "z", 0], ["c", "ze", 2]] expected = DataFrame(data=rows, columns=CompanyMapper.COLUMN_NAMES) assert actual.equals(expected)
def test_df_to_mapper(): rows = [["dasd", "a", "B ", 1], ["dasd", "c", "B \n", 1], ["dasd", "c", "z ", 0], ["dasd", "c", " ze", 2]] columns = ["dasd", "file_name", "Company_name\t", "group_Id"] df = DataFrame(data=rows, columns=columns) actual = CompanyMapper.create_mapper_from_dataframe(df).name_to_group expected = { CompanyNameWithFileName("a", "b"): 1, CompanyNameWithFileName("c", "b"): 1, CompanyNameWithFileName("c", "z"): 0, CompanyNameWithFileName("c", "ze"): 2 } assert actual == expected
def test_get_dict_to_others(): clusterizator = JacardDistanceClusterization(names_1) expected = { "a": { CompanyNameWithFileName("b", "comp_b_0, llc"), CompanyNameWithFileName("b", "comp_b_1, llc"), CompanyNameWithFileName("c", "comp_c_0, llc") }, "b": {CompanyNameWithFileName("c", "comp_c_0, llc")}, "c": set() } actual = clusterizator.get_dict_file_name_to_other_company_names() for key in actual.keys(): assert set(actual[key]) == expected[key]
def test_get_name_from_dataframe(self): file_name = "example.xlsx" data_frame = pd.DataFrame( data={ "A": [" kek, \n LLC\n\t\r ", "lol singapore, llc"], "\n\t\r\nB\n": ["b", "b"] }) list_expected = [ CompanyNameWithFileName(file_name, "kek, llc"), CompanyNameWithFileName(file_name, "lol singapore, llc") ] processer = DataFramePreprocessor("A") list_actual = processer.get_company_names_from_dataframe( data_frame, file_name) assert list_expected == list_actual
def get_company_names_from_dataframe(self, dataframe: pd.DataFrame, file_name: str): dataframe_copy = dataframe.copy(deep=True) self._standartize_columns_names(dataframe_copy) self._check_key_existence(dataframe_copy) self._check_names_nonempty(dataframe_copy) return list( map(lambda name: CompanyNameWithFileName(file_name, name), map(Utils.normalize_string, dataframe_copy[self.__key_name])))
def create_mapper_from_dataframe(dataframe: DataFrame): dataframe.columns = Index( map(Utils.normalize_string, dataframe.columns)) if not set(CompanyMapper.COLUMN_NAMES).issubset(set( dataframe.columns)): raise AssertionError("necessary columns do not exist") dataframe = dataframe[CompanyMapper.COLUMN_NAMES] name_to_group: Dict[CompanyNameWithFileName, int] = {} for _, row in dataframe.iterrows(): name = CompanyNameWithFileName( row[CompanyMapper.COLUMN_NAMES[0]], Utils.normalize_string(row[CompanyMapper.COLUMN_NAMES[1]])) name_to_group[name] = row[CompanyMapper.COLUMN_NAMES[2]] return CompanyMapper(name_to_group)
def test_get_group_to_names(): name_to_group = { CompanyNameWithFileName("a", "b"): 1, CompanyNameWithFileName("c", "b"): 1, CompanyNameWithFileName("c", "z"): 0, CompanyNameWithFileName("c", "ze"): 2 } expected = { 0: {CompanyNameWithFileName("c", "z")}, 1: {CompanyNameWithFileName("a", "b"), CompanyNameWithFileName("c", "b")}, 2: {CompanyNameWithFileName("c", "ze")} } actual = CompanyMapper.get_group_to_names(name_to_group) for group in actual.keys(): actual[group] = set(actual[group]) assert actual == expected
def test_get_indexes_of_common_companies(): name_to_group = { CompanyNameWithFileName("a", "b"): 1, CompanyNameWithFileName("c", "b"): 1, CompanyNameWithFileName("c", "z"): 0, CompanyNameWithFileName("c", "ze"): 2, CompanyNameWithFileName("d", "ze zE"): 3, CompanyNameWithFileName("d", "b"): 1, CompanyNameWithFileName("d", "z"): 0 } mapper = CompanyMapper(name_to_group) file_names_to_series = { "d": Series(("ze ze", "z", "b")), "a": Series(("b", )), "c": Series(("ze", "b", "z")) } actual = mapper.get_indexes_of_common_companies(file_names_to_series) expected = [[None, 2, 1], [0, 1, 2]] assert actual == expected
from companies_union.clusterization.jacard_distance_clusterization import JacardDistanceClusterization from companies_union.company_name import CompanyNameWithFileName import pytest names_1 = [ CompanyNameWithFileName("a", "comp_a_0, llc"), CompanyNameWithFileName("b", "comp_b_0, llc"), CompanyNameWithFileName("a", "comp_a_1, llc"), CompanyNameWithFileName("b", "comp_b_1, llc"), CompanyNameWithFileName("c", "comp_c_0, llc"), ] expected_groups_1 = [0, 2, 1, 3, 4] names_2 = [ CompanyNameWithFileName("a", "a b c d"), CompanyNameWithFileName("b", "a"), CompanyNameWithFileName("a", "a b c e f, llc"), CompanyNameWithFileName("b", "a b c e, llc"), CompanyNameWithFileName("c", "a b c d, llc"), ] expected_groups_2 = [0, 2, 1, 1, 0] names_3 = [ CompanyNameWithFileName("a", "a b c d"), CompanyNameWithFileName("b", "a b d"), CompanyNameWithFileName("b", "a b c"), CompanyNameWithFileName("a", "a b c e f, llc"), CompanyNameWithFileName("b", "a b c e, llc"),
def test_equals(self): first = ["das", "a b, llc"] second = ["das", "a B, \n lLc \n"] assert CompanyNameWithFileName(*first) == CompanyNameWithFileName( *second)
def test_distances(self): first = CompanyNameWithFileName("a", "a b c e f, llc") second = CompanyNameWithFileName("a", "a b c d, llc") assert first.distance(second) >= 1
def test_jacard_distances(self): first = ["das", "a e"] second = ["das", "a b c d e"] assert CompanyNameWithFileName(*first).jacard_distance( CompanyNameWithFileName(*second)) == 0.6
def test_tokens(self): first = ["das", "a b, llc"] assert CompanyNameWithFileName(*first).tokens == {"a", "b"}
} expected = { 0: {CompanyNameWithFileName("c", "z")}, 1: {CompanyNameWithFileName("a", "b"), CompanyNameWithFileName("c", "b")}, 2: {CompanyNameWithFileName("c", "ze")} } actual = CompanyMapper.get_group_to_names(name_to_group) for group in actual.keys(): actual[group] = set(actual[group]) assert actual == expected name_to_group = { CompanyNameWithFileName("a", "b"): 1, CompanyNameWithFileName("c", "b"): 1, CompanyNameWithFileName("c", "z"): 0, CompanyNameWithFileName("c", "ze"): 2, CompanyNameWithFileName("d", "ze zE"): 3, CompanyNameWithFileName("d", "b"): 1 } @pytest.mark.parametrize("file_name,series,expected", [("a", Series(("b", )), []), ("c", Series(("ze", "b", "z")), [0, 2]), ("d", Series(("b", "ze ze")), [1])]) def test_get_indexes_of_unique_companies(file_name, series, expected): mapper = CompanyMapper(name_to_group) actual = mapper.get_indexes_of_unique_companies(series, file_name)