def run(excel_sheet_path): holder = PairHolder.get_holder_from_excel(excel_sheet_path) result = holder.run_levenshtein(1, ["NAME1"], "NAME1", data_limit=500) result2 = holder.run_jaro_winkler(0.15, ["NAME1", "STRAS"], "NAME1", data_limit=500) result3 = holder.run_cosine(0.9, ["NAME1", "STRAS", "ORT01", "PSTLZ", "LAND1"], "_KNA1.KUNNR", data_limit=None) print(result.shape) print(result2.shape) print(result3.shape)
def run_same_countries(excel_sheet_path, result_path, precision): holder = PairHolder.get_holder_from_excel(excel_sheet_path) countries = holder.get_unique_column_values("LAND1") for country in countries: countries_result = holder.run_cosine(precision, ["NAME1", "STRAS"], "NAME1", data_limit=None, filter_field="LAND1", filter_value=country) holder.save_to_csv(countries_result, result_path+country+".csv") return True
from duplicateDetector.pairholder import PairHolder from duplicateDetector import ROOT_PATH import pandas as pd import unittest excel_sheet_path = "/Users/Moritz/Downloads/Customers.xlsx" holder = PairHolder.get_holder_from_excel(excel_sheet_path) class TestSum(unittest.TestCase): def test_run_levenshtein(self): result = holder.run_levenshtein(4, ["NAME1", "STRAS"], "_KNA1.KUNNR", data_limit=3000, filter_field="LAND1", filter_value="DE") self.assertIsInstance(result, pd.DataFrame) self.assertGreater(len(result), 0) for name in ["left_side", "right_side", "similarity"]: self.assertIn(name, result.columns) self.assertEqual(len(result.columns), 3) def test_run_jaro_winkler(self): result = holder.run_jaro_winkler(0.4, ["NAME1", "STRAS"], "_KNA1.KUNNR", data_limit=1000, filter_field="LAND1", filter_value="DE") self.assertIsInstance(result, pd.DataFrame) self.assertGreater(len(result), 0) for name in ["left_side", "right_side", "similarity"]: self.assertIn(name, result.columns) self.assertEqual(len(result.columns), 3) def test_run_cosine(self): result = holder.run_cosine(0.7, ["NAME1", "STRAS"], "_KNA1.KUNNR", filter_field="LAND1", filter_value="DE") self.assertIsInstance(result, pd.DataFrame)
def plot_cosine_de(excel_sheet_path): holder = PairHolder.get_holder_from_excel(excel_sheet_path) holder.plot_cosine(0.8, ["NAME1", "STRAS", "ORT01"], "NAME1", filter_field="LAND1", filter_value="DE")
def run_filter_cluster(excel_sheet_path): holder = PairHolder.get_holder_from_excel(excel_sheet_path) result = holder.run_cluster(0.8, ["NAME1", "STRAS"], "NAME1", data_limit=None, min_cluster_size=3, filter_field="LAND1", filter_value="DE", plot=True) print(result)
def cluster_cosine(excel_sheet_path): holder = PairHolder.get_holder_from_excel(excel_sheet_path) result = holder.cluster_cosine(0.8, ["NAME1", "STRAS", "ORT01"], "NAME1", filter_field="LAND1", filter_value="DE") print(result)
def run_all_countries(excel_sheet_path, result_path): holder = PairHolder.get_holder_from_excel(excel_sheet_path) result = holder.run_cosine(0.8, ["NAME1", "STRAS", "ORT01", "PSTLZ", "LAND1"], "_KNA1.KUNNR", data_limit=None) holder.save_to_csv(result, result_path)