def run(excel_sheet_path):
    holder = PairHolder.get_holder_from_excel(excel_sheet_path)
    result = holder.run_levenshtein(1, ["NAME1"], "NAME1", data_limit=500)
    result2 = holder.run_jaro_winkler(0.15, ["NAME1", "STRAS"], "NAME1", data_limit=500)
    result3 = holder.run_cosine(0.9, ["NAME1", "STRAS", "ORT01", "PSTLZ", "LAND1"], "_KNA1.KUNNR", data_limit=None)
    print(result.shape)
    print(result2.shape)
    print(result3.shape)
def run_same_countries(excel_sheet_path, result_path, precision):
    holder = PairHolder.get_holder_from_excel(excel_sheet_path)
    countries = holder.get_unique_column_values("LAND1")
    for country in countries:
        countries_result = holder.run_cosine(precision, ["NAME1", "STRAS"], "NAME1", data_limit=None,
                                             filter_field="LAND1", filter_value=country)
        holder.save_to_csv(countries_result, result_path+country+".csv")
    return True
Ejemplo n.º 3
0
from duplicateDetector.pairholder import PairHolder
from duplicateDetector import ROOT_PATH
import pandas as pd
import unittest

excel_sheet_path = "/Users/Moritz/Downloads/Customers.xlsx"
holder = PairHolder.get_holder_from_excel(excel_sheet_path)

class TestSum(unittest.TestCase):
	def test_run_levenshtein(self):
		result = holder.run_levenshtein(4, ["NAME1", "STRAS"], "_KNA1.KUNNR", data_limit=3000, filter_field="LAND1",
										filter_value="DE")
		self.assertIsInstance(result, pd.DataFrame)
		self.assertGreater(len(result), 0)
		for name in ["left_side", "right_side", "similarity"]:
			self.assertIn(name, result.columns)
		self.assertEqual(len(result.columns), 3)

	def test_run_jaro_winkler(self):
		result = holder.run_jaro_winkler(0.4, ["NAME1", "STRAS"], "_KNA1.KUNNR", data_limit=1000, filter_field="LAND1",
										filter_value="DE")
		self.assertIsInstance(result, pd.DataFrame)
		self.assertGreater(len(result), 0)
		for name in ["left_side", "right_side", "similarity"]:
			self.assertIn(name, result.columns)
		self.assertEqual(len(result.columns), 3)

	def test_run_cosine(self):
		result = holder.run_cosine(0.7, ["NAME1", "STRAS"], "_KNA1.KUNNR", filter_field="LAND1",
										filter_value="DE")
		self.assertIsInstance(result, pd.DataFrame)
def plot_cosine_de(excel_sheet_path):
    holder = PairHolder.get_holder_from_excel(excel_sheet_path)
    holder.plot_cosine(0.8, ["NAME1", "STRAS", "ORT01"], "NAME1", filter_field="LAND1", filter_value="DE")
def run_filter_cluster(excel_sheet_path):
    holder = PairHolder.get_holder_from_excel(excel_sheet_path)
    result = holder.run_cluster(0.8, ["NAME1", "STRAS"], "NAME1", data_limit=None,
                                min_cluster_size=3, filter_field="LAND1", filter_value="DE", plot=True)
    print(result)
def cluster_cosine(excel_sheet_path):
    holder = PairHolder.get_holder_from_excel(excel_sheet_path)
    result = holder.cluster_cosine(0.8, ["NAME1", "STRAS", "ORT01"], "NAME1", filter_field="LAND1", filter_value="DE")
    print(result)
def run_all_countries(excel_sheet_path, result_path):
    holder = PairHolder.get_holder_from_excel(excel_sheet_path)
    result = holder.run_cosine(0.8, ["NAME1", "STRAS", "ORT01", "PSTLZ", "LAND1"], "_KNA1.KUNNR", data_limit=None)
    holder.save_to_csv(result, result_path)