def test_caseless_match_fail(self): a = "TestString" b = "testString1" comp = Comparison() comp.set_comparator("CaseInsensitiveMatch") res = comp.compare(a, b) self.assertEqual(res, 0)
def test_case_match_match(self): a = "TestString" b = "TestString" comp = Comparison() comp.set_comparator("CaseSensitiveMatch") res = comp.compare(a, b) self.assertEqual(res, 1)
def test_damerau_match_fail(self): a = "aabc" b = "aaaa" threshold = 0.7 comp = Comparison() comp.set_comparator("DamerauLevenshtein") res = comp.compare(a, b) self.assertLess(res, threshold)
def test_damerau_match_perfect(self): a = "aaaa" b = "aaaa" threshold = 1 comp = Comparison() comp.set_comparator("DamerauLevenshtein") res = comp.compare(a, b) self.assertEqual(res, threshold)
def test_damerau_match_success(self): a = "aaaa" b = "aaaa1" threshold = 0.7 comp = Comparison() comp.set_comparator("DamerauLevenshtein") res = comp.compare(a, b) self.assertGreaterEqual(res, threshold)
def test_levenshtein_match_success(self): a = "aaaa" b = "aaaa1" threshold = 0.7 comp = Comparison() comp.set_comparator("LevenshteinDistance") res = comp.compare(a, b) self.assertGreaterEqual(res, threshold)
class Reconciliation: def __init__(self): self.processor = Processing() self.mappings = None self.comparator = Comparison() self.comparator.set_comparator(comparison_alg) def set_mappings(self, mappings: [dict]): self.mappings = [] for mt in mappings: self.mappings.append(cast_from_dict(mt, AttributeMap)) def set_mappings_from_json(self, mappings: str): self.set_mappings(json.loads(mappings)) # Return a similarity level for two given datasets def similarity(self, dataset_a: Dataset, dataset_b: Dataset): if not isinstance(dataset_a, Dataset) \ or not isinstance(dataset_b, Dataset): raise MissingOrBadParams( "Passed parameters are not Dataset objects") # Build the tuple set to compare compare_tuples = self.processor.transform(dataset_a, dataset_b, self.mappings) if not len(compare_tuples): raise NoMatchingRules("No compare tuples could be generated") # Set the similarity of each tuple for ctuple in compare_tuples: ctuple.normalised_similarity = self.comparator.compare( ctuple.items[0], ctuple.items[1]) # Calculate length dependent weight tuple_max_lengths = set() for ctuple in compare_tuples: # Get length of the maximum length string in tuple ctuple.max_length = len(max(ctuple.items, key=len)) tuple_max_lengths.add(ctuple.max_length) # Maximum length of all tuples # tuples_max = max(tuple_max_lengths) # Sum of lengths of all tuples max sum_lengths = 0 for ctuple in compare_tuples: sum_lengths = sum_lengths + ctuple.max_length # Calculate normalised weight for ctuple in compare_tuples: ctuple.length_weight = 1.0 if use_length_weight: # ctuple.length_weight = ctuple.max_length / tuples_max ctuple.length_weight = ctuple.max_length / sum_lengths # Calculate normalised-weighted similarity for each tuple similarities = [] for ctuple in compare_tuples: similarities.append(ctuple.normalised_similarity * ctuple.weight * ctuple.length_weight) # Calculate Dataset Similarity Coefficient (if not using length_weight, # because length_weight is already normalised) if use_length_weight: sim = functools.reduce(lambda a, b: a + b, similarities) else: sim = functools.reduce(lambda a, b: a + b, similarities) / len(similarities) return sim