def test_substitution_notfound(self): attributes = ["$NotToBeFound", "ZZ", "$FamilyName"] expected_str = "ZZARAGO MONZONIS" p = Processing() final_str = p.substitute(attributes, self.datasets[0]) self.assertEqual(final_str, expected_str)
def test_substitution(self): attributes = ["$GivenName", " ", "$FamilyName"] expected_str = "FRANCISCO JOSE ARAGO MONZONIS" p = Processing() final_str = p.substitute(attributes, self.datasets[0]) self.assertEqual(final_str, expected_str)
def test_transform_exact_match(self): p = Processing() ctuples = p.transform(self.datasets[0], self.datasets[1], self.mappings) self.assertGreaterEqual(len(ctuples), 1) for tp in ctuples: self.assertEqual(tp.items[0], tp.items[1])
def test_transform_similar_match_greek(self): p = Processing() ctuples = p.transform(self.datasets[2], self.datasets[3], self.mappings) self.assertEqual(len(ctuples), 1) self.assertEqual(ctuples[0].items[0], "ANDREAS PETROU") self.assertEqual(ctuples[0].items[1], "ANDREAS PETRO")
def test_no_match_set(self): tr = AttributeMap() pr = Pairing() pr.profile = "fail" pr.issuer = "fail" pr.categories = ["None", "to", "be", "found"] tr.pairings = [pr] p = Processing() # with self.assertRaises(MapDatasetMatchNotFound): pairings = p.match_set(self.datasets[0], tr) self.assertEquals(pairings, set())
def test_clean_string(self): input_string = " legitimate-.,;:_·<>+\\|/'#@()\"\t\n\r!%&=?¡¿ text " expected_string = "legitimate text" p = Processing() output_string = Tools.clean_string(input_string, StringProcessor.unwanted_chars) output_string = Tools.clean_spaces(output_string) self.assertEqual(expected_string, output_string)
def __init__(self): self.processor = Processing() self.mappings = None self.comparator = Comparison() self.comparator.set_comparator(comparison_alg)
class Reconciliation: def __init__(self): self.processor = Processing() self.mappings = None self.comparator = Comparison() self.comparator.set_comparator(comparison_alg) def set_mappings(self, mappings: [dict]): self.mappings = [] for mt in mappings: self.mappings.append(cast_from_dict(mt, AttributeMap)) def set_mappings_from_json(self, mappings: str): self.set_mappings(json.loads(mappings)) # Return a similarity level for two given datasets def similarity(self, dataset_a: Dataset, dataset_b: Dataset): if not isinstance(dataset_a, Dataset) \ or not isinstance(dataset_b, Dataset): raise MissingOrBadParams( "Passed parameters are not Dataset objects") # Build the tuple set to compare compare_tuples = self.processor.transform(dataset_a, dataset_b, self.mappings) if not len(compare_tuples): raise NoMatchingRules("No compare tuples could be generated") # Set the similarity of each tuple for ctuple in compare_tuples: ctuple.normalised_similarity = self.comparator.compare( ctuple.items[0], ctuple.items[1]) # Calculate length dependent weight tuple_max_lengths = set() for ctuple in compare_tuples: # Get length of the maximum length string in tuple ctuple.max_length = len(max(ctuple.items, key=len)) tuple_max_lengths.add(ctuple.max_length) # Maximum length of all tuples # tuples_max = max(tuple_max_lengths) # Sum of lengths of all tuples max sum_lengths = 0 for ctuple in compare_tuples: sum_lengths = sum_lengths + ctuple.max_length # Calculate normalised weight for ctuple in compare_tuples: ctuple.length_weight = 1.0 if use_length_weight: # ctuple.length_weight = ctuple.max_length / tuples_max ctuple.length_weight = ctuple.max_length / sum_lengths # Calculate normalised-weighted similarity for each tuple similarities = [] for ctuple in compare_tuples: similarities.append(ctuple.normalised_similarity * ctuple.weight * ctuple.length_weight) # Calculate Dataset Similarity Coefficient (if not using length_weight, # because length_weight is already normalised) if use_length_weight: sim = functools.reduce(lambda a, b: a + b, similarities) else: sim = functools.reduce(lambda a, b: a + b, similarities) / len(similarities) return sim
def on_pubmsg(self, serv, ev): message = ev.arguments[0] Processing.processMessage(message, serv, ev.target)
def test_substitution_no_attrs(self): attributes = [] p = Processing() final_str = p.substitute(attributes, self.datasets[0]) self.assertEqual(final_str, "")
def test_no_getAttributeValue(self): attr_name = "fail" attr_list = self.datasets[0].attributes p = Processing() value = p.getAttributeValue(attr_name, attr_list) self.assertIsNone(value)
def test_clean_spaces(self): input_string = " a b c d " expected_string = "a b c d" p = Processing() output_string = Tools.clean_spaces(input_string) self.assertEqual(expected_string, output_string)