Esempio n. 1
0
    def test_caseless_match_fail(self):
        a = "TestString"
        b = "testString1"

        comp = Comparison()
        comp.set_comparator("CaseInsensitiveMatch")
        res = comp.compare(a, b)

        self.assertEqual(res, 0)
Esempio n. 2
0
    def test_case_match_match(self):
        a = "TestString"
        b = "TestString"

        comp = Comparison()
        comp.set_comparator("CaseSensitiveMatch")
        res = comp.compare(a, b)

        self.assertEqual(res, 1)
Esempio n. 3
0
    def test_damerau_match_fail(self):
        a = "aabc"
        b = "aaaa"
        threshold = 0.7

        comp = Comparison()
        comp.set_comparator("DamerauLevenshtein")
        res = comp.compare(a, b)

        self.assertLess(res, threshold)
Esempio n. 4
0
    def test_damerau_match_perfect(self):
        a = "aaaa"
        b = "aaaa"
        threshold = 1

        comp = Comparison()
        comp.set_comparator("DamerauLevenshtein")
        res = comp.compare(a, b)

        self.assertEqual(res, threshold)
Esempio n. 5
0
    def test_damerau_match_success(self):
        a = "aaaa"
        b = "aaaa1"
        threshold = 0.7

        comp = Comparison()
        comp.set_comparator("DamerauLevenshtein")
        res = comp.compare(a, b)

        self.assertGreaterEqual(res, threshold)
Esempio n. 6
0
    def test_levenshtein_match_success(self):
        a = "aaaa"
        b = "aaaa1"
        threshold = 0.7

        comp = Comparison()
        comp.set_comparator("LevenshteinDistance")
        res = comp.compare(a, b)

        self.assertGreaterEqual(res, threshold)
Esempio n. 7
0
class Reconciliation:
    def __init__(self):
        self.processor = Processing()
        self.mappings = None
        self.comparator = Comparison()
        self.comparator.set_comparator(comparison_alg)

    def set_mappings(self, mappings: [dict]):
        self.mappings = []
        for mt in mappings:
            self.mappings.append(cast_from_dict(mt, AttributeMap))

    def set_mappings_from_json(self, mappings: str):
        self.set_mappings(json.loads(mappings))

    # Return a similarity level for two given datasets
    def similarity(self, dataset_a: Dataset, dataset_b: Dataset):

        if not isinstance(dataset_a, Dataset) \
                or not isinstance(dataset_b, Dataset):
            raise MissingOrBadParams(
                "Passed parameters are not Dataset objects")

        # Build the tuple set to compare
        compare_tuples = self.processor.transform(dataset_a, dataset_b,
                                                  self.mappings)
        if not len(compare_tuples):
            raise NoMatchingRules("No compare tuples could be generated")

        # Set the similarity of each tuple
        for ctuple in compare_tuples:
            ctuple.normalised_similarity = self.comparator.compare(
                ctuple.items[0], ctuple.items[1])

        # Calculate length dependent weight
        tuple_max_lengths = set()
        for ctuple in compare_tuples:
            # Get length of the maximum length string in tuple
            ctuple.max_length = len(max(ctuple.items, key=len))
            tuple_max_lengths.add(ctuple.max_length)

        # Maximum length of all tuples
        # tuples_max = max(tuple_max_lengths)

        # Sum of lengths of all tuples max
        sum_lengths = 0
        for ctuple in compare_tuples:
            sum_lengths = sum_lengths + ctuple.max_length

        # Calculate normalised weight
        for ctuple in compare_tuples:
            ctuple.length_weight = 1.0
            if use_length_weight:
                # ctuple.length_weight = ctuple.max_length / tuples_max
                ctuple.length_weight = ctuple.max_length / sum_lengths

        # Calculate normalised-weighted similarity for each tuple
        similarities = []
        for ctuple in compare_tuples:
            similarities.append(ctuple.normalised_similarity * ctuple.weight *
                                ctuple.length_weight)

        # Calculate Dataset Similarity Coefficient (if not using length_weight,
        # because length_weight is already normalised)
        if use_length_weight:
            sim = functools.reduce(lambda a, b: a + b, similarities)
        else:
            sim = functools.reduce(lambda a, b: a + b,
                                   similarities) / len(similarities)

        return sim