コード例 #1
0
 def test_3_words_comparison(self):
     self.comparator = MorphologicComparator()
     self.assertTrue(
         self.comparator.equals("Stanisława Ignacego Krasickiego",
                                "Stanisławem Ignacym Krasickim"))
     self.assertFalse(
         self.comparator.equals("Stanisława Piotra Krasickiego",
                                "Stanisławem Ignacym Krasickim"))
コード例 #2
0
    def __init__(self, list_of_words):

        self.analyser = morphologic_analyser

        self.data = defaultdict(set)
        self.comparator = MorphologicComparator()
        # to improve algorithm we assume here that every word no matter of it's form starts with the same letter e.g.
        # "doktor"[0] = "doktora"[0] = "doktorów"[0] = "dr"[0] = "doktorem"[0] and so on...
        # thanks to that we don't need to apply morphologic analysis on each element of the container,
        # only on the elements starting with the same letter
        for elem in list_of_words:
            internal_key = self._get_internal_key(elem)
            self.data[internal_key].add(elem)
コード例 #3
0
    def test_ignore_title_case_sensitive_comparison_if_actual_is_upper_case(
            self):
        c = MorphologicComparator(
            title_case_sensitive=True,
            ignore_case_sensitivity_if_actual_upper_case=True)

        self.assertFalse(
            c.equals("Stanisława Kowalska", "stanisławy kowalskiej"))
        self.assertTrue(
            c.equals("Stanisława Kowalska", "STANISŁAWY KOWALSKIEJ"))
        self.assertFalse(
            c.equals(
                "STANISŁAWY KOWALSKIEJ",
                "Stanisława Kowalska"))  # only works if actual is uppercase
コード例 #4
0
class MorphologicSet:
    """ Container that serves for storing words and easy interrogation it with words in different forms. i.e.
        MorphologicSet that stores word "doktor" satisfies the following condition:
        any((word in morphologic_set) for word in ['doktor', 'doktora', 'doktorów']) == True
    """
    @staticmethod
    def _get_internal_key(elem):
        return elem[0]

    def __init__(self, list_of_words):

        self.analyser = morphologic_analyser

        self.data = defaultdict(set)
        self.comparator = MorphologicComparator()
        # to improve algorithm we assume here that every word no matter of it's form starts with the same letter e.g.
        # "doktor"[0] = "doktora"[0] = "doktorów"[0] = "dr"[0] = "doktorem"[0] and so on...
        # thanks to that we don't need to apply morphologic analysis on each element of the container,
        # only on the elements starting with the same letter
        for elem in list_of_words:
            internal_key = self._get_internal_key(elem)
            self.data[internal_key].add(elem)

    @functools.lru_cache(maxsize=config["cache_size"])
    def __contains__(self, key):
        internal_key = self._get_internal_key(key)
        data = self.data[internal_key]
        for elem in data:
            if self.comparator.equals(elem, key):
                return True

        return False
コード例 #5
0
class NameComparator:
    def __init__(self, ignore_case_sensitivity_if_actual_upper_case=False):
        self.name_parser = human_name_parser
        self.comparator = MorphologicComparator(
            ignore_case_sensitivity_if_actual_upper_case=ignore_case_sensitivity_if_actual_upper_case)

    def _all_equal(self, lhs, rhs):
        return all(self.comparator.equals(*comp_pair) for comp_pair in zip_longest(lhs, rhs, fillvalue=""))

    def equals(self, expected: str, actual: str) -> bool:

        try:
            name_expected = self.name_parser.parse(expected)
            name_actual = self.name_parser.parse(actual)
        except FFE_InvalidArgument:
            return False

        any_name_not_provided = not name_expected.first_name or not name_actual.first_name
        any_surname_provided = name_expected.last_name or name_actual.last_name

        if any_name_not_provided and any_surname_provided:
            return self._all_equal(name_expected.last_name, name_actual.last_name)
        else:
            return self._all_equal(name_expected.first_name, name_actual.first_name) \
                   and self._all_equal(name_expected.last_name, name_actual.last_name)
コード例 #6
0
 def __init__(self,
              subject,
              rule_type,
              *,
              comparator=MorphologicComparator().equals):
     self.subject = subject
     self.rule_type = rule_type
     self._comparator = comparator
コード例 #7
0
    def _get_comparator(self, location_name):
        if tagger.does_contain_person_first_name(location_name):
            comparator = NameComparator(
                ignore_case_sensitivity_if_actual_upper_case=True)
        else:
            comparator = MorphologicComparator(
                comparison_rules=self.comparison_rules,
                title_case_sensitive=True,
                ignore_case_sensitivity_if_actual_upper_case=True)

        return comparator.equals
コード例 #8
0
    def __call__(self, description: str):
        phrases_to_look_for = ['przechodni pokój', 'pokój przechodni']

        comparator = MorphologicComparator().equals

        for phrase in phrases_to_look_for:
            found, _ = TextSearcher.find(phrase_to_find=phrase,
                                         text=description,
                                         equality_comparator=comparator)

            if found:
                return {True}

        return {False}
コード例 #9
0
    def __call__(self, description: str):
        phrases_to_look_for = ['aneks kuchenny', 'aneks', 'salon z kuchnią']

        comparator = MorphologicComparator().equals

        for phrase in phrases_to_look_for:
            found, _ = TextSearcher.find(
                phrase_to_find=phrase,
                text=description,
                equality_comparator=comparator)

            if found:
                return {True}

        return {False}
コード例 #10
0
    def __call__(self, description: str):
        phrases_to_look_for = [
            'kawalerka', 'studio', 'garsoniera', 'jednopokojowe', '1-pokojowe',
            '1 pokojowe', 'jedno pokojowe'
        ]

        comparator = MorphologicComparator().equals

        for phrase in phrases_to_look_for:
            found, _ = TextSearcher.find(phrase_to_find=phrase,
                                         text=description,
                                         equality_comparator=comparator)

            if found:
                return {True}

        return {False}
コード例 #11
0
    def test_title_case_sensitive_comparison(self):
        comparator = MorphologicComparator(title_case_sensitive=True)

        self.assertTrue(
            comparator.equals("Stanisława Kowalska", "Stanisławy Kowalskiej"))
        self.assertTrue(
            comparator.equals("STANISŁAWY KOWALSKIEJ",
                              "STANISŁAWY KOWALSKIEJ"))
        self.assertTrue(
            comparator.equals("stanisława kowalska", "stanisławy kowalskiej"))

        self.assertFalse(
            comparator.equals("Stanisława Kowalska", "stanisławy kowalskiej"))
        self.assertFalse(
            comparator.equals("Stanisława Kowalska", "STANISŁAWY KOWALSKIEJ"))
コード例 #12
0
ファイル: tagger.py プロジェクト: jakubgros/FlatFinder
    def does_contain_person_first_name(self, text):
        # check contain_exceptions
        for contain_exception, ret_val in self._contain_person_first_name_exceptions.items(
        ):
            comparator = MorphologicComparator().equals
            does_contain, *_ = TextSearcher.find(
                phrase_to_find=contain_exception,
                text=text,
                equality_comparator=comparator)
            if does_contain:
                return ret_val

        # do normal text analysis
        for word in text.split():
            for inflection in self._analyser.get_base_form(word):
                if inflection in self._first_names:
                    return True
        return False
コード例 #13
0
    def test_custom_comparators(self):
        eq_comparator = MorphologicComparator().equals
        to_find = "Nowy Kleparz"

        found, *_ = TextSearcher.find(phrase_to_find=to_find,
                                      text="na Nowym Kleparzu",
                                      equality_comparator=eq_comparator)
        self.assertTrue(found)

        found, *_ = TextSearcher.find(phrase_to_find=to_find,
                                      text="lokalizacja Nowy Kleparz",
                                      equality_comparator=eq_comparator)
        self.assertTrue(found)

        found, *_ = TextSearcher.find(phrase_to_find=to_find,
                                      text="niedaleko od Nowego Kleparza",
                                      equality_comparator=eq_comparator)
        self.assertTrue(found)
コード例 #14
0
    def _contains_phrase(self, phrase, flat):
        comparator = MorphologicComparator().equals

        found_in_title, _ = TextSearcher.find(
            phrase_to_find=phrase,
            text=flat.title,
            equality_comparator=comparator)

        if found_in_title:
            return True

        found_in_description, _ = TextSearcher.find(
            phrase_to_find=phrase,
            text=flat.description,
            equality_comparator=comparator)

        if found_in_description:
            return True

        return False
コード例 #15
0
    def test_comparison_rule_with_morphologic_comparator(self):
        comparator = MorphologicComparator(title_case_sensitive=True)

        self.assertFalse(
            comparator.equals("Osiedle Kowalskiego", "osiedle Kowalskiego"))

        rules = [
            ComparisonRule("osiedle",
                           ComparisonRuleType.FORCE_CASE_INSENSITIVITY)
        ]
        comparator = MorphologicComparator(
            title_case_sensitive=True,
            comparison_rules=ComparisonRulesContainer(rules))

        self.assertTrue(
            comparator.equals("Osiedle Kowalskiego", "osiedle Kowalskiego"))
コード例 #16
0
 def __init__(self, ignore_case_sensitivity_if_actual_upper_case=False):
     self.name_parser = human_name_parser
     self.comparator = MorphologicComparator(
         ignore_case_sensitivity_if_actual_upper_case=ignore_case_sensitivity_if_actual_upper_case)
コード例 #17
0
class MorphologicComparatorTest(unittest.TestCase):
    def setUp(self):
        self.comparator = MorphologicComparator()

    def test_1_word_comparison(self):
        self.assertTrue(self.comparator.equals("Stanisława", "Stanisławem"))
        self.assertFalse(self.comparator.equals("Piotra", "Stanisławem"))

    def test_2_words_comparison(self):
        self.assertTrue(
            self.comparator.equals("Stanisława Krasickiego",
                                   "Stanisławem Krasickim"))
        self.assertFalse(
            self.comparator.equals("Piotr Krasicki",
                                   "Stanisławem Ignacym Krasickim"))

    def test_3_words_comparison(self):
        self.comparator = MorphologicComparator()
        self.assertTrue(
            self.comparator.equals("Stanisława Ignacego Krasickiego",
                                   "Stanisławem Ignacym Krasickim"))
        self.assertFalse(
            self.comparator.equals("Stanisława Piotra Krasickiego",
                                   "Stanisławem Ignacym Krasickim"))

    def test_case_insensitive_comparison(self):
        self.assertTrue(self.comparator.equals("stanisława", "Stanisława"))
        self.assertTrue(self.comparator.equals("stanisława", "STANISłAWA"))

    def test_title_case_sensitive_comparison(self):
        comparator = MorphologicComparator(title_case_sensitive=True)

        self.assertTrue(
            comparator.equals("Stanisława Kowalska", "Stanisławy Kowalskiej"))
        self.assertTrue(
            comparator.equals("STANISŁAWY KOWALSKIEJ",
                              "STANISŁAWY KOWALSKIEJ"))
        self.assertTrue(
            comparator.equals("stanisława kowalska", "stanisławy kowalskiej"))

        self.assertFalse(
            comparator.equals("Stanisława Kowalska", "stanisławy kowalskiej"))
        self.assertFalse(
            comparator.equals("Stanisława Kowalska", "STANISŁAWY KOWALSKIEJ"))

    def test_ignore_title_case_sensitive_comparison_if_actual_is_upper_case(
            self):
        c = MorphologicComparator(
            title_case_sensitive=True,
            ignore_case_sensitivity_if_actual_upper_case=True)

        self.assertFalse(
            c.equals("Stanisława Kowalska", "stanisławy kowalskiej"))
        self.assertTrue(
            c.equals("Stanisława Kowalska", "STANISŁAWY KOWALSKIEJ"))
        self.assertFalse(
            c.equals(
                "STANISŁAWY KOWALSKIEJ",
                "Stanisława Kowalska"))  # only works if actual is uppercase

    def test_comparison_ignores_white_spaces(self):
        self.assertTrue(
            self.comparator.equals("jakieś słowo  ", "    jakieś    słowo"))

    def test_comparison_ignores_white_spaces(self):
        self.assertTrue(
            self.comparator.equals("jakieś słowo  ", "    jakieś    słowo"))
コード例 #18
0
 def setUp(self):
     self.comparator = MorphologicComparator()