def test_3_words_comparison(self): self.comparator = MorphologicComparator() self.assertTrue( self.comparator.equals("Stanisława Ignacego Krasickiego", "Stanisławem Ignacym Krasickim")) self.assertFalse( self.comparator.equals("Stanisława Piotra Krasickiego", "Stanisławem Ignacym Krasickim"))
def __init__(self, list_of_words): self.analyser = morphologic_analyser self.data = defaultdict(set) self.comparator = MorphologicComparator() # to improve algorithm we assume here that every word no matter of it's form starts with the same letter e.g. # "doktor"[0] = "doktora"[0] = "doktorów"[0] = "dr"[0] = "doktorem"[0] and so on... # thanks to that we don't need to apply morphologic analysis on each element of the container, # only on the elements starting with the same letter for elem in list_of_words: internal_key = self._get_internal_key(elem) self.data[internal_key].add(elem)
def test_ignore_title_case_sensitive_comparison_if_actual_is_upper_case( self): c = MorphologicComparator( title_case_sensitive=True, ignore_case_sensitivity_if_actual_upper_case=True) self.assertFalse( c.equals("Stanisława Kowalska", "stanisławy kowalskiej")) self.assertTrue( c.equals("Stanisława Kowalska", "STANISŁAWY KOWALSKIEJ")) self.assertFalse( c.equals( "STANISŁAWY KOWALSKIEJ", "Stanisława Kowalska")) # only works if actual is uppercase
class MorphologicSet: """ Container that serves for storing words and easy interrogation it with words in different forms. i.e. MorphologicSet that stores word "doktor" satisfies the following condition: any((word in morphologic_set) for word in ['doktor', 'doktora', 'doktorów']) == True """ @staticmethod def _get_internal_key(elem): return elem[0] def __init__(self, list_of_words): self.analyser = morphologic_analyser self.data = defaultdict(set) self.comparator = MorphologicComparator() # to improve algorithm we assume here that every word no matter of it's form starts with the same letter e.g. # "doktor"[0] = "doktora"[0] = "doktorów"[0] = "dr"[0] = "doktorem"[0] and so on... # thanks to that we don't need to apply morphologic analysis on each element of the container, # only on the elements starting with the same letter for elem in list_of_words: internal_key = self._get_internal_key(elem) self.data[internal_key].add(elem) @functools.lru_cache(maxsize=config["cache_size"]) def __contains__(self, key): internal_key = self._get_internal_key(key) data = self.data[internal_key] for elem in data: if self.comparator.equals(elem, key): return True return False
class NameComparator: def __init__(self, ignore_case_sensitivity_if_actual_upper_case=False): self.name_parser = human_name_parser self.comparator = MorphologicComparator( ignore_case_sensitivity_if_actual_upper_case=ignore_case_sensitivity_if_actual_upper_case) def _all_equal(self, lhs, rhs): return all(self.comparator.equals(*comp_pair) for comp_pair in zip_longest(lhs, rhs, fillvalue="")) def equals(self, expected: str, actual: str) -> bool: try: name_expected = self.name_parser.parse(expected) name_actual = self.name_parser.parse(actual) except FFE_InvalidArgument: return False any_name_not_provided = not name_expected.first_name or not name_actual.first_name any_surname_provided = name_expected.last_name or name_actual.last_name if any_name_not_provided and any_surname_provided: return self._all_equal(name_expected.last_name, name_actual.last_name) else: return self._all_equal(name_expected.first_name, name_actual.first_name) \ and self._all_equal(name_expected.last_name, name_actual.last_name)
def __init__(self, subject, rule_type, *, comparator=MorphologicComparator().equals): self.subject = subject self.rule_type = rule_type self._comparator = comparator
def _get_comparator(self, location_name): if tagger.does_contain_person_first_name(location_name): comparator = NameComparator( ignore_case_sensitivity_if_actual_upper_case=True) else: comparator = MorphologicComparator( comparison_rules=self.comparison_rules, title_case_sensitive=True, ignore_case_sensitivity_if_actual_upper_case=True) return comparator.equals
def __call__(self, description: str): phrases_to_look_for = ['przechodni pokój', 'pokój przechodni'] comparator = MorphologicComparator().equals for phrase in phrases_to_look_for: found, _ = TextSearcher.find(phrase_to_find=phrase, text=description, equality_comparator=comparator) if found: return {True} return {False}
def __call__(self, description: str): phrases_to_look_for = ['aneks kuchenny', 'aneks', 'salon z kuchnią'] comparator = MorphologicComparator().equals for phrase in phrases_to_look_for: found, _ = TextSearcher.find( phrase_to_find=phrase, text=description, equality_comparator=comparator) if found: return {True} return {False}
def __call__(self, description: str): phrases_to_look_for = [ 'kawalerka', 'studio', 'garsoniera', 'jednopokojowe', '1-pokojowe', '1 pokojowe', 'jedno pokojowe' ] comparator = MorphologicComparator().equals for phrase in phrases_to_look_for: found, _ = TextSearcher.find(phrase_to_find=phrase, text=description, equality_comparator=comparator) if found: return {True} return {False}
def test_title_case_sensitive_comparison(self): comparator = MorphologicComparator(title_case_sensitive=True) self.assertTrue( comparator.equals("Stanisława Kowalska", "Stanisławy Kowalskiej")) self.assertTrue( comparator.equals("STANISŁAWY KOWALSKIEJ", "STANISŁAWY KOWALSKIEJ")) self.assertTrue( comparator.equals("stanisława kowalska", "stanisławy kowalskiej")) self.assertFalse( comparator.equals("Stanisława Kowalska", "stanisławy kowalskiej")) self.assertFalse( comparator.equals("Stanisława Kowalska", "STANISŁAWY KOWALSKIEJ"))
def does_contain_person_first_name(self, text): # check contain_exceptions for contain_exception, ret_val in self._contain_person_first_name_exceptions.items( ): comparator = MorphologicComparator().equals does_contain, *_ = TextSearcher.find( phrase_to_find=contain_exception, text=text, equality_comparator=comparator) if does_contain: return ret_val # do normal text analysis for word in text.split(): for inflection in self._analyser.get_base_form(word): if inflection in self._first_names: return True return False
def test_custom_comparators(self): eq_comparator = MorphologicComparator().equals to_find = "Nowy Kleparz" found, *_ = TextSearcher.find(phrase_to_find=to_find, text="na Nowym Kleparzu", equality_comparator=eq_comparator) self.assertTrue(found) found, *_ = TextSearcher.find(phrase_to_find=to_find, text="lokalizacja Nowy Kleparz", equality_comparator=eq_comparator) self.assertTrue(found) found, *_ = TextSearcher.find(phrase_to_find=to_find, text="niedaleko od Nowego Kleparza", equality_comparator=eq_comparator) self.assertTrue(found)
def _contains_phrase(self, phrase, flat): comparator = MorphologicComparator().equals found_in_title, _ = TextSearcher.find( phrase_to_find=phrase, text=flat.title, equality_comparator=comparator) if found_in_title: return True found_in_description, _ = TextSearcher.find( phrase_to_find=phrase, text=flat.description, equality_comparator=comparator) if found_in_description: return True return False
def test_comparison_rule_with_morphologic_comparator(self): comparator = MorphologicComparator(title_case_sensitive=True) self.assertFalse( comparator.equals("Osiedle Kowalskiego", "osiedle Kowalskiego")) rules = [ ComparisonRule("osiedle", ComparisonRuleType.FORCE_CASE_INSENSITIVITY) ] comparator = MorphologicComparator( title_case_sensitive=True, comparison_rules=ComparisonRulesContainer(rules)) self.assertTrue( comparator.equals("Osiedle Kowalskiego", "osiedle Kowalskiego"))
def __init__(self, ignore_case_sensitivity_if_actual_upper_case=False): self.name_parser = human_name_parser self.comparator = MorphologicComparator( ignore_case_sensitivity_if_actual_upper_case=ignore_case_sensitivity_if_actual_upper_case)
class MorphologicComparatorTest(unittest.TestCase): def setUp(self): self.comparator = MorphologicComparator() def test_1_word_comparison(self): self.assertTrue(self.comparator.equals("Stanisława", "Stanisławem")) self.assertFalse(self.comparator.equals("Piotra", "Stanisławem")) def test_2_words_comparison(self): self.assertTrue( self.comparator.equals("Stanisława Krasickiego", "Stanisławem Krasickim")) self.assertFalse( self.comparator.equals("Piotr Krasicki", "Stanisławem Ignacym Krasickim")) def test_3_words_comparison(self): self.comparator = MorphologicComparator() self.assertTrue( self.comparator.equals("Stanisława Ignacego Krasickiego", "Stanisławem Ignacym Krasickim")) self.assertFalse( self.comparator.equals("Stanisława Piotra Krasickiego", "Stanisławem Ignacym Krasickim")) def test_case_insensitive_comparison(self): self.assertTrue(self.comparator.equals("stanisława", "Stanisława")) self.assertTrue(self.comparator.equals("stanisława", "STANISłAWA")) def test_title_case_sensitive_comparison(self): comparator = MorphologicComparator(title_case_sensitive=True) self.assertTrue( comparator.equals("Stanisława Kowalska", "Stanisławy Kowalskiej")) self.assertTrue( comparator.equals("STANISŁAWY KOWALSKIEJ", "STANISŁAWY KOWALSKIEJ")) self.assertTrue( comparator.equals("stanisława kowalska", "stanisławy kowalskiej")) self.assertFalse( comparator.equals("Stanisława Kowalska", "stanisławy kowalskiej")) self.assertFalse( comparator.equals("Stanisława Kowalska", "STANISŁAWY KOWALSKIEJ")) def test_ignore_title_case_sensitive_comparison_if_actual_is_upper_case( self): c = MorphologicComparator( title_case_sensitive=True, ignore_case_sensitivity_if_actual_upper_case=True) self.assertFalse( c.equals("Stanisława Kowalska", "stanisławy kowalskiej")) self.assertTrue( c.equals("Stanisława Kowalska", "STANISŁAWY KOWALSKIEJ")) self.assertFalse( c.equals( "STANISŁAWY KOWALSKIEJ", "Stanisława Kowalska")) # only works if actual is uppercase def test_comparison_ignores_white_spaces(self): self.assertTrue( self.comparator.equals("jakieś słowo ", " jakieś słowo")) def test_comparison_ignores_white_spaces(self): self.assertTrue( self.comparator.equals("jakieś słowo ", " jakieś słowo"))
def setUp(self): self.comparator = MorphologicComparator()