def equals(self, expected, actual): expected_split = split_on_special_characters(expected) actual_split = split_on_special_characters(actual) if len(actual_split) != len(expected_split): return False analyser = morphologic_analyser expected_base_form = [ analyser.get_base_form(word) for word in expected_split ] actual_base_form = [ analyser.get_base_form(word) for word in actual_split ] for actual_word_base, expected_word_base, actual_word_original, expected_word_original \ in zip(actual_base_form, expected_base_form, actual_split, expected_split): test_case_sensitivity = self._is_case_sensitive_comparison( actual_word_original, self.rules, actual_split) actual_word_original_istitle = actual_word_original.istitle() expected_word_original_istitle = expected_word_original.istitle() actual_word_base = set(word.lower() for word in actual_word_base) expected_word_base = set(word.lower() for word in expected_word_base) are_not_equal = actual_word_base.isdisjoint(expected_word_base) if are_not_equal \ or (test_case_sensitivity and actual_word_original_istitle != expected_word_original_istitle): return False return True
def test_comparison_rule_with_context(self): rule = ComparisonRule( "osiedle", rule_type=ComparisonRuleType.FORCE_CASE_INSENSITIVITY) self.assertFalse( rule.does_apply(subject="Osiedle", context=split_on_special_characters('osiedle'))) self.assertTrue( rule.does_apply( subject="Osiedle", context=split_on_special_characters('Bardzo ładne osiedle')))
def _test_nearby_location_context_helper(self, *, sentence, subject_slice_beg_end, expected_result, conjunctions, location_type_prefixes={}, introducers, address_provider): with self.subTest(sentence=sentence, subject=subject_slice_beg_end, expected_result=expected_result): source = split_on_special_characters( sentence, preserve_special_characters=True) match = AddressMatch( source=source, match_slice_position=subject_slice_beg_end, location='' # doesn't matter ) ctx_analyser = NearbyLocationContext( introducers=introducers, conjunctions=conjunctions, address_provider=address_provider, location_type_prefixes=location_type_prefixes) self.assertEqual(expected_result, ctx_analyser(match)) negated_ctx_analyser = NearbyLocationContext( negate=True, introducers=introducers, conjunctions=conjunctions, address_provider=address_provider, location_type_prefixes=location_type_prefixes) self.assertEqual(not expected_result, negated_ctx_analyser(match))
def parse(self, name): name_split = split_on_special_characters(name) human_name = HumanName() word_it = iter(name_split) try: word = next(word_it) # titles while True: if word in self._all_valid_titles: human_name.title.append(word) word = next(word_it) else: break # first names while True: if word in self._all_valid_first_names: human_name.first_name.append(word) word = next(word_it) else: break # the rest is parsed as last_name or numerical epithet while True: if RomanNumeralsParser.is_roman_number(word): human_name.numerical_epithet.append(word) else: human_name.last_name.append(word) word = next(word_it) except StopIteration: pass name_split_with_special_characters_preserved \ = split_on_special_characters(name, preserve_special_characters=True) all_matched = human_name.to_list() has_matched = [elem in all_matched for elem in name_split_with_special_characters_preserved] stripped_has_matched = strip_list(has_matched, strip_if_in=[False]) if len(has_matched) != len(stripped_has_matched): raise FFE_InvalidArgument("Provided string contains leading or trailing special characters") return human_name
def _test_first_word_of_sentence_helper(self, sentence, analysis_subject, expected_result): source = split_on_special_characters(sentence, preserve_special_characters=True) analysis_subject = split_on_special_characters( analysis_subject, preserve_special_characters=True) slice_beg = find_slice_beg(source, slice_to_find=analysis_subject) assert slice_beg is not None slice_end = slice_beg + len(analysis_subject) match = AddressMatch( source=source, match_slice_position=(slice_beg, slice_end), location='' # doesn't matter ) assert match.matched_phrase == ' '.join(analysis_subject) ctx_analyser = FirstWordOfSentenceContext() self.assertEqual(ctx_analyser(match), expected_result) negated_ctx_analyser = FirstWordOfSentenceContext(negate=True) self.assertEqual(negated_ctx_analyser(match), not expected_result)
def process(self, text, min_snippet_remove_size=5): split_text = split_on_special_characters( text, preserve_special_characters=True, ignore_spaces=False) while True: slice_beg, slice_end = self._get_biggest_english_part(split_text) if slice_end - slice_beg < min_snippet_remove_size: return ''.join(split_text).strip() split_text = [ word for index, word in enumerate(split_text) if index not in range(slice_beg, slice_end + 1) ]
def _find_all_introducers(self, source: List[str]): found_introducers = [] for introducer in self.introducers: introducer = split_on_special_characters( introducer, preserve_special_characters=True) found_indexes = find_slice_beg(source, slice_to_find=introducer, find_all=True, case_insensitive=True) found_introducers.extend([(idx, introducer) for idx in found_indexes]) found_introducers.sort(key=lambda idx_introducer: idx_introducer[0]) return found_introducers
def find(*, phrase_to_find, text: str, equality_comparator=lambda lhs, rhs: lhs == rhs): word_list = split_on_special_characters( text, preserve_special_characters=True) max_frame_size = len(phrase_to_find.split()) all_text_frames = (TextFrame(word_list, frame_size) for frame_size in range(1, max_frame_size + 1)) found = [] for text_frame in all_text_frames: for slice_position, frame in text_frame: if equality_comparator(phrase_to_find, frame): found.append(slice_position) return found, text_frame.all_words
def does_apply(self, *, subject, context=None): if context and len( split_on_special_characters(subject)) >= len(context): return False else: return self._comparator(self.subject, subject)
def test_split_preserves_newline_character(self): sample_text = "this is a \n sample text" word_list = split_on_special_characters( sample_text, preserve_special_characters=True) self.assertIn('\n', word_list)