コード例 #1
0
    def equals(self, expected, actual):
        expected_split = split_on_special_characters(expected)
        actual_split = split_on_special_characters(actual)

        if len(actual_split) != len(expected_split):
            return False

        analyser = morphologic_analyser
        expected_base_form = [
            analyser.get_base_form(word) for word in expected_split
        ]
        actual_base_form = [
            analyser.get_base_form(word) for word in actual_split
        ]

        for actual_word_base, expected_word_base, actual_word_original, expected_word_original \
                in zip(actual_base_form, expected_base_form, actual_split, expected_split):

            test_case_sensitivity = self._is_case_sensitive_comparison(
                actual_word_original, self.rules, actual_split)

            actual_word_original_istitle = actual_word_original.istitle()
            expected_word_original_istitle = expected_word_original.istitle()
            actual_word_base = set(word.lower() for word in actual_word_base)
            expected_word_base = set(word.lower()
                                     for word in expected_word_base)

            are_not_equal = actual_word_base.isdisjoint(expected_word_base)

            if are_not_equal \
                    or (test_case_sensitivity and actual_word_original_istitle != expected_word_original_istitle):
                return False

        return True
コード例 #2
0
 def test_comparison_rule_with_context(self):
     rule = ComparisonRule(
         "osiedle", rule_type=ComparisonRuleType.FORCE_CASE_INSENSITIVITY)
     self.assertFalse(
         rule.does_apply(subject="Osiedle",
                         context=split_on_special_characters('osiedle')))
     self.assertTrue(
         rule.does_apply(
             subject="Osiedle",
             context=split_on_special_characters('Bardzo ładne osiedle')))
コード例 #3
0
    def _test_nearby_location_context_helper(self,
                                             *,
                                             sentence,
                                             subject_slice_beg_end,
                                             expected_result,
                                             conjunctions,
                                             location_type_prefixes={},
                                             introducers,
                                             address_provider):
        with self.subTest(sentence=sentence,
                          subject=subject_slice_beg_end,
                          expected_result=expected_result):
            source = split_on_special_characters(
                sentence, preserve_special_characters=True)

            match = AddressMatch(
                source=source,
                match_slice_position=subject_slice_beg_end,
                location=''  # doesn't matter
            )

            ctx_analyser = NearbyLocationContext(
                introducers=introducers,
                conjunctions=conjunctions,
                address_provider=address_provider,
                location_type_prefixes=location_type_prefixes)
            self.assertEqual(expected_result, ctx_analyser(match))

            negated_ctx_analyser = NearbyLocationContext(
                negate=True,
                introducers=introducers,
                conjunctions=conjunctions,
                address_provider=address_provider,
                location_type_prefixes=location_type_prefixes)
            self.assertEqual(not expected_result, negated_ctx_analyser(match))
コード例 #4
0
    def parse(self, name):
        name_split = split_on_special_characters(name)

        human_name = HumanName()

        word_it = iter(name_split)
        try:
            word = next(word_it)

            # titles
            while True:
                if word in self._all_valid_titles:
                    human_name.title.append(word)
                    word = next(word_it)
                else:
                    break

            # first names
            while True:
                if word in self._all_valid_first_names:
                    human_name.first_name.append(word)
                    word = next(word_it)
                else:
                    break

            # the rest is parsed as last_name or numerical epithet
            while True:
                if RomanNumeralsParser.is_roman_number(word):
                    human_name.numerical_epithet.append(word)
                else:
                    human_name.last_name.append(word)

                word = next(word_it)

        except StopIteration:
            pass

        name_split_with_special_characters_preserved \
            = split_on_special_characters(name, preserve_special_characters=True)
        all_matched = human_name.to_list()
        has_matched = [elem in all_matched for elem in name_split_with_special_characters_preserved]
        stripped_has_matched = strip_list(has_matched, strip_if_in=[False])
        if len(has_matched) != len(stripped_has_matched):
            raise FFE_InvalidArgument("Provided string contains leading or trailing special characters")

        return human_name
コード例 #5
0
    def _test_first_word_of_sentence_helper(self, sentence, analysis_subject,
                                            expected_result):
        source = split_on_special_characters(sentence,
                                             preserve_special_characters=True)
        analysis_subject = split_on_special_characters(
            analysis_subject, preserve_special_characters=True)

        slice_beg = find_slice_beg(source, slice_to_find=analysis_subject)
        assert slice_beg is not None

        slice_end = slice_beg + len(analysis_subject)
        match = AddressMatch(
            source=source,
            match_slice_position=(slice_beg, slice_end),
            location=''  # doesn't matter
        )
        assert match.matched_phrase == ' '.join(analysis_subject)

        ctx_analyser = FirstWordOfSentenceContext()
        self.assertEqual(ctx_analyser(match), expected_result)

        negated_ctx_analyser = FirstWordOfSentenceContext(negate=True)
        self.assertEqual(negated_ctx_analyser(match), not expected_result)
コード例 #6
0
    def process(self, text, min_snippet_remove_size=5):
        split_text = split_on_special_characters(
            text, preserve_special_characters=True, ignore_spaces=False)

        while True:
            slice_beg, slice_end = self._get_biggest_english_part(split_text)

            if slice_end - slice_beg < min_snippet_remove_size:
                return ''.join(split_text).strip()

            split_text = [
                word for index, word in enumerate(split_text)
                if index not in range(slice_beg, slice_end + 1)
            ]
コード例 #7
0
    def _find_all_introducers(self, source: List[str]):
        found_introducers = []

        for introducer in self.introducers:
            introducer = split_on_special_characters(
                introducer, preserve_special_characters=True)
            found_indexes = find_slice_beg(source,
                                           slice_to_find=introducer,
                                           find_all=True,
                                           case_insensitive=True)
            found_introducers.extend([(idx, introducer)
                                      for idx in found_indexes])

        found_introducers.sort(key=lambda idx_introducer: idx_introducer[0])

        return found_introducers
コード例 #8
0
ファイル: text_searcher.py プロジェクト: jakubgros/FlatFinder
    def find(*,
             phrase_to_find,
             text: str,
             equality_comparator=lambda lhs, rhs: lhs == rhs):

        word_list = split_on_special_characters(
            text, preserve_special_characters=True)

        max_frame_size = len(phrase_to_find.split())
        all_text_frames = (TextFrame(word_list, frame_size)
                           for frame_size in range(1, max_frame_size + 1))

        found = []
        for text_frame in all_text_frames:
            for slice_position, frame in text_frame:
                if equality_comparator(phrase_to_find, frame):
                    found.append(slice_position)

        return found, text_frame.all_words
コード例 #9
0
 def does_apply(self, *, subject, context=None):
     if context and len(
             split_on_special_characters(subject)) >= len(context):
         return False
     else:
         return self._comparator(self.subject, subject)
コード例 #10
0
 def test_split_preserves_newline_character(self):
     sample_text = "this is a \n sample text"
     word_list = split_on_special_characters(
         sample_text, preserve_special_characters=True)
     self.assertIn('\n', word_list)