def testFindWord(self):
        """Test find_word function in utils"""

        # The text is the word
        word = "ete"
        text = "ete"
        result = find_word(word, text)
        expected_result = (0,)
        self.assertEquals(result, expected_result)

        # Many words
        text = "l'ete ou ete"
        result = find_word(word, text)
        expected_result = (2, 9,)
        self.assertEquals(result, expected_result)
    def testFindWord(self):
        """Test find_word function in utils"""

        # The text is the word
        word = "ete"
        text = "ete"
        result = find_word(word, text)
        expected_result = (0,)
        self.assertEquals(result, expected_result)

        # Many words
        text = "l'ete ou ete"
        result = find_word(word, text)
        expected_result = (2, 9,)
        self.assertEquals(result, expected_result)
    def _getTextRelatedTermItems(self, text, glossary_term_items,
                                 excluded_terms=()):
        """
        @param text: charset encoded text
        @param excluded_terms: charset encoded terms to exclude from search
        """

        utext = text.decode(SITE_CHARSET, "replace")
        usplitted_text_terms = self._split(utext)
        atext = encode_ascii(utext)

        aexcluded_terms = [encode_ascii(t.decode(SITE_CHARSET, "replace"))
                          for t in excluded_terms]

        result = []

        # Search glossary terms in text
        analyzed_terms = []
        for item in glossary_term_items:
            # Take into account the word and its variants
            terms = []
            item_title = item['title']
            item_variants = item['variants']

            if type(item_title) == type(''):
                terms.append(item_title)
            if type(item_variants) in (type([]), type(()), ):
                terms.extend(item_variants)

            # Loop on glossary terms and intersect with object terms
            for term in terms:
                if term in analyzed_terms:
                    continue

                # Analyze term
                analyzed_terms.append(term)
                uterm = term.decode(SITE_CHARSET, "replace")
                aterm = encode_ascii(uterm)
                if aterm in aexcluded_terms:
                    continue

                # Search the word in the text
                found_pos = find_word(aterm, atext)
                if not found_pos:
                    continue

                # Extract terms from obj text
                term_length = len(aterm)
                text_terms = []
                for pos in found_pos:
                    utext_term = utext[pos:(pos + term_length)]

                    # FIX ME: Workaround for composed words. Works in 99%
                    # Check the word is not a subword but a real word composing the text
                    if not [x for x in self._split(utext_term) if x in usplitted_text_terms]:
                        continue

                    # Encode the term and make sure there are no doublons
                    text_term = utext_term.encode(SITE_CHARSET, "replace")
                    if text_term in text_terms:
                        continue
                    text_terms.append(text_term)

                if not text_terms:
                    continue

                # Append object term item
                new_item = item.copy()
                new_item['terms'] = text_terms
                result.append(new_item)

        return result
Example #4
0
    def _getTextRelatedTermItems(self,
                                 text,
                                 glossary_term_items,
                                 excluded_terms=()):
        """
        @param text: charset encoded text
        @param excluded_terms: charset encoded terms to exclude from search
        """
        utext = safe_unicode(text)
        usplitted_text_terms = self._split(utext)
        atext = encode_ascii(text)

        aexcluded_terms = [encode_ascii(t) for t in excluded_terms]

        result = []

        # Search glossary terms in text
        analyzed_terms = []
        for item in glossary_term_items:
            # Take into account the word and its variants
            terms = []
            item_title = item['title'].strip()
            item_variants = item['variants']

            if isinstance(item_title, str):
                terms.append(item_title)
            if type(item_variants) in (
                    type([]),
                    type(()),
            ):
                terms.extend(item_variants)

            # Loop on glossary terms and intersect with object terms
            for term in terms:
                if term in analyzed_terms:
                    continue

                # Analyze term
                analyzed_terms.append(term)
                aterm = encode_ascii(term)
                if aterm in aexcluded_terms:
                    continue

                # Search the word in the text
                found_pos = find_word(aterm, atext)
                if not found_pos:
                    continue

                # Extract terms from obj text
                term_length = len(aterm)
                text_terms = []
                for pos in found_pos:
                    utext_term = utext[pos:(pos + term_length)]

                    # FIX ME: Workaround for composed words. Works in 99%
                    # Check the word is not a subword but a real word
                    # composing the text.
                    if not [
                            x for x in self._split(utext_term)
                            if x in usplitted_text_terms
                    ]:
                        continue

                    # Encode the term and make sure there are no doublons
                    text_term = utext_term.encode('utf-8', "replace")
                    if text_term in text_terms:
                        continue
                    text_terms.append(text_term)

                if not text_terms:
                    continue

                # Append object term item
                new_item = item.copy()
                new_item['terms'] = text_terms
                result.append(new_item)

        return result