def testEncodeAscii(self):
        """Test encode_ascii function from utils modules"""

        utext = 'Ellipsis\u2026'
        atext = encode_ascii(utext)
        self.assertEquals(len(utext), len(atext))
        self.assertEquals(atext, "ellipsis.")
    def testEncodeAscii(self):
        """Test encode_ascii function from utils modules"""

        utext = u'Ellipsis\u2026'
        atext = encode_ascii(utext)
        self.assertEquals(len(utext), len(atext))
        self.assertEquals(atext, "ellipsis.")
    def _normalize(self, text):
        """Normalize text : returns an ascii text

        @param text: Text to normalize"""

        utext = text
        if type(text) != type(u''):  # Not unicode string
            utext = text.decode(SITE_CHARSET, "replace")
        return encode_ascii(utext)
    def _normalize(self, text):
        """Normalize text : returns an ascii text

        @param text: Text to normalize"""

        utext = text
        if type(text) != type(u''): # Not unicode string
            utext = text.decode(SITE_CHARSET, "replace")
        return encode_ascii(utext)
Example #5
0
    def _list_results(self):
        """Terms list (brains) depending on the request"""

        gtool = self.gtool
        if self.search_letter:
            # User clicked a letter
            results = gtool.getAbcedaireBrains([self.uid],
                                               letters=[self.search_letter])
        elif self.search_text:
            # User searches for text
            results = gtool.searchResults([self.uid],
                                          SearchableText=self.search_text)
            # We redirect to the result if unique
            if len(results) == 1:
                target = results[0].getURL()
                raise Redirect(target)
        else:
            # Viewing all terms
            results = gtool.searchResults([self.uid])
        results = list(results)
        results.sort(lambda x, y: cmp(encode_ascii(x.Title),
                                      encode_ascii(y.Title)))
        return tuple(results)
    def _getTextRelatedTermItems(self, text, glossary_term_items,
                                 excluded_terms=()):
        """
        @param text: charset encoded text
        @param excluded_terms: charset encoded terms to exclude from search
        """

        utext = text.decode(SITE_CHARSET, "replace")
        usplitted_text_terms = self._split(utext)
        atext = encode_ascii(utext)

        aexcluded_terms = [encode_ascii(t.decode(SITE_CHARSET, "replace"))
                          for t in excluded_terms]

        result = []

        # Search glossary terms in text
        analyzed_terms = []
        for item in glossary_term_items:
            # Take into account the word and its variants
            terms = []
            item_title = item['title']
            item_variants = item['variants']

            if type(item_title) == type(''):
                terms.append(item_title)
            if type(item_variants) in (type([]), type(()), ):
                terms.extend(item_variants)

            # Loop on glossary terms and intersect with object terms
            for term in terms:
                if term in analyzed_terms:
                    continue

                # Analyze term
                analyzed_terms.append(term)
                uterm = term.decode(SITE_CHARSET, "replace")
                aterm = encode_ascii(uterm)
                if aterm in aexcluded_terms:
                    continue

                # Search the word in the text
                found_pos = find_word(aterm, atext)
                if not found_pos:
                    continue

                # Extract terms from obj text
                term_length = len(aterm)
                text_terms = []
                for pos in found_pos:
                    utext_term = utext[pos:(pos + term_length)]

                    # FIX ME: Workaround for composed words. Works in 99%
                    # Check the word is not a subword but a real word composing the text
                    if not [x for x in self._split(utext_term) if x in usplitted_text_terms]:
                        continue

                    # Encode the term and make sure there are no doublons
                    text_term = utext_term.encode(SITE_CHARSET, "replace")
                    if text_term in text_terms:
                        continue
                    text_terms.append(text_term)

                if not text_terms:
                    continue

                # Append object term item
                new_item = item.copy()
                new_item['terms'] = text_terms
                result.append(new_item)

        return result
Example #7
0
 def processGlob(self, lst):
     result = []
     for word in lst:
         norm_word = encode_ascii(word)
         result.extend(self.rxGlob.findall(norm_word))
     return result
Example #8
0
    def _getTextRelatedTermItems(self,
                                 text,
                                 glossary_term_items,
                                 excluded_terms=()):
        """
        @param text: charset encoded text
        @param excluded_terms: charset encoded terms to exclude from search
        """
        utext = safe_unicode(text)
        usplitted_text_terms = self._split(utext)
        atext = encode_ascii(text)

        aexcluded_terms = [encode_ascii(t) for t in excluded_terms]

        result = []

        # Search glossary terms in text
        analyzed_terms = []
        for item in glossary_term_items:
            # Take into account the word and its variants
            terms = []
            item_title = item['title'].strip()
            item_variants = item['variants']

            if isinstance(item_title, str):
                terms.append(item_title)
            if type(item_variants) in (
                    type([]),
                    type(()),
            ):
                terms.extend(item_variants)

            # Loop on glossary terms and intersect with object terms
            for term in terms:
                if term in analyzed_terms:
                    continue

                # Analyze term
                analyzed_terms.append(term)
                aterm = encode_ascii(term)
                if aterm in aexcluded_terms:
                    continue

                # Search the word in the text
                found_pos = find_word(aterm, atext)
                if not found_pos:
                    continue

                # Extract terms from obj text
                term_length = len(aterm)
                text_terms = []
                for pos in found_pos:
                    utext_term = utext[pos:(pos + term_length)]

                    # FIX ME: Workaround for composed words. Works in 99%
                    # Check the word is not a subword but a real word
                    # composing the text.
                    if not [
                            x for x in self._split(utext_term)
                            if x in usplitted_text_terms
                    ]:
                        continue

                    # Encode the term and make sure there are no doublons
                    text_term = utext_term.encode('utf-8', "replace")
                    if text_term in text_terms:
                        continue
                    text_terms.append(text_term)

                if not text_terms:
                    continue

                # Append object term item
                new_item = item.copy()
                new_item['terms'] = text_terms
                result.append(new_item)

        return result
 def processGlob(self, lst):
     result = []
     for word in lst:
         norm_word = encode_ascii(word)
         result.extend(self.rxGlob.findall(norm_word))
     return result