def testEncodeAscii(self): """Test encode_ascii function from utils modules""" utext = 'Ellipsis\u2026' atext = encode_ascii(utext) self.assertEquals(len(utext), len(atext)) self.assertEquals(atext, "ellipsis.")
def testEncodeAscii(self): """Test encode_ascii function from utils modules""" utext = u'Ellipsis\u2026' atext = encode_ascii(utext) self.assertEquals(len(utext), len(atext)) self.assertEquals(atext, "ellipsis.")
def _normalize(self, text): """Normalize text : returns an ascii text @param text: Text to normalize""" utext = text if type(text) != type(u''): # Not unicode string utext = text.decode(SITE_CHARSET, "replace") return encode_ascii(utext)
def _list_results(self): """Terms list (brains) depending on the request""" gtool = self.gtool if self.search_letter: # User clicked a letter results = gtool.getAbcedaireBrains([self.uid], letters=[self.search_letter]) elif self.search_text: # User searches for text results = gtool.searchResults([self.uid], SearchableText=self.search_text) # We redirect to the result if unique if len(results) == 1: target = results[0].getURL() raise Redirect(target) else: # Viewing all terms results = gtool.searchResults([self.uid]) results = list(results) results.sort(lambda x, y: cmp(encode_ascii(x.Title), encode_ascii(y.Title))) return tuple(results)
def _getTextRelatedTermItems(self, text, glossary_term_items, excluded_terms=()): """ @param text: charset encoded text @param excluded_terms: charset encoded terms to exclude from search """ utext = text.decode(SITE_CHARSET, "replace") usplitted_text_terms = self._split(utext) atext = encode_ascii(utext) aexcluded_terms = [encode_ascii(t.decode(SITE_CHARSET, "replace")) for t in excluded_terms] result = [] # Search glossary terms in text analyzed_terms = [] for item in glossary_term_items: # Take into account the word and its variants terms = [] item_title = item['title'] item_variants = item['variants'] if type(item_title) == type(''): terms.append(item_title) if type(item_variants) in (type([]), type(()), ): terms.extend(item_variants) # Loop on glossary terms and intersect with object terms for term in terms: if term in analyzed_terms: continue # Analyze term analyzed_terms.append(term) uterm = term.decode(SITE_CHARSET, "replace") aterm = encode_ascii(uterm) if aterm in aexcluded_terms: continue # Search the word in the text found_pos = find_word(aterm, atext) if not found_pos: continue # Extract terms from obj text term_length = len(aterm) text_terms = [] for pos in found_pos: utext_term = utext[pos:(pos + term_length)] # FIX ME: Workaround for composed words. Works in 99% # Check the word is not a subword but a real word composing the text if not [x for x in self._split(utext_term) if x in usplitted_text_terms]: continue # Encode the term and make sure there are no doublons text_term = utext_term.encode(SITE_CHARSET, "replace") if text_term in text_terms: continue text_terms.append(text_term) if not text_terms: continue # Append object term item new_item = item.copy() new_item['terms'] = text_terms result.append(new_item) return result
def processGlob(self, lst): result = [] for word in lst: norm_word = encode_ascii(word) result.extend(self.rxGlob.findall(norm_word)) return result
def _getTextRelatedTermItems(self, text, glossary_term_items, excluded_terms=()): """ @param text: charset encoded text @param excluded_terms: charset encoded terms to exclude from search """ utext = safe_unicode(text) usplitted_text_terms = self._split(utext) atext = encode_ascii(text) aexcluded_terms = [encode_ascii(t) for t in excluded_terms] result = [] # Search glossary terms in text analyzed_terms = [] for item in glossary_term_items: # Take into account the word and its variants terms = [] item_title = item['title'].strip() item_variants = item['variants'] if isinstance(item_title, str): terms.append(item_title) if type(item_variants) in ( type([]), type(()), ): terms.extend(item_variants) # Loop on glossary terms and intersect with object terms for term in terms: if term in analyzed_terms: continue # Analyze term analyzed_terms.append(term) aterm = encode_ascii(term) if aterm in aexcluded_terms: continue # Search the word in the text found_pos = find_word(aterm, atext) if not found_pos: continue # Extract terms from obj text term_length = len(aterm) text_terms = [] for pos in found_pos: utext_term = utext[pos:(pos + term_length)] # FIX ME: Workaround for composed words. Works in 99% # Check the word is not a subword but a real word # composing the text. if not [ x for x in self._split(utext_term) if x in usplitted_text_terms ]: continue # Encode the term and make sure there are no doublons text_term = utext_term.encode('utf-8', "replace") if text_term in text_terms: continue text_terms.append(text_term) if not text_terms: continue # Append object term item new_item = item.copy() new_item['terms'] = text_terms result.append(new_item) return result