def testFindWord(self): """Test find_word function in utils""" # The text is the word word = "ete" text = "ete" result = find_word(word, text) expected_result = (0,) self.assertEquals(result, expected_result) # Many words text = "l'ete ou ete" result = find_word(word, text) expected_result = (2, 9,) self.assertEquals(result, expected_result)
def _getTextRelatedTermItems(self, text, glossary_term_items, excluded_terms=()): """ @param text: charset encoded text @param excluded_terms: charset encoded terms to exclude from search """ utext = text.decode(SITE_CHARSET, "replace") usplitted_text_terms = self._split(utext) atext = encode_ascii(utext) aexcluded_terms = [encode_ascii(t.decode(SITE_CHARSET, "replace")) for t in excluded_terms] result = [] # Search glossary terms in text analyzed_terms = [] for item in glossary_term_items: # Take into account the word and its variants terms = [] item_title = item['title'] item_variants = item['variants'] if type(item_title) == type(''): terms.append(item_title) if type(item_variants) in (type([]), type(()), ): terms.extend(item_variants) # Loop on glossary terms and intersect with object terms for term in terms: if term in analyzed_terms: continue # Analyze term analyzed_terms.append(term) uterm = term.decode(SITE_CHARSET, "replace") aterm = encode_ascii(uterm) if aterm in aexcluded_terms: continue # Search the word in the text found_pos = find_word(aterm, atext) if not found_pos: continue # Extract terms from obj text term_length = len(aterm) text_terms = [] for pos in found_pos: utext_term = utext[pos:(pos + term_length)] # FIX ME: Workaround for composed words. Works in 99% # Check the word is not a subword but a real word composing the text if not [x for x in self._split(utext_term) if x in usplitted_text_terms]: continue # Encode the term and make sure there are no doublons text_term = utext_term.encode(SITE_CHARSET, "replace") if text_term in text_terms: continue text_terms.append(text_term) if not text_terms: continue # Append object term item new_item = item.copy() new_item['terms'] = text_terms result.append(new_item) return result
def _getTextRelatedTermItems(self, text, glossary_term_items, excluded_terms=()): """ @param text: charset encoded text @param excluded_terms: charset encoded terms to exclude from search """ utext = safe_unicode(text) usplitted_text_terms = self._split(utext) atext = encode_ascii(text) aexcluded_terms = [encode_ascii(t) for t in excluded_terms] result = [] # Search glossary terms in text analyzed_terms = [] for item in glossary_term_items: # Take into account the word and its variants terms = [] item_title = item['title'].strip() item_variants = item['variants'] if isinstance(item_title, str): terms.append(item_title) if type(item_variants) in ( type([]), type(()), ): terms.extend(item_variants) # Loop on glossary terms and intersect with object terms for term in terms: if term in analyzed_terms: continue # Analyze term analyzed_terms.append(term) aterm = encode_ascii(term) if aterm in aexcluded_terms: continue # Search the word in the text found_pos = find_word(aterm, atext) if not found_pos: continue # Extract terms from obj text term_length = len(aterm) text_terms = [] for pos in found_pos: utext_term = utext[pos:(pos + term_length)] # FIX ME: Workaround for composed words. Works in 99% # Check the word is not a subword but a real word # composing the text. if not [ x for x in self._split(utext_term) if x in usplitted_text_terms ]: continue # Encode the term and make sure there are no doublons text_term = utext_term.encode('utf-8', "replace") if text_term in text_terms: continue text_terms.append(text_term) if not text_terms: continue # Append object term item new_item = item.copy() new_item['terms'] = text_terms result.append(new_item) return result