def stem_words(self, words: List[str]) -> List[str]: words = decode_object_from_bytes_if_needed(words) if words is None: raise McLanguageException("Words to stem is None.") stems = [] for word in words: if word is None or len(word) == 0: log.debug("Word is empty or None.") stem = word else: try: # Stemmer might raise an exception stem = hausastemmer.stem(word) # ...or it might return an empty string / None if stem is None or len(stem) == 0: raise Exception("Stem is empty.") except Exception as ex: log.warning("Unable to stem word '{}': {}".format(word, str(ex))) stem = word stems.append(stem) if len(words) != len(stems): log.warning("Stem count is not the same as word count; words: %s; stems: %s" % (str(words), str(stems),)) return stems
def stem_words(self, words: List[str]) -> List[str]: words = decode_object_from_bytes_if_needed(words) if words is None: raise McLanguageException("Words to stem is None.") stems = [] for word in words: if word is None or len(word) == 0: log.debug("Word is empty or None.") stem = word else: stem = hausastemmer.stem(word) if stem is None or len(stem) == 0: log.debug("Unable to stem word '%s'" % word) stem = word stems.append(stem) if len(words) != len(stems): log.warning( "Stem count is not the same as word count; words: %s; stems: %s" % ( str(words), str(stems), )) return stems
def test_stemmer_without_dict_lookup(): for term, expected_stem in sorted(TEST_STEMS_WITHOUT_DICT_LOOKUP.items()): actual_stem = stem(term, lookup=False) if expected_stem != actual_stem: print( "Testing '%s' without dictionary lookup, expecting to get '%s', got '%s'" % (term, expected_stem, actual_stem)) assert actual_stem == expected_stem
def stem_words(self, words: List[str]) -> List[str]: words = decode_object_from_bytes_if_needed(words) if words is None: raise McLanguageException("Words to stem is None.") stems = [] for word in words: if word is None or len(word) == 0: log.debug("Word is empty or None.") stem = word else: try: # Stemmer might raise an exception stem = hausastemmer.stem(word) # ...or it might return an empty string / None if stem is None or len(stem) == 0: raise Exception("Stem is empty.") except Exception as ex: log.warning("Unable to stem word '{}': {}".format( word, str(ex))) stem = word stems.append(stem) if len(words) != len(stems): log.warning( "Stem count is not the same as word count; words: %s; stems: %s" % ( str(words), str(stems), )) return stems
def test_stemmer_with_bad_data(): assert stem('') == '' assert stem('ą') == 'ą' # Won't necessarily return the same string assert len(stem('ą' * 1024 * 1024)) > 1024 * 1023
def py_hausa_stem(token): """Used by Perl code to do Hausa stemming.""" # MC_REWRITE_TO_PYTHON: simplify after rewriting language module to Python. token = decode_object_from_bytes_if_needed(token) return hausastemmer.stem(token)
def py_hausa_stem(token): """Used by Perl code to do Hausa stemming.""" # FIXME MC_REWRITE_TO_PYTHON: simplify after rewriting language module to Python. token = decode_string_from_bytes_if_needed(token) return hausastemmer.stem(token)