def __init__(self): """Constructor.""" super().__init__() self.__japanese_sentence_tokenizer = RegexpTokenizer( r'([^!?。]*[!?。])', gaps=True, # don't discard non-Japanese text discard_empty=True, ) self.__english_language = EnglishLanguage() mecab_dictionary_path = JapaneseLanguage._mecab_ipadic_neologd_path() try: tagger_args = [ f'--dicdir={mecab_dictionary_path}', '--rcfile=/dev/null', f'--node-format=%m{self.__MECAB_TOKEN_POS_SEPARATOR}%h{self.__EOL_SEPARATOR}', f'--eos-format={self.__MECAB_EOS_MARK}{self.__EOL_SEPARATOR}', ] self.__mecab = MeCab.Tagger(' '.join(tagger_args)) except Exception as ex: raise McLanguageException("Unable to initialize MeCab: %s" % str(ex)) # Quick self-test to make sure that MeCab, its dictionaries and Python class are installed and working mecab_exc_message = "MeCab self-test failed; make sure that MeCab is built and dictionaries are accessible." try: test_words = self.split_sentence_to_words('pythonが大好きです') except Exception as _: raise McLanguageException(mecab_exc_message) else: if len(test_words) < 2 or test_words[1] != '大好き': raise McLanguageException(mecab_exc_message)
def stem_words(self, words: List[str]) -> List[str]: words = decode_object_from_bytes_if_needed(words) if words is None: raise McLanguageException("Words to stem is None.") stems = [] for word in words: if word is None or len(word) == 0: log.debug("Word is empty or None.") stem = word else: term_stems = self.__hindi_hunspell.stem(word) if len(term_stems) > 0: stem = term_stems[0] if stem is None or len(stem) == 0: log.debug("Stem for word '%s' is empty or None." % word) stem = word else: log.debug("Stem for word '%s' was not found." % word) stem = word stems.append(stem) if len(words) != len(stems): log.warning("Stem count is not the same as word count; words: %s; stems: %s" % (str(words), str(stems),)) return stems
def stem_words(self, words: List[str]) -> List[str]: words = decode_object_from_bytes_if_needed(words) if words is None: raise McLanguageException("Words to stem is None.") stems = [] for word in words: if word is None or len(word) == 0: log.debug("Word is empty or None.") stem = word else: stem = hausastemmer.stem(word) if stem is None or len(stem) == 0: log.debug("Unable to stem word '%s'" % word) stem = word stems.append(stem) if len(words) != len(stems): log.warning( "Stem count is not the same as word count; words: %s; stems: %s" % ( str(words), str(stems), )) return stems
def __init__(self): """Constructor.""" super().__init__() self.__japanese_sentence_tokenizer = RegexpTokenizer( r'([^!?。]*[!?。])', gaps=True, # don't discard non-Japanese text discard_empty=True, ) self.__english_language = EnglishLanguage() mecab_dictionary_path = JapaneseLanguage._mecab_ipadic_neologd_path() try: self.__mecab = MeCab.Tagger( '--dicdir=%(dictionary_path)s ' '--node-format=%%m%(token_pos_separator)s%%h\\n ' '--eos-format=%(eos_mark)s\\n' % { 'token_pos_separator': self.__MECAB_TOKEN_POS_SEPARATOR, 'eos_mark': self.__MECAB_EOS_MARK, 'dictionary_path': mecab_dictionary_path, }) except Exception as ex: raise McLanguageException("Unable to initialize MeCab: %s" % str(ex)) # Quick self-test to make sure that MeCab, its dictionaries and Python class are installed and working mecab_exc_message = "MeCab self-test failed; make sure that MeCab is built and dictionaries are accessible." try: test_words = self.split_sentence_to_words('pythonが大好きです') except Exception as _: raise McLanguageException(mecab_exc_message) else: if len(test_words) < 2 or test_words[1] != '大好き': raise McLanguageException(mecab_exc_message)
def _mecab_ipadic_neologd_path() -> str: # (protected and not private because used by the unit test) """Return path to mecab-ipadic-neologd dictionary installed on system.""" mecab_dictionary_path = None candidate_paths = JapaneseLanguage.__MECAB_DICTIONARY_PATHS for candidate_path in candidate_paths: if os.path.isdir(candidate_path): if os.path.isfile(os.path.join(candidate_path, 'sys.dic')): mecab_dictionary_path = candidate_path break if mecab_dictionary_path is None: raise McLanguageException( "mecab-ipadic-neologd was not found in paths: %s" % str(candidate_paths) ) return mecab_dictionary_path
def stem_words(self, words: List[str]) -> List[str]: words = decode_object_from_bytes_if_needed(words) if words is None: raise McLanguageException("Words to stem is None.") stems = self.__lt_stemmer.stemWords(words) if len(words) != len(stems): log.warning( "Stem count is not the same as word count; words: %s; stems: %s" % ( str(words), str(stems), )) # Perl's Snowball implementation used to return lowercase stems stems = [stem.lower() for stem in stems] return stems
def __init__(self): """Constructor.""" super().__init__() self.__treebank_tokenizer = TreebankWordTokenizer() hunspell_dict_dir = os.path.join( os.path.dirname(os.path.abspath(__file__)), 'hindi-hunspell', 'dict-hi_IN', ) if not os.path.isdir(hunspell_dict_dir): raise McLanguageException( "Hunspell dictionary directory does not exist at path: %s." % hunspell_dict_dir) if not os.path.isfile(os.path.join(hunspell_dict_dir, 'hi_IN.dic')): raise McLanguageException( "Hunspell dictionary file does not exist at path: %s" % hunspell_dict_dir) if not os.path.isfile(os.path.join(hunspell_dict_dir, 'hi_IN.aff')): raise McLanguageException( "Hunspell affix file does not exist at path: %s" % hunspell_dict_dir) try: self.__hindi_hunspell = Hunspell( lang='hi_IN', hunspell_data_dir=hunspell_dict_dir) except Exception as ex: raise McLanguageException( "Unable to initialize Hunspell with data directory '%s': %s" % ( hunspell_dict_dir, str(ex), )) # Quick self-test to make sure that Hunspell is installed and dictionary is available hunspell_exc_message = """ Hunspell self-test failed; make sure that Hunspell is installed and dictionaries are accessible, e.g. you might need to fetch Git submodules by running: git submodule update --init --recursive """ try: test_stems = self.stem_words(['गुरुओं']) except Exception as _: raise McLanguageException(hunspell_exc_message) else: if len(test_stems) == 0 or test_stems[0] != 'गुरु': raise McLanguageException(hunspell_exc_message)
def __init__(self): """Constructor.""" super().__init__() # Text -> sentence tokenizer for Chinese text self.__chinese_sentence_tokenizer = RegexpTokenizer( r'([^!?。]*[!?。])', gaps=True, # don't discard non-Chinese text discard_empty=True, ) self.__english_language = EnglishLanguage() self.__jieba = jieba.Tokenizer() self.__jieba.cache_file = self.__CACHE_PATH if not os.path.isdir(self.__DICT_PATH): raise McLanguageException( "Jieba dictionary directory was not found: %s" % self.__DICT_PATH) if not os.path.isfile(self.__JIEBA_DICT_PATH): raise McLanguageException( "Default dictionary not found in Jieba dictionary directory: %s" % self.__DICT_PATH) if not os.path.isfile(self.__JIEBA_USERDICT_PATH): raise McLanguageException( "User dictionary not found in Jieba dictionary directory: %s" % self.__DICT_PATH) try: self.__jieba.set_dictionary(os.path.join(self.__JIEBA_DICT_PATH)) self.__jieba.load_userdict(os.path.join( self.__JIEBA_USERDICT_PATH)) except Exception as ex: raise McLanguageException("Unable to initialize Jieba: %s" % str(ex)) # Quick self-test to make sure that Jieba, its dictionaries and Python class are installed and working jieba_exc_message = "Jieba self-test failed; make sure that the dictionaries are accessible." try: test_words = self.split_sentence_to_words('python課程') except Exception as _: raise McLanguageException(jieba_exc_message) else: if len(test_words) < 2 or test_words[1] != '課程': raise McLanguageException(jieba_exc_message)
def stem_words(self, words: List[str]) -> List[str]: words = decode_object_from_bytes_if_needed(words) if words is None: raise McLanguageException("Words to stem is None.") stems = [] for word in words: if word is None or len(word) == 0: log.debug("Word is empty or None.") stem = word else: try: # Stemmer might raise an exception stem = hausastemmer.stem(word) # ...or it might return an empty string / None if stem is None or len(stem) == 0: raise Exception("Stem is empty.") except Exception as ex: log.warning("Unable to stem word '{}': {}".format( word, str(ex))) stem = word stems.append(stem) if len(words) != len(stems): log.warning( "Stem count is not the same as word count; words: %s; stems: %s" % ( str(words), str(stems), )) return stems