Example #1
0
    def __init__(self):
        """Constructor."""
        super().__init__()

        self.__japanese_sentence_tokenizer = RegexpTokenizer(
            r'([^!?。]*[!?。])',
            gaps=True,  # don't discard non-Japanese text
            discard_empty=True,
        )

        self.__english_language = EnglishLanguage()

        mecab_dictionary_path = JapaneseLanguage._mecab_ipadic_neologd_path()

        try:
            tagger_args = [
                f'--dicdir={mecab_dictionary_path}',
                '--rcfile=/dev/null',
                f'--node-format=%m{self.__MECAB_TOKEN_POS_SEPARATOR}%h{self.__EOL_SEPARATOR}',
                f'--eos-format={self.__MECAB_EOS_MARK}{self.__EOL_SEPARATOR}',
            ]
            self.__mecab = MeCab.Tagger(' '.join(tagger_args))
        except Exception as ex:
            raise McLanguageException("Unable to initialize MeCab: %s" % str(ex))

        # Quick self-test to make sure that MeCab, its dictionaries and Python class are installed and working
        mecab_exc_message = "MeCab self-test failed; make sure that MeCab is built and dictionaries are accessible."
        try:
            test_words = self.split_sentence_to_words('pythonが大好きです')
        except Exception as _:
            raise McLanguageException(mecab_exc_message)
        else:
            if len(test_words) < 2 or test_words[1] != '大好き':
                raise McLanguageException(mecab_exc_message)
Example #2
0
    def stem_words(self, words: List[str]) -> List[str]:
        words = decode_object_from_bytes_if_needed(words)
        if words is None:
            raise McLanguageException("Words to stem is None.")

        stems = []

        for word in words:
            if word is None or len(word) == 0:
                log.debug("Word is empty or None.")
                stem = word
            else:
                term_stems = self.__hindi_hunspell.stem(word)
                if len(term_stems) > 0:
                    stem = term_stems[0]

                    if stem is None or len(stem) == 0:
                        log.debug("Stem for word '%s' is empty or None." % word)
                        stem = word

                else:
                    log.debug("Stem for word '%s' was not found." % word)
                    stem = word

            stems.append(stem)

        if len(words) != len(stems):
            log.warning("Stem count is not the same as word count; words: %s; stems: %s" % (str(words), str(stems),))

        return stems
Example #3
0
    def stem_words(self, words: List[str]) -> List[str]:
        words = decode_object_from_bytes_if_needed(words)
        if words is None:
            raise McLanguageException("Words to stem is None.")

        stems = []

        for word in words:
            if word is None or len(word) == 0:
                log.debug("Word is empty or None.")
                stem = word
            else:
                stem = hausastemmer.stem(word)

                if stem is None or len(stem) == 0:
                    log.debug("Unable to stem word '%s'" % word)
                    stem = word

            stems.append(stem)

        if len(words) != len(stems):
            log.warning(
                "Stem count is not the same as word count; words: %s; stems: %s"
                % (
                    str(words),
                    str(stems),
                ))

        return stems
Example #4
0
    def __init__(self):
        """Constructor."""
        super().__init__()

        self.__japanese_sentence_tokenizer = RegexpTokenizer(
            r'([^!?。]*[!?。])',
            gaps=True,  # don't discard non-Japanese text
            discard_empty=True,
        )

        self.__english_language = EnglishLanguage()

        mecab_dictionary_path = JapaneseLanguage._mecab_ipadic_neologd_path()

        try:
            self.__mecab = MeCab.Tagger(
                '--dicdir=%(dictionary_path)s '
                '--node-format=%%m%(token_pos_separator)s%%h\\n '
                '--eos-format=%(eos_mark)s\\n' % {
                    'token_pos_separator': self.__MECAB_TOKEN_POS_SEPARATOR,
                    'eos_mark': self.__MECAB_EOS_MARK,
                    'dictionary_path': mecab_dictionary_path,
                })
        except Exception as ex:
            raise McLanguageException("Unable to initialize MeCab: %s" %
                                      str(ex))

        # Quick self-test to make sure that MeCab, its dictionaries and Python class are installed and working
        mecab_exc_message = "MeCab self-test failed; make sure that MeCab is built and dictionaries are accessible."
        try:
            test_words = self.split_sentence_to_words('pythonが大好きです')
        except Exception as _:
            raise McLanguageException(mecab_exc_message)
        else:
            if len(test_words) < 2 or test_words[1] != '大好き':
                raise McLanguageException(mecab_exc_message)
Example #5
0
    def _mecab_ipadic_neologd_path() -> str:  # (protected and not private because used by the unit test)
        """Return path to mecab-ipadic-neologd dictionary installed on system."""
        mecab_dictionary_path = None
        candidate_paths = JapaneseLanguage.__MECAB_DICTIONARY_PATHS

        for candidate_path in candidate_paths:
            if os.path.isdir(candidate_path):
                if os.path.isfile(os.path.join(candidate_path, 'sys.dic')):
                    mecab_dictionary_path = candidate_path
                    break

        if mecab_dictionary_path is None:
            raise McLanguageException(
                "mecab-ipadic-neologd was not found in paths: %s" % str(candidate_paths)
            )

        return mecab_dictionary_path
Example #6
0
    def stem_words(self, words: List[str]) -> List[str]:
        words = decode_object_from_bytes_if_needed(words)
        if words is None:
            raise McLanguageException("Words to stem is None.")

        stems = self.__lt_stemmer.stemWords(words)

        if len(words) != len(stems):
            log.warning(
                "Stem count is not the same as word count; words: %s; stems: %s"
                % (
                    str(words),
                    str(stems),
                ))

        # Perl's Snowball implementation used to return lowercase stems
        stems = [stem.lower() for stem in stems]

        return stems
Example #7
0
    def __init__(self):
        """Constructor."""
        super().__init__()

        self.__treebank_tokenizer = TreebankWordTokenizer()

        hunspell_dict_dir = os.path.join(
            os.path.dirname(os.path.abspath(__file__)),
            'hindi-hunspell',
            'dict-hi_IN',
        )
        if not os.path.isdir(hunspell_dict_dir):
            raise McLanguageException(
                "Hunspell dictionary directory does not exist at path: %s." %
                hunspell_dict_dir)

        if not os.path.isfile(os.path.join(hunspell_dict_dir, 'hi_IN.dic')):
            raise McLanguageException(
                "Hunspell dictionary file does not exist at path: %s" %
                hunspell_dict_dir)
        if not os.path.isfile(os.path.join(hunspell_dict_dir, 'hi_IN.aff')):
            raise McLanguageException(
                "Hunspell affix file does not exist at path: %s" %
                hunspell_dict_dir)

        try:
            self.__hindi_hunspell = Hunspell(
                lang='hi_IN', hunspell_data_dir=hunspell_dict_dir)
        except Exception as ex:
            raise McLanguageException(
                "Unable to initialize Hunspell with data directory '%s': %s" %
                (
                    hunspell_dict_dir,
                    str(ex),
                ))

        # Quick self-test to make sure that Hunspell is installed and dictionary is available
        hunspell_exc_message = """
            Hunspell self-test failed; make sure that Hunspell is installed and dictionaries are accessible, e.g.
            you might need to fetch Git submodules by running:

                git submodule update --init --recursive
        """
        try:
            test_stems = self.stem_words(['गुरुओं'])
        except Exception as _:
            raise McLanguageException(hunspell_exc_message)
        else:
            if len(test_stems) == 0 or test_stems[0] != 'गुरु':
                raise McLanguageException(hunspell_exc_message)
Example #8
0
    def __init__(self):
        """Constructor."""
        super().__init__()

        # Text -> sentence tokenizer for Chinese text
        self.__chinese_sentence_tokenizer = RegexpTokenizer(
            r'([^!?。]*[!?。])',
            gaps=True,  # don't discard non-Chinese text
            discard_empty=True,
        )

        self.__english_language = EnglishLanguage()

        self.__jieba = jieba.Tokenizer()
        self.__jieba.cache_file = self.__CACHE_PATH

        if not os.path.isdir(self.__DICT_PATH):
            raise McLanguageException(
                "Jieba dictionary directory was not found: %s" %
                self.__DICT_PATH)

        if not os.path.isfile(self.__JIEBA_DICT_PATH):
            raise McLanguageException(
                "Default dictionary not found in Jieba dictionary directory: %s"
                % self.__DICT_PATH)
        if not os.path.isfile(self.__JIEBA_USERDICT_PATH):
            raise McLanguageException(
                "User dictionary not found in Jieba dictionary directory: %s" %
                self.__DICT_PATH)
        try:
            self.__jieba.set_dictionary(os.path.join(self.__JIEBA_DICT_PATH))
            self.__jieba.load_userdict(os.path.join(
                self.__JIEBA_USERDICT_PATH))
        except Exception as ex:
            raise McLanguageException("Unable to initialize Jieba: %s" %
                                      str(ex))

        # Quick self-test to make sure that Jieba, its dictionaries and Python class are installed and working
        jieba_exc_message = "Jieba self-test failed; make sure that the dictionaries are accessible."
        try:
            test_words = self.split_sentence_to_words('python課程')
        except Exception as _:
            raise McLanguageException(jieba_exc_message)
        else:
            if len(test_words) < 2 or test_words[1] != '課程':
                raise McLanguageException(jieba_exc_message)
Example #9
0
    def stem_words(self, words: List[str]) -> List[str]:
        words = decode_object_from_bytes_if_needed(words)
        if words is None:
            raise McLanguageException("Words to stem is None.")

        stems = []

        for word in words:
            if word is None or len(word) == 0:
                log.debug("Word is empty or None.")
                stem = word
            else:

                try:

                    # Stemmer might raise an exception
                    stem = hausastemmer.stem(word)

                    # ...or it might return an empty string / None
                    if stem is None or len(stem) == 0:
                        raise Exception("Stem is empty.")

                except Exception as ex:
                    log.warning("Unable to stem word '{}': {}".format(
                        word, str(ex)))
                    stem = word

            stems.append(stem)

        if len(words) != len(stems):
            log.warning(
                "Stem count is not the same as word count; words: %s; stems: %s"
                % (
                    str(words),
                    str(stems),
                ))

        return stems