Example #1
0
    def __init__(self):
        """Constructor."""
        super().__init__()

        self.__japanese_sentence_tokenizer = RegexpTokenizer(
            r'([^!?。]*[!?。])',
            gaps=True,  # don't discard non-Japanese text
            discard_empty=True,
        )

        self.__english_language = EnglishLanguage()

        mecab_dictionary_path = JapaneseLanguage._mecab_ipadic_neologd_path()

        try:
            tagger_args = [
                f'--dicdir={mecab_dictionary_path}',
                '--rcfile=/dev/null',
                f'--node-format=%m{self.__MECAB_TOKEN_POS_SEPARATOR}%h{self.__EOL_SEPARATOR}',
                f'--eos-format={self.__MECAB_EOS_MARK}{self.__EOL_SEPARATOR}',
            ]
            self.__mecab = MeCab.Tagger(' '.join(tagger_args))
        except Exception as ex:
            raise McLanguageException("Unable to initialize MeCab: %s" % str(ex))

        # Quick self-test to make sure that MeCab, its dictionaries and Python class are installed and working
        mecab_exc_message = "MeCab self-test failed; make sure that MeCab is built and dictionaries are accessible."
        try:
            test_words = self.split_sentence_to_words('pythonが大好きです')
        except Exception as _:
            raise McLanguageException(mecab_exc_message)
        else:
            if len(test_words) < 2 or test_words[1] != '大好き':
                raise McLanguageException(mecab_exc_message)
Example #2
0
    def split_text_to_sentences(self, text: str) -> List[str]:
        text = decode_object_from_bytes_if_needed(text)
        if text is None:
            log.warning("Text is None.")
            return []

        # Replace Hindi's "।" with line break to make tokenizer split on both "।" and period
        text = text.replace("।", "।\n\n")

        # No non-breaking prefixes in Hausa, so using English file
        en = EnglishLanguage()
        return en.split_text_to_sentences(text)
Example #3
0
    def __init__(self):
        """Constructor."""
        super().__init__()

        # Text -> sentence tokenizer for Chinese text
        self.__chinese_sentence_tokenizer = RegexpTokenizer(
            r'([^!?。]*[!?。])',
            gaps=True,  # don't discard non-Chinese text
            discard_empty=True,
        )

        self.__english_language = EnglishLanguage()

        self.__jieba = jieba.Tokenizer()
        self.__jieba.cache_file = self.__CACHE_PATH

        if not os.path.isdir(self.__DICT_PATH):
            raise McLanguageException(
                "Jieba dictionary directory was not found: %s" %
                self.__DICT_PATH)

        if not os.path.isfile(self.__JIEBA_DICT_PATH):
            raise McLanguageException(
                "Default dictionary not found in Jieba dictionary directory: %s"
                % self.__DICT_PATH)
        if not os.path.isfile(self.__JIEBA_USERDICT_PATH):
            raise McLanguageException(
                "User dictionary not found in Jieba dictionary directory: %s" %
                self.__DICT_PATH)
        try:
            self.__jieba.set_dictionary(os.path.join(self.__JIEBA_DICT_PATH))
            self.__jieba.load_userdict(os.path.join(
                self.__JIEBA_USERDICT_PATH))
        except Exception as ex:
            raise McLanguageException("Unable to initialize Jieba: %s" %
                                      str(ex))

        # Quick self-test to make sure that Jieba, its dictionaries and Python class are installed and working
        jieba_exc_message = "Jieba self-test failed; make sure that the dictionaries are accessible."
        try:
            test_words = self.split_sentence_to_words('python課程')
        except Exception as _:
            raise McLanguageException(jieba_exc_message)
        else:
            if len(test_words) < 2 or test_words[1] != '課程':
                raise McLanguageException(jieba_exc_message)
Example #4
0
    def __init__(self):
        """Constructor."""
        super().__init__()

        self.__japanese_sentence_tokenizer = RegexpTokenizer(
            r'([^!?。]*[!?。])',
            gaps=True,  # don't discard non-Japanese text
            discard_empty=True,
        )

        self.__english_language = EnglishLanguage()

        mecab_dictionary_path = JapaneseLanguage._mecab_ipadic_neologd_path()

        try:
            self.__mecab = MeCab.Tagger(
                '--dicdir=%(dictionary_path)s '
                '--node-format=%%m%(token_pos_separator)s%%h\\n '
                '--eos-format=%(eos_mark)s\\n' % {
                    'token_pos_separator': self.__MECAB_TOKEN_POS_SEPARATOR,
                    'eos_mark': self.__MECAB_EOS_MARK,
                    'dictionary_path': mecab_dictionary_path,
                })
        except Exception as ex:
            raise McLanguageException("Unable to initialize MeCab: %s" %
                                      str(ex))

        # Quick self-test to make sure that MeCab, its dictionaries and Python class are installed and working
        mecab_exc_message = "MeCab self-test failed; make sure that MeCab is built and dictionaries are accessible."
        try:
            test_words = self.split_sentence_to_words('pythonが大好きです')
        except Exception as _:
            raise McLanguageException(mecab_exc_message)
        else:
            if len(test_words) < 2 or test_words[1] != '大好き':
                raise McLanguageException(mecab_exc_message)
Example #5
0
    def __init__(self):
        """Constructor."""
        super().__init__()

        # Text -> sentence tokenizer for Chinese text
        self.__chinese_sentence_tokenizer = RegexpTokenizer(
            r'([^!?。]*[!?。])',
            gaps=True,  # don't discard non-Chinese text
            discard_empty=True,
        )

        self.__english_language = EnglishLanguage()

        self.__jieba = JiebaTokenizer()

        if not os.path.isdir(self.__DICT_PATH):
            raise McLanguageException("Jieba dictionary directory was not found: %s" % self.__DICT_PATH)

        if not os.path.isfile(self.__JIEBA_DICT_PATH):
            raise McLanguageException(
                "Default dictionary not found in Jieba dictionary directory: %s" % self.__DICT_PATH
            )
        if not os.path.isfile(self.__JIEBA_USERDICT_PATH):
            raise McLanguageException(
                "User dictionary not found in jieba dictionary directory: %s" % self.__DICT_PATH
            )
        try:
            self.__jieba.set_dictionary(os.path.join(self.__JIEBA_DICT_PATH))
            self.__jieba.load_userdict(os.path.join(self.__JIEBA_USERDICT_PATH))
        except Exception as ex:
            raise McLanguageException("Unable to initialize jieba: %s" % str(ex))

        # Quick self-test to make sure that Jieba, its dictionaries and Python class are installed and working
        jieba_exc_message = "Jieba self-test failed; make sure that MeCab is built and dictionaries are accessible."
        try:
            test_words = self.split_sentence_to_words('python課程')
        except Exception as _:
            raise McLanguageException(jieba_exc_message)
        else:
            if len(test_words) < 2 or test_words[1] != '課程':
                raise McLanguageException(jieba_exc_message)
Example #6
0
    def __init__(self):
        """Constructor."""
        super().__init__()

        self.__japanese_sentence_tokenizer = RegexpTokenizer(
            r'([^!?。]*[!?。])',
            gaps=True,  # don't discard non-Japanese text
            discard_empty=True,
        )

        self.__english_language = EnglishLanguage()

        mecab_dictionary_path = JapaneseLanguage._mecab_ipadic_neologd_path()

        try:
            self.__mecab = MeCab.Tagger(
                '--dicdir=%(dictionary_path)s '
                '--node-format=%%m%(token_pos_separator)s%%h\\n '
                '--eos-format=%(eos_mark)s\\n' % {
                    'token_pos_separator': self.__MECAB_TOKEN_POS_SEPARATOR,
                    'eos_mark': self.__MECAB_EOS_MARK,
                    'dictionary_path': mecab_dictionary_path,
                }
            )
        except Exception as ex:
            raise McLanguageException("Unable to initialize MeCab: %s" % str(ex))

        # Quick self-test to make sure that MeCab, its dictionaries and Python class are installed and working
        mecab_exc_message = "MeCab self-test failed; make sure that MeCab is built and dictionaries are accessible."
        try:
            test_words = self.split_sentence_to_words('pythonが大好きです')
        except Exception as _:
            raise McLanguageException(mecab_exc_message)
        else:
            if len(test_words) < 2 or test_words[1] != '大好き':
                raise McLanguageException(mecab_exc_message)
Example #7
0
    def split_text_to_sentences(self, text: str) -> List[str]:
        text = decode_object_from_bytes_if_needed(text)

        # No non-breaking prefixes in Hausa, so using English file
        en = EnglishLanguage()
        return en.split_text_to_sentences(text)
Example #8
0
 def default_language_code() -> str:
     """Return default language code ('en' for English)."""
     return EnglishLanguage.language_code()
Example #9
0
 def default_language_code() -> str:
     """Return default language code ('en' for English)."""
     return EnglishLanguage.language_code()
Example #10
0
class LanguageFactory(object):
    """Language instance factory."""

    # Supported + enabled language codes and their corresponding classes
    __ENABLED_LANGUAGES = {
        CatalanLanguage.language_code(): CatalanLanguage,
        ChineseLanguage.language_code(): ChineseLanguage,
        DanishLanguage.language_code(): DanishLanguage,
        DutchLanguage.language_code(): DutchLanguage,
        EnglishLanguage.language_code(): EnglishLanguage,
        FinnishLanguage.language_code(): FinnishLanguage,
        FrenchLanguage.language_code(): FrenchLanguage,
        GermanLanguage.language_code(): GermanLanguage,
        HausaLanguage.language_code(): HausaLanguage,
        HindiLanguage.language_code(): HindiLanguage,
        HungarianLanguage.language_code(): HungarianLanguage,
        ItalianLanguage.language_code(): ItalianLanguage,
        JapaneseLanguage.language_code(): JapaneseLanguage,
        LithuanianLanguage.language_code(): LithuanianLanguage,
        NorwegianLanguage.language_code(): NorwegianLanguage,
        PortugueseLanguage.language_code(): PortugueseLanguage,
        RomanianLanguage.language_code(): RomanianLanguage,
        RussianLanguage.language_code(): RussianLanguage,
        SpanishLanguage.language_code(): SpanishLanguage,
        SwedishLanguage.language_code(): SwedishLanguage,
        TurkishLanguage.language_code(): TurkishLanguage,
    }

    # Static language object instances ({'language code': language object, ... })
    __language_instances = dict()

    @staticmethod
    def enabled_languages() -> set:
        """Return set of enabled languages (their codes)."""
        return set(LanguageFactory.__ENABLED_LANGUAGES.keys())

    @staticmethod
    def language_is_enabled(language_code: str) -> bool:
        """Return True if language is supported + enabled, False if it's not."""

        language_code = decode_object_from_bytes_if_needed(language_code)

        if language_code is None:
            log.warning("Language code is None.")
            return False

        return language_code in LanguageFactory.__ENABLED_LANGUAGES

    @staticmethod
    def language_for_code(language_code: str) -> Union[AbstractLanguage, None]:
        """Return language module instance for the language code, None if language is not supported."""

        language_code = decode_object_from_bytes_if_needed(language_code)

        if not LanguageFactory.language_is_enabled(language_code):
            return None

        if language_code not in LanguageFactory.__language_instances:
            language_class = LanguageFactory.__ENABLED_LANGUAGES[language_code]
            language = language_class()
            LanguageFactory.__language_instances[language_code] = language

        return LanguageFactory.__language_instances[language_code]

    @staticmethod
    def default_language_code() -> str:
        """Return default language code ('en' for English)."""
        return EnglishLanguage.language_code()

    @staticmethod
    def default_language() -> AbstractLanguage:
        """Return default language module instance (English)."""
        return LanguageFactory.language_for_code(
            LanguageFactory.default_language_code())
Example #11
0
class JapaneseLanguage(StopWordsFromFileMixIn):
    """Japanese language support module."""

    # Paths where mecab-ipadic-neologd might be located
    __MECAB_DICTIONARY_PATHS = [

        # Ubuntu / Debian
        '/var/lib/mecab/dic/ipadic-neologd',

        # CentOS / Fedora
        '/usr/lib64/mecab/dic/ipadic-neologd/',

        # OS X
        '/usr/local/opt/mecab-ipadic-neologd/lib/mecab/dic/ipadic-neologd/',
    ]

    __MECAB_TOKEN_POS_SEPARATOR = random_string(
        length=16)  # for whatever reason tab doesn't work
    __MECAB_EOS_MARK = 'EOS'

    __slots__ = [
        # MeCab instance
        '__mecab',

        # Text -> sentence tokenizer for Japanese text
        '__japanese_sentence_tokenizer',

        # English language instance for tokenizing non-Chinese (e.g. English) text
        '__english_language',
    ]

    @staticmethod
    def _mecab_ipadic_neologd_path(
    ) -> str:  # (protected and not private because used by the unit test)
        """Return path to mecab-ipadic-neologd dictionary installed on system."""
        mecab_dictionary_path = None
        candidate_paths = JapaneseLanguage.__MECAB_DICTIONARY_PATHS

        for candidate_path in candidate_paths:
            if os.path.isdir(candidate_path):
                if os.path.isfile(os.path.join(candidate_path, 'sys.dic')):
                    mecab_dictionary_path = candidate_path
                    break

        if mecab_dictionary_path is None:
            raise McLanguageException(
                "mecab-ipadic-neologd was not found in paths: %s" %
                str(candidate_paths))

        return mecab_dictionary_path

    @staticmethod
    def _mecab_allowed_pos_ids() -> Dict[int, str]:
        """Return allowed MeCab part-of-speech IDs and their definitions from pos-id.def.

        Definitions don't do much in the language module itself, they're used by unit tests to verify that pos-id.def
        didn't change in some unexpected way and we're not missing out on newly defined POSes.
        """
        return {
            36: '名詞,サ変接続,*,*',  # noun-verbal
            38: '名詞,一般,*,*',  # noun
            40: '名詞,形容動詞語幹,*,*',  # adjectival nouns or quasi-adjectives
            41: '名詞,固有名詞,一般,*',  # proper nouns
            42: '名詞,固有名詞,人名,一般',  # proper noun, names of people
            43: '名詞,固有名詞,人名,姓',  # proper noun, first name
            44: '名詞,固有名詞,人名,名',  # proper noun, last name
            45: '名詞,固有名詞,組織,*',  # proper noun, organization
            46: '名詞,固有名詞,地域,一般',  # proper noun in general
            47: '名詞,固有名詞,地域,国',  # proper noun, country name
        }

    def __init__(self):
        """Constructor."""
        super().__init__()

        self.__japanese_sentence_tokenizer = RegexpTokenizer(
            r'([^!?。]*[!?。])',
            gaps=True,  # don't discard non-Japanese text
            discard_empty=True,
        )

        self.__english_language = EnglishLanguage()

        mecab_dictionary_path = JapaneseLanguage._mecab_ipadic_neologd_path()

        try:
            self.__mecab = MeCab.Tagger(
                '--dicdir=%(dictionary_path)s '
                '--node-format=%%m%(token_pos_separator)s%%h\\n '
                '--eos-format=%(eos_mark)s\\n' % {
                    'token_pos_separator': self.__MECAB_TOKEN_POS_SEPARATOR,
                    'eos_mark': self.__MECAB_EOS_MARK,
                    'dictionary_path': mecab_dictionary_path,
                })
        except Exception as ex:
            raise McLanguageException("Unable to initialize MeCab: %s" %
                                      str(ex))

        # Quick self-test to make sure that MeCab, its dictionaries and Python class are installed and working
        mecab_exc_message = "MeCab self-test failed; make sure that MeCab is built and dictionaries are accessible."
        try:
            test_words = self.split_sentence_to_words('pythonが大好きです')
        except Exception as _:
            raise McLanguageException(mecab_exc_message)
        else:
            if len(test_words) < 2 or test_words[1] != '大好き':
                raise McLanguageException(mecab_exc_message)

    @staticmethod
    def language_code() -> str:
        return "ja"

    @staticmethod
    def sample_sentence() -> str:
        return "いろはにほへと ちりぬるを わかよたれそ つねならむ うゐのおくやま けふこえて あさきゆめみし ゑひもせす(ん)。"

    # noinspection PyMethodMayBeStatic
    def stem_words(self, words: List[str]) -> List[str]:
        words = decode_object_from_bytes_if_needed(words)

        # MeCab's sentence -> word tokenizer already returns "base forms" of every word
        return words

    def split_text_to_sentences(self, text: str) -> List[str]:
        """Tokenize Japanese text into sentences."""
        text = decode_object_from_bytes_if_needed(text)
        if text is None:
            log.warning("Text is None.")
            return []

        text = text.strip()

        if len(text) == 0:
            return []

        # First split Japanese text
        japanese_sentences = self.__japanese_sentence_tokenizer.tokenize(text)
        sentences = []
        for sentence in japanese_sentences:

            # Split paragraphs separated by two line breaks denoting a list
            paragraphs = re.split("\n\s*?\n", sentence)
            for paragraph in paragraphs:

                # Split lists separated by "* "
                list_items = re.split("\n\s*?(?=\* )", paragraph)
                for list_item in list_items:
                    # Split non-Japanese text
                    non_japanese_sentences = self.__english_language.split_text_to_sentences(
                        list_item)

                    sentences += non_japanese_sentences

        # Trim whitespace
        sentences = [sentence.strip() for sentence in sentences]

        return sentences

    def split_sentence_to_words(self, sentence: str) -> List[str]:
        """Tokenize Japanese sentence into words.

        Removes punctuation and words that don't belong to part-of-speech whitelist."""

        sentence = decode_object_from_bytes_if_needed(sentence)
        if sentence is None:
            log.warning("Sentence is None.")
            return []

        sentence = sentence.strip()

        if len(sentence) == 0:
            return []

        parsed_text = self.__mecab.parse(sentence).strip()
        parsed_tokens = parsed_text.split("\n")

        allowed_pos_ids = self._mecab_allowed_pos_ids()

        words = []
        for parsed_token_line in parsed_tokens:
            if self.__MECAB_TOKEN_POS_SEPARATOR in parsed_token_line:

                primary_form_and_pos_number = parsed_token_line.split(
                    self.__MECAB_TOKEN_POS_SEPARATOR)

                primary_form = primary_form_and_pos_number[0]
                pos_number = primary_form_and_pos_number[1]

                if pos_number.isdigit():
                    pos_number = int(pos_number)

                    if pos_number in allowed_pos_ids:
                        words.append(primary_form)

            else:
                # Ignore all the "EOS" stuff
                pass

        return words
Example #12
0
 def setUp(self):
     self.__tokenizer = EnglishLanguage()
Example #13
0
class TestEnglishLanguage(TestCase):

    def setUp(self):
        self.__tokenizer = EnglishLanguage()

    def test_language_code(self):
        assert self.__tokenizer.language_code() == "en"

    def test_sample_sentence(self):
        assert len(self.__tokenizer.sample_sentence())

    def test_stop_words_map(self):
        stop_words = self.__tokenizer.stop_words_map()
        assert "the" in stop_words
        assert "not_a_stopword" not in stop_words

    def test_stem(self):
        input_words = ["stemming"]
        expected_stems = ["stem"]
        actual_stems = self.__tokenizer.stem_words(input_words)
        assert expected_stems == actual_stems

    def test_stem_apostrophe_normal(self):
        """Stemming with normal apostrophe."""
        input_words = ["Katz's", "Delicatessen"]
        expected_stems = ['katz', 'delicatessen']
        actual_stems = self.__tokenizer.stem_words(input_words)
        assert expected_stems == actual_stems

    def test_stem_apostrophe_right_single_quotation_mark(self):
        """Stemming with right single quotation mark."""
        input_words = ["it’s", "toasted"]
        expected_stems = ['it', 'toast']
        actual_stems = self.__tokenizer.stem_words(input_words)
        assert expected_stems == actual_stems

    def test_split_text_to_sentences_period_in_number(self):
        """Period in number."""
        input_text = "Sentence contain version 2.0 of the text. Foo."
        expected_sentences = [
            'Sentence contain version 2.0 of the text.',
            'Foo.',
        ]
        actual_sentences = self.__tokenizer.split_text_to_sentences(input_text)
        assert expected_sentences == actual_sentences

    def test_split_text_to_sentences_may_ending(self):
        """'May' ending."""
        input_text = "Sentence ends in May. This is the next sentence. Foo."
        expected_sentences = [
            'Sentence ends in May.',
            'This is the next sentence.',
            'Foo.',
        ]
        actual_sentences = self.__tokenizer.split_text_to_sentences(input_text)
        assert expected_sentences == actual_sentences

    def test_split_text_to_sentences_punctuation(self):
        """'May' ending."""
        input_text = "Leave the city! [Mega No!], l."
        expected_sentences = [
            'Leave the city!',
            '[Mega No!], l.',
        ]
        actual_sentences = self.__tokenizer.split_text_to_sentences(input_text)
        assert expected_sentences == actual_sentences

    def test_split_text_to_sentences_unicode(self):
        """Basic Unicode."""
        input_text = "Non Mega Não! [Mega No!], l."
        expected_sentences = [
            'Non Mega Não!',
            '[Mega No!], l.',
        ]
        actual_sentences = self.__tokenizer.split_text_to_sentences(input_text)
        assert expected_sentences == actual_sentences

    def test_split_text_to_sentences_quotation(self):
        """Basic Unicode (with fancy Unicode quotation marks)."""
        input_text = """
            Perhaps that’s the best thing the Nobel Committee did by awarding this year’s literature prize to a
            non-dissident, someone whom Peter Englund of the Swedish Academy said was “more a critic of the system,
            sitting within the system.” They’ve given him a chance to bust out.
        """
        expected_sentences = [
            (
                'Perhaps that’s the best thing the Nobel Committee did by awarding this year’s literature prize to a '
                'non-dissident, someone whom Peter Englund of the Swedish Academy said was “more a critic of the '
                'system, sitting within the system.”'
            ),
            'They’ve given him a chance to bust out.',
        ]
        actual_sentences = self.__tokenizer.split_text_to_sentences(input_text)
        assert expected_sentences == actual_sentences

    def test_split_text_to_sentences_two_spaces(self):
        """Two spaces in the middle of the sentence."""
        input_text = """
            Although several opposition groups have called for boycotting the coming June 12  presidential election, it
            seems the weight of boycotting groups is much less than four years ago.
        """
        expected_sentences = [
            (
                'Although several opposition groups have called for boycotting the coming June 12 presidential '
                'election, it seems the weight of boycotting groups is much less than four years ago.'
            ),
        ]
        actual_sentences = self.__tokenizer.split_text_to_sentences(input_text)
        assert expected_sentences == actual_sentences

    def test_split_text_to_sentences_nbsp(self):
        """Non-breaking space."""
        input_text = """
            American Current TV journalists Laura Ling and Euna Lee have been  sentenced  to 12 years of hard labor
            (according to CNN).\u00a0 Jillian York  rounded up blog posts  for Global Voices prior to the journalists'
            sentencing.
        """
        expected_sentences = [
            (
                'American Current TV journalists Laura Ling and Euna Lee have been sentenced to 12 years of hard labor '
                '(according to CNN).'
            ),
            "Jillian York rounded up blog posts for Global Voices prior to the journalists' sentencing.",
        ]
        actual_sentences = self.__tokenizer.split_text_to_sentences(input_text)
        assert expected_sentences == actual_sentences

    def test_split_text_to_sentences_no_space_after_period(self):
        """No space after a period."""
        input_text = """
            Anger is a waste of energy and what North Korea wants of you.We can and will work together and use our
            minds, to work this through.
        """
        expected_sentences = [
            'Anger is a waste of energy and what North Korea wants of you.',
            'We can and will work together and use our minds, to work this through.',
        ]
        actual_sentences = self.__tokenizer.split_text_to_sentences(input_text)
        assert expected_sentences == actual_sentences

    def test_split_text_to_sentences_unicode_ellipsis(self):
        """Unicode "…"."""
        input_text = """
            One of the most popular Brahmin community, with 28, 726 members, randomly claims: “we r clever &
            hardworking. no one can fool us…” The Brahmans community with 41952 members and the Brahmins of India
            community with 30588 members are also very popular.
        """
        expected_sentences = [
            (
                'One of the most popular Brahmin community, with 28, 726 members, randomly claims: “we r clever & '
                'hardworking. no one can fool us...”'
            ),
            (
                'The Brahmans community with 41952 members and the Brahmins of India community with 30588 members are '
                'also very popular.'
            ),
        ]
        actual_sentences = self.__tokenizer.split_text_to_sentences(input_text)
        assert expected_sentences == actual_sentences

    def test_split_sentence_to_words_normal_apostrophe(self):
        """Normal apostrophe (')."""
        input_sentence = "It's always sunny in Philadelphia."
        expected_words = ["it's", "always", "sunny", "in", "philadelphia"]
        actual_words = self.__tokenizer.split_sentence_to_words(input_sentence)
        assert expected_words == actual_words

    def test_split_sentence_to_words_right_single_quotation_mark(self):
        """Right single quotation mark (’), normalized to apostrophe (')."""
        input_sentence = "It’s always sunny in Philadelphia."
        expected_words = ["it's", "always", "sunny", "in", "philadelphia"]
        actual_words = self.__tokenizer.split_sentence_to_words(input_sentence)
        assert expected_words == actual_words

    def test_split_sentence_to_words_hyphen_without_split(self):
        """Hyphen without split."""
        input_sentence = "near-total secrecy"
        expected_words = ["near-total", "secrecy"]
        actual_words = self.__tokenizer.split_sentence_to_words(input_sentence)
        assert expected_words == actual_words

    def test_split_sentence_to_words_hyphen_without_split_as_dash(self):
        """Hyphen with split (where it's being used as a dash)."""
        input_sentence = "A Pythagorean triple - named for the ancient Greek Pythagoras"
        expected_words = ['a', 'pythagorean', 'triple', 'named', 'for', 'the', 'ancient', 'greek', 'pythagoras']
        actual_words = self.__tokenizer.split_sentence_to_words(input_sentence)
        assert expected_words == actual_words

    def test_split_sentence_to_words_quotes(self):
        """Quotation marks."""
        input_sentence = 'it was in the Guinness Book of World Records as the "most difficult mathematical problem"'
        expected_words = [
            'it', 'was', 'in', 'the', 'guinness', 'book', 'of', 'world', 'records', 'as', 'the', 'most', 'difficult',
            'mathematical', 'problem'
        ]
        actual_words = self.__tokenizer.split_sentence_to_words(input_sentence)
        assert expected_words == actual_words
Example #14
0
    def split_text_to_sentences(self, text: str) -> List[str]:
        text = decode_object_from_bytes_if_needed(text)

        # No non-breaking prefixes in Hausa, so using English file
        en = EnglishLanguage()
        return en.split_text_to_sentences(text)
Example #15
0
class ChineseLanguage(StopWordsFromFileMixIn):
    """Chinese language support module."""

    # Path to jieba dictionary(ies)
    __DICT_PATH = os.path.dirname(os.path.abspath(__file__))
    __JIEBA_DICT_PATH = os.path.join(__DICT_PATH, 'dict.txt.big')
    __JIEBA_USERDICT_PATH = os.path.join(__DICT_PATH, 'userdict.txt')

    __slots__ = [
        # Stop words map
        '__stop_words_map',

        # Jieba instance
        '__jieba',

        # Text -> sentence tokenizer for Chinese text
        '__chinese_sentence_tokenizer',

        # English language instance for tokenizing non-Chinese (e.g. English) text
        '__english_language',
    ]

    def __init__(self):
        """Constructor."""
        super().__init__()

        # Text -> sentence tokenizer for Chinese text
        self.__chinese_sentence_tokenizer = RegexpTokenizer(
            r'([^!?。]*[!?。])',
            gaps=True,  # don't discard non-Chinese text
            discard_empty=True,
        )

        self.__english_language = EnglishLanguage()

        self.__jieba = JiebaTokenizer()

        if not os.path.isdir(self.__DICT_PATH):
            raise McLanguageException("Jieba dictionary directory was not found: %s" % self.__DICT_PATH)

        if not os.path.isfile(self.__JIEBA_DICT_PATH):
            raise McLanguageException(
                "Default dictionary not found in Jieba dictionary directory: %s" % self.__DICT_PATH
            )
        if not os.path.isfile(self.__JIEBA_USERDICT_PATH):
            raise McLanguageException(
                "User dictionary not found in jieba dictionary directory: %s" % self.__DICT_PATH
            )
        try:
            self.__jieba.set_dictionary(os.path.join(self.__JIEBA_DICT_PATH))
            self.__jieba.load_userdict(os.path.join(self.__JIEBA_USERDICT_PATH))
        except Exception as ex:
            raise McLanguageException("Unable to initialize jieba: %s" % str(ex))

        # Quick self-test to make sure that Jieba, its dictionaries and Python class are installed and working
        jieba_exc_message = "Jieba self-test failed; make sure that MeCab is built and dictionaries are accessible."
        try:
            test_words = self.split_sentence_to_words('python課程')
        except Exception as _:
            raise McLanguageException(jieba_exc_message)
        else:
            if len(test_words) < 2 or test_words[1] != '課程':
                raise McLanguageException(jieba_exc_message)

    @staticmethod
    def language_code() -> str:
        return "zh"

    @staticmethod
    def sample_sentence() -> str:
        return (
            "2010年宾夕法尼亚州联邦参议员选举民主党初选于2010年5月18日举行,联邦众议员乔·谢斯塔克战胜在任联邦参议员阿伦·斯佩克特,"
            "为后者的连续5个参议员任期划上句点。"
        )

    def stem_words(self, words: List[str]) -> List[str]:
        words = decode_object_from_bytes_if_needed(words)

        # Jieba's sentence -> word tokenizer already returns "base forms" of every word
        return words

    def split_text_to_sentences(self, text: str) -> List[str]:
        """Tokenize Chinese text into sentences."""

        text = decode_object_from_bytes_if_needed(text)
        if text is None:
            log.warning("Text is None.")
            return []

        text = text.strip()

        if len(text) == 0:
            return []

        # First split Chinese text
        chinese_sentences = self.__chinese_sentence_tokenizer.tokenize(text)
        sentences = []
        for sentence in chinese_sentences:

            # Split paragraphs separated by two line breaks denoting a list
            paragraphs = re.split("\n\s*?\n", sentence)
            for paragraph in paragraphs:

                # Split lists separated by "* "
                list_items = re.split("\n\s*?(?=\* )", paragraph)
                for list_item in list_items:
                    # Split non-Chinese text
                    non_chinese_sentences = self.__english_language.split_text_to_sentences(list_item)

                    sentences += non_chinese_sentences

        # Trim whitespace
        sentences = [sentence.strip() for sentence in sentences]

        return sentences

    def split_sentence_to_words(self, sentence: str) -> List[str]:
        """Tokenize Chinese sentence into words.

        Removes punctuation."""

        sentence = decode_object_from_bytes_if_needed(sentence)

        if sentence is None:
            log.warning("Sentence to tokenize into words is None.")
            return []

        sentence = sentence.strip()

        if len(sentence) == 0:
            return []

        parsed_text = self.__jieba.lcut(sentence, cut_all=False)
        parsed_tokens = [x for x in parsed_text if x.strip()]
        words = []
        for parsed_token in parsed_tokens:
            if re.search(r'\w+', parsed_token, flags=re.UNICODE) is not None:
                words.append(parsed_token)
            else:
                pass

        return words
Example #16
0
class ChineseLanguage(StopWordsFromFileMixIn):
    """Chinese language support module."""

    # Path to jieba dictionary(ies)
    __DICT_PATH = os.path.dirname(os.path.abspath(__file__))
    __JIEBA_DICT_PATH = os.path.join(__DICT_PATH, 'dict.txt.big')
    __JIEBA_USERDICT_PATH = os.path.join(__DICT_PATH, 'userdict.txt')

    __slots__ = [
        # Stop words map
        '__stop_words_map',

        # Jieba instance
        '__jieba',

        # Text -> sentence tokenizer for Chinese text
        '__chinese_sentence_tokenizer',

        # English language instance for tokenizing non-Chinese (e.g. English) text
        '__english_language',
    ]

    def __init__(self):
        """Constructor."""
        super().__init__()

        # Text -> sentence tokenizer for Chinese text
        self.__chinese_sentence_tokenizer = RegexpTokenizer(
            r'([^!?。]*[!?。])',
            gaps=True,  # don't discard non-Chinese text
            discard_empty=True,
        )

        self.__english_language = EnglishLanguage()

        self.__jieba = JiebaTokenizer()

        if not os.path.isdir(self.__DICT_PATH):
            raise McLanguageException("Jieba dictionary directory was not found: %s" % self.__DICT_PATH)

        if not os.path.isfile(self.__JIEBA_DICT_PATH):
            raise McLanguageException(
                "Default dictionary not found in Jieba dictionary directory: %s" % self.__DICT_PATH
            )
        if not os.path.isfile(self.__JIEBA_USERDICT_PATH):
            raise McLanguageException(
                "User dictionary not found in jieba dictionary directory: %s" % self.__DICT_PATH
            )
        try:
            self.__jieba.set_dictionary(os.path.join(self.__JIEBA_DICT_PATH))
            self.__jieba.load_userdict(os.path.join(self.__JIEBA_USERDICT_PATH))
        except Exception as ex:
            raise McLanguageException("Unable to initialize jieba: %s" % str(ex))

        # Quick self-test to make sure that Jieba, its dictionaries and Python class are installed and working
        jieba_exc_message = "Jieba self-test failed; make sure that MeCab is built and dictionaries are accessible."
        try:
            test_words = self.split_sentence_to_words('python課程')
        except Exception as _:
            raise McLanguageException(jieba_exc_message)
        else:
            if len(test_words) < 2 or test_words[1] != '課程':
                raise McLanguageException(jieba_exc_message)

    @staticmethod
    def language_code() -> str:
        return "zh"

    @staticmethod
    def sample_sentence() -> str:
        return (
            "2010年宾夕法尼亚州联邦参议员选举民主党初选于2010年5月18日举行,联邦众议员乔·谢斯塔克战胜在任联邦参议员阿伦·斯佩克特,"
            "为后者的连续5个参议员任期划上句点。"
        )

    def stem_words(self, words: List[str]) -> List[str]:
        words = decode_object_from_bytes_if_needed(words)

        # Jieba's sentence -> word tokenizer already returns "base forms" of every word
        return words

    def split_text_to_sentences(self, text: str) -> List[str]:
        """Tokenize Chinese text into sentences."""

        text = decode_object_from_bytes_if_needed(text)
        if text is None:
            log.warning("Text is None.")
            return []

        text = text.strip()

        if len(text) == 0:
            return []

        # First split Chinese text
        chinese_sentences = self.__chinese_sentence_tokenizer.tokenize(text)
        sentences = []
        for sentence in chinese_sentences:

            # Split paragraphs separated by two line breaks denoting a list
            paragraphs = re.split(r"\n\s*?\n", sentence)
            for paragraph in paragraphs:

                # Split lists separated by "* "
                list_items = re.split(r"\n\s*?(?=\* )", paragraph)
                for list_item in list_items:
                    # Split non-Chinese text
                    non_chinese_sentences = self.__english_language.split_text_to_sentences(list_item)

                    sentences += non_chinese_sentences

        # Trim whitespace
        sentences = [sentence.strip() for sentence in sentences]

        return sentences

    def split_sentence_to_words(self, sentence: str) -> List[str]:
        """Tokenize Chinese sentence into words.

        Removes punctuation."""

        sentence = decode_object_from_bytes_if_needed(sentence)

        if sentence is None:
            log.warning("Sentence to tokenize into words is None.")
            return []

        sentence = sentence.strip()

        if len(sentence) == 0:
            return []

        parsed_text = self.__jieba.lcut(sentence, cut_all=False)
        parsed_tokens = [x for x in parsed_text if x.strip()]
        words = []
        for parsed_token in parsed_tokens:
            if re.search(r'\w+', parsed_token, flags=re.UNICODE) is not None:
                words.append(parsed_token)
            else:
                pass

        return words
Example #17
0
class JapaneseLanguage(StopWordsFromFileMixIn):
    """Japanese language support module."""

    # Paths where mecab-ipadic-neologd might be located
    __MECAB_DICTIONARY_PATHS = [

        # Ubuntu / Debian
        '/var/lib/mecab/dic/ipadic-neologd',

        # CentOS / Fedora
        '/usr/lib64/mecab/dic/ipadic-neologd/',

        # OS X
        '/usr/local/opt/mecab-ipadic-neologd/lib/mecab/dic/ipadic-neologd/',
    ]

    __MECAB_TOKEN_POS_SEPARATOR = random_string(length=16)  # for whatever reason tab doesn't work
    __MECAB_EOS_MARK = 'EOS'

    __slots__ = [
        # MeCab instance
        '__mecab',

        # Text -> sentence tokenizer for Japanese text
        '__japanese_sentence_tokenizer',

        # English language instance for tokenizing non-Chinese (e.g. English) text
        '__english_language',
    ]

    @staticmethod
    def _mecab_ipadic_neologd_path() -> str:  # (protected and not private because used by the unit test)
        """Return path to mecab-ipadic-neologd dictionary installed on system."""
        mecab_dictionary_path = None
        candidate_paths = JapaneseLanguage.__MECAB_DICTIONARY_PATHS

        for candidate_path in candidate_paths:
            if os.path.isdir(candidate_path):
                if os.path.isfile(os.path.join(candidate_path, 'sys.dic')):
                    mecab_dictionary_path = candidate_path
                    break

        if mecab_dictionary_path is None:
            raise McLanguageException(
                "mecab-ipadic-neologd was not found in paths: %s" % str(candidate_paths)
            )

        return mecab_dictionary_path

    @staticmethod
    def _mecab_allowed_pos_ids() -> Dict[int, str]:
        """Return allowed MeCab part-of-speech IDs and their definitions from pos-id.def.

        Definitions don't do much in the language module itself, they're used by unit tests to verify that pos-id.def
        didn't change in some unexpected way and we're not missing out on newly defined POSes.
        """
        return {
            36: '名詞,サ変接続,*,*',  # noun-verbal
            38: '名詞,一般,*,*',  # noun
            40: '名詞,形容動詞語幹,*,*',  # adjectival nouns or quasi-adjectives
            41: '名詞,固有名詞,一般,*',  # proper nouns
            42: '名詞,固有名詞,人名,一般',  # proper noun, names of people
            43: '名詞,固有名詞,人名,姓',  # proper noun, first name
            44: '名詞,固有名詞,人名,名',  # proper noun, last name
            45: '名詞,固有名詞,組織,*',  # proper noun, organization
            46: '名詞,固有名詞,地域,一般',  # proper noun in general
            47: '名詞,固有名詞,地域,国',  # proper noun, country name
        }

    def __init__(self):
        """Constructor."""
        super().__init__()

        self.__japanese_sentence_tokenizer = RegexpTokenizer(
            r'([^!?。]*[!?。])',
            gaps=True,  # don't discard non-Japanese text
            discard_empty=True,
        )

        self.__english_language = EnglishLanguage()

        mecab_dictionary_path = JapaneseLanguage._mecab_ipadic_neologd_path()

        try:
            self.__mecab = MeCab.Tagger(
                '--dicdir=%(dictionary_path)s '
                '--node-format=%%m%(token_pos_separator)s%%h\\n '
                '--eos-format=%(eos_mark)s\\n' % {
                    'token_pos_separator': self.__MECAB_TOKEN_POS_SEPARATOR,
                    'eos_mark': self.__MECAB_EOS_MARK,
                    'dictionary_path': mecab_dictionary_path,
                }
            )
        except Exception as ex:
            raise McLanguageException("Unable to initialize MeCab: %s" % str(ex))

        # Quick self-test to make sure that MeCab, its dictionaries and Python class are installed and working
        mecab_exc_message = "MeCab self-test failed; make sure that MeCab is built and dictionaries are accessible."
        try:
            test_words = self.split_sentence_to_words('pythonが大好きです')
        except Exception as _:
            raise McLanguageException(mecab_exc_message)
        else:
            if len(test_words) < 2 or test_words[1] != '大好き':
                raise McLanguageException(mecab_exc_message)

    @staticmethod
    def language_code() -> str:
        return "ja"

    @staticmethod
    def sample_sentence() -> str:
        return "いろはにほへと ちりぬるを わかよたれそ つねならむ うゐのおくやま けふこえて あさきゆめみし ゑひもせす(ん)。"

    # noinspection PyMethodMayBeStatic
    def stem_words(self, words: List[str]) -> List[str]:
        words = decode_object_from_bytes_if_needed(words)

        # MeCab's sentence -> word tokenizer already returns "base forms" of every word
        return words

    def split_text_to_sentences(self, text: str) -> List[str]:
        """Tokenize Japanese text into sentences."""
        text = decode_object_from_bytes_if_needed(text)
        if text is None:
            log.warning("Text is None.")
            return []

        text = text.strip()

        if len(text) == 0:
            return []

        # First split Japanese text
        japanese_sentences = self.__japanese_sentence_tokenizer.tokenize(text)
        sentences = []
        for sentence in japanese_sentences:

            # Split paragraphs separated by two line breaks denoting a list
            paragraphs = re.split(r"\n\s*?\n", sentence)
            for paragraph in paragraphs:

                # Split lists separated by "* "
                list_items = re.split(r"\n\s*?(?=\* )", paragraph)
                for list_item in list_items:
                    # Split non-Japanese text
                    non_japanese_sentences = self.__english_language.split_text_to_sentences(list_item)

                    sentences += non_japanese_sentences

        # Trim whitespace
        sentences = [sentence.strip() for sentence in sentences]

        return sentences

    def split_sentence_to_words(self, sentence: str) -> List[str]:
        """Tokenize Japanese sentence into words.

        Removes punctuation and words that don't belong to part-of-speech whitelist."""

        sentence = decode_object_from_bytes_if_needed(sentence)
        if sentence is None:
            log.warning("Sentence is None.")
            return []

        sentence = sentence.strip()

        if len(sentence) == 0:
            return []

        parsed_text = self.__mecab.parse(sentence).strip()
        parsed_tokens = parsed_text.split("\n")

        allowed_pos_ids = self._mecab_allowed_pos_ids()

        words = []
        for parsed_token_line in parsed_tokens:
            if self.__MECAB_TOKEN_POS_SEPARATOR in parsed_token_line:

                primary_form_and_pos_number = parsed_token_line.split(self.__MECAB_TOKEN_POS_SEPARATOR)

                primary_form = primary_form_and_pos_number[0]
                pos_number = primary_form_and_pos_number[1]

                if pos_number.isdigit():
                    pos_number = int(pos_number)

                    if pos_number in allowed_pos_ids:
                        words.append(primary_form)

            else:
                # Ignore all the "EOS" stuff
                pass

        return words