def test_mecab_tokenizer_no_normalize(self): tokenizer = MecabTokenizer(normalize_text=False, mecab_dic="ipadic") self.assertListEqual( tokenizer.tokenize(" \tアップルストアでiPhone8 が \n 発売された 。 "), ["アップルストア", "で", "iPhone", "8", "が", "発売", "さ", "れ", "た", " ", "。"], )
def test_mecab_tokenizer_lower(self): tokenizer = MecabTokenizer(do_lower_case=True, mecab_dic="ipadic") self.assertListEqual( tokenizer.tokenize(" \tアップルストアでiPhone8 が \n 発売された 。 "), ["アップルストア", "で", "iphone", "8", "が", "発売", "さ", "れ", "た", "。"], )
def test_mecab_tokenizer_unidic(self): try: tokenizer = MecabTokenizer(mecab_dic="unidic") except ModuleNotFoundError: return self.assertListEqual( tokenizer.tokenize(" \tアップルストアでiPhone8 が \n 発売された 。 "), ["アップル", "ストア", "で", "iPhone", "8", "が", "発売", "さ", "れ", "た", "。"], )
def test_mecab_tokenizer_with_option(self): try: tokenizer = MecabTokenizer( do_lower_case=True, normalize_text=False, mecab_option="-d /usr/local/lib/mecab/dic/jumandic" ) except RuntimeError: # if dict doesn't exist in the system, previous code raises this error. return self.assertListEqual( tokenizer.tokenize(" \tアップルストアでiPhone8 が \n 発売された 。 "), ["アップルストア", "で", "iPhone", "8", "が", "発売", "さ", "れた", "\u3000", "。"], )