def test__tokens_map2(self): str = japanese_normalize(test_str2) tokens = JapaneseTokenizer._tokenize(nlp(str)) self.assertEqual( my_tokenizer._tokens_filter( tokens, my_tokenizer._token_map_callback ), [ "一時期", ",", "エルサレム", "の", "近く", "の", "古代", "の", "トフェト", "で", "も", ",", "同様", "の", "儀式", "が", "行なわ", "れ", "て", "い", "まし", "た", ".", ], )
def test_tokenize3(self): s = japanese_normalize(test_str3) tokens = my_tokenizer.tokenize(s) self.assertEqual( tokens, ( [ "また", ",", "[ORG]", "誌", "に", "掲載", "さ", "れ", "た", ",", "日本", "から", "の", "報道", "は", ",", "日本", "の", "子供", "たち", "が", "テレビ", "で", "「", "ほとんど", "際限", "なく", "流血", "と", "暴力", "を", "見せ", "られ", "て", "いる", "」", "こと", "を", "示し", "て", "い", "ます", ".", ], [("アトラス", "ORG"), ("日本", "LOC"), ("日本", "LOC")], ), )
def test__tokens_map3(self): str = japanese_normalize(test_str3) tokens = JapaneseTokenizer._tokenize(nlp(str)) self.assertEqual( JapaneseTokenizer._tokens_filter( tokens, my_tokenizer._token_map_callback ), [ "また", ",", "[ORG]", "誌", "に", "掲載", "さ", "れ", "た", ",", "日本", "から", "の", "報道", "は", ",", "日本", "の", "子供", "たち", "が", "テレビ", "で", "「", "ほとんど", "際限", "なく", "流血", "と", "暴力", "を", "見せ", "られ", "て", "いる", "」", "こと", "を", "示し", "て", "い", "ます", ".", ], )
def test_tokenize(self): s = japanese_normalize(test_str) tokens = my_tokenizer.tokenize(s) self.assertEqual( tokens, ( [ "その", "後", ",", "国", "が", "立て", "た", "捕食", "動物", "抑制", "計画", "に", "より", ",", "[DATE]", "から", "[DATE]", "の", "間", "に", "さらに", "[NUM]", "匹", "の", "コヨテ", "が", "殺さ", "れ", "まし", "た", ".", ], [("1955年", "DATE"), ("1964年", "DATE"), ("2万7,646", "NUM")], ), )
def test__tokenize(self): str = japanese_normalize(test_str) tokens = JapaneseTokenizer._tokenize(nlp(str)) self.assertEqual( tokens, [ ("その", False, False, False, ""), ("後", False, False, False, ""), (",", False, False, False, ""), ("国", False, False, False, ""), ("が", False, False, False, ""), ("立て", False, False, False, ""), ("た", False, False, False, ""), ("捕食", False, False, False, ""), ("動物", False, False, False, ""), ("抑制", False, False, False, ""), ("計画", False, False, False, ""), ("に", False, False, False, ""), ("より", False, False, False, ""), (",", False, False, False, ""), ("1955", True, False, False, "DATE"), ("年", False, False, False, "DATE"), ("から", False, False, False, ""), ("1964", True, False, False, "DATE"), ("年", False, False, False, "DATE"), ("の", False, False, False, ""), ("間", False, False, False, ""), ("に", False, False, False, ""), ("さらに", False, False, False, ""), ("2万7,646", True, False, False, ""), # 漢数字がしっかり数値として扱われるか確認する ("匹", False, False, False, ""), ("の", False, False, False, ""), ("コヨテ", False, False, False, ""), ("が", False, False, False, ""), ("殺さ", False, False, False, ""), ("れ", False, False, False, ""), ("まし", False, False, False, ""), ("た", False, False, False, ""), (".", False, False, False, ""), ], )
def test__tokens_map(self): str = japanese_normalize(test_str) tokens = JapaneseTokenizer._tokenize(nlp(str)) self.assertEqual( my_tokenizer._tokens_filter( tokens, my_tokenizer._token_map_callback ), [ "その", "後", ",", "国", "が", "立て", "た", "捕食", "動物", "抑制", "計画", "に", "より", ",", "[DATE]", "から", "[DATE]", "の", "間", "に", "さらに", "[NUM]", "匹", "の", "コヨテ", "が", "殺さ", "れ", "まし", "た", ".", ], )
def test__tokens_filter(self): str = japanese_normalize(test_str) tokens = JapaneseTokenizer._tokenize(nlp(str)) self.assertEqual( JapaneseTokenizer._tokens_filter(tokens, lambda token: token), [ ("その", False, False, False, ""), ("後", False, False, False, ""), (",", False, False, False, ""), ("国", False, False, False, ""), ("が", False, False, False, ""), ("立て", False, False, False, ""), ("た", False, False, False, ""), ("捕食", False, False, False, ""), ("動物", False, False, False, ""), ("抑制", False, False, False, ""), ("計画", False, False, False, ""), ("に", False, False, False, ""), ("より", False, False, False, ""), (",", False, False, False, ""), ("1955", True, False, False, "DATE"), # 減っている ("から", False, False, False, ""), ("1964", True, False, False, "DATE"), # 減っている ("の", False, False, False, ""), ("間", False, False, False, ""), ("に", False, False, False, ""), ("さらに", False, False, False, ""), ("2万7,646", True, False, False, ""), ("匹", False, False, False, ""), ("の", False, False, False, ""), ("コヨテ", False, False, False, ""), ("が", False, False, False, ""), ("殺さ", False, False, False, ""), ("れ", False, False, False, ""), ("まし", False, False, False, ""), ("た", False, False, False, ""), (".", False, False, False, ""), ], )
def test__entity2(self): str = japanese_normalize(test_str2) entities = JapaneseTokenizer._entity(nlp(str)) self.assertEqual(entities, [("エルサレム", "LOC")])
def test__entity(self): str = japanese_normalize(test_str) entities = JapaneseTokenizer._entity(nlp(str)) self.assertEqual( entities, [("1955年", "DATE"), ("1964年", "DATE")] )
def test_japanese_normalize2(self): str = japanese_normalize(test_str2) self.assertEqual( str, "その後,国が立てた捕食動物抑制計画により,1955年から1964年の間にさらに2万7,646匹のコヨテが殺されました。")
from tqdm import tqdm from JapaneseTokenizer import JapaneseTokenizer from normlizer import japanese_normalize my_tokenizer = JapaneseTokenizer() if __name__ == "__main__": num_lines = sum(1 for line in open("./raw/ja.txt", "r")) with open("./raw/ja.txt", "r") as f: with open("./mod/ja.txt", "w") as w: for line in tqdm(f, total=num_lines): w.write(japanese_normalize(line) + '\n')