def test_tokenizer_template_texts_to_words_jp(self): tokenizer = TokenizerJP() self.assertIsNotNone(tokenizer) tokenizer.is_template = True self.assertEqual(["こんにちは「良い天気」ですね"], tokenizer.texts_to_words("こんにちは「良い天気」ですね"))
def test_tokenizer_texts_to_words_en(self): tokenizer = TokenizerJP() self.assertIsNotNone(tokenizer) self.assertEqual([], tokenizer.texts_to_words("")) self.assertEqual( ["Hello,", "he", "is", "Mr.A", "(No", "name)"], tokenizer.texts_to_words("Hello, he is Mr.A (No name)"))
def test_tokenizer_template_texts_to_words_en(self): tokenizer = TokenizerJP() self.assertIsNotNone(tokenizer) tokenizer.is_template = True self.assertEqual([], tokenizer.texts_to_words("")) self.assertEqual( ["Hello, he is Mr.A (No name)"], tokenizer.texts_to_words("Hello, he is Mr.A (No name)"))
def test_tokenizer_texts_to_words_en_with_punctation(self): punctations = ';\'",!()[]:’”;、。!()「」' tokenizer = TokenizerJP(punctuation_chars=punctations) self.assertIsNotNone(tokenizer) self.assertEqual([], tokenizer.texts_to_words("")) self.assertEqual( ["Hello", "he", "is", "Mr.A", "No", "name"], tokenizer.texts_to_words("Hello, he is Mr.A (No name)"))
def test_reload_jp(self): storage_factory = StorageFactory() tokenizer = TokenizerJP() file_store_config = FileStorageConfiguration() file_store_config._normal_storage = FileStoreConfiguration( file=os.path.dirname(__file__) + os.sep + "test_files" + os.sep + "normal_jp.txt", format="text", extension="txt", encoding="utf-8", delete_on_start=False) storage_engine = FileStorageEngine(file_store_config) storage_factory._storage_engines[ StorageFactory.NORMAL] = storage_engine storage_factory._store_to_engine_map[ StorageFactory.NORMAL] = storage_engine collection = NormalCollection() self.assertIsNotNone(collection) collection.load(storage_factory) self.assertEqual("丸1の回答", collection.normalise_string(tokenizer, "①の回答")) collection.reload(storage_factory) self.assertEqual("丸1の回答", collection.normalise_string(tokenizer, "①の回答"))
def test_reload_jp(self): storage_factory = StorageFactory() tokenizer = TokenizerJP() file_store_config = FileStorageConfiguration() file_store_config._gender_storage = FileStoreConfiguration( file=os.path.dirname(__file__) + os.sep + "test_files" + os.sep + "gender_jp.txt", format="text", extension="txt", encoding="utf-8", delete_on_start=False) storage_engine = FileStorageEngine(file_store_config) storage_factory._storage_engines[ StorageFactory.GENDER] = storage_engine storage_factory._store_to_engine_map[ StorageFactory.GENDER] = storage_engine collection = GenderCollection() self.assertIsNotNone(collection) collection.load(storage_factory) self.assertEqual(collection.gender("彼"), '彼女') self.assertEqual(collection.genderise_string(tokenizer, "彼が来た"), "彼女が来た") collection.reload(storage_factory) self.assertEqual(collection.gender("彼"), '彼女') self.assertEqual(collection.genderise_string(tokenizer, "彼が来た"), "彼女が来た")
def test_reload_jp(self): storage_factory = StorageFactory() tokenizer = TokenizerJP() file_store_config = FileStorageConfiguration() file_store_config._person2_storage = FileStoreConfiguration( file=os.path.dirname(__file__) + os.sep + "test_files" + os.sep + "person2_jp.txt", format="text", extension="txt", encoding="utf-8", delete_on_start=False) storage_engine = FileStorageEngine(file_store_config) storage_factory._storage_engines[ StorageFactory.PERSON2] = storage_engine storage_factory._store_to_engine_map[ StorageFactory.PERSON2] = storage_engine collection = Person2Collection() self.assertIsNotNone(collection) collection.load(storage_factory) self.assertEqual(collection.personalise_string(tokenizer, "私"), "彼か彼女") self.assertEqual(collection.personalise_string(tokenizer, "彼か彼女が来た"), "私か私が来た") collection.reload(storage_factory) self.assertEqual(collection.personalise_string(tokenizer, "私"), "彼か彼女") self.assertEqual(collection.personalise_string(tokenizer, "彼か彼女が来た"), "私か私が来た")
def test_collection_duplicate_jp(self): collection = NormalCollection() self.assertIsNotNone(collection) collection.add_to_lookup("①", '丸1') collection.add_to_lookup("①", '丸2') tokenizer = TokenizerJP() self.assertEqual("丸1の回答", collection.normalise_string(tokenizer, "①の回答"))
def test_tokenizer_words_to_texts_with_quote(self): tokenizer = TokenizerJP() self.assertIsNotNone(tokenizer) tokenizer.set_configuration_before_concatenation_rule('.*[ -~]') tokenizer.set_configuration_after_concatenation_rule('[ -~].*') self.assertEqual( 'Hello " very good " World', tokenizer.words_to_texts( ["Hello", '"', "very", "good", '"', "World"])) self.assertEqual( 'Hello "very good" World', tokenizer.words_to_texts(["Hello", '"very', 'good"', "World"])) self.assertEqual( 'こんにちは"良い天気"ですね', tokenizer.words_to_texts(["こんにちは", '"', "良い天気", '"', "です", "ね"])) self.assertEqual( 'こんにちは"良い天気"ですね', tokenizer.words_to_texts(["こんにちは", '"良い天気"', "です", "ね"]))
def test_collection_operations_JP(self): collection = NormalCollection() self.assertIsNotNone(collection) collection.add_to_lookup("①", '丸1') tokenizer = TokenizerJP() self.assertTrue(collection.has_keyVal("①")) self.assertEqual('丸1', collection.value("①")) self.assertEqual("丸1の回答", collection.normalise_string(tokenizer, "①の回答"))
def test_collection_invalid_jp(self): collection = NormalCollection() self.assertIsNotNone(collection) collection.add_to_lookup("彼岸", 'お彼岸') self.assertFalse(collection.has_keyVal("彼氏")) self.assertIsNone(collection.value("彼氏")) tokenizer = TokenizerJP() self.assertIsNone(collection.normalise("彼氏")) self.assertEqual("彼氏の回答", collection.normalise_string(tokenizer, "彼氏の回答"))
def test_collection_invalid_JP(self): collection = GenderCollection() self.assertIsNotNone(collection) collection.add_to_lookup("彼", '彼女') self.assertFalse(collection.has_keyVal("彼氏")) self.assertIsNone(collection.value("彼氏")) tokenizer = TokenizerJP() self.assertIsNone(collection.gender("彼氏")) self.assertEqual(collection.genderise_string(tokenizer, "彼氏が来た"), "彼氏が来た")
def test_tokenizer_words_to_texts_url(self): tokenizer = TokenizerJP() self.assertIsNotNone(tokenizer) tokenizer.set_configuration_before_concatenation_rule('.*[ -~]') tokenizer.set_configuration_after_concatenation_rule('[ -~].*') self.assertEqual( "http :// 192.168.1.10 / index.html", tokenizer.words_to_texts( ["http", "://", "192.168.1.10", "/", "index.html"]))
def test_collection_invalid_JP(self): collection = Person2Collection() self.assertIsNotNone(collection) collection.add_to_lookup("私", "彼か彼女") self.assertFalse(collection.has_keyVal("彼")) self.assertIsNone(collection.value("彼")) tokenizer = TokenizerJP() self.assertIsNone(collection.person("彼")) self.assertEqual(collection.personalise_string(tokenizer, "彼が来た"), "彼が来た")
def test_collection_invalid_jp(self): collection = DenormalCollection() self.assertIsNotNone(collection) collection.add_to_lookup("丸1", "①") self.assertFalse(collection.has_keyVal("丸")) self.assertIsNone(collection.value("丸")) tokenizer = TokenizerJP() self.assertIsNone(collection.denormalise("丸")) self.assertEqual(collection.denormalise_string(tokenizer, "丸の回答"), "丸の回答")
def test_collection_operations_JP(self): collection = GenderCollection() self.assertIsNotNone(collection) collection.add_to_lookup("彼", '彼女') tokenizer = TokenizerJP() self.assertTrue(collection.has_keyVal("彼")) self.assertEqual('彼女', collection.value("彼")) self.assertEqual(collection.gender("彼"), '彼女') self.assertEqual(collection.genderise_string(tokenizer, "彼が来た"), "彼女が来た")
def test_tokenizer_words_to_texts_mix(self): tokenizer = TokenizerJP() self.assertIsNotNone(tokenizer) tokenizer.set_configuration_before_concatenation_rule('.*[ -~]') tokenizer.set_configuration_after_concatenation_rule('[ -~].*') self.assertEqual( "こんにちは10日はHappy dayですね", tokenizer.words_to_texts( ["こんにちは", "10", "日", "は", "Happy", "day", "です", "ね"])) self.assertEqual( "=こんにちは10日はHappy dayですね=", tokenizer.words_to_texts( ["=", "こんにちは", "10", "日", "は", "Happy", "day", "です", "ね", "="])) self.assertEqual( "pen lightはありますか", tokenizer.words_to_texts(['pen', 'light', 'は', 'あり', 'ます', 'か']))
def test_tokenizer_no_test(self): tokenizer = TokenizerJP() self.assertIsNotNone(tokenizer) words = tokenizer._texts_to_words_en(None) self.assertEqual(0, len(words)) words = tokenizer._texts_to_words_en('') self.assertEqual(0, len(words)) words = tokenizer._texts_to_words_jp(None) self.assertEqual(0, len(words)) words = tokenizer._texts_to_words_jp('') self.assertEqual(0, len(words)) words = tokenizer._template_texts_to_words_jp(None) self.assertEqual(0, len(words)) words = tokenizer._template_texts_to_words_jp('') self.assertEqual(0, len(words)) texts = tokenizer._words_to_texts(None) self.assertEqual("", texts) words = tokenizer._words_to_texts('') self.assertEqual("", texts)
def test_tokenizer_template_texts_to_words_mix(self): tokenizer = TokenizerJP() self.assertIsNotNone(tokenizer) tokenizer.is_template = True self.assertEqual(["こんにちはhappyですか"], tokenizer.texts_to_words("こんにちはhappyですか")) self.assertEqual(["こんにちは happy ですか"], tokenizer.texts_to_words("こんにちは happy ですか")) self.assertEqual(["こんにちは(happy)ですか"], tokenizer.texts_to_words("こんにちは(happy)ですか"))
def test_tokenizer_texts_to_words_mix_with_punctation(self): punctations = ';\'",!()[]:’”;、。!()「」' tokenizer = TokenizerJP(punctuation_chars=punctations) self.assertIsNotNone(tokenizer) self.assertEqual(["こんにちは", "happy", "です", "か"], tokenizer.texts_to_words("こんにちはhappyですか")) self.assertEqual(["こんにちは", "happy", "です", "か"], tokenizer.texts_to_words("こんにちは happy ですか")) self.assertEqual(["こんにちは", "happy", "です", "か"], tokenizer.texts_to_words("こんにちは(happy)ですか")) self.assertEqual(["こんにちは", "happy", "です", "か"], tokenizer.texts_to_words("こんにちは「happy]ですか")) self.assertEqual(["こんにちは", "happy", "unhappy", "です", "か"], tokenizer.texts_to_words("こんにちは happy, unhappy ですか"))
def test_collection_operations_JP(self): person2_text = """ "私","彼か彼女" "彼","私" "彼女","私" """ collection = Person2Collection() self.assertIsNotNone(collection) collection.load_from_text(person2_text) tokenizer = TokenizerJP() self.assertEqual(collection.personalise_string(tokenizer, "私"), "彼か彼女") self.assertEqual(collection.personalise_string(tokenizer, "彼か彼女が来た"), "私か私が来た") pattern = collection.person("私") self.assertIsNotNone(pattern) self.assertEqual("彼か彼女", pattern)
def test_collection_operations_jp(self): person_text = """ "貴方","私" "私","貴方" "あなた","わたし" "わたし","あなた" """ collection = PersonCollection() self.assertIsNotNone(collection) collection.load_from_text(person_text) tokenizer = TokenizerJP() self.assertEqual(collection.personalise_string(tokenizer, "私が正しい"), "貴方が正しい") self.assertEqual(collection.personalise_string(tokenizer, "あなたは変"), "わたしは変") pattern = collection.person("貴方") self.assertIsNotNone(pattern) self.assertEqual("私", pattern)
def test_tokenizer_words_to_texts_jp(self): tokenizer = TokenizerJP() self.assertIsNotNone(tokenizer) self.assertEqual( "こんにちは 良い天気ですね", tokenizer.words_to_texts(["こんにちは", "", "良い", "天気", "です", "ね"])) self.assertEqual( "こんにちは<良い天気>ですね", tokenizer.words_to_texts( ["こんにちは", "<", "良い", "天気", ">", "です", "ね"])) self.assertEqual( "こんにちは<良い天気>ですね", tokenizer.words_to_texts( ["こんにちは", "<", "良い", "天気", ">", "です", "ね"])) self.assertEqual( "<こんにちは良い天気ですね>", tokenizer.words_to_texts( ["<", "こんにちは", "良い", "天気", "です", "ね", ">"]))
def test_tokenizer_convert_url(self): tokenizer = TokenizerJP() self.assertIsNotNone(tokenizer) tokenizer.set_configuration_before_concatenation_rule('.*[ -~]') tokenizer.set_configuration_after_concatenation_rule('[ -~].*') words = tokenizer.texts_to_words("http://192.168.1.10/index.html") self.assertEqual(["http://192.168.1.10/index.html"], words) self.assertEqual("http://192.168.1.10/index.html", tokenizer.words_to_texts(words)) words_en = tokenizer._texts_to_words_en( "http://192.168.1.10/index.html") self.assertEqual(["http://192.168.1.10/index.html"], words_en) self.assertEqual("http://192.168.1.10/index.html", tokenizer.words_to_texts(words_en)) words_jp = tokenizer._texts_to_words_jp( "http://192.168.1.10/index.html") self.assertEqual([ "http", "://", "192", ".", "168", ".", "1", ".", "10", "/", "index", ".", "html" ], words_jp) self.assertEqual("http :// 192 . 168 . 1 . 10 / index . html", tokenizer.words_to_texts(words_jp)) words_mix = tokenizer.texts_to_words( "URLはhttp://192.168.1.10/index.html") self.assertEqual(["URL", "は", "http://192.168.1.10/index.html"], words_mix) self.assertEqual("URLはhttp://192.168.1.10/index.html", tokenizer.words_to_texts(words_mix))
def test_tokenizer_words_to_texts_json_tag(self): JSON_CHILD_IN = '\uF010' JSON_CHILD_OUT = '\uF011' tokenizer = TokenizerJP() self.assertIsNotNone(tokenizer) tokenizer.set_configuration_before_concatenation_rule('.*[ -~]') tokenizer.set_configuration_after_concatenation_rule('[ -~].*') words1 = [JSON_CHILD_IN, "json", "data", JSON_CHILD_OUT] self.assertEqual("\uF010json data\uF011", tokenizer.words_to_texts(words1)) words2 = [JSON_CHILD_IN, "データ", "設定", JSON_CHILD_OUT] self.assertEqual("\uF010データ設定\uF011", tokenizer.words_to_texts(words2)) words1 = [JSON_CHILD_IN, "json", "設定", JSON_CHILD_OUT] self.assertEqual("\uF010json設定\uF011", tokenizer.words_to_texts(words1)) words2 = [JSON_CHILD_IN, "データ", "json", JSON_CHILD_OUT] self.assertEqual("\uF010データjson\uF011", tokenizer.words_to_texts(words2)) words1 = [JSON_CHILD_IN, "json", "設定", "data", JSON_CHILD_OUT] self.assertEqual("\uF010json設定data\uF011", tokenizer.words_to_texts(words1)) words2 = [JSON_CHILD_IN, "データ", "json", "設定", JSON_CHILD_OUT] self.assertEqual("\uF010データjson設定\uF011", tokenizer.words_to_texts(words2))
def test_tokenizer_words_to_texts_with_text_jp_json_en(self): JSON_CHILD_IN = '\uF010' JSON_CHILD_OUT = '\uF011' tokenizer = TokenizerJP() self.assertIsNotNone(tokenizer) tokenizer.set_configuration_before_concatenation_rule('.*[ -~]') tokenizer.set_configuration_after_concatenation_rule('[ -~].*') words1 = [ "こんにちは", JSON_CHILD_IN, "json-data", JSON_CHILD_OUT, "です", "ね" ] self.assertEqual("こんにちは\uF010json-data\uF011ですね", tokenizer.words_to_texts(words1)) words2 = [ "こんにちは", '"', JSON_CHILD_IN, "json-data", JSON_CHILD_OUT, '"', "です", "ね" ] self.assertEqual('こんにちは"\uF010json-data\uF011"ですね', tokenizer.words_to_texts(words2)) words3 = ["こんにちは", JSON_CHILD_IN, "json-data", JSON_CHILD_OUT] self.assertEqual('こんにちは\uF010json-data\uF011', tokenizer.words_to_texts(words3)) words4 = [ "こんにちは", '"', JSON_CHILD_IN, "json-data", JSON_CHILD_OUT, '"' ] self.assertEqual('こんにちは"\uF010json-data\uF011"', tokenizer.words_to_texts(words4)) words5 = [JSON_CHILD_IN, "json-data", JSON_CHILD_OUT, "です", "ね"] self.assertEqual('\uF010json-data\uF011ですね', tokenizer.words_to_texts(words5)) words6 = [ '"', JSON_CHILD_IN, "json-data", JSON_CHILD_OUT, '"', "です", "ね" ] self.assertEqual('"\uF010json-data\uF011"ですね', tokenizer.words_to_texts(words6))
def test_tokenizer_words_to_texts_with_text_en_json_jp(self): JSON_CHILD_IN = '\uF010' JSON_CHILD_OUT = '\uF011' tokenizer = TokenizerJP() self.assertIsNotNone(tokenizer) tokenizer.set_configuration_before_concatenation_rule('.*[ -~]') tokenizer.set_configuration_after_concatenation_rule('[ -~].*') words1 = ["Hello", JSON_CHILD_IN, "データ", JSON_CHILD_OUT, "you"] self.assertEqual("Hello\uF010データ\uF011you", tokenizer.words_to_texts(words1)) words2 = [ "Hello", '"', JSON_CHILD_IN, "データ", JSON_CHILD_OUT, '"', "you" ] self.assertEqual('Hello "\uF010データ\uF011" you', tokenizer.words_to_texts(words2)) words3 = ["Hello", JSON_CHILD_IN, "データ", JSON_CHILD_OUT] self.assertEqual('Hello\uF010データ\uF011', tokenizer.words_to_texts(words3)) words4 = ["Hello", '"', JSON_CHILD_IN, "データ", JSON_CHILD_OUT, '"'] self.assertEqual('Hello "\uF010データ\uF011"', tokenizer.words_to_texts(words4)) words5 = [JSON_CHILD_IN, "データ", JSON_CHILD_OUT, "you"] self.assertEqual('\uF010データ\uF011you', tokenizer.words_to_texts(words5)) words6 = ['"', JSON_CHILD_IN, "データ", JSON_CHILD_OUT, '"', "you"] self.assertEqual('"\uF010データ\uF011" you', tokenizer.words_to_texts(words6))
def test_tokenizer_words_from_current_pos_mix(self): tokenizer = TokenizerJP() self.assertIsNotNone(tokenizer) tokenizer.set_configuration_before_concatenation_rule('.*[ -~]') tokenizer.set_configuration_after_concatenation_rule('[ -~].*') with self.assertRaises(Exception): tokenizer.words_from_current_pos(None, 0) words = ["Yes", "か", "No"] self.assertEqual("YesかNo", tokenizer.words_from_current_pos(words, 0)) self.assertEqual("かNo", tokenizer.words_from_current_pos(words, 1)) self.assertEqual("No", tokenizer.words_from_current_pos(words, 2)) self.assertEqual("", tokenizer.words_from_current_pos(words, 3)) self.assertEqual("No", tokenizer.words_from_current_pos(words, -1)) self.assertEqual("かNo", tokenizer.words_from_current_pos(words, -2)) self.assertEqual("YesかNo", tokenizer.words_from_current_pos(words, -3)) self.assertEqual("YesかNo", tokenizer.words_from_current_pos(words, -4))
def test_tokenizer_texts_to_words_jp(self): tokenizer = TokenizerJP() self.assertIsNotNone(tokenizer) self.assertEqual(["こんにちは", "「", "良い", "天気", "」", "です", "ね"], tokenizer.texts_to_words("こんにちは「良い天気」ですね"))
def test_tokenizer_texts_to_words_mix(self): tokenizer = TokenizerJP() self.assertIsNotNone(tokenizer) self.assertEqual(["こんにちは", "happy"], tokenizer.texts_to_words("こんにちはhappy")) self.assertEqual(["こんにちは", "happy", "です", "か"], tokenizer.texts_to_words("こんにちはhappyですか")) self.assertEqual(["こんにちは", "happy", "です", "か"], tokenizer.texts_to_words("こんにちは happy ですか")) self.assertEqual(["こんにちは", "(happy)", "です", "か"], tokenizer.texts_to_words("こんにちは(happy)ですか")) self.assertEqual(["Hello", "ハッピー"], tokenizer.texts_to_words("Hello ハッピー")) self.assertEqual(["Hello", "ハッピー", "です", "か"], tokenizer.texts_to_words("Helloハッピーですか")) self.assertEqual(["Hello", "(", "ハッピー", ")", "です", "か"], tokenizer.texts_to_words("Hello (ハッピー)ですか")) self.assertEqual(["Hello", "ハッピー", "you"], tokenizer.texts_to_words("Helloハッピーyou")) self.assertEqual(["Hello", "ハッピー", "you"], tokenizer.texts_to_words("Hello ハッピー you"))