data_size = len(data) train_split_index = int(data_size*90/100) training_input = data_input[:train_split_index] training_output = data_output[:train_split_index] validation_input = data_input[train_split_index:] validation_output = data_output[train_split_index:] # Encoding the data ---------------------- input_encoding, input_decoding, input_dict_size = encoding.build_characters_encoding(data_input) output_encoding, output_decoding, output_dict_size = encoding.build_characters_encoding(data_output) encoded_training_input = encoding.transform(input_encoding, training_input, vector_size=MAX_ENGLISH_INPUT_LENGTH) encoded_training_output = encoding.transform(output_encoding, training_output, vector_size=MAX_KATAKANA_OUTPUT_LENGTH) encoded_validation_input = encoding.transform(input_encoding, validation_input, vector_size=MAX_ENGLISH_INPUT_LENGTH) encoded_validation_output = encoding.transform(output_encoding, validation_output, vector_size=MAX_KATAKANA_OUTPUT_LENGTH) # Building the model ---------------------- training_encoder_input, training_decoder_input, training_decoder_output = \ model.create_model_data(encoded_training_input, encoded_training_output, output_dict_size) validation_encoder_input, validation_decoder_input, validation_decoder_output = \ model.create_model_data(encoded_validation_input, encoded_validation_output, output_dict_size) # Building the model ---------------------- seq2seq_model = model.create_model(
print('Evaluating the model on random testing dataset...') data = pd.read_csv('./dataset/data.csv') data = data.sample(frac=1, random_state=11) data_input = [s.lower() for s in data[0]] data_output = [s.lower() for s in data[1]] data_size = len(data) test_split = int(data_size*10/100) test_input = data_input[:test_split] test_output = data_output[:test_split] encoded_testing_input = encoding.transform(input_encoding, test_input) encoded_testing_output = encoding.transform(output_encoding, test_output) test_encoder_input, test_decoder_input, test_decoder_output = \ model.create_model_data(encoded_testing_input, encoded_testing_output, len(output_decoding) + 1) testing_model.evaluate(x=[test_encoder_input, test_decoder_input], y=test_decoder_output) # =============================================================== print('Evaluating the model on random names...') def to_katakan(english_text): return model.to_katakana(english_text, testing_model, input_encoding, output_decoding)
data_output = [s.decode('utf-8') for s in data[1]] data_size = len(data) training_input = data_input[data_size*0/100:data_size*90/100] training_output = data_output[data_size*0/100:data_size*90/100] validation_input = data_input[data_size*90/100:data_size*100/100] validation_output = data_output[data_size*90/100:data_size*100/100] # Encoding the data ---------------------- input_encoding, input_decoding, input_dict_size = encoding.build_characters_encoding(data_input) output_encoding, output_decoding, output_dict_size = encoding.build_characters_encoding(data_output) encoded_training_input = encoding.transform(input_encoding, training_input, vector_size=MAX_ENGLISH_INPUT_LENGTH) encoded_training_output = encoding.transform(output_encoding, training_output, vector_size=MAX_KATAKANA_OUTPUT_LENGTH) encoded_validation_input = encoding.transform(input_encoding, validation_input, vector_size=MAX_ENGLISH_INPUT_LENGTH) encoded_validation_output = encoding.transform(output_encoding, validation_output, vector_size=MAX_KATAKANA_OUTPUT_LENGTH) # Building the model ---------------------- training_encoder_input = encoded_training_input training_decoder_input = np.zeros_like(encoded_training_output) training_decoder_input[:, 1:] = encoded_training_output[:,:-1] training_decoder_input[:, 0] = encoding.CHAR_CODE_START training_decoder_output = np.eye(output_dict_size)[encoded_training_output.astype('int')] validation_encoder_input = encoded_validation_input validation_decoder_input = np.zeros_like(encoded_validation_output)