# ### Randomize Data # As you saw from exploring the data above, the order of the samples are randomized. It doesn't hurt to randomize it again, but you don't need to for this dataset. # ## Preprocess all the data and save it # Running the code cell below will preprocess all the CIFAR-10 data and save it to file. The code below also uses 10% of the training data for validation. # In[5]: """ DON'T MODIFY ANYTHING IN THIS CELL """ # Preprocess Training, Validation, and Testing Data helper.preprocess_and_save_data(cifar10_dataset_folder_path, normalize, one_hot_encode) # # Check Point # This is your first checkpoint. If you ever decide to come back to this notebook or have to restart the notebook, you can start from here. The preprocessed data has been saved to disk. # In[6]: """ DON'T MODIFY ANYTHING IN THIS CELL """ import pickle import problem_unittests as tests import helper
return source_vocab_to_int, target_vocab_toint """ DON'T MODIFY ANYTHING IN THIS CELL THAT IS BELOW THIS LINE """ tests.test_text_to_ids(text_to_ids) # ### Preprocess all the data and save it # Running the code cell below will preprocess all the data and save it to file. # In[4]: """ DON'T MODIFY ANYTHING IN THIS CELL """ helper.preprocess_and_save_data(source_path, target_path, text_to_ids) # # Check Point # This is your first checkpoint. If you ever decide to come back to this notebook or have to restart the notebook, you can start from here. The preprocessed data has been saved to disk. # In[2]: """ DON'T MODIFY ANYTHING IN THIS CELL """ import numpy as np import helper import problem_unittests as tests (source_int_text, target_int_text), (source_vocab_to_int, target_vocab_to_int), _ = helper.load_preprocess()
def one_hot_encode(x): """ One hot encode a list of sample labels. Return a one-hot encoded vector for each label. : x: List of sample Labels : return: Numpy array of one-hot encoded labels """ # TODO: Implement Function # HINT: google "np.eye" or use label encoder from sklearn print(len(x)) return np.eye(10)[x] tests.test_one_hot_encode(one_hot_encode) helper.preprocess_and_save_data(cifar10_dataset_folder_path, normalize, one_hot_encode) import pickle import problem_unittests as tests import helper valid_features, valid_labels = pickle.load( open('preprocess_validation.p', mode='rb')) import tensorflow as tf def neural_net_image_input(image_shape): image = tf.placeholder( tf.float32, [None, image_shape[0], image_shape[1], image_shape[2]],
# 数字到文字的映射 int_to_vocab = dict(enumerate(vocab)) return vocab_to_int, int_to_vocab # 创建一个符号查询表,把逗号,句号等符号与一个标志一一对应,用于将『我。』和『我』这样的类似情况区分开来,排除标点符号的影响。 def token_lookup(): symbols = set(['。', ',', '“', "”", ';', '!', '?', '(', ')', '——', '\n']) tokens = ["P", "C", "Q", "T", "S", "E", "M", "I", "O", "D", "R"] return dict(zip(symbols, tokens)) # 预处理一下数据,并保存到磁盘,以便下次直接读取 ================================================================================================================ helper.preprocess_and_save_data(''.join(lines_of_text), token_lookup, create_lookup_tables) int_text, vocab_to_int, int_to_vocab, token_dict = helper.load_preprocess() # 训练循环次数 num_epochs = 200 # batch大小 batch_size = 256 # lstm层中包含的unit个数 rnn_size = 512 # embedding layer的大小 embed_dim = 512
""" DON'T MODIFY ANYTHING IN THIS CELL THAT IS BELOW THIS LINE """ tests.test_tokenize(token_lookup) # ## 预处理并保存所有数据 # 运行以下代码将预处理所有数据,并将它们保存至文件。 # In[27]: """ DON'T MODIFY ANYTHING IN THIS CELL """ # Preprocess Training, Validation, and Testing Data helper.preprocess_and_save_data(data_dir, token_lookup, create_lookup_tables) # # 检查点 # 这是你遇到的第一个检点。如果你想要回到这个 notebook,或需要重新打开 notebook,你都可以从这里开始。预处理的数据都已经保存完毕。 # In[28]: """ DON'T MODIFY ANYTHING IN THIS CELL """ import helper import numpy as np import problem_unittests as tests int_text, vocab_to_int, int_to_vocab, token_dict = helper.load_preprocess() # ## 创建神经网络
[target_vocab_to_int['<EOS>']] for line in sentences] return source_id_text, target_id_text PREPROCESS_PATH = '18sep_reverse_preprocess.p' PARAM_PATH = '18sep_reverse_param.p' save_path = 'checkpoints/18sep_reverse' """ DON'T MODIFY ANYTHING IN THIS CELL THAT IS BELOW THIS LINE """ tests.test_text_to_ids(text_to_ids) """ DON'T MODIFY ANYTHING IN THIS CELL """ helper.preprocess_and_save_data(source_path, target_path, text_to_ids, PREPROCESS_PATH) """ DON'T MODIFY ANYTHING IN THIS CELL """ import numpy as np import helper import problem_unittests as tests (source_int_text, target_int_text), ( source_vocab_to_int, target_vocab_to_int), _ = helper.load_preprocess(PREPROCESS_PATH) """ DON'T MODIFY ANYTHING IN THIS CELL """ from distutils.version import LooseVersion import warnings
tkn_dict[')'] = '<RIGHT_PARENTHESIS>' tkn_dict['\n'] = '<NEW_LINE>' tkn_dict['--'] = '<DASH>' return tkn_dict ### HyperParameters batch_size = 512 every_n_batches = 4 learning_rate = 0.05 num_epochs = 40 rnn_size = 512 seq_length = 15 ### Build Network helper.preprocess_and_save_data('./data/simpsons/moes_tavern_lines.txt', \ token_lookup, create_lookup_tables) int_text, vocab_to_int, int_to_vocab, token_dict = helper.load_preprocess() train_graph = tf.Graph() with train_graph.as_default(): vocab_size = len(int_to_vocab) input_text, targets, lr = get_inputs() input_data_shape = tf.shape(input_text) cell, initial_state = get_init_cell(input_data_shape[0], rnn_size) logits, final_state = build_nn(cell, rnn_size, input_text, vocab_size) probs = tf.nn.softmax(logits, name='probs') cost = seq2seq.sequence_loss(logits, targets, \ tf.ones([input_data_shape[0], input_data_shape[1]])) optimizer = tf.train.AdamOptimizer(lr)
def Train(embed_dim=512, num_epochs=20, learning_rate=0.01, seq_length=10, rnn_size=700, batch_size=100): data_dir = './data/simpsons/moes_tavern_lines.txt' text = helper.load_data(data_dir) # Ignore notice, since we don't use it for analysing the data text = text[81:] helper.preprocess_and_save_data(data_dir, token_lookup, create_lookup_tables) int_text, _, int_to_vocab, _ = helper.load_preprocess() show_every_n_batches = 50 train_graph = tf.Graph() with train_graph.as_default(): vocab_size = len(int_to_vocab) input_text, targets, lr = get_inputs() input_data_shape = tf.shape(input_text) cell, initial_state = get_init_cell(input_data_shape[0], rnn_size) logits, final_state = build_nn(cell, rnn_size, input_text, vocab_size, embed_dim) # Probabilities for generating words tf.nn.softmax(logits, name='probs') # Loss function cost = seq2seq.sequence_loss( logits, targets, tf.ones([input_data_shape[0], input_data_shape[1]])) # Optimizer optimizer = tf.train.AdamOptimizer(lr) # Gradient Clipping gradients = optimizer.compute_gradients(cost) capped_gradients = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in gradients if grad is not None] train_op = optimizer.apply_gradients(capped_gradients) batches = get_batches(int_text, batch_size, seq_length) with tf.Session(graph=train_graph) as sess: sess.run(tf.global_variables_initializer()) for epoch_i in range(num_epochs): state = sess.run(initial_state, {input_text: batches[0][0]}) for batch_i, (x, y) in enumerate(batches): feed = { input_text: x, targets: y, initial_state: state, lr: learning_rate } train_loss, state, _ = sess.run([cost, final_state, train_op], feed) # Show every <show_every_n_batches> batches if (epoch_i * len(batches) + batch_i) % show_every_n_batches == 0: print('Epoch {:>3} Batch {:>4}/{} train_loss = {:.3f}'. format(epoch_i, batch_i, len(batches), train_loss)) # Save Model saver = tf.train.Saver() saver.save(sess, "./save") print('Model Trained and Saved') # Save parameters for checkpoint helper.save_params((seq_length, "./save"))
def createNumDict(): lines_of_text = helper.load_data() helper.preprocess_and_save_data(lines_of_text, helper.token_lookup, helper.create_lookup_tables)
pixels = np.ndarray((len(x), 32, 32, 3)) for p in x: p = p.flatten() p = abs((p - 128.) / 128.) p = p.reshape(1, 32, 32, 3) pixels[w, :, :, :] = p w += 1 return pixels tests.test_normalize(normalize) # %% def one_hot_encode(x): classes = list(range(10)) lb = preprocessing.LabelBinarizer() lb.fit(classes) return lb.transform(x) tests.test_one_hot_encode(one_hot_encode) # %% # Preprocess Training, Validation, and Testing Data print("Preprocessing and saving data...") helper.preprocess_and_save_data("cifar-10-batches-py", normalize, one_hot_encode)
def test_preprocess_and_save(self): preprocess_and_save_data(dataset_path="../data/Seinfeld_Scripts.txt", token_lookup=create_punctuation_map, create_lookup_tables=create_maps)
DON'T MODIFY ANYTHING IN THIS CELL THAT IS BELOW THIS LINE """ tests.test_one_hot_encode(one_hot_encode) # ### Randomize Data # As you saw from exploring the data above, the order of the samples are randomized. It doesn't hurt to randomize it again, but you don't need to for this dataset. # ## Preprocess all the data and save it # Running the code cell below will preprocess all the Fashion-MNIST data and save it to file. The code below also uses 10% of the training data for validation. # In[6]: """ DON'T MODIFY ANYTHING IN THIS CELL """ # Preprocess Training, Validation, and Testing Data helper.preprocess_and_save_data(filename, normalize, one_hot_encode) # # Check Point # This is your first checkpoint. If you ever decide to come back to this notebook or have to restart the notebook, you can start from here. The preprocessed data has been saved to disk. # In[7]: """ DON'T MODIFY ANYTHING IN THIS CELL """ import pickle import problem_unittests as tests import helper # Load the Preprocessed Validation data valid_features, valid_labels = pickle.load( open('preprocess_validation.p', mode='rb'))