Ejemplo n.º 1
0

# ### Randomize Data
# As you saw from exploring the data above, the order of the samples are randomized.  It doesn't hurt to randomize it again, but you don't need to for this dataset.

# ## Preprocess all the data and save it
# Running the code cell below will preprocess all the CIFAR-10 data and save it to file. The code below also uses 10% of the training data for validation.

# In[5]:


"""
DON'T MODIFY ANYTHING IN THIS CELL
"""
# Preprocess Training, Validation, and Testing Data
helper.preprocess_and_save_data(cifar10_dataset_folder_path, normalize, one_hot_encode)


# # Check Point
# This is your first checkpoint.  If you ever decide to come back to this notebook or have to restart the notebook, you can start from here.  The preprocessed data has been saved to disk.

# In[6]:


"""
DON'T MODIFY ANYTHING IN THIS CELL
"""
import pickle
import problem_unittests as tests
import helper
Ejemplo n.º 2
0
    return source_vocab_to_int, target_vocab_toint


"""
DON'T MODIFY ANYTHING IN THIS CELL THAT IS BELOW THIS LINE
"""
tests.test_text_to_ids(text_to_ids)

# ### Preprocess all the data and save it
# Running the code cell below will preprocess all the data and save it to file.

# In[4]:
"""
DON'T MODIFY ANYTHING IN THIS CELL
"""
helper.preprocess_and_save_data(source_path, target_path, text_to_ids)

# # Check Point
# This is your first checkpoint. If you ever decide to come back to this notebook or have to restart the notebook, you can start from here. The preprocessed data has been saved to disk.

# In[2]:
"""
DON'T MODIFY ANYTHING IN THIS CELL
"""
import numpy as np
import helper
import problem_unittests as tests

(source_int_text,
 target_int_text), (source_vocab_to_int,
                    target_vocab_to_int), _ = helper.load_preprocess()
Ejemplo n.º 3
0
def one_hot_encode(x):
    """
    One hot encode a list of sample labels. Return a one-hot encoded vector for each label.
    : x: List of sample Labels
    : return: Numpy array of one-hot encoded labels
    """
    # TODO: Implement Function
    # HINT: google "np.eye" or use label encoder from sklearn
    print(len(x))
    return np.eye(10)[x]


tests.test_one_hot_encode(one_hot_encode)

helper.preprocess_and_save_data(cifar10_dataset_folder_path, normalize,
                                one_hot_encode)

import pickle
import problem_unittests as tests
import helper

valid_features, valid_labels = pickle.load(
    open('preprocess_validation.p', mode='rb'))

import tensorflow as tf


def neural_net_image_input(image_shape):

    image = tf.placeholder(
        tf.float32, [None, image_shape[0], image_shape[1], image_shape[2]],
Ejemplo n.º 4
0
    # 数字到文字的映射
    int_to_vocab = dict(enumerate(vocab))

    return vocab_to_int, int_to_vocab

# 创建一个符号查询表,把逗号,句号等符号与一个标志一一对应,用于将『我。』和『我』这样的类似情况区分开来,排除标点符号的影响。
def token_lookup():

    symbols = set(['。', ',', '“', "”", ';', '!', '?', '(', ')', '——', '\n'])

    tokens = ["P", "C", "Q", "T", "S", "E", "M", "I", "O", "D", "R"]

    return dict(zip(symbols, tokens))

# 预处理一下数据,并保存到磁盘,以便下次直接读取  ================================================================================================================
helper.preprocess_and_save_data(''.join(lines_of_text), token_lookup, create_lookup_tables)

int_text, vocab_to_int, int_to_vocab, token_dict = helper.load_preprocess()

# 训练循环次数
num_epochs = 200

# batch大小
batch_size = 256

# lstm层中包含的unit个数
rnn_size = 512

# embedding layer的大小
embed_dim = 512
Ejemplo n.º 5
0

"""
DON'T MODIFY ANYTHING IN THIS CELL THAT IS BELOW THIS LINE
"""
tests.test_tokenize(token_lookup)

# ## 预处理并保存所有数据
# 运行以下代码将预处理所有数据,并将它们保存至文件。

# In[27]:
"""
DON'T MODIFY ANYTHING IN THIS CELL
"""
# Preprocess Training, Validation, and Testing Data
helper.preprocess_and_save_data(data_dir, token_lookup, create_lookup_tables)

# # 检查点
# 这是你遇到的第一个检点。如果你想要回到这个 notebook,或需要重新打开 notebook,你都可以从这里开始。预处理的数据都已经保存完毕。

# In[28]:
"""
DON'T MODIFY ANYTHING IN THIS CELL
"""
import helper
import numpy as np
import problem_unittests as tests

int_text, vocab_to_int, int_to_vocab, token_dict = helper.load_preprocess()

# ## 创建神经网络
                      [target_vocab_to_int['<EOS>']] for line in sentences]

    return source_id_text, target_id_text


PREPROCESS_PATH = '18sep_reverse_preprocess.p'
PARAM_PATH = '18sep_reverse_param.p'
save_path = 'checkpoints/18sep_reverse'
"""
DON'T MODIFY ANYTHING IN THIS CELL THAT IS BELOW THIS LINE
"""
tests.test_text_to_ids(text_to_ids)
"""
DON'T MODIFY ANYTHING IN THIS CELL
"""
helper.preprocess_and_save_data(source_path, target_path, text_to_ids,
                                PREPROCESS_PATH)
"""
DON'T MODIFY ANYTHING IN THIS CELL
"""
import numpy as np
import helper
import problem_unittests as tests

(source_int_text, target_int_text), (
    source_vocab_to_int,
    target_vocab_to_int), _ = helper.load_preprocess(PREPROCESS_PATH)
"""
DON'T MODIFY ANYTHING IN THIS CELL
"""
from distutils.version import LooseVersion
import warnings
    tkn_dict[')'] = '<RIGHT_PARENTHESIS>'
    tkn_dict['\n'] = '<NEW_LINE>'
    tkn_dict['--'] = '<DASH>'
    return tkn_dict


### HyperParameters
batch_size = 512
every_n_batches = 4
learning_rate = 0.05
num_epochs = 40
rnn_size = 512
seq_length = 15

### Build Network
helper.preprocess_and_save_data('./data/simpsons/moes_tavern_lines.txt', \
    token_lookup, create_lookup_tables)
int_text, vocab_to_int, int_to_vocab, token_dict = helper.load_preprocess()

train_graph = tf.Graph()
with train_graph.as_default():
    vocab_size = len(int_to_vocab)
    input_text, targets, lr = get_inputs()
    input_data_shape = tf.shape(input_text)

    cell, initial_state = get_init_cell(input_data_shape[0], rnn_size)
    logits, final_state = build_nn(cell, rnn_size, input_text, vocab_size)

    probs = tf.nn.softmax(logits, name='probs')
    cost = seq2seq.sequence_loss(logits, targets, \
        tf.ones([input_data_shape[0], input_data_shape[1]]))
    optimizer = tf.train.AdamOptimizer(lr)
Ejemplo n.º 8
0
def Train(embed_dim=512,
          num_epochs=20,
          learning_rate=0.01,
          seq_length=10,
          rnn_size=700,
          batch_size=100):
    data_dir = './data/simpsons/moes_tavern_lines.txt'
    text = helper.load_data(data_dir)
    # Ignore notice, since we don't use it for analysing the data
    text = text[81:]
    helper.preprocess_and_save_data(data_dir, token_lookup,
                                    create_lookup_tables)
    int_text, _, int_to_vocab, _ = helper.load_preprocess()
    show_every_n_batches = 50
    train_graph = tf.Graph()
    with train_graph.as_default():
        vocab_size = len(int_to_vocab)
        input_text, targets, lr = get_inputs()
        input_data_shape = tf.shape(input_text)
        cell, initial_state = get_init_cell(input_data_shape[0], rnn_size)
        logits, final_state = build_nn(cell, rnn_size, input_text, vocab_size,
                                       embed_dim)

        # Probabilities for generating words
        tf.nn.softmax(logits, name='probs')

        # Loss function
        cost = seq2seq.sequence_loss(
            logits, targets,
            tf.ones([input_data_shape[0], input_data_shape[1]]))

        # Optimizer
        optimizer = tf.train.AdamOptimizer(lr)

        # Gradient Clipping
        gradients = optimizer.compute_gradients(cost)
        capped_gradients = [(tf.clip_by_value(grad, -1., 1.), var)
                            for grad, var in gradients if grad is not None]
        train_op = optimizer.apply_gradients(capped_gradients)

    batches = get_batches(int_text, batch_size, seq_length)

    with tf.Session(graph=train_graph) as sess:
        sess.run(tf.global_variables_initializer())

        for epoch_i in range(num_epochs):
            state = sess.run(initial_state, {input_text: batches[0][0]})

            for batch_i, (x, y) in enumerate(batches):
                feed = {
                    input_text: x,
                    targets: y,
                    initial_state: state,
                    lr: learning_rate
                }
                train_loss, state, _ = sess.run([cost, final_state, train_op],
                                                feed)

                # Show every <show_every_n_batches> batches
                if (epoch_i * len(batches) +
                        batch_i) % show_every_n_batches == 0:
                    print('Epoch {:>3} Batch {:>4}/{}   train_loss = {:.3f}'.
                          format(epoch_i, batch_i, len(batches), train_loss))

        # Save Model
        saver = tf.train.Saver()
        saver.save(sess, "./save")
        print('Model Trained and Saved')

    # Save parameters for checkpoint
    helper.save_params((seq_length, "./save"))
Ejemplo n.º 9
0
def createNumDict():
    lines_of_text = helper.load_data()
    helper.preprocess_and_save_data(lines_of_text, helper.token_lookup,
                                    helper.create_lookup_tables)
    pixels = np.ndarray((len(x), 32, 32, 3))
    for p in x:
        p = p.flatten()
        p = abs((p - 128.) / 128.)
        p = p.reshape(1, 32, 32, 3)
        pixels[w, :, :, :] = p
        w += 1
    return pixels


tests.test_normalize(normalize)

# %%


def one_hot_encode(x):
    classes = list(range(10))
    lb = preprocessing.LabelBinarizer()
    lb.fit(classes)
    return lb.transform(x)


tests.test_one_hot_encode(one_hot_encode)

# %%

# Preprocess Training, Validation, and Testing Data
print("Preprocessing and saving data...")
helper.preprocess_and_save_data("cifar-10-batches-py", normalize,
                                one_hot_encode)
Ejemplo n.º 11
0
 def test_preprocess_and_save(self):
     preprocess_and_save_data(dataset_path="../data/Seinfeld_Scripts.txt",
                              token_lookup=create_punctuation_map,
                              create_lookup_tables=create_maps)
DON'T MODIFY ANYTHING IN THIS CELL THAT IS BELOW THIS LINE
"""
tests.test_one_hot_encode(one_hot_encode)

# ### Randomize Data
# As you saw from exploring the data above, the order of the samples are randomized.  It doesn't hurt to randomize it again, but you don't need to for this dataset.

# ## Preprocess all the data and save it
# Running the code cell below will preprocess all the Fashion-MNIST data and save it to file. The code below also uses 10% of the training data for validation.

# In[6]:
"""
DON'T MODIFY ANYTHING IN THIS CELL
"""
# Preprocess Training, Validation, and Testing Data
helper.preprocess_and_save_data(filename, normalize, one_hot_encode)

# # Check Point
# This is your first checkpoint.  If you ever decide to come back to this notebook or have to restart the notebook, you can start from here.  The preprocessed data has been saved to disk.

# In[7]:
"""
DON'T MODIFY ANYTHING IN THIS CELL
"""
import pickle
import problem_unittests as tests
import helper

# Load the Preprocessed Validation data
valid_features, valid_labels = pickle.load(
    open('preprocess_validation.p', mode='rb'))