Ejemplo n.º 1
0
def predict_next_word(word1, word2, word3, model, k):
    """
    Predicts the next word.
    Inputs:
      word1: The first word as a string.
      word2: The second word as a string.
      word3: The third word as a string.
      model: Model returned by the training script.
      k: The k most probable predictions are shown.
    Example usage:
      predict_next_word('john', 'might', 'be', model, 3)
      predict_next_word('life', 'in', 'new', model, 3)"""

    word_embedding_weights = model.word_embedding_weights

    vocab = model.vocab

    id1 = vocab.index(word1)
    id2 = vocab.index(word2)
    id3 = vocab.index(word3)

    input = r_[id1, id2, id3]

    embedding_layer_state, hidden_layer_state, output_layer_state = \
        fprop(input, model.word_embedding_weights, model.embed_to_hid_weights, model.hid_to_output_weights,
              model.hid_bias, model.output_bias)

    # sorted indices and probabilities
    indices = argsort(output_layer_state)[::-1]
    prob = output_layer_state[indices]

    for i in xrange(k):
        print("{0} {1} {2} {3}\tprob: {4}".format(word1, word2, word3, vocab[indices[i]], prob[i]))
def predict_next_word(word1, word2, word3, model, k):
    '''
    % Predicts the next word.
    % Inputs:
    %   word1: The first word as a string.
    %   word2: The second word as a string.
    %   word3: The third word as a string.
    %   model: Model returned by the training script.
    %   k: The k most probable predictions are shown.
    % Example usage:
    %   predict_next_word('john', 'might', 'be', model, 3)
    %   predict_next_word('life', 'in', 'new', model, 3)
    '''

    word_embedding_weights = model['word_embedding_weights']
    vocab = list(model['vocab'])

    input = np.vstack(np.asarray([-1,-1,-1]))
    words = (word1, word2, word3) 
    for i,w in enumerate(words):
        if words[i] in vocab:
            input[i] = vocab.index(words[i])
        else:
            print("Word ''%s\'' not in vocabulary.\n" % (word1))
    
    
    [embedding_layer_state, hidden_layer_state, output_layer_state] = fprop(input, model['word_embedding_weights'], model['embed_to_hid_weights'], model['hid_to_output_weights'], model['hid_bias'], model['output_bias'])
    indices = sorted(range(len(output_layer_state)), key=lambda k: output_layer_state[k], reverse=True)
    
    for i in range(0,k):
        print("%s %s %s %s Prob: %.5f\n" % (word1, word2, word3, vocab[indices[i]], output_layer_state[i]))
Ejemplo n.º 3
0
def ngen_nn2(weights, self):
    temp = [[0] * self.dimy for i in range(self.dimx)]
    for i in range(self.dimx):
        for j in range(self.dimy):
            if (self[i][j]):
                temp[i][j] = 1
                inputs = [
                    self[(i + k) % self.dimx][(j + l) % self.dimy]
                    for k, l in rad2
                ]
                for m, n in decision2(
                        fp.fprop(weights, inputs, fp.sigmoid3, None)):
                    temp[(i + m) % self.dimx][(j + n) % self.dimy] = 1
    self[:] = temp
def predict_next_word(word1, word2, word3, model, k):
    """
    Predicts the next word.
    Inputs:
        word1: The first word as a string.
        word2: The second word as a string.
        word3: The third word as a string.
        model: Model returned by the training script.
        k: The k most probable predictions are shown.
    Example usage:
        predict_next_word('john', 'might', 'be', model, 3)
        predict_next_word('life', 'in', 'new', model, 3)
    """
    word_embedding_weights = model["word_embedding_weights"]
    vocab = model["vocab"]  # (250,)
    vocab = list(vocab)

    try:
        id1 = vocab.index(word1)
    except ValueError:
        print("Word '{}' not in vocabulary.\n".format(word1))
        return

    try:
        id2 = vocab.index(word2)
    except ValueError:
        print("Word '{}' not in vocabulary.\n".format(word2))
        return

    try:
        id3 = vocab.index(word3)
    except ValueError:
        print("Word '{}' not in vocabulary.\n".format(word3))
        return

    input = np.array([id1, id2, id3])  # (3,)
    input = np.expand_dims(input, axis=1)  # (3, 1)
    embedding_layer_state, hidden_layer_state, output_layer_state = \
            fprop(input, model["word_embedding_weights"], model["embed_to_hid_weights"], model["hid_to_output_weights"], model["hid_bias"], model["output_bias"])
    prob = np.sort(
        output_layer_state,
        axis=None)[::-1]  # (250,); output_layer_state.shape is (250, 1).
    indices = np.argsort(-output_layer_state, axis=None)

    for i in range(k):
        print("{} {} {} {}    Prob: {:.5f}".format(word1, word2, word3,
                                                   vocab[int(indices[i])],
                                                   prob[i + 1]))
    print("")
Ejemplo n.º 5
0
def complexity(ind, sampling=500):
    reg = {}
    for i in range(sampling):
        inputs = list(np.random.randint(2, size=24))
        while not posneighbourhood(inputs):
            inputs = list(np.random.randint(2, size=24))
        temp = str(decision2(fp.fprop(ind, inputs, fp.sigmoid3, None)))
        if temp in reg:
            reg[temp] += 1
        else:
            reg[temp] = 1
    res = [(k, '%.1f%%' % (100 * reg[k] / sampling))
           for k in sorted(reg, key=reg.get, reverse=True)]
    for i, j in res:
        print(i, j)
Ejemplo n.º 6
0
def train(epochs):
    #% Inputs:
    #%   epochs: Number of epochs to run.
    #% Output:
    #%   model: A struct containing the learned weights and biases and vocabulary.

    start_time = time.time()

    #% SET HYPERPARAMETERS HERE.
    batchsize = 100  #% Mini-batch size.
    learning_rate = 0.1  #% Learning rate; default = 0.1.
    momentum = 0.9  #% Momentum; default = 0.9.
    numhid1 = 50  #% Dimensionality of embedding space; default = 50.
    numhid2 = 200  #% Number of units in hidden layer; default = 200.
    init_wt = 0.01  #% Standard deviation of the normal distribution
    #% which is sampled to get the initial weights; default = 0.01

    #% VARIABLES FOR TRACKING TRAINING PROGRESS.
    show_training_CE_after = 100
    show_validation_CE_after = 1000

    #% LOAD DATA.
    [
        train_input, train_target, valid_input, valid_target, test_input,
        test_target, vocab
    ] = load_data(batchsize)

    [numwords, batchsize, numbatches] = np.shape(train_input)
    vocab_size = len(vocab)

    # % INITIALIZE WEIGHTS AND BIASES.
    word_embedding_weights = init_wt * np.random.standard_normal(
        (vocab_size, numhid1))
    embed_to_hid_weights = init_wt * np.random.standard_normal(
        (numwords * numhid1, numhid2))

    hid_to_output_weights = init_wt * np.random.standard_normal(
        (numhid2, vocab_size))
    hid_bias = np.zeros((numhid2, 1))
    output_bias = np.zeros((vocab_size, 1))

    word_embedding_weights_delta = np.zeros((vocab_size, numhid1))
    word_embedding_weights_gradient = np.zeros((vocab_size, numhid1))
    embed_to_hid_weights_delta = np.zeros((numwords * numhid1, numhid2))
    hid_to_output_weights_delta = np.zeros((numhid2, vocab_size))
    hid_bias_delta = np.zeros((numhid2, 1))
    output_bias_delta = np.zeros((vocab_size, 1))
    expansion_matrix = np.identity(vocab_size)
    count = 0
    tiny = math.exp(-30)

    # % TRAIN.
    for epoch in range(1, epochs + 1):
        print("Epoch %d\n" % (epoch))
        this_chunk_CE = 0
        trainset_CE = 0
        #% LOOP OVER MINI-BATCHES.
        for m in range(0, numbatches):
            input_batch = np.asarray(train_input[:, :, m])
            target_batch = np.asarray(train_target[:, :, m])
            '''% FORWARD PROPAGATE.
            % Compute the state of each layer in the network given the input batch
            % and all weights and biases '''

            [embedding_layer_state, hidden_layer_state,
             output_layer_state] = fprop(input_batch, word_embedding_weights,
                                         embed_to_hid_weights,
                                         hid_to_output_weights, hid_bias,
                                         output_bias)
            '''% COMPUTE DERIVATIVE.
            %% Expand the target to a sparse 1-of-K vector.'''
            expanded_target_batch = np.vstack(
                (expansion_matrix[:, [i for i in target_batch[0]]]))
            # if m>=400:
            #     print(m)
            #     print(expanded_target_batch)
            #     exit()

            #%% Compute derivative of cross-entropy loss function.
            error_deriv = output_layer_state - expanded_target_batch
            # print(output_layer_state.shape)
            # print(tiny)
            # print(np.log(output_layer_state + tiny)-np.log(output_layer_state))
            # print(-np.sum(np.dot(expanded_target_batch, np.log(output_layer_state + tiny).T)) / batchsize)
            # print(-np.sum(np.multiply(expanded_target_batch, np.log(output_layer_state + tiny))) / batchsize)
            # #print()
            # exit()
            #% MEASURE LOSS FUNCTION.
            CE = -np.sum(
                np.multiply(expanded_target_batch,
                            np.log(output_layer_state + tiny))) / batchsize
            print(CE, end="\r")
            count = count + 1
            this_chunk_CE = this_chunk_CE + (CE - this_chunk_CE) / count
            trainset_CE = trainset_CE + (CE - trainset_CE) / (m + 1)
            print("'\rBatch %d Train CE %.3f" % (m + 1, this_chunk_CE),
                  end="\r")
            if np.mod(m + 1, show_training_CE_after) == 0:
                print("\n", end="\r")
                count = 0
                this_chunk_CE = 0

    #     % BACK PROPAGATE.
    #     %% OUTPUT LAYER.
            hid_to_output_weights_gradient = np.dot(hidden_layer_state,
                                                    error_deriv.T)
            output_bias_gradient = np.vstack(error_deriv.sum(axis=1))
            back_propagated_deriv_1 = np.dot(
                hid_to_output_weights,
                error_deriv) * hidden_layer_state * (1 - hidden_layer_state)

            #     %% HIDDEN LAYER.
            #     % FILL IN CODE. Replace the line below by one of the options.
            embed_to_hid_weights_gradient = np.zeros(
                (numhid1 * numwords, numhid2))
            #     % Options:
            #     % (a) embed_to_hid_weights_gradient = back_propagated_deriv_1.T * embedding_layer_state
            #     % (b) embed_to_hid_weights_gradient = np.dot(embedding_layer_state, back_propagated_deriv_1.T)
            #     % (c) embed_to_hid_weights_gradient = back_propagated_deriv_1
            #     % (d) embed_to_hid_weights_gradient = embedding_layer_state

            #     % FILL IN CODE. Replace the line below by one of the options.
            hid_bias_gradient = np.zeros((numhid2, 1))
            #     % Options
            #     % (a) hid_bias_gradient = np.vstack(back_propagated_deriv_1.sum(axis=1))
            #     % (b) hid_bias_gradient = np.sum(back_propagated_deriv_1[0]);
            #     % (c) hid_bias_gradient = back_propagated_deriv_1
            #     % (d) hid_bias_gradient = back_propagated_deriv_1.T

            #     % FILL IN CODE. Replace the line below by one of the options.
            back_propagated_deriv_2 = np.zeros((numhid2, batchsize))
            #     % Options
            #     % (a) back_propagated_deriv_2 = np.dot(embed_to_hid_weights, back_propagated_deriv_1)
            #     % (b) back_propagated_deriv_2 = back_propagated_deriv_1 * embed_to_hid_weights;
            #     % (c) back_propagated_deriv_2 = back_propagated_deriv_1' * embed_to_hid_weights;
            #     % (d) back_propagated_deriv_2 = back_propagated_deriv_1 * embed_to_hid_weights';

            #word_embedding_weights_gradient[:] = 0
            #     %% EMBEDDING LAYER.
            for w in range(0, numwords):
                word_embedding_weights_gradient = word_embedding_weights_gradient + np.dot(
                    expansion_matrix[:, input_batch[w, :]],
                    back_propagated_deriv_2[w * numhid1:(w + 1) *
                                            numhid1, :].T)

    #     % UPDATE WEIGHTS AND BIASES.
            word_embedding_weights_delta = momentum * word_embedding_weights_delta + word_embedding_weights_gradient / batchsize
            word_embedding_weights = word_embedding_weights - learning_rate * word_embedding_weights_delta

            embed_to_hid_weights_delta = momentum * embed_to_hid_weights_delta + embed_to_hid_weights_gradient / batchsize
            embed_to_hid_weights = embed_to_hid_weights - learning_rate * embed_to_hid_weights_delta

            hid_to_output_weights_delta = momentum * hid_to_output_weights_delta + hid_to_output_weights_gradient / batchsize
            hid_to_output_weights = hid_to_output_weights - learning_rate * hid_to_output_weights_delta

            hid_bias_delta = momentum * hid_bias_delta + hid_bias_gradient / batchsize
            hid_bias = hid_bias - learning_rate * hid_bias_delta

            output_bias_delta = momentum * output_bias_delta + output_bias_gradient / batchsize
            output_bias = output_bias - learning_rate * output_bias_delta

            #% VALIDATE.
            if np.mod(m + 1, show_validation_CE_after) == 0:
                print("\rRunning validation ...")
                [
                    embedding_layer_state, hidden_layer_state,
                    output_layer_state
                ] = fprop(valid_input, word_embedding_weights,
                          embed_to_hid_weights, hid_to_output_weights,
                          hid_bias, output_bias)
                datasetsize = valid_input.shape[1]
                expanded_valid_target = expansion_matrix[:, valid_target]
                CE = -np.sum(
                    np.multiply(
                        expanded_valid_target,
                        np.log(output_layer_state + tiny))) / datasetsize
                print(" Validation CE %.3f\n" % (CE))

        print("\rAverage Training CE %.3f\n" % (trainset_CE))

    print("Finished Training.\n")
    print("Final Training CE %.3f\n" % (trainset_CE))

    # % EVALUATE ON VALIDATION SET.
    print("\rRunning validation ...")

    [embedding_layer_state, hidden_layer_state,
     output_layer_state] = fprop(valid_input, word_embedding_weights,
                                 embed_to_hid_weights, hid_to_output_weights,
                                 hid_bias, output_bias)
    datasetsize = valid_input.shape[1]
    expanded_valid_target = expansion_matrix[:, valid_target]
    CE = -np.sum(
        np.multiply(expanded_valid_target,
                    np.log(output_layer_state + tiny))) / datasetsize
    print("\rFinal Validation CE %.3f\n" % (CE))

    # % EVALUATE ON TEST SET.
    print("\rRunning test ...")
    [embedding_layer_state, hidden_layer_state,
     output_layer_state] = fprop(test_input, word_embedding_weights,
                                 embed_to_hid_weights, hid_to_output_weights,
                                 hid_bias, output_bias)
    datasetsize = test_input.shape[1]
    expanded_test_target = expansion_matrix[:, test_target]
    #   expanded_test_target .* log(output_layer_state + tiny))) / datasetsize;
    CE = -np.sum(
        np.multiply(expanded_test_target,
                    np.log(output_layer_state + tiny))) / datasetsize
    print("\rFinal Test CE %.3f\n" % (CE))

    model = dict()
    model['word_embedding_weights'] = word_embedding_weights
    model['embed_to_hid_weights'] = embed_to_hid_weights
    model['hid_to_output_weights'] = hid_to_output_weights
    model['hid_bias'] = hid_bias
    model['output_bias'] = output_bias
    model['vocab'] = vocab

    end_time = time.time()
    print("Training took %.2f seconds\n" % (end_time - start_time))
    return model
Ejemplo n.º 7
0
def train(epochs):
    """
% Inputs:
%   epochs: Number of epochs to run.
% Output:
%   model: A struct containing the learned weights and biases and vocabulary.
	"""
    """
if size(ver('Octave'),1)
  OctaveMode = 1;
  warning('error', 'Octave:broadcast');
  start_time = time;
else
  OctaveMode = 0;
  start_time = clock;
end
	"""

    #% SET HYPERPARAMETERS HERE.
    batchsize = 100
    #% Mini-batch size.
    learning_rate = 0.1
    #% Learning rate; default = 0.1.
    momentum = 0.9
    #% Momentum; default = 0.9.
    numhid1 = 50
    #% Dimensionality of embedding space; default = 50.
    numhid2 = 200
    #% Number of units in hidden layer; default = 200.
    init_wt = 0.01
    #% Standard deviation of the normal distribution
    #% which is sampled to get the initial weights; default = 0.01

    #% VARIABLES FOR TRACKING TRAINING PROGRESS.
    show_training_CE_after = 100
    show_validation_CE_after = 1000

    #% LOAD DATA.
    #[train_input, train_target, valid_input, valid_target, ...
    #  test_input, test_target, vocab] = load_data(batchsize);
    train_input, train_target, valid_input, valid_target, test_input, test_target, vocab = load_data(
        batchsize)

    #[numwords, batchsize, numbatches] = size(train_input);
    numwords, batchsize, numbatches = train_input.shape  # 3, 100, 3725
    #% Size(vector, [dimension requried]) - get size of first dimension (rows)
    #vocab_size = size(vocab, 2);
    vocab_size = vocab.shape[1]  # 250
    #print(numwords, batchsize, numbatches, vocab_size)

    #% INITIALIZE WEIGHTS AND BIASES.
    #% randn(rows, cols) - random matrix with zero mean and variance one
    # randn seems to produce a matrix instead of an array --> convert!!
    word_embedding_weights = init_wt * asarray(randn(vocab_size, numhid1))
    embed_to_hid_weights = init_wt * asarray(randn(numwords * numhid1,
                                                   numhid2))
    hid_to_output_weights = init_wt * asarray(randn(numhid2, vocab_size))
    hid_bias = zeros((numhid2, 1))
    output_bias = zeros((vocab_size, 1))

    word_embedding_weights_delta = zeros((vocab_size, numhid1))
    word_embedding_weights_gradient = zeros((vocab_size, numhid1))
    embed_to_hid_weights_delta = zeros((numwords * numhid1, numhid2))
    hid_to_output_weights_delta = zeros((numhid2, vocab_size))
    hid_bias_delta = zeros((numhid2, 1))
    output_bias_delta = zeros((vocab_size, 1))
    expansion_matrix = eye((vocab_size))
    count = 0
    tiny = exp(-30)

    #% TRAIN.
    #for epoch = 1:epochs
    for epoch in range(epochs):
        #  fprintf(1, 'Epoch %d\n', epoch);
        print('Epoch %d\n' % (epoch + 1))  # don't forget offset later on!
        this_chunk_CE = 0
        trainset_CE = 0
        #% LOOP OVER MINI-BATCHES.
        #for m = 1:numbatches
        for m in range(numbatches):
            #input_batch = train_input(:, :, m);
            input_batch = train_input[:, :, m]
            #target_batch = train_target(:, :, m);
            target_batch = train_target[:, :, m]

            #% FORWARD PROPAGATE.
            #% Compute the state of each layer in the network given the input batch
            #% and all weights and biases
            #[embedding_layer_state, hidden_layer_state, output_layer_state] = ...
            #  fprop(input_batch, ...
            #        word_embedding_weights, embed_to_hid_weights, ...
            #        hid_to_output_weights, hid_bias, output_bias);
            embedding_layer_state, hidden_layer_state, output_layer_state = fprop(
                input_batch, word_embedding_weights, embed_to_hid_weights,
                hid_to_output_weights, hid_bias, output_bias)

            # test for batch 5 word 0
            if m == 5:
                test_words = input_batch[:, 0]
                w1 = word_embedding_weights[test_words[0] - 1, 0:5]
                w2 = word_embedding_weights[test_words[1] - 1, 0:5]
                w3 = word_embedding_weights[test_words[2] - 1, 0:5]
                s1 = embedding_layer_state[0:5, 0]
                s2 = embedding_layer_state[50:55, 0]
                s3 = embedding_layer_state[100:105, 0]

                #print(test_words, '\n', w1, w2, w3, '\n', s1, s2, s3)

                #% COMPUTE DERIVATIVE.
                #%% Expand the target to a sparse 1-of-K vector.
                #expanded_target_batch = expansion_matrix(:, target_batch);
            expanded_target_batch = expansion_matrix[:,
                                                     ravel(target_batch) - 1]
            #if m==5: print(expansion_matrix.shape, target_batch.shape, expanded_target_batch.shape)
            #%% Compute derivative of cross-entropy loss function.
            # dE/dZout
            error_deriv = output_layer_state - expanded_target_batch

            #% MEASURE LOSS FUNCTION.
            #CE = -sum(sum(...
            #  expanded_target_batch .* log(output_layer_state + tiny))) / batchsize;
            CE = -(expanded_target_batch *
                   log(output_layer_state + tiny)).sum() / batchsize
            count = count + 1
            this_chunk_CE = this_chunk_CE + (CE - this_chunk_CE) / count
            trainset_CE = trainset_CE + (CE - trainset_CE) / (m + 1)
            #fprintf(1, '\rBatch %d Train CE %.3f', m, this_chunk_CE);
            print('\rBatch %d Train CE %.3f' % (m, this_chunk_CE))

            #if mod(m, show_training_CE_after) == 0
            if (m + 1) % show_training_CE_after == 0:
                #fprintf(1, '\n');
                print('\n')
                count = 0
                this_chunk_CE = 0
#end
#if OctaveMode
#  fflush(1);
#end

#% BACK PROPAGATE.
#%% OUTPUT LAYER.
#hid_to_output_weights_gradient =  hidden_layer_state * error_deriv';
            # dE/dWho
            hid_to_output_weights_gradient = dot(hidden_layer_state,
                                                 error_deriv.T)
            #output_bias_gradient = sum(error_deriv, 2);
            # use reshape to force 2D array from sum
            output_bias_gradient = error_deriv.sum(axis=1).reshape(-1, 1)
            #back_propagated_deriv_1 = (hid_to_output_weights * error_deriv) ...
            # dE/dYh
            back_propagated_deriv_1 = dot(
                hid_to_output_weights,
                error_deriv) * hidden_layer_state * (1 - hidden_layer_state)
            #%% HIDDEN LAYER.
            #% FILL IN CODE. Replace the line below by one of the options.
            #embed_to_hid_weights_gradient = zeros(numhid1 * numwords, numhid2);
            #% Options:
            #% (a) embed_to_hid_weights_gradient = back_propagated_deriv_1' * embedding_layer_state;
            #% (b) embed_to_hid_weights_gradient = embedding_layer_state * back_propagated_deriv_1';
            embed_to_hid_weights_gradient = dot(embedding_layer_state,
                                                back_propagated_deriv_1.T)
            #% (c) embed_to_hid_weights_gradient = back_propagated_deriv_1;
            #% (d) embed_to_hid_weights_gradient = embedding_layer_state;

            #% FILL IN CODE. Replace the line below by one of the options.
            #hid_bias_gradient = zeros(numhid2, 1);
            #% Options
            #% (a) hid_bias_gradient = sum(back_propagated_deriv_1, 2);
            # use reshape to force 2D array from sum
            hid_bias_gradient = back_propagated_deriv_1.sum(axis=1).reshape(
                -1, 1)
            #% (b) hid_bias_gradient = sum(back_propagated_deriv_1, 1);
            #% (c) hid_bias_gradient = back_propagated_deriv_1;
            #% (d) hid_bias_gradient = back_propagated_deriv_1';

            #% FILL IN CODE. Replace the line below by one of the options.
            #back_propagated_deriv_2 = zeros(numhid2, batchsize);
            #% Options
            #% (a) back_propagated_deriv_2 = embed_to_hid_weights * back_propagated_deriv_1;
            # dE/dZe
            back_propagated_deriv_2 = dot(embed_to_hid_weights,
                                          back_propagated_deriv_1)
            #% (b) back_propagated_deriv_2 = back_propagated_deriv_1 * embed_to_hid_weights;
            #% (c) back_propagated_deriv_2 = back_propagated_deriv_1' * embed_to_hid_weights;
            #% (d) back_propagated_deriv_2 = back_propagated_deriv_1 * embed_to_hid_weights';

            #word_embedding_weights_gradient(:) = 0;
            word_embedding_weights_gradient.fill(0)
            #%% EMBEDDING LAYER.
            #for w = 1:numwords
            for w in range(numwords):
                #word_embedding_weights_gradient = word_embedding_weights_gradient + ...
                #  expansion_matrix(:, input_batch(w, :)) * ...
                #  (back_propagated_deriv_2(1 + (w - 1) * numhid1 : w * numhid1, :)');
                word_embedding_weights_gradient = (
                    word_embedding_weights_gradient +
                    dot(expansion_matrix[:, ravel(input_batch[w, :]) - 1],
                        (back_propagated_deriv_2[w * numhid1:(w + 1) *
                                                 numhid1, :].T)))
#end

#% UPDATE WEIGHTS AND BIASES.
#word_embedding_weights_delta = ...
#  momentum .* word_embedding_weights_delta + ...
#  word_embedding_weights_gradient ./ batchsize;
            word_embedding_weights_delta = (
                momentum * word_embedding_weights_delta +
                word_embedding_weights_gradient / batchsize)
            #word_embedding_weights = word_embedding_weights...
            #  - learning_rate * word_embedding_weights_delta;
            word_embedding_weights = (
                word_embedding_weights -
                learning_rate * word_embedding_weights_delta)

            #embed_to_hid_weights_delta = ...
            #  momentum .* embed_to_hid_weights_delta + ...
            #  embed_to_hid_weights_gradient ./ batchsize;
            embed_to_hid_weights_delta = (
                momentum * embed_to_hid_weights_delta +
                embed_to_hid_weights_gradient / batchsize)
            #embed_to_hid_weights = embed_to_hid_weights...
            #  - learning_rate * embed_to_hid_weights_delta;
            embed_to_hid_weights = (embed_to_hid_weights -
                                    learning_rate * embed_to_hid_weights_delta)

            #hid_to_output_weights_delta = ...
            #  momentum .* hid_to_output_weights_delta + ...
            #  hid_to_output_weights_gradient ./ batchsize;
            hid_to_output_weights_delta = (
                momentum * hid_to_output_weights_delta +
                hid_to_output_weights_gradient / batchsize)
            #hid_to_output_weights = hid_to_output_weights...
            #  - learning_rate * hid_to_output_weights_delta;
            hid_to_output_weights = (
                hid_to_output_weights -
                learning_rate * hid_to_output_weights_delta)

            #hid_bias_delta = momentum .* hid_bias_delta + ...
            #  hid_bias_gradient ./ batchsize;
            #print(hid_bias_delta.shape, hid_bias_gradient.shape)
            hid_bias_delta = (momentum * hid_bias_delta +
                              hid_bias_gradient / batchsize)
            #hid_bias = hid_bias - learning_rate * hid_bias_delta;
            hid_bias = hid_bias - learning_rate * hid_bias_delta

            #output_bias_delta = momentum .* output_bias_delta + ...
            #  output_bias_gradient ./ batchsize;
            output_bias_delta = (momentum * output_bias_delta +
                                 output_bias_gradient / batchsize)
            #output_bias = output_bias - learning_rate * output_bias_delta;
            output_bias = output_bias - learning_rate * output_bias_delta

            #% VALIDATE.
            #if mod(m, show_validation_CE_after) == 0
            if (m + 1) % show_validation_CE_after == 0:
                #fprintf(1, '\rRunning validation ...');
                print('\rRunning validation ...')
                #if OctaveMode
                #fflush(1);
                #end
                #[embedding_layer_state, hidden_layer_state, output_layer_state] = ...
                #  fprop(valid_input, word_embedding_weights, embed_to_hid_weights,...
                #        hid_to_output_weights, hid_bias, output_bias);
                embedding_layer_state, hidden_layer_state, output_layer_state = fprop(
                    valid_input, word_embedding_weights, embed_to_hid_weights,
                    hid_to_output_weights, hid_bias, output_bias)
                #datasetsize = size(valid_input, 2);
                datasetsize = valid_input.shape[1]
                #expanded_valid_target = expansion_matrix(:, valid_target);
                expanded_valid_target = expansion_matrix[:,
                                                         ravel(valid_target) -
                                                         1]
                #CE = -sum(sum(...
                #  expanded_valid_target .* log(output_layer_state + tiny))) /datasetsize;
                CE = -(expanded_valid_target *
                       log(output_layer_state + tiny)).sum() / datasetsize
                #fprintf(1, ' Validation CE %.3f\n', CE);
                print(' Validation CE %.3f\n' % (CE))
#if OctaveMode
#  fflush(1);
#end
#end
#end
#fprintf(1, '\rAverage Training CE %.3f\n', trainset_CE);
        print('\rAverage Training CE %.3f\n' % (trainset_CE))


#end

#fprintf(1, 'Finished Training.\n');
    print('Finished Training.\n')
    #if OctaveMode
    #  fflush(1);
    #end
    #fprintf(1, 'Final Training CE %.3f\n', trainset_CE);
    print('Final Training CE %.3f\n' % (trainset_CE))

    #% EVALUATE ON VALIDATION SET.
    #fprintf(1, '\rRunning validation ...');
    print('\rRunning validation ...')
    #if OctaveMode
    #  fflush(1);
    #end
    #[embedding_layer_state, hidden_layer_state, output_layer_state] = ...
    #  fprop(valid_input, word_embedding_weights, embed_to_hid_weights,...
    #        hid_to_output_weights, hid_bias, output_bias);
    print('Validation input shape: {}'.format(valid_input.shape))
    embedding_layer_state, hidden_layer_state, output_layer_state = fprop(
        valid_input, word_embedding_weights, embed_to_hid_weights,
        hid_to_output_weights, hid_bias, output_bias)
    #datasetsize = size(valid_input, 2);
    datasetsize = valid_input.shape[1]
    #expanded_valid_target = expansion_matrix(:, valid_target);
    expanded_valid_target = expansion_matrix[:, ravel(valid_target) - 1]
    #CE = -sum(sum(...
    #  expanded_valid_target .* log(output_layer_state + tiny))) / datasetsize;
    CE = -(expanded_valid_target *
           log(output_layer_state + tiny)).sum() / datasetsize
    #fprintf(1, '\rFinal Validation CE %.3f\n', CE);
    print('\rFinal Validation CE %.3f\n' % (CE))
    #if OctaveMode
    #  fflush(1);
    #end

    # reset states to avoid running out of memory on raspberry pi!
    embedding_layer_state, hidden_layer_state, output_layer_state = 0, 0, 0
    #% EVALUATE ON TEST SET.
    #fprintf(1, '\rRunning test ...');
    print('\rRunning test ...')
    #if OctaveMode
    #  fflush(1);
    #end
    #[embedding_layer_state, hidden_layer_state, output_layer_state] = ...
    #  fprop(test_input, word_embedding_weights, embed_to_hid_weights,...
    #        hid_to_output_weights, hid_bias, output_bias);
    print('Test input shape: {}'.format(test_input.shape))
    embedding_layer_state, hidden_layer_state, output_layer_state = fprop(
        test_input, word_embedding_weights, embed_to_hid_weights,
        hid_to_output_weights, hid_bias, output_bias)
    #datasetsize = size(test_input, 2);
    datasetsize = test_input.shape[1]
    #expanded_test_target = expansion_matrix(:, test_target);
    expanded_test_target = expansion_matrix[:, ravel(test_target) - 1]
    #CE = -sum(sum(...
    #  expanded_test_target .* log(output_layer_state + tiny))) / datasetsize;
    CE = -(expanded_test_target *
           log(output_layer_state + tiny)).sum() / datasetsize
    #fprintf(1, '\rFinal Test CE %.3f\n', CE);
    print('\rFinal Test CE %.3f\n' % (CE))
    #if OctaveMode
    #  fflush(1);
    #end
    """
def train(epochs=1):
    """ This function trains a neural network language model.
        Inputs:
          epochs: Number of epochs to run.
        Output:
          model: A struct containing the learned weights and biases and vocabulary.
    """
    start_time = time()

    # SET HYPERPARAMETERS HERE.
    batchsize = 100  # Mini-batch size.
    learning_rate = 0.1  # Learning rate, default = 0.1.
    momentum = 0.9  # Momentum, default = 0.9.
    numhid1 = 50  # Dimensionality of embedding space, default = 50.
    numhid2 = 200  # Number of units in hidden layer, default = 200.
    init_wt = 0.01  # Standard deviation of the normal distribution which is sampled to get the initial weights, default = 0.01

    # VARIABLES FOR TRACKING TRAINING PROGRESS.
    show_training_CE_after = 100
    show_validation_CE_after = 1000

    # LOAD DATA.
    train_input, train_target, valid_input, valid_target, test_input, test_target, vocab = load_data(
        batchsize)
    numwords, batchsize, numbatches = train_input.shape  # 3, 100, 3725
    vocab_size = vocab.shape[0]  # 250   vocab_size = size(vocab, 2);

    # INITIALIZE WEIGHTS AND BIASES.
    word_embedding_weights = init_wt * np.random.randn(vocab_size,
                                                       numhid1)  # (250, 50)
    embed_to_hid_weights = init_wt * np.random.randn(numwords * numhid1,
                                                     numhid2)  # (150, 200)
    hid_to_output_weights = init_wt * np.random.randn(numhid2,
                                                      vocab_size)  # (200, 250)
    hid_bias = np.zeros((numhid2, 1))  # (200, 1)
    output_bias = np.zeros((vocab_size, 1))  # (250, 1)

    word_embedding_weights_delta = np.zeros((vocab_size, numhid1))  # (250, 50)
    word_embedding_weights_gradient = np.zeros(
        (vocab_size, numhid1))  # (250, 50)
    embed_to_hid_weights_delta = np.zeros(
        (numwords * numhid1, numhid2))  # (150, 200)
    hid_to_output_weights_delta = np.zeros((numhid2, vocab_size))  # (200, 250)
    hid_bias_delta = np.zeros((numhid2, 1))  # (200, 1)
    output_bias_delta = np.zeros((vocab_size, 1))  # (250, 1)
    expansion_matrix = np.eye((vocab_size))  # (250, 250)
    count = 0
    tiny = np.exp(-30)

    # TRAIN.
    for epoch in range(1, epochs + 1):
        print('Epoch {}'.format(epoch))
        this_chunk_CE = 0
        trainset_CE = 0
        # LOOP OVER MINI-BATCHES.
        for m in range(1, numbatches + 1):
            input_batch = train_input[:, :, m - 1]  # (3, 100)
            target_batch = train_target[:, :, m - 1]  # (1, 100)
            #print("input_batch:", input_batch.shape)
            #print("target_batch:", target_batch.shape)

            # FORWARD PROPAGATE.
            # Compute the state of each layer in the network given the input batch
            # and all weights and biases
            embedding_layer_state, hidden_layer_state, output_layer_state = fprop(
                input_batch, word_embedding_weights, embed_to_hid_weights,
                hid_to_output_weights, hid_bias, output_bias)

            # COMPUTE DERIVATIVE.
            ## Expand the target to a sparse 1-of-K vector.
            expanded_target_batch = expansion_matrix[:,
                                                     target_batch]  # (250, 1, 100)
            expanded_target_batch = expanded_target_batch.reshape(
                vocab_size, -1)  # (250, 100)
            #print("expanded_target_batch:", expanded_target_batch.shape)
            ## Compute derivative of cross-entropy loss function.
            error_deriv = output_layer_state - expanded_target_batch  # (250, 100)
            #print("error_deriv:", error_deriv.shape)

            # MEASURE LOSS FUNCTION.
            CE = -np.sum(
                np.sum(expanded_target_batch *
                       np.log(output_layer_state + tiny))) / batchsize
            count = count + 1
            this_chunk_CE = this_chunk_CE + (CE - this_chunk_CE) / count
            trainset_CE = trainset_CE + (CE - trainset_CE) / m
            if (np.mod(m, show_training_CE_after) == 0):
                print('Batch {} Train CE {:.3f}'.format(m, this_chunk_CE))
                count = 0
                this_chunk_CE = 0
            sys.stdout.flush()

            # BACK PROPAGATE.
            ## OUTPUT LAYER.
            hid_to_output_weights_gradient = np.dot(
                hidden_layer_state, error_deriv.T)  # (200, 250)
            output_bias_gradient = np.sum(error_deriv, axis=1)  # (250,)
            output_bias_gradient = output_bias_gradient[:,
                                                        np.newaxis]  # (250, 1)
            #output_bias_gradient = output_bias_gradient.reshape(output_bias_gradient.shape[0], 1)                                      # (250, 1)
            back_propagated_deriv_1 = np.dot(
                hid_to_output_weights, error_deriv) * hidden_layer_state * (
                    1.0 - hidden_layer_state)  # (200, 100)
            #print("hid_to_output_weights_gradient:", hid_to_output_weights_gradient.shape)
            #print("output_bias_gradient:", output_bias_gradient.shape)
            #print("back_propagated_deriv_1:", back_propagated_deriv_1.shape)

            ## HIDDEN LAYER.
            # FILL IN CODE. Replace the line below by one of the options.
            #embed_to_hid_weights_gradient = np.zeros((numhid1 * numwords, numhid2))
            # Options:
            # (a) embed_to_hid_weights_gradient = np.dot(back_propagated_deriv_1.T, embedding_layer_state)
            # (b) embed_to_hid_weights_gradient = np.dot(embedding_layer_state, back_propagated_deriv_1.T)
            # (c) embed_to_hid_weights_gradient = back_propagated_deriv_1
            # (d) embed_to_hid_weights_gradient = embedding_layer_state
            #print("embed_to_hid_weights_gradient:", embed_to_hid_weights_gradient.shape)                   # (150, 200)

            # FILL IN CODE. Replace the line below by one of the options.
            #hid_bias_gradient = np.zeros((numhid2, 1))
            # Options
            # (a) hid_bias_gradient = np.sum(back_propagated_deriv_1, 1)
            # (b) hid_bias_gradient = np.sum(back_propagated_deriv_1, 0)
            # (c) hid_bias_gradient = back_propagated_deriv_1
            # (d) hid_bias_gradient = back_propagated_deriv_1.T
            # Shape is (200,).
            hid_bias_gradient = np.expand_dims(hid_bias_gradient,
                                               axis=1)  # (200, 1)
            #hid_bias_gradient = hid_bias_gradient.reshape(hid_bias_gradient.shape[0], 1)                   # (200, 1)
            #print("hid_bias_gradient:", hid_bias_gradient.shape)

            # FILL IN CODE. Replace the line below by one of the options.
            #back_propagated_deriv_2 = np.zeros((numhid2, batchsize))                                       # (200, 100)
            # Options
            # (a) back_propagated_deriv_2 = np.dot(embed_to_hid_weights, back_propagated_deriv_1)
            # (b) back_propagated_deriv_2 = np.dot(back_propagated_deriv_1, embed_to_hid_weights)
            # (c) back_propagated_deriv_2 = np.dot(back_propagated_deriv_1.T, embed_to_hid_weights)
            # (d) back_propagated_deriv_2 = np.dot(back_propagated_deriv_1, embed_to_hid_weights.T)
            #print("back_propagated_deriv_2:", back_propagated_deriv_2.shape)                   # (150,100)

            word_embedding_weights_gradient[:] = 0

            ## EMBEDDING LAYER.
            for w in range(1, numwords + 1):
                #print(expansion_matrix[:, input_batch[w - 1, :]].shape)                        # (250, 100)
                #print(back_propagated_deriv_2[0 + (w - 1) * numhid1 : w * numhid1, :].shape)   # (50, 100)
                word_embedding_weights_gradient = word_embedding_weights_gradient + \
                    np.dot(expansion_matrix[:, input_batch[w - 1, :]], back_propagated_deriv_2[(w - 1) * numhid1 : w * numhid1, :].T)
            #print("word_embedding_weights_gradient:", word_embedding_weights_gradient.shape)   # (250, 50)

            # UPDATE WEIGHTS AND BIASES.
            word_embedding_weights_delta = momentum * word_embedding_weights_delta + word_embedding_weights_gradient / batchsize
            word_embedding_weights = word_embedding_weights - learning_rate * word_embedding_weights_delta

            embed_to_hid_weights_delta = momentum * embed_to_hid_weights_delta + embed_to_hid_weights_gradient / batchsize
            embed_to_hid_weights = embed_to_hid_weights - learning_rate * embed_to_hid_weights_delta

            hid_to_output_weights_delta = momentum * hid_to_output_weights_delta + hid_to_output_weights_gradient / batchsize
            hid_to_output_weights = hid_to_output_weights - learning_rate * hid_to_output_weights_delta

            hid_bias_delta = momentum * hid_bias_delta + hid_bias_gradient / batchsize  # (200, 1)
            hid_bias = hid_bias - learning_rate * hid_bias_delta  # (200, 1)

            output_bias_delta = momentum * output_bias_delta + output_bias_gradient / batchsize  # (250, 1)
            output_bias = output_bias - learning_rate * output_bias_delta  # (250, 1)

            # VALIDATE.
            if (np.mod(m, show_validation_CE_after) == 0):
                print('Running validation ...')
                sys.stdout.flush()

                embedding_layer_state, hidden_layer_state, output_layer_state = fprop(
                    valid_input, word_embedding_weights, embed_to_hid_weights,
                    hid_to_output_weights, hid_bias, output_bias)
                datasetsize = valid_input.shape[1]
                expanded_valid_target = expansion_matrix[:, valid_target]
                CE = -np.sum(
                    np.sum(expanded_valid_target *
                           np.log(output_layer_state + tiny))) / datasetsize
                print(' Validation CE {:.3f}'.format(CE))
                sys.stdout.flush()

        print('  Average Training CE {:.3f}\n'.format(trainset_CE))

    print('Finished Training.')
    sys.stdout.flush()
    print('Final Training CE {:.3f}'.format(trainset_CE))

    # EVALUATE ON VALIDATION SET.
    print('\nRunning validation ...')
    sys.stdout.flush()
    embedding_layer_state, hidden_layer_state, output_layer_state = fprop(
        valid_input, word_embedding_weights, embed_to_hid_weights,
        hid_to_output_weights, hid_bias, output_bias)
    datasetsize = valid_input.shape[1]
    expanded_valid_target = expansion_matrix[:, valid_target]
    CE = -np.sum(
        np.sum(expanded_valid_target *
               np.log(output_layer_state + tiny))) / datasetsize
    print('Final Validation CE {:.3f}'.format(CE))
    sys.stdout.flush()

    # EVALUATE ON TEST SET.
    print('\nRunning test ...')
    sys.stdout.flush()
    embedding_layer_state, hidden_layer_state, output_layer_state = fprop(
        test_input, word_embedding_weights, embed_to_hid_weights,
        hid_to_output_weights, hid_bias, output_bias)
    datasetsize = test_input.shape[1]
    expanded_test_target = expansion_matrix[:, test_target]
    CE = -np.sum(
        np.sum(expanded_test_target *
               np.log(output_layer_state + tiny))) / datasetsize
    print('Final Test CE {:.3f}'.format(CE))
    sys.stdout.flush()

    #model = [word_embedding_weights, embed_to_hid_weights, hid_to_output_weights, hid_bias, output_bias, vocab]
    model = {
        "word_embedding_weights": word_embedding_weights,
        "embed_to_hid_weights": embed_to_hid_weights,
        "hid_to_output_weights": hid_to_output_weights,
        "hid_bias": hid_bias,
        "output_bias": output_bias,
        "vocab": vocab
    }

    #model.word_embedding_weights = word_embedding_weights
    #model.embed_to_hid_weights = embed_to_hid_weights
    #model.hid_to_output_weights = hid_to_output_weights
    #model.hid_bias = hid_bias
    #model.output_bias = output_bias
    #model.vocab = vocab

    end_time = time()
    diff = end_time - start_time
    print("\nTraining took {:.3f} seconds\n".format(diff))

    return model