Esempio n. 1
0
 def ff(self, x, phase):
     return ele.tanh(x)
Esempio n. 2
0
def LSTM_train(model, sents, words, learning_rate, EPOCH, tanhC_version=1):

    # Constants
    N = model.Layers[1]  # Number of units
    K = model.Layers[2]  # Vocabulary size

    last_time = time.time()
    # For each epoch
    for epoch_id in range(1, EPOCH + 1):
        epoch_ll = 0
        # For each sentence
        for sent_id, sent in enumerate(sents):
            #print sent_id
            #print "sent", sent
            #print "sents", sents
            ##### Initialize activations #####

            Tau = len(sent)
            sent_ll = 0  # Sentence log likelihood

            data = [None] * Tau

            Hout = [None] * Tau
            Hout[0] = owl.zeros([N, 1])

            act_ig = [None] * Tau
            act_fg = [None] * Tau
            act_og = [None] * Tau
            act_ff = [None] * Tau

            C = [None] * Tau
            C[0] = owl.zeros([N, 1])
            dY = [None] * Tau

            dBd = owl.zeros([model.Layers[2], 1])  #dY.sum(0)
            dWd = owl.zeros([model.Layers[2], model.Layers[1]])
            dHout = [None] * Tau  #dY.dot(model.decoder_weights.transpose())
            dEmb = [None] * Tau

            ##### Forward pass #####
            # For each time step

            for t in range(1, Tau):
                # predict the (t+1)'th word from the t'th word
                data[t] = model.emb_weight[sent[t - 1]]
                NVector = np.zeros((K, 1))
                NVector[sent[t]] = 1
                target = owl.from_numpy(NVector).trans()

                act_ig[t] = model.ig_weight_data * data[
                    t] + model.ig_weight_prev * Hout[
                        t - 1] + model.ig_weight_cell * C[
                            t - 1] + model.ig_weight_bias
                act_ig[t] = ele.sigm(act_ig[t])

                act_fg[t] = model.fg_weight_data * data[
                    t] + model.fg_weight_prev * Hout[
                        t - 1] + model.fg_weight_cell * C[
                            t - 1] + model.fg_weight_bias
                act_fg[t] = ele.sigm(act_fg[t])

                act_ff[t] = model.ff_weight_data * data[
                    t] + model.ff_weight_prev * Hout[t -
                                                     1] + model.ff_weight_bias
                act_ff[t] = ele.tanh(act_ff[t])

                C[t] = ele.mult(act_ig[t], act_ff[t]) + ele.mult(
                    act_fg[t], C[t - 1])

                act_og[t] = model.og_weight_data * data[
                    t] + model.og_weight_prev * Hout[
                        t -
                        1] + model.og_weight_cell * C[t] + model.og_weight_bias
                act_og[t] = ele.sigm(act_og[t])

                if tanhC_version:
                    Hout[t] = ele.mult(act_og[t], ele.tanh(C[t]))
                else:
                    Hout[t] = ele.mult(act_og[t], C[t])

                Y = softmax(model.decoder_weights * Hout[t] +
                            model.decoder_bias)

                # BP to Hout
                dY[t] = Y - target
                dBd += dY[t]
                dWd += dY[t] * Hout[t].trans()
                dHout[t] = model.decoder_weights.trans() * dY[t]

                # evaluation
                output = Y.to_numpy(
                )  # Can directly get a single element from Y
                # print output[0, sent[t]]
                sent_ll += math.log(max(output[0, sent[t]], 1e-20), 2)

                #print "Y_0[t]",Y_o[t]
                #print "Y_o[t][sent[t]]",Y_o[t][sent[t]]
                #print np.sum(output.to_numpy())
                # output = Ym[t].trans() * data[t]
                # sent_ll += math.log10( max(np.sum(output.to_numpy()),1e-20) )
            ##### Initialize gradient vectors #####

            weight_update_ig_data = owl.zeros(
                [model.Layers[1], model.Layers[0]])
            weight_update_ig_prev = owl.zeros(
                [model.Layers[1], model.Layers[1]])
            weight_update_ig_cell = owl.zeros(
                [model.Layers[1], model.Layers[1]])
            weight_update_ig_bias = owl.zeros([model.Layers[1], 1])

            weight_update_fg_data = owl.zeros(
                [model.Layers[1], model.Layers[0]])
            weight_update_fg_prev = owl.zeros(
                [model.Layers[1], model.Layers[1]])
            weight_update_fg_cell = owl.zeros(
                [model.Layers[1], model.Layers[1]])
            weight_update_fg_bias = owl.zeros([model.Layers[1], 1])

            weight_update_og_data = owl.zeros(
                [model.Layers[1], model.Layers[0]])
            weight_update_og_prev = owl.zeros(
                [model.Layers[1], model.Layers[1]])
            weight_update_og_cell = owl.zeros(
                [model.Layers[1], model.Layers[1]])
            weight_update_og_bias = owl.zeros([model.Layers[1], 1])

            weight_update_ff_data = owl.zeros(
                [model.Layers[1], model.Layers[0]])
            weight_update_ff_prev = owl.zeros(
                [model.Layers[1], model.Layers[1]])
            weight_update_ff_bias = owl.zeros([model.Layers[1], 1])

            dC = [None] * Tau

            for t in xrange(Tau):
                dC[t] = owl.zeros(C[t].shape)

            # Calculate the error and add it
            for t in reversed(range(1, Tau)):
                #print "sent",sent
                #print "t",t

                # BP from og controled gate and og
                if tanhC_version:
                    tanhC = ele.tanh(C[t])
                    dTanhC = ele.mult(dHout[t], act_og[t])
                    sen_og = ele.mult(dHout[t], tanhC)
                    dC[t] += ele.mult((1 - ele.mult(tanhC, tanhC)), dTanhC)
                else:
                    sen_og = ele.mult(C[t], dHout[t])
                    dC[t] += ele.mult(act_og[t], dHout[t])

                # BP from og
                sen_og = ele.mult(ele.mult(act_og[t], (1.0 - act_og[t])),
                                  sen_og)
                dHout[t - 1] = model.og_weight_prev.trans() * sen_og
                dC[t] += model.og_weight_cell.trans() * sen_og
                dEmb[t] = model.og_weight_data.trans() * sen_og

                # BP from fg controled gate
                sen_fg = ele.mult(C[t - 1], dC[t])
                dC[t - 1] += ele.mult(act_fg[t], dC[t])

                # BP from ig controled gate
                sen_ig = ele.mult(act_ff[t], dC[t])
                sen_ff = ele.mult(act_ig[t], dC[t])
                sen_ff = ele.mult((1 - ele.mult(act_ff[t], act_ff[t])), sen_ff)
                dEmb[t] += model.ff_weight_data.trans() * sen_ff

                # BP from fg
                sen_fg = ele.mult(ele.mult(act_fg[t], (1.0 - act_fg[t])),
                                  sen_fg)
                dHout[t - 1] += model.fg_weight_prev.trans() * sen_fg
                dC[t - 1] += model.fg_weight_cell.trans() * sen_fg
                dEmb[t] += model.fg_weight_data.trans() * sen_fg

                # BP from ig
                sen_ig = ele.mult(ele.mult(act_ig[t], (1.0 - act_ig[t])),
                                  sen_ig)
                dHout[t - 1] += model.ig_weight_prev.trans() * sen_ig
                dC[t - 1] += model.ig_weight_cell.trans() * sen_ig
                dEmb[t] += model.ig_weight_data.trans() * sen_ig

                # derivatives on weight matrix and bias
                weight_update_ig_data += sen_ig * data[t].trans()
                weight_update_ig_prev += sen_ig * Hout[t - 1].trans()
                weight_update_ig_cell += sen_ig * C[t - 1].trans()
                weight_update_ig_bias += sen_ig

                weight_update_fg_data += sen_fg * data[t].trans()
                weight_update_fg_prev += sen_fg * Hout[t - 1].trans()
                weight_update_fg_cell += sen_fg * C[t - 1].trans()
                weight_update_fg_bias += sen_fg

                weight_update_og_data += sen_og * data[t].trans()
                weight_update_og_prev += sen_og * Hout[t - 1].trans()
                weight_update_og_cell += sen_og * C[t].trans()
                weight_update_og_bias += sen_og

                weight_update_ff_data += sen_ff * data[t].trans()
                weight_update_ff_prev += sen_ff * Hout[t - 1].trans()
                weight_update_ff_bias += sen_ff

            # normalize the gradients
            rate = learning_rate / Tau

            # weight update
            model.ig_weight_prev -= rate * weight_update_ig_prev
            model.ig_weight_data -= rate * weight_update_ig_data
            model.ig_weight_cell -= rate * weight_update_ig_cell
            model.ig_weight_bias -= rate * weight_update_ig_bias

            model.fg_weight_prev -= rate * weight_update_fg_prev
            model.fg_weight_data -= rate * weight_update_fg_data
            model.fg_weight_cell -= rate * weight_update_fg_cell
            model.fg_weight_bias -= rate * weight_update_fg_bias

            model.og_weight_prev -= rate * weight_update_og_prev
            model.og_weight_data -= rate * weight_update_og_data
            model.og_weight_cell -= rate * weight_update_og_cell
            model.og_weight_bias -= rate * weight_update_og_bias

            model.ff_weight_prev -= rate * weight_update_ff_prev
            model.ff_weight_data -= rate * weight_update_ff_data
            model.ff_weight_bias -= rate * weight_update_ff_bias

            model.decoder_weights -= rate * dWd
            model.decoder_bias -= rate * dBd

            for t in range(1, Tau):
                model.emb_weight[sent[t - 1]] -= rate * dEmb[t]

            # Print results
            epoch_ll += sent_ll
            # print(" Sentence %d LL: %f" % (sent_id, sent_ll))

        epoch_ent = epoch_ll * (-1) / words
        epoch_ppl = 2**epoch_ent
        cur_time = time.time()
        print("Epoch %d (alpha=%f) PPL=%f" %
              (epoch_id, learning_rate, epoch_ppl))
        print "  time consumed:", cur_time - last_time
        last_time = cur_time

    return model, learning_rate
Esempio n. 3
0
def LSTM_test(model, sents, words, tanhC_version=1):

    N = model.Layers[1]
    K = model.Layers[2]

    test_ll = 0
    # For each sentence
    for sent_id, sent in enumerate(sents):
        #print sent_id
        #print "sent", sent
        #print "sents", sents
        ##### Initialize activations #####

        Tau = len(sent)
        sent_ll = 0  # Sentence log likelihood

        data = [None] * Tau

        Hout = [None] * Tau
        Hout[0] = owl.zeros([N, 1])

        act_ig = [None] * Tau
        act_fg = [None] * Tau
        act_og = [None] * Tau
        act_ff = [None] * Tau

        C = [None] * Tau
        C[0] = owl.zeros([N, 1])

        ##### Forward pass #####
        # For each time step

        for t in range(1, Tau):
            # predict the (t+1)'th word from the t'th word
            data[t] = model.emb_weight[sent[t - 1]]

            act_ig[t] = model.ig_weight_data * data[
                t] + model.ig_weight_prev * Hout[
                    t - 1] + model.ig_weight_cell * C[t -
                                                      1] + model.ig_weight_bias
            act_ig[t] = ele.sigm(act_ig[t])

            act_fg[t] = model.fg_weight_data * data[
                t] + model.fg_weight_prev * Hout[
                    t - 1] + model.fg_weight_cell * C[t -
                                                      1] + model.fg_weight_bias
            act_fg[t] = ele.sigm(act_fg[t])

            act_ff[t] = model.ff_weight_data * data[
                t] + model.ff_weight_prev * Hout[t - 1] + model.ff_weight_bias
            act_ff[t] = ele.tanh(act_ff[t])

            C[t] = ele.mult(act_ig[t], act_ff[t]) + ele.mult(
                act_fg[t], C[t - 1])

            act_og[t] = model.og_weight_data * data[
                t] + model.og_weight_prev * Hout[
                    t - 1] + model.og_weight_cell * C[t] + model.og_weight_bias
            act_og[t] = ele.sigm(act_og[t])

            if tanhC_version:
                Hout[t] = ele.mult(act_og[t], ele.tanh(C[t]))
            else:
                Hout[t] = ele.mult(act_og[t], C[t])

            Y = softmax(model.decoder_weights * Hout[t] + model.decoder_bias)

            # evaluation
            output = Y.to_numpy()  # Can directly get a single element from Y
            # print output[0, sent[t]]
            sent_ll += math.log(max(output[0, sent[t]], 1e-20), 2)

        test_ll += sent_ll

    test_ent = test_ll * (-1) / words
    test_ppl = 2**test_ent

    print "Test PPL =", test_ppl
Esempio n. 4
0
def LSTM_train(model,
               sents,
               vocab_size,
               words,
               NUM_EPOCHS=100,
               tanhC_version=1):

    # Constants
    ALPHA = 1  # Learning rate
    N = 10  # Number of units
    learning_rate = 1

    K = vocab_size  # Vocabulary size

    # For each epoch
    last_ll = 1e99
    last_time = time.time()
    for epoch_id in range(1, NUM_EPOCHS + 1):
        epoch_ll = 0
        # For each sentence
        for sent_id, sent in enumerate(sents):
            #print "sent_id",sent_id
            #print "sent", sent
            #print "sents", sents
            ##### Initialize activations #####
            Tau = len(sent)
            sent_ll = 0  # Sentence log likelihood
            batch_size = Tau

            data = [None] * Tau
            prev = [None] * Tau
            embed = np.zeros((K, 1))
            embed[sent[0]] = 1
            data[0] = owl.from_numpy(embed).trans()

            Hout = [None] * Tau
            Hout[0] = owl.zeros([N, 1])

            act_ig = [None] * Tau
            act_fg = [None] * Tau
            act_og = [None] * Tau
            act_ff = [None] * Tau

            C = [None] * Tau
            C[0] = owl.zeros([N, 1])
            Ym = [None] * Tau
            dY = [None] * Tau

            dBd = owl.zeros([model.Layers[2], 1])  #dY.sum(0)
            dWd = owl.zeros([model.Layers[1],
                             model.Layers[2]])  #Hout.transpose().dot(dY)
            dHout = [None] * Tau  #dY.dot(model.decoder_weights.transpose())

            ##### Forward pass #####
            # For each time step
            for t in range(1, Tau):
                prev[t] = Hout[t - 1]
                embed = np.zeros((K, 1))
                embed[sent[t]] = 1
                data[t] = owl.from_numpy(embed).trans()

                act_ig[t] = model.ig_weight_data.trans() * data[
                    t - 1] + model.ig_weight_prev.trans(
                    ) * prev[t] + model.ig_weight_bias
                act_fg[t] = model.fg_weight_data.trans() * data[
                    t - 1] + model.fg_weight_prev.trans(
                    ) * prev[t] + model.fg_weight_bias
                act_og[t] = model.og_weight_data.trans() * data[
                    t - 1] + model.og_weight_prev.trans(
                    ) * prev[t] + model.og_weight_bias
                act_ff[t] = model.ff_weight_data.trans() * data[
                    t - 1] + model.ff_weight_prev.trans(
                    ) * prev[t] + model.ff_weight_bias

                act_ig[t] = ele.sigm(act_ig[t])
                act_fg[t] = ele.sigm(act_fg[t])
                act_og[t] = ele.sigm(act_og[t])
                act_ff[t] = ele.tanh(act_ff[t])

                C[t] = ele.mult(act_ig[t], act_ff[t]) + ele.mult(
                    act_fg[t], C[t - 1])

                if tanhC_version:
                    Hout[t] = ele.mult(act_og[t], ele.tanh(C[t]))
                else:
                    Hout[t] = ele.mult(act_og[t], C[t])
                Ym[t] = softmax(model.decoder_weights.trans() * Hout[t] +
                                model.decoder_bias)

                dY[t] = data[t] - Ym[t]
                dBd += dY[t] / batch_size
                dWd += Hout[t] * dY[t].trans() / batch_size
                dHout[t] = model.decoder_weights * dY[t]

                #print "Y_0[t]",Y_o[t]
                #print "Y_o[t][sent[t]]",Y_o[t][sent[t]]
                #print np.sum(output.to_numpy())
                # output = Ym[t].trans() * data[t]
                # sent_ll += math.log10( max(np.sum(output.to_numpy()),1e-20) )
            ##### Initialize gradient vectors #####
            for t in range(1, Tau):
                output = Ym[t].trans() * data[t]
                sent_ll += math.log10(max(np.sum(output.to_numpy()), 1e-20))

            sen_ig = [None] * Tau
            sen_fg = [None] * Tau
            sen_og = [None] * Tau
            sen_ff = [None] * Tau

            weight_update_ig_data = owl.zeros(
                [model.Layers[0], model.Layers[1]])
            weight_update_ig_prev = owl.zeros(
                [model.Layers[1], model.Layers[1]])
            weight_update_ig_bias = owl.zeros([model.Layers[1], 1])

            weight_update_fg_data = owl.zeros(
                [model.Layers[0], model.Layers[1]])
            weight_update_fg_prev = owl.zeros(
                [model.Layers[1], model.Layers[1]])
            weight_update_fg_bias = owl.zeros([model.Layers[1], 1])

            weight_update_og_data = owl.zeros(
                [model.Layers[0], model.Layers[1]])
            weight_update_og_prev = owl.zeros(
                [model.Layers[1], model.Layers[1]])
            weight_update_og_bias = owl.zeros([model.Layers[1], 1])

            weight_update_ff_data = owl.zeros(
                [model.Layers[0], model.Layers[1]])
            weight_update_ff_prev = owl.zeros(
                [model.Layers[1], model.Layers[1]])
            weight_update_ff_bias = owl.zeros([model.Layers[1], 1])

            dHin = owl.zeros([model.Layers[1], model.Layers[1]])
            dC = [None] * Tau
            for t in xrange(Tau):
                dC[t] = owl.zeros(C[t].shape)

            # Calculate the error and add it
            for t in reversed(range(1, len(sent))):
                #print "sent",sent
                #print "t",t
                if tanhC_version:
                    tanhCt = ele.tanh(C[t])
                    sen_og[t] = ele.mult(tanhCt, dHout[t])
                    dC[t] += ele.mult((1 - ele.mult(tanhCt, tanhCt)),
                                      ele.mult(act_og[t], dHout[t]))
                else:
                    sen_og[t] = ele.mult(C[t], dHout[t])
                    dC[t] += ele.mult(act_og[t], dHout[t])

                sen_fg[t] = owl.zeros([model.Layers[1], 1])
                if t > 0:
                    sen_fg[t] = ele.mult(C[t - 1], dC[t])
                    dC[t - 1] += ele.mult(act_og[t], dC[t])
                sen_ig[t] = ele.mult(act_ff[t], dC[t])
                sen_ff[t] = ele.mult(act_ig[t], dC[t])

                # backprop activation functions
                sen_ff[t] = ele.mult((1 - ele.mult(act_ff[t], act_ff[t])),
                                     sen_ff[t])
                sen_ig[t] = ele.mult(ele.mult(act_ig[t], (1.0 - act_ig[t])),
                                     sen_ig[t])
                sen_fg[t] = ele.mult(ele.mult(act_fg[t], (1.0 - act_fg[t])),
                                     sen_fg[t])
                sen_og[t] = ele.mult(ele.mult(act_og[t], (1.0 - act_og[t])),
                                     sen_og[t])

                # backprop matrix multiply
                weight_update_ig_data += data[t] * sen_ig[t].trans()
                weight_update_ig_prev += prev[t] * sen_ig[t].trans()
                weight_update_fg_bias += sen_ig[t]  # sen_ig[t].sum(0 or 1)

                weight_update_fg_data += data[t] * sen_fg[t].trans()
                weight_update_fg_prev += prev[t] * sen_fg[t].trans()
                weight_update_fg_bias += sen_fg[t]

                weight_update_og_data += data[t] * sen_og[t].trans()
                weight_update_og_prev += prev[t] * sen_og[t].trans()
                weight_update_og_bias += sen_og[t]

                weight_update_ff_data += data[t] * sen_ff[t].trans()
                weight_update_ff_prev += prev[t] * sen_ff[t].trans()
                weight_update_ff_bias += sen_ff[t]

                if t > 1:
                    dHout[t - 1] += model.ig_weight_prev.trans() * sen_ig[t]
                    dHout[t - 1] += model.fg_weight_prev.trans() * sen_fg[t]
                    dHout[t - 1] += model.og_weight_prev.trans() * sen_og[t]
                    dHout[t - 1] += model.ff_weight_prev.trans() * sen_ff[t]

            # normalize the gradients
            # dWLSTM /= batch_size
            weight_update_ig_prev /= batch_size
            weight_update_ig_data /= batch_size
            weight_update_ig_bias /= batch_size

            weight_update_fg_prev /= batch_size
            weight_update_fg_data /= batch_size
            weight_update_fg_bias /= batch_size

            weight_update_og_prev /= batch_size
            weight_update_og_data /= batch_size
            weight_update_og_bias /= batch_size

            weight_update_ff_prev /= batch_size
            weight_update_ff_data /= batch_size
            weight_update_ff_bias /= batch_size

            # weight update
            model.ig_weight_prev += learning_rate * weight_update_ig_prev
            model.ig_weight_data += learning_rate * weight_update_ig_data
            model.ig_weight_bias += learning_rate * weight_update_ig_bias

            model.fg_weight_prev += learning_rate * weight_update_fg_prev
            model.fg_weight_data += learning_rate * weight_update_fg_data
            model.fg_weight_bias += learning_rate * weight_update_fg_bias

            model.og_weight_prev += learning_rate * weight_update_og_prev
            model.og_weight_data += learning_rate * weight_update_og_data
            model.og_weight_bias += learning_rate * weight_update_og_bias

            model.ff_weight_prev += learning_rate * weight_update_ff_prev
            model.ff_weight_data += learning_rate * weight_update_ff_data
            model.ff_weight_bias += learning_rate * weight_update_ff_bias

            model.decoder_weights += learning_rate * dWd
            model.decoder_bias += learning_rate * dBd

            # Print results
            epoch_ll += sent_ll
            # print(" Sentence %d LL: %f" % (sent_id, sent_ll))
        epoch_ent = epoch_ll * (-1) / words
        epoch_ppl = 10**epoch_ent
        cur_time = time.time()
        print("Epoch %d (alpha=%f) PPL=%f" %
              (epoch_id, learning_rate, epoch_ppl))
        print "  time consumed:", cur_time - last_time
        if last_ll > epoch_ll:
            learning_rate /= 2.0
        last_ll = epoch_ll
        last_time = cur_time
Esempio n. 5
0
def LSTM_test(model, sents, vocab_size, words, tanhC_version=1):

    N = 10
    K = vocab_size

    test_ll = 0
    # For each sentence
    for sent_id, sent in enumerate(sents):
        #print "sent_id",sent_id
        #print "sent", sent
        #print "sents", sents
        ##### Initialize activations #####
        Tau = len(sent)
        sent_ll = 0  # Sentence log likelihood
        batch_size = Tau

        data = [None] * Tau
        prev = [None] * Tau
        embed = np.zeros((K, 1))
        embed[sent[0]] = 1
        data[0] = owl.from_numpy(embed).trans()

        Hout = [None] * Tau
        Hout[0] = owl.zeros([N, 1])

        act_ig = [None] * Tau
        act_fg = [None] * Tau
        act_og = [None] * Tau
        act_ff = [None] * Tau

        C = [None] * Tau
        C[0] = owl.zeros([N, 1])
        Ym = [None] * Tau

        ##### Forward pass #####
        # For each time step
        for t in range(1, Tau):
            prev[t] = Hout[t - 1]
            embed = np.zeros((K, 1))
            embed[sent[t]] = 1
            data[t] = owl.from_numpy(embed).trans()

            act_ig[t] = model.ig_weight_data.trans() * data[
                t - 1] + model.ig_weight_prev.trans(
                ) * prev[t] + model.ig_weight_bias
            act_fg[t] = model.fg_weight_data.trans() * data[
                t - 1] + model.fg_weight_prev.trans(
                ) * prev[t] + model.fg_weight_bias
            act_og[t] = model.og_weight_data.trans() * data[
                t - 1] + model.og_weight_prev.trans(
                ) * prev[t] + model.og_weight_bias
            act_ff[t] = model.ff_weight_data.trans() * data[
                t - 1] + model.ff_weight_prev.trans(
                ) * prev[t] + model.ff_weight_bias

            act_ig[t] = ele.sigm(act_ig[t])
            act_fg[t] = ele.sigm(act_fg[t])
            act_og[t] = ele.sigm(act_og[t])
            act_ff[t] = ele.tanh(act_ff[t])

            C[t] = ele.mult(act_ig[t], act_ff[t]) + ele.mult(
                act_fg[t], C[t - 1])

            if tanhC_version:
                Hout[t] = ele.mult(act_og[t], ele.tanh(C[t]))
            else:
                Hout[t] = ele.mult(act_og[t], C[t])
            Ym[t] = softmax(model.decoder_weights.trans() * Hout[t] +
                            model.decoder_bias)

            #print "Y_0[t]",Y_o[t]
            #print "Y_o[t][sent[t]]",Y_o[t][sent[t]]
            output = Ym[t].trans() * data[t]
            test_ll += math.log10(max(np.sum(output.to_numpy()), 1e-20))

    print test_ll
    test_ent = test_ll * (-1) / words
    test_ppl = 10**test_ent

    print("Test PPL = %f" % (test_ppl))
Esempio n. 6
0
def LSTM_train(model, sents, words, learning_rate, EPOCH, tanhC_version = 1):

	# Constants
	N = model.Layers[1]       # Number of units
	K = model.Layers[2]       # Vocabulary size

	last_time = time.time()
	# For each epoch
	for epoch_id in range(1, EPOCH + 1):
		epoch_ll = 0
		# For each sentence
		for sent_id, sent in enumerate(sents):
			#print sent_id
			#print "sent", sent
			#print "sents", sents
			##### Initialize activations #####

			Tau = len(sent)
			sent_ll = 0 # Sentence log likelihood

			data = [None] * Tau

			Hout = [None] * Tau
			Hout[0] = owl.zeros([N, 1])

			act_ig = [None] * Tau
			act_fg = [None] * Tau
			act_og = [None] * Tau
			act_ff = [None] * Tau

			C = [None] * Tau
			C[0] = owl.zeros([N, 1])
			dY = [None] * Tau

			dBd = owl.zeros([model.Layers[2], 1]) #dY.sum(0)
			dWd = owl.zeros([model.Layers[2], model.Layers[1]]) 
			dHout = [None] * Tau #dY.dot(model.decoder_weights.transpose())
			dEmb = [None] * Tau

			##### Forward pass #####
			# For each time step

			for t in range(1, Tau):
				# predict the (t+1)'th word from the t'th word
				data[t] = model.emb_weight[sent[t - 1]]
				NVector = np.zeros((K, 1))
				NVector[sent[t]] = 1
				target = owl.from_numpy(NVector).trans()

				act_ig[t] = model.ig_weight_data * data[t] + model.ig_weight_prev * Hout[t - 1] + model.ig_weight_cell * C[t - 1] + model.ig_weight_bias
				act_ig[t] = ele.sigm(act_ig[t])

				act_fg[t] = model.fg_weight_data * data[t] + model.fg_weight_prev * Hout[t - 1] + model.fg_weight_cell * C[t - 1] + model.fg_weight_bias
				act_fg[t] = ele.sigm(act_fg[t])

				act_ff[t] = model.ff_weight_data * data[t] + model.ff_weight_prev * Hout[t - 1] + model.ff_weight_bias
				act_ff[t] = ele.tanh(act_ff[t])

				C[t] = ele.mult(act_ig[t], act_ff[t]) + ele.mult(act_fg[t], C[t - 1])

				act_og[t] = model.og_weight_data * data[t] + model.og_weight_prev * Hout[t - 1] + model.og_weight_cell * C[t] + model.og_weight_bias
				act_og[t] = ele.sigm(act_og[t])

				if tanhC_version:
					Hout[t] = ele.mult(act_og[t], ele.tanh(C[t]))
				else:
					Hout[t] = ele.mult(act_og[t], C[t])

				Y = softmax(model.decoder_weights * Hout[t] + model.decoder_bias)

				# BP to Hout
				dY[t] = Y - target
				dBd += dY[t]
				dWd += dY[t] * Hout[t].trans()
				dHout[t] = model.decoder_weights.trans() * dY[t]

				# evaluation
				output = Y.to_numpy()			# Can directly get a single element from Y
				# print output[0, sent[t]]
				sent_ll += math.log(max(output[0, sent[t]],1e-20), 2)

				#print "Y_0[t]",Y_o[t]
				#print "Y_o[t][sent[t]]",Y_o[t][sent[t]]
				#print np.sum(output.to_numpy())
				# output = Ym[t].trans() * data[t]
				# sent_ll += math.log10( max(np.sum(output.to_numpy()),1e-20) )
			##### Initialize gradient vectors #####
				

			weight_update_ig_data = owl.zeros([model.Layers[1], model.Layers[0]])
			weight_update_ig_prev = owl.zeros([model.Layers[1], model.Layers[1]])
			weight_update_ig_cell = owl.zeros([model.Layers[1], model.Layers[1]])
			weight_update_ig_bias = owl.zeros([model.Layers[1], 1])

			weight_update_fg_data = owl.zeros([model.Layers[1], model.Layers[0]])
			weight_update_fg_prev = owl.zeros([model.Layers[1], model.Layers[1]])
			weight_update_fg_cell = owl.zeros([model.Layers[1], model.Layers[1]])
			weight_update_fg_bias = owl.zeros([model.Layers[1], 1])

			weight_update_og_data = owl.zeros([model.Layers[1], model.Layers[0]])
			weight_update_og_prev = owl.zeros([model.Layers[1], model.Layers[1]])
			weight_update_og_cell = owl.zeros([model.Layers[1], model.Layers[1]])
			weight_update_og_bias = owl.zeros([model.Layers[1], 1])

			weight_update_ff_data = owl.zeros([model.Layers[1], model.Layers[0]])
			weight_update_ff_prev = owl.zeros([model.Layers[1], model.Layers[1]])
			weight_update_ff_bias = owl.zeros([model.Layers[1], 1])

			dC = [None] * Tau

			for t in xrange(Tau):
				dC[t] = owl.zeros(C[t].shape)

			# Calculate the error and add it
			for t in reversed(range(1, Tau)):
				#print "sent",sent
				#print "t",t

				# BP from og controled gate and og
				if tanhC_version:
					tanhC = ele.tanh(C[t])
					dTanhC = ele.mult(dHout[t], act_og[t])
					sen_og = ele.mult(dHout[t], tanhC)
					dC[t] += ele.mult((1 - ele.mult(tanhC, tanhC)), dTanhC)
				else:
					sen_og = ele.mult(C[t], dHout[t])
					dC[t] += ele.mult(act_og[t], dHout[t])

				# BP from og
				sen_og = ele.mult(ele.mult(act_og[t], (1.0 - act_og[t])), sen_og)
				dHout[t - 1] = model.og_weight_prev.trans() * sen_og
				dC[t] += model.og_weight_cell.trans() * sen_og
				dEmb[t] = model.og_weight_data.trans() * sen_og

				# BP from fg controled gate
				sen_fg = ele.mult(C[t - 1], dC[t])
				dC[t - 1] += ele.mult(act_fg[t], dC[t])
				
				# BP from ig controled gate
				sen_ig = ele.mult(act_ff[t], dC[t])
				sen_ff = ele.mult(act_ig[t], dC[t])
				sen_ff = ele.mult((1 - ele.mult(act_ff[t], act_ff[t])), sen_ff)
				dEmb[t] += model.ff_weight_data.trans() * sen_ff
				
				# BP from fg
				sen_fg = ele.mult(ele.mult(act_fg[t], (1.0 - act_fg[t])), sen_fg)
				dHout[t - 1] += model.fg_weight_prev.trans() * sen_fg
				dC[t - 1] += model.fg_weight_cell.trans() * sen_fg
				dEmb[t] += model.fg_weight_data.trans() * sen_fg

				# BP from ig
				sen_ig = ele.mult(ele.mult(act_ig[t], (1.0 - act_ig[t])), sen_ig)
				dHout[t - 1] += model.ig_weight_prev.trans() * sen_ig
				dC[t - 1] += model.ig_weight_cell.trans() * sen_ig
				dEmb[t] += model.ig_weight_data.trans() * sen_ig

				# derivatives on weight matrix and bias
				weight_update_ig_data += sen_ig * data[t].trans()
				weight_update_ig_prev += sen_ig * Hout[t - 1].trans()
				weight_update_ig_cell += sen_ig * C[t - 1].trans()
				weight_update_ig_bias += sen_ig

				weight_update_fg_data += sen_fg * data[t].trans()
				weight_update_fg_prev += sen_fg * Hout[t - 1].trans()
				weight_update_fg_cell += sen_fg * C[t - 1].trans()
				weight_update_fg_bias += sen_fg

				weight_update_og_data += sen_og * data[t].trans()
				weight_update_og_prev += sen_og * Hout[t - 1].trans()
				weight_update_og_cell += sen_og * C[t].trans()
				weight_update_og_bias += sen_og

				weight_update_ff_data += sen_ff * data[t].trans()
				weight_update_ff_prev += sen_ff * Hout[t - 1].trans()
				weight_update_ff_bias += sen_ff


			# normalize the gradients
			rate = learning_rate / Tau

			# weight update
			model.ig_weight_prev -= rate * weight_update_ig_prev
			model.ig_weight_data -= rate * weight_update_ig_data
			model.ig_weight_cell -= rate * weight_update_ig_cell
			model.ig_weight_bias -= rate * weight_update_ig_bias

			model.fg_weight_prev -= rate * weight_update_fg_prev
			model.fg_weight_data -= rate * weight_update_fg_data
			model.fg_weight_cell -= rate * weight_update_fg_cell
			model.fg_weight_bias -= rate * weight_update_fg_bias

			model.og_weight_prev -= rate * weight_update_og_prev
			model.og_weight_data -= rate * weight_update_og_data
			model.og_weight_cell -= rate * weight_update_og_cell
			model.og_weight_bias -= rate * weight_update_og_bias

			model.ff_weight_prev -= rate * weight_update_ff_prev
			model.ff_weight_data -= rate * weight_update_ff_data
			model.ff_weight_bias -= rate * weight_update_ff_bias

			model.decoder_weights -= rate * dWd
			model.decoder_bias -= rate * dBd

			for t in range(1, Tau):
				model.emb_weight[sent[t - 1]] -= rate * dEmb[t]

			# Print results
			epoch_ll += sent_ll
			# print(" Sentence %d LL: %f" % (sent_id, sent_ll))

			
		epoch_ent = epoch_ll * (-1) / words
		epoch_ppl = 2 ** epoch_ent
		cur_time = time.time()
		print("Epoch %d (alpha=%f) PPL=%f" % (epoch_id, learning_rate, epoch_ppl))
		print "  time consumed:", cur_time - last_time
		last_time = cur_time

	return model, learning_rate
Esempio n. 7
0
def LSTM_test(model, sents, words, tanhC_version = 1):

	N = model.Layers[1]
	K = model.Layers[2]

	test_ll = 0
	# For each sentence
	for sent_id, sent in enumerate(sents):
		#print sent_id
		#print "sent", sent
		#print "sents", sents
		##### Initialize activations #####

		Tau = len(sent)
		sent_ll = 0 # Sentence log likelihood

		data = [None] * Tau

		Hout = [None] * Tau
		Hout[0] = owl.zeros([N, 1])

		act_ig = [None] * Tau
		act_fg = [None] * Tau
		act_og = [None] * Tau
		act_ff = [None] * Tau

		C = [None] * Tau
		C[0] = owl.zeros([N, 1])

		##### Forward pass #####
		# For each time step

		for t in range(1, Tau):
			# predict the (t+1)'th word from the t'th word
			data[t] = model.emb_weight[sent[t - 1]]

			act_ig[t] = model.ig_weight_data * data[t] + model.ig_weight_prev * Hout[t - 1] + model.ig_weight_cell * C[t - 1] + model.ig_weight_bias
			act_ig[t] = ele.sigm(act_ig[t])

			act_fg[t] = model.fg_weight_data * data[t] + model.fg_weight_prev * Hout[t - 1] + model.fg_weight_cell * C[t - 1] + model.fg_weight_bias
			act_fg[t] = ele.sigm(act_fg[t])

			act_ff[t] = model.ff_weight_data * data[t] + model.ff_weight_prev * Hout[t - 1] + model.ff_weight_bias
			act_ff[t] = ele.tanh(act_ff[t])

			C[t] = ele.mult(act_ig[t], act_ff[t]) + ele.mult(act_fg[t], C[t - 1])

			act_og[t] = model.og_weight_data * data[t] + model.og_weight_prev * Hout[t - 1] + model.og_weight_cell * C[t] + model.og_weight_bias
			act_og[t] = ele.sigm(act_og[t])

			if tanhC_version:
				Hout[t] = ele.mult(act_og[t], ele.tanh(C[t]))
			else:
				Hout[t] = ele.mult(act_og[t], C[t])

			Y = softmax(model.decoder_weights * Hout[t] + model.decoder_bias)

			# evaluation
			output = Y.to_numpy()			# Can directly get a single element from Y
			# print output[0, sent[t]]
			sent_ll += math.log(max(output[0, sent[t]],1e-20), 2)

		test_ll += sent_ll

	test_ent = test_ll * (-1) / words
	test_ppl = 2 ** test_ent

	print "Test PPL =", test_ppl
Esempio n. 8
0
 def ff(self, x, phase):
     return ele.tanh(x)
Esempio n. 9
0
def LSTM_train(model, sents, words, learning_rate, EPOCH, tanhC_version=1):

    # Constants
    N = model.Layers[1]  # Number of units
    K = model.Layers[0]  # Vocabulary size

    # For each epoch
    last_ll = 1e99
    for epoch_id in range(EPOCH, EPOCH + 10):
        print 'Start epoch #', epoch_id
        last_time = time.time()
        epoch_ll = 0
        tau_sum = 0
        # For each sentence
        for sent_id, sent in enumerate(sents):
            #print "sent_id",sent_id
            #print "sent", sent
            #print "sents", sents
            ##### Initialize activations #####
            Tau = len(sent)
            tau_sum += Tau
            sent_ll = 0  # Sentence log likelihood
            batch_size = Tau

            data = [None] * Tau
            prev = [None] * Tau
            data[0] = owl.zeros([K, 1])
            # embed = np.zeros((K, 1))
            # embed[sent[0]] = 1
            # data[0] = owl.from_numpy(embed).trans()

            Hout = [None] * Tau
            Hout[0] = owl.zeros([N, 1])

            act_ig = [None] * Tau
            act_fg = [None] * Tau
            act_og = [None] * Tau
            act_ff = [None] * Tau

            C = [None] * Tau
            C[0] = owl.zeros([N, 1])
            Ym = [None] * Tau
            dY = [None] * Tau

            dBd = owl.zeros([model.Layers[2], 1])  #dY.sum(0)
            dWd = owl.zeros([model.Layers[1],
                             model.Layers[2]])  #Hout.transpose().dot(dY)
            dHout = [None] * Tau  #dY.dot(model.decoder_weights.transpose())

            ##### Forward pass #####
            # For each time step
            for t in range(1, Tau):
                #prev[t] = Hout[t - 1]
                prev[t] = owl.zeros([N, 1])
                data[t] = owl.zeros([K, 1])
                #embed = np.zeros((K, 1))
                #embed[sent[t]] = 1
                #data[t] = owl.from_numpy(embed).trans()

                act_ig[t] = model.ig_weight_data.trans() * data[
                    t - 1] + model.ig_weight_prev.trans(
                    ) * prev[t] + model.ig_weight_bias
                act_fg[t] = model.fg_weight_data.trans() * data[
                    t - 1] + model.fg_weight_prev.trans(
                    ) * prev[t] + model.fg_weight_bias
                act_og[t] = model.og_weight_data.trans() * data[
                    t - 1] + model.og_weight_prev.trans(
                    ) * prev[t] + model.og_weight_bias
                act_ff[t] = model.ff_weight_data.trans() * data[
                    t - 1] + model.ff_weight_prev.trans(
                    ) * prev[t] + model.ff_weight_bias

                act_ig[t] = ele.sigm(act_ig[t])
                act_fg[t] = ele.sigm(act_fg[t])
                act_og[t] = ele.sigm(act_og[t])
                act_ff[t] = ele.tanh(act_ff[t])

                C[t] = ele.mult(act_ig[t], act_ff[t]) + ele.mult(
                    act_fg[t], C[t - 1])

                if tanhC_version:
                    Hout[t] = ele.mult(act_og[t], ele.tanh(C[t]))
                else:
                    Hout[t] = ele.mult(act_og[t], C[t])
                Ym[t] = softmax(model.decoder_weights.trans() * Hout[t] +
                                model.decoder_bias)

                dY[t] = data[t] - Ym[t]
                dBd += dY[t] / batch_size
                dWd += Hout[t] * dY[t].trans() / batch_size
                dHout[t] = model.decoder_weights * dY[t]

            #print "Y_0[t]",Y_o[t]
            #print "Y_o[t][sent[t]]",Y_o[t][sent[t]]
            #print np.sum(output.to_numpy())
            # output = Ym[t].trans() * data[t]
            # sent_ll += math.log10( max(np.sum(output.to_numpy()),1e-20) )
            ##### Initialize gradient vectors #####
            #Ym[-1].wait_for_eval()
            for t in range(1, Tau):
                Ym[t].wait_for_eval()
            #output = Ym[t].trans() * data[t]
            #sent_ll += math.log10( max(np.sum(output.to_numpy()),1e-20) )
            if sent_id % 100 == 0:
                cur_time = time.time()
                print 'Finished', sent_id, 'sentences. Time used:', cur_time - last_time, 's. sent/s:', float(
                    sent_id) / (cur_time - last_time), 'tau_sum=', tau_sum
                #print owl.print_profiler_result()
                tau_sum = 0
            continue

            sen_ig = [None] * Tau
            sen_fg = [None] * Tau
            sen_og = [None] * Tau
            sen_ff = [None] * Tau

            weight_update_ig_data = owl.zeros(
                [model.Layers[0], model.Layers[1]])
            weight_update_ig_prev = owl.zeros(
                [model.Layers[1], model.Layers[1]])
            weight_update_ig_bias = owl.zeros([model.Layers[1], 1])

            weight_update_fg_data = owl.zeros(
                [model.Layers[0], model.Layers[1]])
            weight_update_fg_prev = owl.zeros(
                [model.Layers[1], model.Layers[1]])
            weight_update_fg_bias = owl.zeros([model.Layers[1], 1])

            weight_update_og_data = owl.zeros(
                [model.Layers[0], model.Layers[1]])
            weight_update_og_prev = owl.zeros(
                [model.Layers[1], model.Layers[1]])
            weight_update_og_bias = owl.zeros([model.Layers[1], 1])

            weight_update_ff_data = owl.zeros(
                [model.Layers[0], model.Layers[1]])
            weight_update_ff_prev = owl.zeros(
                [model.Layers[1], model.Layers[1]])
            weight_update_ff_bias = owl.zeros([model.Layers[1], 1])

            dHin = owl.zeros([model.Layers[1], model.Layers[1]])
            dC = [None] * Tau
            for t in xrange(Tau):
                dC[t] = owl.zeros(C[t].shape)

            # Calculate the error and add it
            for t in reversed(range(1, len(sent))):
                #print "sent",sent
                #print "t",t
                if tanhC_version:
                    tanhCt = ele.tanh(C[t])
                    sen_og[t] = ele.mult(tanhCt, dHout[t])
                    dC[t] += ele.mult((1 - ele.mult(tanhCt, tanhCt)),
                                      ele.mult(act_og[t], dHout[t]))
                else:
                    sen_og[t] = ele.mult(C[t], dHout[t])
                    dC[t] += ele.mult(act_og[t], dHout[t])

                sen_fg[t] = owl.zeros([model.Layers[1], 1])
                if t > 0:
                    sen_fg[t] = ele.mult(C[t - 1], dC[t])
                    dC[t - 1] += ele.mult(act_fg[t], dC[t])
                sen_ig[t] = ele.mult(act_ff[t], dC[t])
                sen_ff[t] = ele.mult(act_ig[t], dC[t])

                # backprop activation functions
                sen_ff[t] = ele.mult((1 - ele.mult(act_ff[t], act_ff[t])),
                                     sen_ff[t])
                sen_ig[t] = ele.mult(ele.mult(act_ig[t], (1.0 - act_ig[t])),
                                     sen_ig[t])
                sen_fg[t] = ele.mult(ele.mult(act_fg[t], (1.0 - act_fg[t])),
                                     sen_fg[t])
                sen_og[t] = ele.mult(ele.mult(act_og[t], (1.0 - act_og[t])),
                                     sen_og[t])

                # backprop matrix multiply
                weight_update_ig_data += data[t] * sen_ig[t].trans()
                weight_update_ig_prev += prev[t] * sen_ig[t].trans()
                weight_update_fg_bias += sen_ig[t]  # sen_ig[t].sum(0 or 1)

                weight_update_fg_data += data[t] * sen_fg[t].trans()
                weight_update_fg_prev += prev[t] * sen_fg[t].trans()
                weight_update_fg_bias += sen_fg[t]

                weight_update_og_data += data[t] * sen_og[t].trans()
                weight_update_og_prev += prev[t] * sen_og[t].trans()
                weight_update_og_bias += sen_og[t]

                weight_update_ff_data += data[t] * sen_ff[t].trans()
                weight_update_ff_prev += prev[t] * sen_ff[t].trans()
                weight_update_ff_bias += sen_ff[t]

                if t > 1:
                    dHout[t - 1] += model.ig_weight_prev.trans() * sen_ig[t]
                    dHout[t - 1] += model.fg_weight_prev.trans() * sen_fg[t]
                    dHout[t - 1] += model.og_weight_prev.trans() * sen_og[t]
                    dHout[t - 1] += model.ff_weight_prev.trans() * sen_ff[t]

            # normalize the gradients
            # weight update
            model.ig_weight_prev += learning_rate / batch_size * weight_update_ig_prev
            model.ig_weight_data += learning_rate / batch_size * weight_update_ig_data
            model.ig_weight_bias += learning_rate / batch_size * weight_update_ig_bias

            model.fg_weight_prev += learning_rate / batch_size * weight_update_fg_prev
            model.fg_weight_data += learning_rate / batch_size * weight_update_fg_data
            model.fg_weight_bias += learning_rate / batch_size * weight_update_fg_bias

            model.og_weight_prev += learning_rate / batch_size * weight_update_og_prev
            model.og_weight_data += learning_rate / batch_size * weight_update_og_data
            model.og_weight_bias += learning_rate / batch_size * weight_update_og_bias

            model.ff_weight_prev += learning_rate / batch_size * weight_update_ff_prev
            model.ff_weight_data += learning_rate / batch_size * weight_update_ff_data
            model.ff_weight_bias += learning_rate / batch_size * weight_update_ff_bias

            model.decoder_weights += learning_rate * dWd
            model.decoder_bias += learning_rate * dBd

            # Print results
            epoch_ll += sent_ll
            # print(" Sentence %d LL: %f" % (sent_id, sent_ll))
        epoch_ent = epoch_ll * (-1) / words
        epoch_ppl = 10**epoch_ent
        cur_time = time.time()
        print("Epoch %d (alpha=%f) PPL=%f" %
              (epoch_id, learning_rate, epoch_ppl))
        print "  time consumed:", cur_time - last_time
        last_time = cur_time
        if last_ll > epoch_ll:
            learning_rate /= 2.0
        last_ll = epoch_ll

    return model, learning_rate
Esempio n. 10
0
 def ff(self, x):
     return ele.tanh(x)