def __init__(self, V, D, K, activation):
        self.D = D
        self.f = activation

        # word embedding
        We = init_weight(V, D)

        # linear terms
        W1 = init_weight(D, D)
        W2 = init_weight(D, D)

        # bias
        bh = np.zeros(D)

        # output layer
        Wo = init_weight(D, K)
        bo = np.zeros(K)

        # make them tensorflow variables
        self.We = tf.Variable(We.astype(np.float32))
        self.W1 = tf.Variable(W1.astype(np.float32))
        self.W2 = tf.Variable(W2.astype(np.float32))
        self.bh = tf.Variable(bh.astype(np.float32))
        self.Wo = tf.Variable(Wo.astype(np.float32))
        self.bo = tf.Variable(bo.astype(np.float32))
        self.params = [self.We, self.W1, self.W2, self.Wo]
Ejemplo n.º 2
0
 def setUp(self):
     rng = np.random.RandomState(0)
     init_w_e, init_b_e = util.init_weight(rng, self.n_in, self.n_hidden)
     init_w_d, init_b_d = util.init_weight(rng, self.n_hidden, self.n_in)
     self.w_e.set_value(init_w_e, borrow=True)
     self.b_e.set_value(init_b_e, borrow=True)
     self.w_d.set_value(init_w_d, borrow=True)
     self.b_d.set_value(init_b_d, borrow=True)
    def fit(self, X, epochs=500, show_fig=False):
        N = len(X)
        D = self.D
        M = self.M
        V = self.V

        # initial weights
        We = init_weight(V, D).astype(np.float32)
        Wx = init_weight(D, M).astype(np.float32)
        Wh = init_weight(M, M).astype(np.float32)
        bh = np.zeros(M).astype(np.float32)
        h0 = np.zeros(M).astype(np.float32)
        Wo = init_weight(M, V).astype(np.float32)
        bo = np.zeros(V).astype(np.float32)

        # build tensorflow functions
        self.build(We, Wx, Wh, bh, h0, Wo, bo)

        # sentence input:
        # [START, w1, w2, ..., wn]
        # sentence target:
        # [w1,    w2, w3, ..., END]

        costs = []
        n_total = sum((len(sentence)+1) for sentence in X)
        for i in range(epochs):
            X = shuffle(X)
            n_correct = 0
            cost = 0
            for j in range(N):
                # problem! many words --> END token are overrepresented
                # result: generated lines will be very short
                # we will try to fix in a later iteration
                # BAD! magic numbers 0 and 1...
                input_sequence = [0] + X[j]
                output_sequence = X[j] + [1]

                # we set 0 to start and 1 to end
                _, c, p = self.session.run(
                    (self.train_op, self.cost, self.predict_op),
                    feed_dict={self.tfX: input_sequence, self.tfY: output_sequence}
                )
                # print "p:", p
                cost += c
                # print "j:", j, "c:", c/len(X[j]+1)
                for pj, xj in zip(p, output_sequence):
                    if pj == xj:
                        n_correct += 1
            print("i:", i, "cost:", cost, "correct rate:", (float(n_correct)/n_total))
            costs.append(cost)

        if show_fig:
            plt.plot(costs)
            plt.show()
 def __init__(self, M1, M2, an_id):
     self.id = an_id
     self.M1 = M1
     self.M2 = M2
     W = init_weight(M1, M2)
     b = np.zeros(M2)
     self.W = theano.shared(W, 'W_%s' % self.id)
     self.b = theano.shared(b, 'b_%s' % self.id)
     self.params = [self.W, self.b]
Ejemplo n.º 5
0
def get_param(name, n_in, n_out, params, rng):
    w_name = "w_" + name
    b_name = "b_" + name
    if params is not None and w_name in params:
        assert b_name in params
        init_w = params[w_name]
        init_b = params[b_name]
    else:
        init_w, init_b = util.init_weight(rng, n_in, n_out)
    w = theano.shared(name=w_name, borrow=True, value=init_w)
    b = theano.shared(name=b_name, borrow=True, value=init_b)
    return w, b
    def __init__(self, Mi, Mo, activation):
        self.Mi = Mi
        self.Mo = Mo
        self.f  = activation

        # numpy init
        Wxr = init_weight(Mi, Mo)
        Whr = init_weight(Mo, Mo)
        br  = np.zeros(Mo)
        Wxz = init_weight(Mi, Mo)
        Whz = init_weight(Mo, Mo)
        bz  = np.zeros(Mo)
        Wxh = init_weight(Mi, Mo)
        Whh = init_weight(Mo, Mo)
        bh  = np.zeros(Mo)
        h0  = np.zeros(Mo)

        # theano vars
        self.Wxr = theano.shared(Wxr)
        self.Whr = theano.shared(Whr)
        self.br  = theano.shared(br)
        self.Wxz = theano.shared(Wxz)
        self.Whz = theano.shared(Whz)
        self.bz  = theano.shared(bz)
        self.Wxh = theano.shared(Wxh)
        self.Whh = theano.shared(Whh)
        self.bh  = theano.shared(bh)
        self.h0  = theano.shared(h0)
        self.params = [self.Wxr, self.Whr, self.br, self.Wxz, self.Whz, self.bz, self.Wxh, self.Whh, self.bh, self.h0]
Ejemplo n.º 7
0
    def __init__(self, Mi, Mo, activation):
        self.Mi = Mi
        self.Mo = Mo
        self.f = activation

        # x(t) to r(t) gate
        Wxr = init_weight(Mi, Mo)
        # h(t) to r(t) gate
        Whr = init_weight(Mo, Mo)
        # bias to r(t) gate
        br = np.zeros(Mo)
        # x(t) to z(t) gate
        Wxz = init_weight(Mi, Mo)
        # h(t) to z(t) gate
        Whz = init_weight(Mo, Mo)
        # bias to z(t) gate
        bz = np.zeros(Mo)
        # x(t) to h(t) gate
        Wxh = init_weight(Mi, Mo)
        # h(t-1) to h(t) gate
        Whh = init_weight(Mo, Mo)
        # bias to h(t) gate
        bh = np.zeros(Mo)
        # initial hidden state
        h0 = np.zeros(Mo)

        # create theano variables
        self.Wxr = theano.shared(Wxr)
        self.Whr = theano.shared(Whr)
        self.br = theano.shared(br)
        self.Wxz = theano.shared(Wxz)
        self.Whz = theano.shared(Whz)
        self.bz = theano.shared(bz)
        self.Wxh = theano.shared(Wxh)
        self.Whh = theano.shared(Whh)
        self.bh = theano.shared(bh)
        self.h0 = theano.shared(h0)
        self.params = [
            self.Wxr, self.Whr, self.br, self.Wxz, self.Whz, self.bz, self.Wxh,
            self.Whh, self.bh, self.h0
        ]
Ejemplo n.º 8
0
    def fit(self,
            X,
            Y,
            learning_rate=10e-3,
            mu=0.99,
            reg=10e-12,
            eps=10e-10,
            epochs=400,
            batch_sz=20,
            print_period=1,
            show_fig=False):

        # X = X.astype(np.float32)
        Y = Y.astype(np.int32)

        # initialize hidden layers
        N, D = X.shape
        K = len(set(Y))
        self.hidden_layers = []
        M1 = D
        count = 0
        for M2 in self.hidden_layer_sizes:
            h = HiddenLayer(M1, M2, count)
            self.hidden_layers.append(h)
            M1 = M2
            count += 1
        W = init_weight(M1, K)
        b = np.zeros(K)
        self.W = theano.shared(W, 'W_logreg')
        self.b = theano.shared(b, 'b_logreg')

        # collect params for later use
        self.params = [self.W, self.b]
        for h in self.hidden_layers:
            self.params += h.params

        # for momentum
        dparams = [
            theano.shared(np.zeros(p.get_value().shape)) for p in self.params
        ]

        # for rmsprop
        cache = [
            theano.shared(np.zeros(p.get_value().shape)) for p in self.params
        ]

        # set up theano functions and variables
        thX = T.matrix('X')
        thY = T.ivector('Y')
        pY = self.forward(thX)

        rcost = reg * T.sum([(p * p).sum() for p in self.params])
        cost = -T.mean(T.log(pY[T.arange(thY.shape[0]), thY])) + rcost
        prediction = self.predict(thX)
        grads = T.grad(cost, self.params)

        # momentum only
        updates = [(p, p + mu * dp - learning_rate * g)
                   for p, dp, g in zip(self.params, dparams, grads)
                   ] + [(dp, mu * dp - learning_rate * g)
                        for dp, g in zip(dparams, grads)]

        train_op = theano.function(
            inputs=[thX, thY],
            outputs=[cost, prediction],
            updates=updates,
        )

        n_batches = N / batch_sz
        # print "N:", N, "batch_sz:", batch_sz
        # print "n_batches:", n_batches
        costs = []
        for i in xrange(epochs):
            X, Y = shuffle(X, Y)
            for j in xrange(n_batches):
                Xbatch = X[j * batch_sz:(j * batch_sz + batch_sz)]
                Ybatch = Y[j * batch_sz:(j * batch_sz + batch_sz)]

                c, p = train_op(Xbatch, Ybatch)

                if j % print_period == 0:
                    costs.append(c)
                    e = np.mean(Ybatch != p)
                    print "i:", i, "j:", j, "nb:", n_batches, "cost:", c, "error rate:", e

        if show_fig:
            plt.plot(costs)
            plt.show()
    def fit(self, trees, test_trees, reg=1e-3, epochs=8, train_inner_nodes=False):
        D = self.D
        V = self.V
        K = self.K
        N = len(trees)

        We = init_weight(V, D)
        W11 = np.random.randn(D, D, D) / np.sqrt(3*D)
        W22 = np.random.randn(D, D, D) / np.sqrt(3*D)
        W12 = np.random.randn(D, D, D) / np.sqrt(3*D)
        W1 = init_weight(D, D)
        W2 = init_weight(D, D)
        bh = np.zeros(D)
        Wo = init_weight(D, K)
        bo = np.zeros(K)

        self.We = tf.Variable(We.astype(np.float32))
        self.W11 = tf.Variable(W11.astype(np.float32))
        self.W22 = tf.Variable(W22.astype(np.float32))
        self.W12 = tf.Variable(W12.astype(np.float32))
        self.W1 = tf.Variable(W1.astype(np.float32))
        self.W2 = tf.Variable(W2.astype(np.float32))
        self.bh = tf.Variable(bh.astype(np.float32))
        self.Wo = tf.Variable(Wo.astype(np.float32))
        self.bo = tf.Variable(bo.astype(np.float32))
        self.weights = [self.We, self.W11, self.W22, self.W12, self.W1, self.W2, self.Wo]


        words = tf.compat.v1.placeholder(tf.int32, shape=(None,), name='words')
        left_children = tf.compat.v1.placeholder(tf.int32, shape=(None,), name='left_children')
        right_children = tf.compat.v1.placeholder(tf.int32, shape=(None,), name='right_children')
        labels = tf.compat.v1.placeholder(tf.int32, shape=(None,), name='labels')

        # save for later
        self.words = words
        self.left = left_children
        self.right = right_children
        self.labels = labels

        def dot1(a, B):
            return tf.tensordot(a, B, axes=[[0], [1]])

        def dot2(B, a):
            return tf.tensordot(B, a, axes=[[1], [0]])

        def recursive_net_transform(hiddens, n):
            h_left = hiddens.read(left_children[n])
            h_right = hiddens.read(right_children[n])
            return self.f(
                dot1(h_left, dot2(self.W11, h_left)) +
                dot1(h_right, dot2(self.W22, h_right)) +
                dot1(h_left, dot2(self.W12, h_right)) +
                dot1(h_left, self.W1) +
                dot1(h_right, self.W2) +
                self.bh
            )


        def recurrence(hiddens, n):
            w = words[n]
            # any non-word will have index -1

            h_n = tf.cond(
                pred=w >= 0,
                true_fn=lambda: tf.nn.embedding_lookup(params=self.We, ids=w),
                false_fn=lambda: recursive_net_transform(hiddens, n)
            )
            hiddens = hiddens.write(n, h_n)
            n = tf.add(n, 1)
            return hiddens, n


        def condition(hiddens, n):
            # loop should continue while n < len(words)
            return tf.less(n, tf.shape(input=words)[0])


        hiddens = tf.TensorArray(
            tf.float32,
            size=0,
            dynamic_size=True,
            clear_after_read=False,
            infer_shape=False
        )

        hiddens, _ = tf.while_loop(
            cond=condition,
            body=recurrence,
            loop_vars=[hiddens, tf.constant(0)],
            parallel_iterations=1
        )
        h = hiddens.stack()
        logits = tf.matmul(h, self.Wo) + self.bo

        prediction_op = tf.argmax(input=logits, axis=1)
        self.prediction_op = prediction_op
        
        rcost = reg*sum(tf.nn.l2_loss(p) for p in self.weights)
        if train_inner_nodes:
            # filter out -1s
            labeled_indices = tf.compat.v1.where(labels >= 0)

            cost_op = tf.reduce_mean(
                input_tensor=tf.nn.sparse_softmax_cross_entropy_with_logits(
                    logits=tf.gather(logits, labeled_indices),
                    labels=tf.gather(labels, labeled_indices),
                )
            ) + rcost
        else:
            cost_op = tf.reduce_mean(
                input_tensor=tf.nn.sparse_softmax_cross_entropy_with_logits(
                    logits=logits[-1],
                    labels=labels[-1],
                )
            ) + rcost

        train_op = tf.compat.v1.train.AdagradOptimizer(learning_rate=8e-3).minimize(cost_op)
        # train_op = tf.train.MomentumOptimizer(learning_rate=8e-3, momentum=0.9).minimize(cost_op)

        # NOTE: If you're using GPU, InteractiveSession breaks
        # AdagradOptimizer and some other optimizers
        # change to tf.Session() if so.
        self.session = tf.compat.v1.Session()
        init_op = tf.compat.v1.global_variables_initializer()
        self.session.run(init_op)


        costs = []
        sequence_indexes = range(N)
        for i in range(epochs):
            t0 = datetime.now()
            sequence_indexes = shuffle(sequence_indexes)
            n_correct = 0
            n_total = 0
            cost = 0
            it = 0
            for j in sequence_indexes:
                words_, left, right, lab = trees[j]
                # print("words_:", words_)
                # print("lab:", lab)
                c, p, _ = self.session.run(
                    (cost_op, prediction_op, train_op),
                    feed_dict={
                        words: words_,
                        left_children: left,
                        right_children: right,
                        labels: lab
                    }
                )
                if np.isnan(c):
                    print("Cost is nan! Let's stop here. \
                        Why don't you try decreasing the learning rate?")
                    for p in self.params:
                        print(p.get_value().sum())
                    exit()
                cost += c
                n_correct += (p[-1] == lab[-1])
                n_total += 1

                it += 1
                if it % 10 == 0:
                    sys.stdout.write(
                        "j/N: %d/%d correct rate so far: %f, cost so far: %f\r" %
                        (it, N, float(n_correct)/n_total, cost)
                    )
                    sys.stdout.flush()


            # calculate the test score
            n_test_correct = 0
            n_test_total = 0
            for words_, left, right, lab in test_trees:
                p = self.session.run(prediction_op, feed_dict={
                    words: words_,
                    left_children: left,
                    right_children: right,
                    labels: lab
                })
                n_test_correct += (p[-1] == lab[-1])
                n_test_total += 1


            print(
                "i:", i, "cost:", cost,
                "train acc:", float(n_correct)/n_total,
                "test acc:", float(n_test_correct)/n_test_total,
                "time for epoch:", (datetime.now() - t0)
            )
            costs.append(cost)

        plt.plot(costs)
        plt.show()
Ejemplo n.º 10
0
    def fit(self,
            X,
            learning_rate=0.5,
            mu=0.99,
            epochs=1,
            batch_sz=100,
            show_fig=False):
        # cast to float
        mu = np.float32(mu)
        learning_rate = np.float32(learning_rate)
        X = X.astype(np.float32)

        N, D = X.shape
        n_batches = N // batch_sz

        # define shared (all weights in NN)
        W0 = init_weight((D, self.M))
        self.W = theano.shared(W0, 'W_%s' % self.id)
        self.bh = theano.shared(
            np.zeros(self.M, dtype=np.float32), 'bh_%s' % self.id
        )  # 重大發現阿,假如要固定型別 TensorType(float32, matrix),也要記得將np.array物件的內部型別轉成np.float32
        self.bo = theano.shared(np.zeros(D, dtype=np.float32),
                                'bo_%s' % self.id)
        self.params = [self.W, self.bh,
                       self.bo]  # keep tracking all parameters
        self.forward_params = [self.W, self.bh]

        self.dW = theano.shared(np.zeros(W0.shape, dtype=np.float32),
                                'dW_%s' % self.id)
        self.dbh = theano.shared(np.zeros(self.M, dtype=np.float32),
                                 'dbh_%s' % self.id)
        self.dbo = theano.shared(np.zeros(D, dtype=np.float32),
                                 'dbo_%s' % self.id)
        self.dparams = [self.dW, self.dbh, self.dbo]
        self.forward_dparams = [self.dW, self.dbh]

        # define matrix (training data)
        X_in = T.matrix('X_%s' % self.id, dtype='float32')
        X_hat = self.forward_output(X_in)

        # attach it to the object so it can be used later
        # must be sigmoid because the output is also a sigmoid
        H = T.nnet.sigmoid(
            X_in.dot(self.W) +
            self.bh)  # define a hidden layer operation as a theano function
        # 取出中間hidden layer 的實際數值(作圖/ DNN訓練用),這個 function 的輸入是 numpy.array ,輸出是 numpy.array
        self.hidden_op = theano.function(
            inputs=[X_in],
            outputs=H,
        )

        # save this for later so we can call it to
        # create reconstructions of input
        self.predict = theano.function(
            inputs=[X_in],
            outputs=X_hat,
        )

        # cost = ((X_in - X_hat) * (X_in - X_hat)).sum() / N  # squared error
        cost = -(X_in * T.log(X_hat) +
                 (1 - X_in) * T.log(1 - X_hat)).sum() / N  # cross entropy
        cost_op = theano.function(inputs=[X_in], outputs=cost)

        # updates = [
        #     (p, p + mu*dp - learning_rate*T.grad(cost, p)) for p, dp in zip(self.params, self.dparams)
        # ] + [
        #     (dp, mu*dp - learning_rate*T.grad(cost, p)) for p, dp, in zip(self.params, self.dparams)
        # ]

        updates = momentum_updates(cost, self.params, mu, learning_rate)
        train_op = theano.function(
            inputs=[X_in],
            updates=updates,
        )

        costs = []
        print('training autoencoder: %s' % self.id)
        print('epochs to do:', epochs)
        for i in range(epochs):
            print('epoch:', i)
            X = shuffle(X)
            for j in range(n_batches):
                batch = X[j * batch_sz:(j * batch_sz + batch_sz), ]
                train_op(batch)
                the_cost = cost_op(batch)
                costs.append(the_cost)
                if j % 10 == 0:
                    print('j / n_batches', j, '/', n_batches, 'cost:',
                          the_cost)

        if show_fig:
            plt.plot(costs)
            plt.show()
Ejemplo n.º 11
0
    def fit(self, X, Y, learning_rate=10e-1, mu=.99, reg=1.0, activation=T.tanh, batch_sz=100, epochs=100,
            show_fig=False):
        D = X[0].shape[1]
        K = len(set(Y.flatten()))
        N = len(Y)
        M = self.M
        self.f = activation

        # initial weights
        Wx = init_weight(D, M)
        Wh = init_weight(M, M)
        bh = np.zeros(M)
        h0 = np.zeros(M)
        Wo = init_weight(M, K)
        bo = np.zeros(K)

        self.Wx = theano.shared(Wx)
        self.Wh = theano.shared(Wh)
        self.bh = theano.shared(bh)
        self.h0 = theano.shared(h0)
        self.Wo = theano.shared(Wo)
        self.bo = theano.shared(bo)
        self.params = [self.Wx, self.Wh, self.bh, self.Wo, self.bo, self.h0]

        thX = T.fmatrix('X')
        thY = T.ivector('Y')
        thStartPoints = T.ivector('startPoints')

        XW = thX.dot(self.Wx)

        def recurrence(xw_t, is_start, h_t1, h0):
            # return h(t), y(t)
            h_t = T.switch(
                T.eq(is_start, 1),
                self.f(xw_t + h0.dot(self.Wh) + self.bh),
                self.f(xw_t + h_t1.dot(self.Wh) + self.bh)
            )
            return h_t

        h, _ = theano.scan(
            fn=recurrence,
            outputs_info=[self.h0],
            sequences=[XW, thStartPoints],
            non_sequences=[self.h0],
            n_steps=XW.shape[0],
            # mode="DebugMode"
        )

        # py_x = y[:, 0, :]
        py_x = T.nnet.softmax(h.dot(self.Wo) + self.bo)
        prediction = T.argmax(py_x, axis=1)

        ## Notes
        # py_x[T.arange(thY.shape[0]), thY] ==> is advence indexing
        # eg:
        #   thY = [1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0]
        #   py_x = array([
        #           [0.21464644, 0.78535356], [0.53838035, 0.46161965], [0.53524101, 0.46475899], [0.4911481 , 0.5088519 ],
        #           [0.49989071, 0.50010929], [0.53311029, 0.46688971], [0.49294333, 0.50705667], [0.49984173, 0.50015827],
        #           [0.49985361, 0.50014639], [0.49982706, 0.50017294], [0.53299261, 0.46700739], [0.49291816, 0.50708184]
        #         ])
        #  py_x[T.arange(thY.shape[0]), thY] ==> py_x[[0,1,2,3,4,5,6,7,8,9,10,11], [1,1,1,0,1,1,0,1,0,1,1,0]
        #                                    ==> [ 0.78535356, 0.46161965, 0.46475899, 0.4911481 ,
        #                                          0.50010929, 0.46688971, 0.49294333, 0.50015827,
        #                                          0.49985361, 0.50017294, 0.46700739, 0.49291816]
        cost = -T.mean(T.log(py_x[T.arange(thY.shape[0]), thY]))
        grads = T.grad(cost, self.params)
        dparams = [theano.shared(p.get_value()*0) for p in self.params]

        updates = [
            (p, p + mu*dp - learning_rate*g) for p, dp, g in zip(self.params, dparams, grads)
        ] + [
            (dp, mu*dp - learning_rate*g) for dp, g in zip(dparams, grads)
        ]

        # self.predict_op = theano.function(inputs=[thX], outputs=prediction)
        self.train_op = theano.function(
            inputs=[thX, thY, thStartPoints],
            outputs=[cost, prediction, py_x],
            updates=updates
            # mode="DebugMode"
        )

        costs = []
        n_batches = N // batch_sz
        sequenceLength = X.shape[1]

        startPoints = np.zeros(sequenceLength*batch_sz, dtype=np.int32)
        for b in range(batch_sz):
            startPoints[b*sequenceLength] = 1
        for i in range(epochs):
            X, Y = shuffle(X, Y)
            n_correct = 0
            cost = 0
            for j in range(n_batches):
                Xbatch = X[j*batch_sz:(j+1)*batch_sz].reshape(sequenceLength*batch_sz, D)
                Ybatch = Y[j*batch_sz:(j+1)*batch_sz].reshape(sequenceLength*batch_sz).astype(np.int32)
                c, p, rout = self.train_op(Xbatch, Ybatch, startPoints)
                cost += c

                for b in range(batch_sz):
                    idx = sequenceLength*(b + 1) - 1
                    if p[idx] == Ybatch[idx]:
                        n_correct += 1
            print("shape y:", rout.shape)
            print("i:", i, "cost:", cost, "Classification rate:", (float(n_correct) / N))
            costs.append(cost)
            # if n_correct == N:
            #     break
        if show_fig:
            plt.plot(costs)
            plt.show()
Ejemplo n.º 12
0
    def fit(self,
            X,
            learning_rate=10e-1,
            mu=0.99,
            reg=1.0,
            activation=T.tanh,
            epochs=500,
            show_fig=False):
        N = len(X)
        D = self.D
        M = self.M
        V = self.V

        # initial weights
        We = init_weight(V, D)
        Wx = init_weight(D, M)
        Wh = init_weight(M, M)
        bh = np.zeros(M)
        h0 = np.zeros(M)
        # z  = np.ones(M)
        Wxz = init_weight(D, M)
        Whz = init_weight(M, M)
        bz = np.zeros(M)
        Wo = init_weight(M, V)
        bo = np.zeros(V)

        thX, thY, py_x, prediction = self.set(We, Wx, Wh, bh, h0, Wxz, Whz, bz,
                                              Wo, bo, activation)

        lr = T.scalar('lr')

        cost = -T.mean(T.log(py_x[T.arange(thY.shape[0]), thY]))
        grads = T.grad(cost, self.params)
        dparams = [theano.shared(p.get_value() * 0) for p in self.params]

        updates = [(p, p + mu * dp - lr * g)
                   for p, dp, g in zip(self.params, dparams, grads)] + [
                       (dp, mu * dp - lr * g) for dp, g in zip(dparams, grads)
                   ]

        self.predict_op = theano.function(inputs=[thX], outputs=prediction)
        self.train_op = theano.function(inputs=[thX, thY, lr],
                                        outputs=[cost, prediction],
                                        updates=updates)

        costs = []
        for i in xrange(epochs):
            X = shuffle(X)
            n_correct = 0
            n_total = 0
            cost = 0
            for j in xrange(N):
                if np.random.random() < 0.1:
                    input_sequence = [0] + X[j]
                    output_sequence = X[j] + [1]
                else:
                    input_sequence = [0] + X[j][:-1]
                    output_sequence = X[j]
                n_total += len(output_sequence)

                # we set 0 to start and 1 to end
                c, p = self.train_op(input_sequence, output_sequence,
                                     learning_rate)
                # print "p:", p
                cost += c
                # print "j:", j, "c:", c/len(X[j]+1)
                for pj, xj in zip(p, output_sequence):
                    if pj == xj:
                        n_correct += 1
            print "i:", i, "cost:", cost, "correct rate:", (float(n_correct) /
                                                            n_total)
            if (i + 1) % 500 == 0:
                learning_rate /= 2
            costs.append(cost)

        if show_fig:
            plt.plot(costs)
            plt.show()
Ejemplo n.º 13
0
    def fit(self,
            X,
            learning_rate=1e-4,
            mu=0.99,
            epochs=10,
            batch_sz=100,
            show_fig=True,
            activation=T.nnet.relu,
            RecurrentUnit=LSTM):
        D = self.D
        V = self.V
        N = len(X)

        We = init_weight(V, D)
        self.hidden_layers = []
        Mi = D
        for Mo in self.hidden_layer_sizes:
            ru = RecurrentUnit(Mi, Mo, activation)
            self.hidden_layers.append(ru)
            Mi = Mo

        Wo = init_weight(Mi, V)
        bo = np.zeros(V)

        self.We = theano.shared(We)
        self.Wo = theano.shared(Wo)
        self.bo = theano.shared(bo)
        self.params = [self.We, self.Wo, self.bo]
        for ru in self.hidden_layers:
            self.params += ru.params

        thX = T.ivector('X')  # will represent multiple batches concatenated
        thY = T.ivector('Y')  # represents next word
        thStartPoints = T.ivector('start_points')

        Z = self.We[thX]
        for ru in self.hidden_layers:
            Z = ru.output(Z, thStartPoints)
        py_x = T.nnet.softmax(Z.dot(self.Wo) + self.bo)
        prediction = T.argmax(py_x, axis=1)

        cost = -T.mean(T.log(py_x[T.arange(thY.shape[0]), thY]))
        grads = T.grad(cost, self.params)
        dparams = [theano.shared(p.get_value() * 0) for p in self.params]

        updates = [(p, p + mu * dp - learning_rate * g)
                   for p, dp, g in zip(self.params, dparams, grads)
                   ] + [(dp, mu * dp - learning_rate * g)
                        for dp, g in zip(dparams, grads)]

        # self.predict_op = theano.function(inputs=[thX, thStartPoints], outputs=prediction)
        self.train_op = theano.function(inputs=[thX, thY, thStartPoints],
                                        outputs=[cost, prediction],
                                        updates=updates)

        costs = []
        n_batches = N // batch_sz
        for i in range(epochs):
            t0 = datetime.now()
            X = shuffle(X)
            n_correct = 0
            n_total = 0
            cost = 0

            for j in range(n_batches):
                # construct input sequence and output sequence as
                # concatenatation of multiple input sequences and output sequences
                # input X should be a list of 2-D arrays or one 3-D array
                # N x T(n) x D - batch size x sequence length x num features
                # sequence length can be variable
                sequenceLengths = []
                input_sequence = []
                output_sequence = []
                for k in range(j * batch_sz, (j + 1) * batch_sz):
                    # don't always add the end token
                    if np.random.random() < 0.01 or len(X[k]) <= 1:
                        input_sequence += [0] + X[k]
                        output_sequence += X[k] + [1]
                        sequenceLengths.append(len(X[k]) + 1)
                    else:
                        input_sequence += [0] + X[k][:-1]
                        output_sequence += X[k]
                        sequenceLengths.append(len(X[k]))
                n_total += len(output_sequence)

                startPoints = np.zeros(len(output_sequence), dtype=np.int32)
                last = 0
                for length in sequenceLengths:
                    startPoints[last] = 1
                    last += length

                c, p = self.train_op(input_sequence, output_sequence,
                                     startPoints)
                cost += c
                for pj, xj in zip(p, output_sequence):
                    if pj == xj:
                        n_correct += 1
                if j % 1 == 0:
                    sys.stdout.write(
                        "j/n_batches: %d/%d correct rate so far: %f\r" %
                        (j, n_batches, float(n_correct) / n_total))
                    sys.stdout.flush()
            print("i:", i, "cost:", cost, "correct rate:",
                  (float(n_correct) / n_total), "time for epoch:",
                  (datetime.now() - t0))
            costs.append(cost)

        if show_fig:
            plt.plot(costs)
            plt.show()
    def __init__(self, Mi, Mo, activation):
        self.Mi = Mi
        self.Mo = Mo
        self.f  = activation

        # numpy init
        Wxi = init_weight(Mi, Mo)
        Whi = init_weight(Mo, Mo)
        Wci = init_weight(Mo, Mo)
        bi  = np.zeros(Mo)
        Wxf = init_weight(Mi, Mo)
        Whf = init_weight(Mo, Mo)
        Wcf = init_weight(Mo, Mo)
        bf  = np.zeros(Mo)
        Wxc = init_weight(Mi, Mo)
        Whc = init_weight(Mo, Mo)
        bc  = np.zeros(Mo)
        Wxo = init_weight(Mi, Mo)
        Who = init_weight(Mo, Mo)
        Wco = init_weight(Mo, Mo)
        bo  = np.zeros(Mo)
        c0  = np.zeros(Mo)
        h0  = np.zeros(Mo)

        # theano vars
        self.Wxi = theano.shared(Wxi)
        self.Whi = theano.shared(Whi)
        self.Wci = theano.shared(Wci)
        self.bi  = theano.shared(bi)
        self.Wxf = theano.shared(Wxf)
        self.Whf = theano.shared(Whf)
        self.Wcf = theano.shared(Wcf)
        self.bf  = theano.shared(bf)
        self.Wxc = theano.shared(Wxc)
        self.Whc = theano.shared(Whc)
        self.bc  = theano.shared(bc)
        self.Wxo = theano.shared(Wxo)
        self.Who = theano.shared(Who)
        self.Wco = theano.shared(Wco)
        self.bo  = theano.shared(bo)
        self.c0  = theano.shared(c0)
        self.h0  = theano.shared(h0)
        self.params = [
            self.Wxi,
            self.Whi,
            self.Wci,
            self.bi,
            self.Wxf,
            self.Whf,
            self.Wcf,
            self.bf,
            self.Wxc,
            self.Whc,
            self.bc,
            self.Wxo,
            self.Who,
            self.Wco,
            self.bo,
            self.c0,
            self.h0,
        ]
Ejemplo n.º 15
0
  def fit(self, X, Y, batch_sz=20, learning_rate=10e-1, mu=0.99, activation=tf.nn.sigmoid, epochs=100, show_fig=False):
    N, T, D = X.shape # X is of size N x T(n) x D
    K = len(set(Y.flatten()))
    M = self.M
    self.f = activation

    # initial weights
    # note: Wx, Wh, bh are all part of the RNN unit and will be created
    #       by BasicRNNCell
    Wo = init_weight(M, K).astype(np.float32)
    bo = np.zeros(K, dtype=np.float32)

    # make them tf variables
    self.Wo = tf.Variable(Wo)
    self.bo = tf.Variable(bo)

    # tf Graph input
    tfX = tf.placeholder(tf.float32, shape=(batch_sz, T, D), name='inputs')
    tfY = tf.placeholder(tf.int64, shape=(batch_sz, T), name='targets')

    # turn tfX into a sequence, e.g. T tensors all of size (batch_sz, D)
    sequenceX = x2sequence(tfX, T, D, batch_sz)

    # create the simple rnn unit
    rnn_unit = BasicRNNCell(num_units=self.M, activation=self.f)

    # Get rnn cell output
    # outputs, states = rnn_module.rnn(rnn_unit, sequenceX, dtype=tf.float32)
    outputs, states = get_rnn_output(rnn_unit, sequenceX, dtype=tf.float32)

    # outputs are now of size (T, batch_sz, M)
    # so make it (batch_sz, T, M)
    outputs = tf.transpose(outputs, (1, 0, 2))
    outputs = tf.reshape(outputs, (T*batch_sz, M))

    # Linear activation, using rnn inner loop last output
    logits = tf.matmul(outputs, self.Wo) + self.bo
    predict_op = tf.argmax(logits, 1)
    targets = tf.reshape(tfY, (T*batch_sz,))

    cost_op = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits, targets))
    train_op = tf.train.MomentumOptimizer(learning_rate, momentum=mu).minimize(cost_op)

    costs = []
    n_batches = N / batch_sz
    
    init = tf.initialize_all_variables()
    with tf.Session() as session:
      session.run(init)
      for i in xrange(epochs):
        X, Y = shuffle(X, Y)
        n_correct = 0
        cost = 0
        for j in xrange(n_batches):
          Xbatch = X[j*batch_sz:(j+1)*batch_sz]
          Ybatch = Y[j*batch_sz:(j+1)*batch_sz]
          
          _, c, p = session.run([train_op, cost_op, predict_op], feed_dict={tfX: Xbatch, tfY: Ybatch})
          cost += c
          for b in xrange(batch_sz):
            idx = (b + 1)*T - 1
            n_correct += (p[idx] == Ybatch[b][-1])
        if i % 10 == 0:
          print "i:", i, "cost:", cost, "classification rate:", (float(n_correct)/N)
        if n_correct == N:
          print "i:", i, "cost:", cost, "classification rate:", (float(n_correct)/N)
          break
        costs.append(cost)

    if show_fig:
      plt.plot(costs)
      plt.show()
    def fit(self, trees, test_trees, reg=1e-3, epochs=8, train_inner_nodes=False):
        D = self.D
        V = self.V
        K = self.K
        N = len(trees)

        We = init_weight(V, D)
        W11 = np.random.randn(D, D, D) / np.sqrt(3*D)
        W22 = np.random.randn(D, D, D) / np.sqrt(3*D)
        W12 = np.random.randn(D, D, D) / np.sqrt(3*D)
        W1 = init_weight(D, D)
        W2 = init_weight(D, D)
        bh = np.zeros(D)
        Wo = init_weight(D, K)
        bo = np.zeros(K)

        self.We = tf.Variable(We.astype(np.float32))
        self.W11 = tf.Variable(W11.astype(np.float32))
        self.W22 = tf.Variable(W22.astype(np.float32))
        self.W12 = tf.Variable(W12.astype(np.float32))
        self.W1 = tf.Variable(W1.astype(np.float32))
        self.W2 = tf.Variable(W2.astype(np.float32))
        self.bh = tf.Variable(bh.astype(np.float32))
        self.Wo = tf.Variable(Wo.astype(np.float32))
        self.bo = tf.Variable(bo.astype(np.float32))
        self.weights = [self.We, self.W11, self.W22, self.W12, self.W1, self.W2, self.Wo]


        words = tf.placeholder(tf.int32, shape=(None,), name='words')
        left_children = tf.placeholder(tf.int32, shape=(None,), name='left_children')
        right_children = tf.placeholder(tf.int32, shape=(None,), name='right_children')
        labels = tf.placeholder(tf.int32, shape=(None,), name='labels')

        # save for later
        self.words = words
        self.left = left_children
        self.right = right_children
        self.labels = labels

        def dot1(a, B):
            return tf.tensordot(a, B, axes=[[0], [1]])

        def dot2(B, a):
            return tf.tensordot(B, a, axes=[[1], [0]])

        def recursive_net_transform(hiddens, n):
            h_left = hiddens.read(left_children[n])
            h_right = hiddens.read(right_children[n])
            return self.f(
                dot1(h_left, dot2(self.W11, h_left)) +
                dot1(h_right, dot2(self.W22, h_right)) +
                dot1(h_left, dot2(self.W12, h_right)) +
                dot1(h_left, self.W1) +
                dot1(h_right, self.W2) +
                self.bh
            )


        def recurrence(hiddens, n):
            w = words[n]
            # any non-word will have index -1

            h_n = tf.cond(
                w >= 0,
                lambda: tf.nn.embedding_lookup(self.We, w),
                lambda: recursive_net_transform(hiddens, n)
            )
            hiddens = hiddens.write(n, h_n)
            n = tf.add(n, 1)
            return hiddens, n


        def condition(hiddens, n):
            # loop should continue while n < len(words)
            return tf.less(n, tf.shape(words)[0])


        hiddens = tf.TensorArray(
            tf.float32,
            size=0,
            dynamic_size=True,
            clear_after_read=False,
            infer_shape=False
        )

        hiddens, _ = tf.while_loop(
            condition,
            recurrence,
            [hiddens, tf.constant(0)],
            parallel_iterations=1
        )
        h = hiddens.stack()
        logits = tf.matmul(h, self.Wo) + self.bo

        prediction_op = tf.argmax(logits, axis=1)
        self.prediction_op = prediction_op
        
        rcost = reg*sum(tf.nn.l2_loss(p) for p in self.weights)
        if train_inner_nodes:
            # filter out -1s
            labeled_indices = tf.where(labels >= 0)

            cost_op = tf.reduce_mean(
                tf.nn.sparse_softmax_cross_entropy_with_logits(
                    logits=tf.gather(logits, labeled_indices),
                    labels=tf.gather(labels, labeled_indices),
                )
            ) + rcost
        else:
            cost_op = tf.reduce_mean(
                tf.nn.sparse_softmax_cross_entropy_with_logits(
                    logits=logits[-1],
                    labels=labels[-1],
                )
            ) + rcost

        train_op = tf.train.AdagradOptimizer(learning_rate=8e-3).minimize(cost_op)
        # train_op = tf.train.MomentumOptimizer(learning_rate=8e-3, momentum=0.9).minimize(cost_op)

        # NOTE: If you're using GPU, InteractiveSession breaks
        # AdagradOptimizer and some other optimizers
        # change to tf.Session() if so.
        self.session = tf.Session()
        init_op = tf.global_variables_initializer()
        self.session.run(init_op)


        costs = []
        sequence_indexes = range(N)
        for i in range(epochs):
            t0 = datetime.now()
            sequence_indexes = shuffle(sequence_indexes)
            n_correct = 0
            n_total = 0
            cost = 0
            it = 0
            for j in sequence_indexes:
                words_, left, right, lab = trees[j]
                # print("words_:", words_)
                # print("lab:", lab)
                c, p, _ = self.session.run(
                    (cost_op, prediction_op, train_op),
                    feed_dict={
                        words: words_,
                        left_children: left,
                        right_children: right,
                        labels: lab
                    }
                )
                if np.isnan(c):
                    print("Cost is nan! Let's stop here. \
                        Why don't you try decreasing the learning rate?")
                    for p in self.params:
                        print(p.get_value().sum())
                    exit()
                cost += c
                n_correct += (p[-1] == lab[-1])
                n_total += 1

                it += 1
                if it % 10 == 0:
                    sys.stdout.write(
                        "j/N: %d/%d correct rate so far: %f, cost so far: %f\r" %
                        (it, N, float(n_correct)/n_total, cost)
                    )
                    sys.stdout.flush()


            # calculate the test score
            n_test_correct = 0
            n_test_total = 0
            for words_, left, right, lab in test_trees:
                p = self.session.run(prediction_op, feed_dict={
                    words: words_,
                    left_children: left,
                    right_children: right,
                    labels: lab
                })
                n_test_correct += (p[-1] == lab[-1])
                n_test_total += 1


            print(
                "i:", i, "cost:", cost,
                "train acc:", float(n_correct)/n_total,
                "test acc:", float(n_test_correct)/n_test_total,
                "time for epoch:", (datetime.now() - t0)
            )
            costs.append(cost)

        plt.plot(costs)
        plt.show()
    def fit(self, X, learning_rate=10e-1, mu=0.99, reg=1.0, activation=T.tanh, epochs=500, show_fig=False):
        N = len(X)
        D = self.D
        M = self.M
        V = self.V
        self.f = activation

        # initial weights
        We = init_weight(V, D)
        Wx = init_weight(D, M)
        Wh = init_weight(M, M)
        bh = np.zeros(M)
        h0 = np.zeros(M)
        Wo = init_weight(M, V)
        bo = np.zeros(V)

        # make them theano shared
        self.We = theano.shared(We)
        self.Wx = theano.shared(Wx)
        self.Wh = theano.shared(Wh)
        self.bh = theano.shared(bh)
        self.h0 = theano.shared(h0)
        self.Wo = theano.shared(Wo)
        self.bo = theano.shared(bo)
        self.params = [self.We, self.Wx, self.Wh, self.bh, self.h0, self.Wo, self.bo]

        thX = T.ivector('X')
        Ei = self.We[thX] # will be a TxD matrix
        thY = T.ivector('Y')

        # sentence input:
        # [START, w1, w2, ..., wn]
        # sentence target:
        # [w1,    w2, w3, ..., END]

        def recurrence(x_t, h_t1):
            # returns h(t), y(t)
            h_t = self.f(x_t.dot(self.Wx) + h_t1.dot(self.Wh) + self.bh)
            y_t = T.nnet.softmax(h_t.dot(self.Wo) + self.bo)
            return h_t, y_t

        [h, y], _ = theano.scan(
            fn=recurrence,
            outputs_info=[self.h0, None],
            sequences=Ei,
            n_steps=Ei.shape[0],
        )

        py_x = y[:, 0, :]
        prediction = T.argmax(py_x, axis=1)

        cost = -T.mean(T.log(py_x[T.arange(thY.shape[0]), thY]))
        grads = T.grad(cost, self.params)
        dparams = [theano.shared(p.get_value()*0) for p in self.params]

        updates = [
            (p, p + mu*dp - learning_rate*g) for p, dp, g in zip(self.params, dparams, grads)
        ] + [
            (dp, mu*dp - learning_rate*g) for dp, g in zip(dparams, grads)
        ]

        self.predict_op = theano.function(inputs=[thX], outputs=prediction)
        self.train_op = theano.function(
            inputs=[thX, thY],
            outputs=[cost, prediction],
            updates=updates
        )

        costs = []
        n_total = sum((len(sentence)+1) for sentence in X)
        for i in xrange(epochs):
            X = shuffle(X)
            n_correct = 0
            cost = 0
            for j in xrange(N):
                # problem! many words --> END token are overrepresented
                # result: generated lines will be very short
                # we will try to fix in a later iteration
                # BAD! magic numbers 0 and 1...
                input_sequence = [0] + X[j]
                output_sequence = X[j] + [1]

                # we set 0 to start and 1 to end
                c, p = self.train_op(input_sequence, output_sequence)
                # print "p:", p
                cost += c
                # print "j:", j, "c:", c/len(X[j]+1)
                for pj, xj in zip(p, output_sequence):
                    if pj == xj:
                        n_correct += 1
            print "i:", i, "cost:", cost, "correct rate:", (float(n_correct)/n_total)
            costs.append(cost)

        if show_fig:
            plt.plot(costs)
            plt.show()
    def fit(self, X, learning_rate=10e-1, mu=0.99, reg=1.0, activation=T.tanh, epochs=500, show_fig=False):
        N = len(X)
        D = self.D
        M = self.M
        V = self.V

        # initial weights
        We = init_weight(V, D)
        Wx = init_weight(D, M)
        Wh = init_weight(M, M)
        bh = np.zeros(M)
        h0 = np.zeros(M)
        # z  = np.ones(M)
        Wxz = init_weight(D, M)
        Whz = init_weight(M, M)
        bz  = np.zeros(M)
        Wo = init_weight(M, V)
        bo = np.zeros(V)

        thX, thY, py_x, prediction = self.set(We, Wx, Wh, bh, h0, Wxz, Whz, bz, Wo, bo, activation)

        lr = T.scalar('lr')

        cost = -T.mean(T.log(py_x[T.arange(thY.shape[0]), thY]))
        grads = T.grad(cost, self.params)
        dparams = [theano.shared(p.get_value()*0) for p in self.params]

        updates = [
            (p, p + mu*dp - lr*g) for p, dp, g in zip(self.params, dparams, grads)
        ] + [
            (dp, mu*dp - lr*g) for dp, g in zip(dparams, grads)
        ]

        self.predict_op = theano.function(inputs=[thX], outputs=prediction)
        self.train_op = theano.function(
            inputs=[thX, thY, lr],
            outputs=[cost, prediction],
            updates=updates
        )

        costs = []
        for i in xrange(epochs):
            X = shuffle(X)
            n_correct = 0
            n_total = 0
            cost = 0
            for j in xrange(N):
                if np.random.random() < 0.1:
                    input_sequence = [0] + X[j]
                    output_sequence = X[j] + [1]
                else:
                    input_sequence = [0] + X[j][:-1]
                    output_sequence = X[j]
                n_total += len(output_sequence)

                # we set 0 to start and 1 to end
                c, p = self.train_op(input_sequence, output_sequence, learning_rate)
                # print "p:", p
                cost += c
                # print "j:", j, "c:", c/len(X[j]+1)
                for pj, xj in zip(p, output_sequence):
                    if pj == xj:
                        n_correct += 1
            print "i:", i, "cost:", cost, "correct rate:", (float(n_correct)/n_total)
            if (i + 1) % 500 == 0:
                learning_rate /= 2
            costs.append(cost)

        if show_fig:
            plt.plot(costs)
            plt.show()
	def fit(self, X, learning_rate=10e-1, mu=0.99, reg=1.0, activation=T.tanh, epochs=500, show_fig=False):
		N = len(X)
		D = self.D
		M = self.M
		V = self.V
		self.f = activation

		# intial weights
		We = init_weight(V, D)
		Wx = init_weight(D, M)
		Wh = init_weight(M, M)
		bh = np.zeros(M)
		h0 = np.zeros(M)
		Wo = init_weight(M, V)
		bo = np.zeros(V)

		self.We = theano.shared(We)
		self.Wx = theano.shared(Wx)
		self.Wh = theano.shared(Wh)
		self.bh = theano.shared(bh)
		self.h0 = theano.shared(h0)
		self.Wo = theano.shared(Wo)
		self.bo = theano.shared(bo)
		self.params = [self.We, self.Wx, self.Wh, self.bh, self.h0, self.Wo, self.bo]

		thX = T.ivector('X')
		Ei = self.We[thX] # T x D
		thY = T.ivector('Y')

		def recurrence(x_t, h_t1):
			# returns h(t), y(t)
			h_t = self.f(x_t.dot(self.Wx) + h_t1.dot(self.Wh) + self.bh)
			y_t = T.nnet.softmax(h_t.dot(self.Wo) + self.bo)
			return h_t, y_t

		[h, y], _ = theano.scan(
			fn=recurrence,
			outputs_info=[self.h0, None],
			sequences=Ei,
			n_steps=Ei.shape[0],
		)

		py_x = y[:, 0, :]
		prediction = T.argmax(py_x, axis=1)

		cost = -T.mean(T.log(py_x[T.arange(thY.shape[0]), thY]))
		grads = T.grad(cost, self.params)
		dparams = [theano.shared(p.get_value()*0) for p in self.params]

		updates = [
			(p, p + mu*dp - learning_rate*g) for p, dp, g in zip(self.params, dparams, grads)
		] + [
			(dp, mu*dp - learning_rate*g) for dp, g in zip(dparams, grads)
		]

		self.predict_op = theano.function(inputs=[thX], outputs=prediction)
		self.train_op = theano.function(
			inputs=[thX, thY],
			outputs=[cost, prediction],
			updates=updates,
		)

		costs = []
		n_total = sum((len(sentence) + 1) for sentence in X)
		for i in xrange(epochs):
			X = shuffle(X)
			n_correct = 0
			cost = 0
			for j in xrange(N):
				input_sequence = [0] + X[j]
				output_sequence = X[j] + [1]

				c, p = self.train_op(input_sequence, output_sequence)
				cost += c
				for pj, xj in zip(p, output_sequence):
					if pj == xj:
						n_correct += 1
			print "i:", i, "cost:", cost, "correct rate:", (float(n_correct) / n_total)
			costs.append(cost)

		if show_fig:
			plt.plot(costs)
			plt.show()
    def fit(self, X, Y, learning_rate=1e-2, mu=0.99, reg=1e-12, epochs=400, batch_sz=20, print_period=1, show_fig=False):

        # X = X.astype(np.float32)
        Y = Y.astype(np.int32)

        # initialize hidden layers
        N, D = X.shape
        K = len(set(Y))
        self.hidden_layers = []
        M1 = D
        count = 0
        for M2 in self.hidden_layer_sizes:
            h = HiddenLayer(M1, M2, count)
            self.hidden_layers.append(h)
            M1 = M2
            count += 1
        W = init_weight(M1, K)
        b = np.zeros(K)
        self.W = theano.shared(W, 'W_logreg')
        self.b = theano.shared(b, 'b_logreg')

        # collect params for later use
        self.params = [self.W, self.b]
        for h in self.hidden_layers:
            self.params += h.params

        # for momentum
        dparams = [theano.shared(np.zeros(p.get_value().shape)) for p in self.params]

        # for rmsprop
        cache = [theano.shared(np.zeros(p.get_value().shape)) for p in self.params]

        # set up theano functions and variables
        thX = T.matrix('X')
        thY = T.ivector('Y')
        pY = self.forward(thX)

        rcost = reg*T.sum([(p*p).sum() for p in self.params])
        cost = -T.mean(T.log(pY[T.arange(thY.shape[0]), thY])) + rcost
        prediction = self.predict(thX)
        grads = T.grad(cost, self.params)

        # momentum only
        updates = [
            (p, p + mu*dp - learning_rate*g) for p, dp, g in zip(self.params, dparams, grads)
        ] + [
            (dp, mu*dp - learning_rate*g) for dp, g in zip(dparams, grads)
        ]

        train_op = theano.function(
            inputs=[thX, thY],
            outputs=[cost, prediction],
            updates=updates,
        )

        n_batches = N // batch_sz
        costs = []
        for i in range(epochs):
            X, Y = shuffle(X, Y)
            for j in range(n_batches):
                Xbatch = X[j*batch_sz:(j*batch_sz+batch_sz)]
                Ybatch = Y[j*batch_sz:(j*batch_sz+batch_sz)]

                c, p = train_op(Xbatch, Ybatch)

                if j % print_period == 0:
                    costs.append(c)
                    e = np.mean(Ybatch != p)
                    print("i:", i, "j:", j, "nb:", n_batches, "cost:", c, "error rate:", e)
        
        if show_fig:
            plt.plot(costs)
            plt.show()
Ejemplo n.º 21
0
    def fit(self,
            trees,
            test_trees,
            reg=1e-3,
            epochs=8,
            train_inner_nodes=False):
        D = self.D
        V = self.V
        K = self.K
        N = len(trees)

        We = init_weight(V, D)
        W11 = np.random.randn(D, D, D) / np.sqrt(3 * D)
        W22 = np.random.randn(D, D, D) / np.sqrt(3 * D)
        W12 = np.random.randn(D, D, D) / np.sqrt(3 * D)
        W1 = init_weight(D, D)
        W2 = init_weight(D, D)
        bh = np.zeros(D)
        Wo = init_weight(D, K)
        bo = np.zeros(K)

        self.We = theano.shared(We)
        self.W11 = theano.shared(W11)
        self.W22 = theano.shared(W22)
        self.W12 = theano.shared(W12)
        self.W1 = theano.shared(W1)
        self.W2 = theano.shared(W2)
        self.bh = theano.shared(bh)
        self.Wo = theano.shared(Wo)
        self.bo = theano.shared(bo)
        self.params = [
            self.We, self.W11, self.W22, self.W12, self.W1, self.W2, self.bh,
            self.Wo, self.bo
        ]

        lr = T.scalar('learning_rate')
        words = T.ivector('words')
        left_children = T.ivector('left_children')
        right_children = T.ivector('right_children')
        labels = T.ivector('labels')

        def recurrence(n, hiddens, words, left, right):
            w = words[n]
            # any non-word will have index -1
            hiddens = T.switch(
                T.ge(w, 0), T.set_subtensor(hiddens[n], self.We[w]),
                T.set_subtensor(
                    hiddens[n],
                    self.
                    f(hiddens[left[n]].dot(self.W11).dot(hiddens[left[n]]) +
                      hiddens[right[n]].dot(self.W22).dot(hiddens[right[n]]) +
                      hiddens[left[n]].dot(self.W12).dot(hiddens[right[n]]) +
                      hiddens[left[n]].dot(self.W1) +
                      hiddens[right[n]].dot(self.W2) + self.bh)))
            return hiddens

        hiddens = T.zeros((words.shape[0], D))

        h, _ = theano.scan(
            fn=recurrence,
            outputs_info=[hiddens],
            n_steps=words.shape[0],
            sequences=T.arange(words.shape[0]),
            non_sequences=[words, left_children, right_children],
        )

        py_x = T.nnet.softmax(h[-1].dot(self.Wo) + self.bo)

        prediction = T.argmax(py_x, axis=1)

        rcost = reg * T.sum([(p * p).sum() for p in self.params])
        if train_inner_nodes:
            relevant_labels = labels[labels >= 0]
            cost = -T.mean(T.log(py_x[labels >= 0, relevant_labels])) + rcost
        else:
            cost = -T.mean(T.log(py_x[-1, labels[-1]])) + rcost

        updates = adagrad(cost, self.params, lr)

        self.cost_predict_op = theano.function(
            inputs=[words, left_children, right_children, labels],
            outputs=[cost, prediction],
            allow_input_downcast=True,
        )

        self.train_op = theano.function(
            inputs=[words, left_children, right_children, labels, lr],
            outputs=[cost, prediction],
            updates=updates)

        lr_ = 8e-3  # initial learning rate
        costs = []
        sequence_indexes = range(N)
        # if train_inner_nodes:
        #     n_total = sum(len(words) for words, _, _, _ in trees)
        # else:
        #     n_total = N
        for i in range(epochs):
            t0 = datetime.now()
            sequence_indexes = shuffle(sequence_indexes)
            n_correct = 0
            n_total = 0
            cost = 0
            it = 0
            for j in sequence_indexes:
                words, left, right, lab = trees[j]
                c, p = self.train_op(words, left, right, lab, lr_)
                if np.isnan(c):
                    print("Cost is nan! Let's stop here. \
                        Why don't you try decreasing the learning rate?")
                    for p in self.params:
                        print(p.get_value().sum())
                    exit()
                cost += c
                n_correct += (p[-1] == lab[-1])
                n_total += 1
                it += 1
                if it % 10 == 0:
                    sys.stdout.write(
                        "j/N: %d/%d correct rate so far: %f, cost so far: %f\r"
                        % (it, N, float(n_correct) / n_total, cost))
                    sys.stdout.flush()

            # calculate the test score
            n_test_correct = 0
            n_test_total = 0
            for words, left, right, lab in test_trees:
                _, p = self.cost_predict_op(words, left, right, lab)
                n_test_correct += (p[-1] == lab[-1])
                n_test_total += 1

            print("i:", i, "cost:", cost, "train acc:",
                  float(n_correct) / n_total, "test acc:",
                  float(n_test_correct) / n_test_total, "time for epoch:",
                  (datetime.now() - t0))
            costs.append(cost)

        plt.plot(costs)
        plt.show()
Ejemplo n.º 22
0
    def fit(self, X, Y, learning_rate=0.01, mu=0.99, epochs=30, batch_sz=100):
        N, D = X.shape
        K = len(set(Y))

        self.hidden_layers = []
        mi = D
        for mo in self.hidden_layer_sizes:
            h = HiddenLayer(mi, mo)
            self.hidden_layers.append(h)
            mi = mo

        # initialize logistic regression layer
        W = init_weight(*(mo, K))
        b = np.zeros(K)
        self.W = theano.shared(W)
        self.b = theano.shared(b)

        self.params = [self.W, self.b]
        self.allWs = []
        for h in self.hidden_layers:
            self.params += h.params
            self.allWs.append(h.W)
        self.allWs.append(self.W)

        X_in = T.matrix('X_in')
        targets = T.ivector('Targets')
        pY = self.forward(X_in)

        cost = -T.mean(T.log(pY[T.arange(pY.shape[0]), targets]))
        prediction = self.predict(X_in)
        # cost_predict_op = theano.function(
        #     inputs=[X_in, targets],
        #     outputs=[cost, prediction],
        # )

        dparams = [theano.shared(p.get_value() * 0) for p in self.params]
        grads = T.grad(cost, self.params)

        updates = [(p, p + mu * dp - learning_rate * g)
                   for p, dp, g in zip(self.params, dparams, grads)
                   ] + [(dp, mu * dp - learning_rate * g)
                        for dp, g in zip(dparams, grads)]
        train_op = theano.function(
            inputs=[X_in, targets],
            outputs=[cost, prediction],
            updates=updates,
        )

        n_batches = N / batch_sz
        costs = []
        lastWs = [W.get_value() for W in self.allWs]
        W_changes = []
        print "supervised training..."
        for i in xrange(epochs):
            print "epoch:", i
            X, Y = shuffle(X, Y)
            for j in xrange(n_batches):
                Xbatch = X[j * batch_sz:(j * batch_sz + batch_sz)]
                Ybatch = Y[j * batch_sz:(j * batch_sz + batch_sz)]
                c, p = train_op(Xbatch, Ybatch)
                if j % 100 == 0:
                    print "j / n_batches:", j, "/", n_batches, "cost:", c, "error:", error_rate(
                        p, Ybatch)
                costs.append(c)

                # log changes in all Ws
                W_change = [
                    np.abs(W.get_value() - lastW).mean()
                    for W, lastW in zip(self.allWs, lastWs)
                ]
                W_changes.append(W_change)
                lastWs = [W.get_value() for W in self.allWs]

        W_changes = np.array(W_changes)
        plt.subplot(2, 1, 1)
        for i in xrange(W_changes.shape[1]):
            plt.plot(W_changes[:, i], label='layer %s' % i)
        plt.legend()
        # plt.show()

        plt.subplot(2, 1, 2)
        plt.plot(costs)
        plt.show()
    def fit(self, X, Y, batch_sz=20, learning_rate=1.0, mu=0.99, reg=1.0, activation=T.tanh, epochs=100, show_fig=False):
        D = X[0].shape[1] # X is of size N x T(n) x D
        K = len(set(Y.flatten()))
        N = len(Y)
        M = self.M
        self.f = activation

        # initial weights
        Wx = init_weight(D, M)
        Wh = init_weight(M, M)
        bh = np.zeros(M)
        h0 = np.zeros(M)
        Wo = init_weight(M, K)
        bo = np.zeros(K)

        # make them theano shared
        self.Wx = theano.shared(Wx)
        self.Wh = theano.shared(Wh)
        self.bh = theano.shared(bh)
        self.h0 = theano.shared(h0)
        self.Wo = theano.shared(Wo)
        self.bo = theano.shared(bo)
        self.params = [self.Wx, self.Wh, self.bh, self.h0, self.Wo, self.bo]

        thX = T.fmatrix('X') # will represent multiple batches concatenated
        thY = T.ivector('Y')
        thStartPoints = T.ivector('start_points')

        XW = thX.dot(self.Wx)

        # startPoints will contain 1 where a sequence starts and 0 otherwise
        # Ex. if I have 3 sequences: [[1,2,3], [4,5], [6,7,8]]
        # Then I will concatenate these into one X: [1,2,3,4,5,6,7,8]
        # And startPoints will be [1,0,0,1,0,1,0,0]

        # One possible solution: loop through index
        # def recurrence(t, h_t1, XW, h0, startPoints):
        #     # returns h(t)

        #     # if at a boundary, state should be h0
        #     h_t = T.switch(
        #         T.eq(startPoints[t], 1),
        #         self.f(XW[t] + h0.dot(self.Wh) + self.bh),
        #         self.f(XW[t] + h_t1.dot(self.Wh) + self.bh)
        #     )
        #     return h_t

        # h, _ = theano.scan(
        #     fn=recurrence,
        #     outputs_info=[self.h0],
        #     sequences=T.arange(XW.shape[0]),
        #     non_sequences=[XW, self.h0, thStartPoints],
        #     n_steps=XW.shape[0],
        # )

        # other solution - loop through all sequences simultaneously
        def recurrence(xw_t, is_start, h_t1, h0):
            # if at a boundary, state should be h0
            h_t = T.switch(
                T.eq(is_start, 1),
                self.f(xw_t + h0.dot(self.Wh) + self.bh),
                self.f(xw_t + h_t1.dot(self.Wh) + self.bh)
            )
            return h_t

        h, _ = theano.scan(
            fn=recurrence,
            outputs_info=[self.h0],
            sequences=[XW, thStartPoints],
            non_sequences=[self.h0],
            n_steps=XW.shape[0],
        )

        # h is of shape (T*batch_sz, M)
        py_x = T.nnet.softmax(h.dot(self.Wo) + self.bo)
        prediction = T.argmax(py_x, axis=1)

        cost = -T.mean(T.log(py_x[T.arange(thY.shape[0]), thY]))
        grads = T.grad(cost, self.params)
        dparams = [theano.shared(p.get_value()*0) for p in self.params]

        updates = [
            (p, p + mu*dp - learning_rate*g) for p, dp, g in zip(self.params, dparams, grads)
        ] + [
            (dp, mu*dp - learning_rate*g) for dp, g in zip(dparams, grads)
        ]

        # self.predict_op = theano.function(inputs=[thX, thStartPoints], outputs=prediction)
        self.train_op = theano.function(
            inputs=[thX, thY, thStartPoints],
            outputs=[cost, prediction, py_x],
            updates=updates
        )

        costs = []
        n_batches = N // batch_sz
        sequenceLength = X.shape[1]

        # if each sequence was of variable length, we would need to
        # initialize this inside the loop for every new batch
        startPoints = np.zeros(sequenceLength*batch_sz, dtype=np.int32)
        for b in range(batch_sz):
            startPoints[b*sequenceLength] = 1
        for i in range(epochs):
            X, Y = shuffle(X, Y)
            n_correct = 0
            cost = 0
            for j in range(n_batches):
                Xbatch = X[j*batch_sz:(j+1)*batch_sz].reshape(sequenceLength*batch_sz, D)
                Ybatch = Y[j*batch_sz:(j+1)*batch_sz].reshape(sequenceLength*batch_sz).astype(np.int32)
                c, p, rout = self.train_op(Xbatch, Ybatch, startPoints)
                # print "p:", p
                cost += c
                # P = p.reshape(batch_sz, sequenceLength)
                for b in range(batch_sz):
                    idx = sequenceLength*(b + 1) - 1
                    if p[idx] == Ybatch[idx]:
                        n_correct += 1
                    # else:
                        # print "pred:", p[idx], "actual:", Ybatch[idx]
            if i % 10 == 0:
                print("shape y:", rout.shape)
                print("i:", i, "cost:", cost, "classification rate:", (float(n_correct)/N))
            if n_correct == N:
                print("i:", i, "cost:", cost, "classification rate:", (float(n_correct)/N))
                break
            costs.append(cost)

        if show_fig:
            plt.plot(costs)
            plt.show()
Ejemplo n.º 24
0
    def fit(self, X, learning_rate=1e-5, mu=0.99, epochs=10, show_fig=True, activation=T.nnet.relu, RecurrentUnit=GRU, normalize=True):
        D = self.D
        V = self.V
        N = len(X)

        We = init_weight(V, D)
        self.hidden_layers = []
        Mi = D
        for Mo in self.hidden_layer_sizes:
            ru = RecurrentUnit(Mi, Mo, activation)
            self.hidden_layers.append(ru)
            Mi = Mo

        Wo = init_weight(Mi, V)
        bo = np.zeros(V)

        self.We = theano.shared(We)
        self.Wo = theano.shared(Wo)
        self.bo = theano.shared(bo)
        self.params = [self.Wo, self.bo]
        for ru in self.hidden_layers:
            self.params += ru.params

        thX = T.ivector('X')
        thY = T.ivector('Y')

        Z = self.We[thX]
        for ru in self.hidden_layers:
            Z = ru.output(Z)
        py_x = T.nnet.softmax(Z.dot(self.Wo) + self.bo)

        prediction = T.argmax(py_x, axis=1)
        # let's return py_x too so we can draw a sample instead
        self.predict_op = theano.function(
            inputs=[thX],
            outputs=[py_x, prediction],
            allow_input_downcast=True,
        )
        
        cost = -T.mean(T.log(py_x[T.arange(thY.shape[0]), thY]))
        grads = T.grad(cost, self.params)
        dparams = [theano.shared(p.get_value()*0) for p in self.params]

        dWe = theano.shared(self.We.get_value()*0)
        gWe = T.grad(cost, self.We)
        dWe_update = mu*dWe - learning_rate*gWe
        We_update = self.We + dWe_update
        if normalize:
            We_update /= We_update.norm(2)

        updates = [
            (p, p + mu*dp - learning_rate*g) for p, dp, g in zip(self.params, dparams, grads)
        ] + [
            (dp, mu*dp - learning_rate*g) for dp, g in zip(dparams, grads)
        ] + [
            (self.We, We_update), (dWe, dWe_update)
        ]

        self.train_op = theano.function(
            inputs=[thX, thY],
            outputs=[cost, prediction],
            updates=updates
        )

        costs = []
        for i in range(epochs):
            t0 = datetime.now()
            X = shuffle(X)
            n_correct = 0
            n_total = 0
            cost = 0
            for j in range(N):
                if np.random.random() < 0.01 or len(X[j]) <= 1:
                    input_sequence = [0] + X[j]
                    output_sequence = X[j] + [1]
                else:
                    input_sequence = [0] + X[j][:-1]
                    output_sequence = X[j]
                n_total += len(output_sequence)

                # test:
                
                try:
                    # we set 0 to start and 1 to end
                    c, p = self.train_op(input_sequence, output_sequence)
                except Exception as e:
                    PYX, pred = self.predict_op(input_sequence)
                    print("input_sequence len:", len(input_sequence))
                    print("PYX.shape:",PYX.shape)
                    print("pred.shape:", pred.shape)
                    raise e
                # print "p:", p
                cost += c
                # print "j:", j, "c:", c/len(X[j]+1)
                for pj, xj in zip(p, output_sequence):
                    if pj == xj:
                        n_correct += 1
                if j % 200 == 0:
                    sys.stdout.write("j/N: %d/%d correct rate so far: %f\r" % (j, N, float(n_correct)/n_total))
                    sys.stdout.flush()
            print("i:", i, "cost:", cost, "correct rate:", (float(n_correct)/n_total), "time for epoch:", (datetime.now() - t0))
            costs.append(cost)

        if show_fig:
            plt.plot(costs)
            plt.show()
    def fit(self, trees, learning_rate=10e-4, mu=0.5, reg=10e-3, eps=10e-3, epochs=20, activation=T.tanh, train_inner_nodes=False):
        D = self.D
        V = self.V
        K = self.K
        self.f = activation
        N = len(trees)

        We = init_weight(V, D)
        W11 = np.random.randn(D, D, D) / np.sqrt(3*D)
        W22 = np.random.randn(D, D, D) / np.sqrt(3*D)
        W12 = np.random.randn(D, D, D) / np.sqrt(3*D)
        W1 = init_weight(D, D)
        W2 = init_weight(D, D)
        bh = np.zeros(D)
        Wo = init_weight(D, K)
        bo = np.zeros(K)

        self.We = theano.shared(We)
        self.W11 = theano.shared(W11)
        self.W22 = theano.shared(W22)
        self.W12 = theano.shared(W12)
        self.W1 = theano.shared(W1)
        self.W2 = theano.shared(W2)
        self.bh = theano.shared(bh)
        self.Wo = theano.shared(Wo)
        self.bo = theano.shared(bo)
        self.params = [self.We, self.W11, self.W22, self.W12, self.W1, self.W2, self.bh, self.Wo, self.bo]

        words = T.ivector('words')
        left_children = T.ivector('left_children')
        right_children = T.ivector('right_children')
        labels = T.ivector('labels')

        def recurrence(n, hiddens, words, left, right):
            w = words[n]
            # any non-word will have index -1
            hiddens = T.switch(
                T.ge(w, 0),
                T.set_subtensor(hiddens[n], self.We[w]),
                T.set_subtensor(hiddens[n],
                    self.f(
                        hiddens[left[n]].dot(self.W11).dot(hiddens[left[n]]) +
                        hiddens[right[n]].dot(self.W22).dot(hiddens[right[n]]) +
                        hiddens[left[n]].dot(self.W12).dot(hiddens[right[n]]) +
                        hiddens[left[n]].dot(self.W1) +
                        hiddens[right[n]].dot(self.W2) +
                        self.bh
                    )
                )
            )
            return hiddens

        hiddens = T.zeros((words.shape[0], D))

        h, _ = theano.scan(
            fn=recurrence,
            outputs_info=[hiddens],
            n_steps=words.shape[0],
            sequences=T.arange(words.shape[0]),
            non_sequences=[words, left_children, right_children],
        )

        py_x = T.nnet.softmax(h[-1].dot(self.Wo) + self.bo)

        prediction = T.argmax(py_x, axis=1)
        
        rcost = reg*T.mean([(p*p).sum() for p in self.params])
        if train_inner_nodes:
            cost = -T.mean(T.log(py_x[T.arange(labels.shape[0]), labels])) + rcost
        else:
            cost = -T.mean(T.log(py_x[-1, labels[-1]])) + rcost
        grads = T.grad(cost, self.params)
        # dparams = [theano.shared(p.get_value()*0) for p in self.params]
        cache = [theano.shared(p.get_value()*0) for p in self.params]

        # momentum
        # updates = [
        #     (p, p + mu*dp - learning_rate*g) for p, dp, g in zip(self.params, dparams, grads)
        # ] + [
        #     (dp, mu*dp - learning_rate*g) for dp, g in zip(dparams, grads)
        # ]
        updates = [
            (c, c + g*g) for c, g in zip(cache, grads)
        ] + [
            (p, p - learning_rate*g / T.sqrt(c + eps)) for p, c, g in zip(self.params, cache, grads)
        ]

        self.cost_predict_op = theano.function(
            inputs=[words, left_children, right_children, labels],
            outputs=[cost, prediction],
            allow_input_downcast=True,
        )

        self.train_op = theano.function(
            inputs=[words, left_children, right_children, labels],
            outputs=[cost, prediction],
            updates=updates
        )

        costs = []
        sequence_indexes = range(N)
        if train_inner_nodes:
            n_total = sum(len(words) for words, _, _, _ in trees)
        else:
            n_total = N
        for i in xrange(epochs):
            t0 = datetime.now()
            sequence_indexes = shuffle(sequence_indexes)
            n_correct = 0
            cost = 0
            it = 0
            for j in sequence_indexes:
                words, left, right, lab = trees[j]
                c, p = self.train_op(words, left, right, lab)
                if np.isnan(c):
                    print "Cost is nan! Let's stop here. Why don't you try decreasing the learning rate?"
                    exit()
                cost += c
                if train_inner_nodes:
                    n_correct += np.sum(p == lab)
                else:
                    n_correct += (p[-1] == lab[-1])
                it += 1
                if it % 1 == 0:
                    sys.stdout.write("j/N: %d/%d correct rate so far: %f, cost so far: %f\r" % (it, N, float(n_correct)/n_total, cost))
                    sys.stdout.flush()
            print "i:", i, "cost:", cost, "correct rate:", (float(n_correct)/n_total), "time for epoch:", (datetime.now() - t0)
            costs.append(cost)

        plt.plot(costs)
        plt.show()
    def __init__(self, Mi, Mo, activation):
        self.Mi = Mi
        self.Mo = Mo
        self.f = activation

        # input gate
        self.Wxi = init_weight(Mi, Mo)
        self.Whi = init_weight(Mo, Mo)
        self.Wci = init_weight(Mo, Mo)
        self.bi = np.zeros(Mi)

        # forget gate
        self.Wxf = init_weight(Mi, Mo)
        self.Whf = init_weight(Mo, Mo)
        self.Wcf = init_weight(Mo, Mo)
        self.bf = np.zeros(Mo)

        # candidate cell
        self.Wxc = init_weight(Mi, Mo)
        self.Whc = init_weight(Mo, Mo)
        self.bc = np.zeros(Mo)

        # output gate
        self.Wxo = init_weight(Mi, Mo)
        self.Who = init_weight(Mo, Mo)
        self.Wco = init_weight(Mo, Mo)
        self.bo = np.zeros(Mo)

        # initial state of h and c
        self.h0 = np.zeros(Mo)
        self.c0 = np.zeros(Mo)  # czy to dobry rozmiar

        # initialize in theano
        # input gate
        self.Wxi = theano.shared(Wxi)
        self.Whi = theano.shared(Whi)
        self.Wci = theano.shared(Wci)
        self.bi = theano.shared(bi)

        # forget gate
        self.Wxf = theano.shared(Wxf)
        self.Whf = theano.shared(Whf)
        self.Wcf = theano.shared(Wcf)
        self.bf = theano.shared(bf)

        # candidate gate
        self.Wxc = theano.shared(Wxc)
        self.Whc = theano.shared(Whc)
        self.bc = theano.shared(bc)

        # output gate
        self.Wxo = theano.shared(Wxo)
        self.Who = theano.shared(Who)
        self.Wco = theano.shared(Wco)
        self.bo = theano.shared(bo)

        # initial states
        self.h0 = theano.shared(h0)
        self.c0 = theano.shared(c0)

        # list for grad update
        params = [
            self.Wxi, self.Whi, self.Wci, self.bi, self.Wxf, self.Whf,
            self.Wcf, self.bf, self.Wxc, self.Whc, self.bc, self.Wxo, self.Who,
            self.Wco, self.bo, self.h0, self.c0
        ]
    def fit(self, trees, test_trees, reg=1e-3, epochs=8, train_inner_nodes=False):
        D = self.D
        V = self.V
        K = self.K
        N = len(trees)

        We = init_weight(V, D)
        W11 = np.random.randn(D, D, D) / np.sqrt(3*D)
        W22 = np.random.randn(D, D, D) / np.sqrt(3*D)
        W12 = np.random.randn(D, D, D) / np.sqrt(3*D)
        W1 = init_weight(D, D)
        W2 = init_weight(D, D)
        bh = np.zeros(D)
        Wo = init_weight(D, K)
        bo = np.zeros(K)

        self.We = theano.shared(We)
        self.W11 = theano.shared(W11)
        self.W22 = theano.shared(W22)
        self.W12 = theano.shared(W12)
        self.W1 = theano.shared(W1)
        self.W2 = theano.shared(W2)
        self.bh = theano.shared(bh)
        self.Wo = theano.shared(Wo)
        self.bo = theano.shared(bo)
        self.params = [self.We, self.W11, self.W22, self.W12, self.W1, self.W2, self.bh, self.Wo, self.bo]

        lr = T.scalar('learning_rate')
        words = T.ivector('words')
        left_children = T.ivector('left_children')
        right_children = T.ivector('right_children')
        labels = T.ivector('labels')

        def recurrence(n, hiddens, words, left, right):
            w = words[n]
            # any non-word will have index -1
            hiddens = T.switch(
                T.ge(w, 0),
                T.set_subtensor(hiddens[n], self.We[w]),
                T.set_subtensor(hiddens[n],
                    self.f(
                        hiddens[left[n]].dot(self.W11).dot(hiddens[left[n]]) +
                        hiddens[right[n]].dot(self.W22).dot(hiddens[right[n]]) +
                        hiddens[left[n]].dot(self.W12).dot(hiddens[right[n]]) +
                        hiddens[left[n]].dot(self.W1) +
                        hiddens[right[n]].dot(self.W2) +
                        self.bh
                    )
                )
            )
            return hiddens

        hiddens = T.zeros((words.shape[0], D))

        h, _ = theano.scan(
            fn=recurrence,
            outputs_info=[hiddens],
            n_steps=words.shape[0],
            sequences=T.arange(words.shape[0]),
            non_sequences=[words, left_children, right_children],
        )

        py_x = T.nnet.softmax(h[-1].dot(self.Wo) + self.bo)

        prediction = T.argmax(py_x, axis=1)
        
        rcost = reg*T.sum([(p*p).sum() for p in self.params])
        if train_inner_nodes:
            relevant_labels = labels[labels >= 0]
            cost = -T.mean(T.log(py_x[labels >= 0, relevant_labels])) + rcost
        else:
            cost = -T.mean(T.log(py_x[-1, labels[-1]])) + rcost
        

        updates = adagrad(cost, self.params, lr)

        self.cost_predict_op = theano.function(
            inputs=[words, left_children, right_children, labels],
            outputs=[cost, prediction],
            allow_input_downcast=True,
        )

        self.train_op = theano.function(
            inputs=[words, left_children, right_children, labels, lr],
            outputs=[cost, prediction],
            updates=updates
        )

        lr_ = 8e-3 # initial learning rate
        costs = []
        sequence_indexes = range(N)
        # if train_inner_nodes:
        #     n_total = sum(len(words) for words, _, _, _ in trees)
        # else:
        #     n_total = N
        for i in range(epochs):
            t0 = datetime.now()
            sequence_indexes = shuffle(sequence_indexes)
            n_correct = 0
            n_total = 0
            cost = 0
            it = 0
            for j in sequence_indexes:
                words, left, right, lab = trees[j]
                c, p = self.train_op(words, left, right, lab, lr_)
                if np.isnan(c):
                    print("Cost is nan! Let's stop here. \
                        Why don't you try decreasing the learning rate?")
                    for p in self.params:
                        print(p.get_value().sum())
                    exit()
                cost += c
                n_correct += (p[-1] == lab[-1])
                n_total += 1
                it += 1
                if it % 10 == 0:
                    sys.stdout.write(
                        "j/N: %d/%d correct rate so far: %f, cost so far: %f\r" %
                        (it, N, float(n_correct)/n_total, cost)
                    )
                    sys.stdout.flush()

            # calculate the test score
            n_test_correct = 0
            n_test_total = 0
            for words, left, right, lab in test_trees:
                _, p = self.cost_predict_op(words, left, right, lab)
                n_test_correct += (p[-1] == lab[-1])
                n_test_total += 1

            print(
                "i:", i, "cost:", cost,
                "train acc:", float(n_correct)/n_total,
                "test acc:", float(n_test_correct)/n_test_total,
                "time for epoch:", (datetime.now() - t0)
            )
            costs.append(cost)

        plt.plot(costs)
        plt.show()
    def fit(self,
            X,
            Y,
            learning_rate=1.0,
            mu=0.99,
            reg=1.0,
            activation=tf.tanh,
            epochs=100,
            show_fig=False):
        N, T, D = X.shape
        K = len(set(Y.flatten()))
        M = self.M
        self.f = activation

        # initial weights, pay attention to the shape!
        Wx = init_weight(D, M).astype(np.float32)
        Wh = init_weight(M, M).astype(np.float32)
        bh = np.zeros(M, dtype=np.float32)
        h0 = np.zeros(M, dtype=np.float32)
        Wo = init_weight(M, K).astype(np.float32)
        bo = np.zeros(K, dtype=np.float32)

        self.Wx = tf.Variable(Wx)
        self.Wh = tf.Variable(Wh)
        self.bh = tf.Variable(bh)
        self.h0 = tf.Variable(h0)
        self.Wo = tf.Variable(Wo)
        self.bo = tf.Variable(bo)

        tfX = tf.placeholder(tf.float32, shape=(T, D), name='X')
        tfY = tf.placeholder(tf.int32, shape=(T, ), name='Y')

        XWx = tf.matmul(tfX, self.Wx)

        def recurrence(h_t1, xw_t):
            # matmul() only works with 2-D objects
            # we want to return a 1-D object of size M
            # so that the final result is T x M, not T x 1 x M!
            h_t = self.f(xw_t + tf.matmul(tf.reshape(h_t1, (1, M)), self.Wh) +
                         self.bh)
            return tf.reshape(h_t, (M, ))

        h = tf.scan(
            fn=recurrence,
            elems=XWx,
            initializer=self.h0,
        )

        logits = tf.matmul(h, self.Wo) + self.bo

        cost = tf.reduce_mean(
            tf.nn.sparse_softmax_cross_entropy_with_logits(
                labels=tfY,
                logits=logits,
            ))

        predict_op = tf.argmax(logits, 1)
        train_op = tf.train.AdamOptimizer(1e-2).minimize(cost)

        init = tf.global_variables_initializer()
        with tf.Session() as session:
            session.run(init)

            costs = []
            for i in range(epochs):
                X, Y = shuffle(X, Y)
                n_correct = 0
                batch_cost = 0
                for j in range(N):
                    _, c, p = session.run([train_op, cost, predict_op],
                                          feed_dict={
                                              tfX: X[j].reshape(T, D),
                                              tfY: Y[j]
                                          })
                    batch_cost += c
                    if p[-1] == Y[j, -1]:
                        n_correct += 1
                print("i:", i, "cost:", batch_cost, "classification rate:",
                      (float(n_correct) / N))
                costs.append(batch_cost)
                if n_correct == N:
                    break

        if show_fig:
            plt.plot(costs)
            plt.show()
Ejemplo n.º 29
0
    def fit(self,
            X,
            Y,
            Xtest,
            Ytest,
            pretrain=True,
            learning_rate=0.1,
            mu=0.99,
            reg=0.0,
            epochs=1,
            batch_sz=100):
        # cast to float32
        learning_rate = np.float32(learning_rate)
        mu = np.float32(mu)
        reg = np.float32(reg)

        # 選擇要不要 train AutoEncoder 物件
        pretrain_epochs = 2
        if not pretrain:
            pretrain_epochs = 0  # 假如這裡epoch=0 就只會initialize weights 但不做training

        # training AutoEncoder 物件
        current_input = X
        for ae in self.hidden_layers:
            ae.fit(current_input, epochs=pretrain_epochs)
            current_input = ae.hidden_op(current_input)

        # initialize logistic regression layer (最後一層)
        N = len(Y)
        K = len(set(Y))
        W0 = init_weight((self.hidden_layers[-1].M, K))
        self.W = theano.shared(W0, 'W_logreg')
        self.b = theano.shared(np.zeros(K, dtype=np.float32), 'b_logreg')
        self.params = [self.W, self.b]
        for ae in self.hidden_layers:
            # self.params.append(ae.forward_params)
            self.params += ae.forward_params

        self.dW = theano.shared(np.zeros(W0.shape, dtype=np.float32),
                                'dW_logreg')
        self.db = theano.shared(np.zeros(K, dtype=np.float32), 'db_logreg')
        self.dparams = [self.dW, self.db]
        for ae in self.hidden_layers:
            # self.dparams.append(ae.forward_dparams)
            self.dparams += ae.forward_dparams

        X_in = T.matrix('X_in', dtype='float32')
        targets = T.ivector(
            'Targets')  # 注意,這邊要在建立vector存量同時,就宣告存量型別(ivector),如果不宣告,預設是float32
        pY = self.forward(X_in)

        reg_cost = T.sum([(p * p).sum() for p in self.params])
        cost = -T.mean(T.log(pY[T.arange(pY.shape[0]),
                                targets])) + reg * reg_cost
        updates = [
            (p, p + mu * dp - learning_rate * T.grad(cost, p))
            for p, dp in zip(self.params, self.dparams)
        ] + [
            (dp, mu * dp - learning_rate * T.grad(cost, p))
            for p, dp, in zip(self.params, self.dparams)
        ]  # ..............................包含之前已Autoencoder pretrain過的每一層都要training
        train_op = theano.function(inputs=[X_in, targets], updates=updates)

        prediction = self.predict(X_in)
        cost_predict_op = theano.function(inputs=[X_in, targets],
                                          outputs=[cost, prediction])

        n_batches = N // batch_sz
        costs = []
        print("supervised training...")
        for i in range(epochs):
            print("epoch:", i)
            X, Y = shuffle(X, Y)
            for j in range(n_batches):
                Xbatch = X[j * batch_sz:(j * batch_sz + batch_sz)]
                Ybatch = Y[j * batch_sz:(j * batch_sz + batch_sz)]
                train_op(Xbatch, Ybatch)
                the_cost, the_prediction = cost_predict_op(Xtest, Ytest)
                error = error_rate(the_prediction, Ytest)
                print("j / n_batches:", j, "/", n_batches, "cost:", the_cost,
                      "error:", error)
                costs.append(the_cost)
        plt.plot(costs)
        plt.show()
Ejemplo n.º 30
0
    def __init__(self, Mi, Mo, activation):
        self.Mi = Mi
        self.Mo = Mo
        self.f = activation

        # numpy init
        Wxi = init_weight(Mi, Mo)  # input to input gate
        Whi = init_weight(Mo, Mo)
        Wci = init_weight(Mo, Mo)
        bi = np.zeros(Mo)
        Wxf = init_weight(Mi, Mo)  # input to forget gate
        Whf = init_weight(Mo, Mo)
        Wcf = init_weight(Mo, Mo)
        bf = np.zeros(Mo)
        Wxc = init_weight(Mi, Mo)  # input to cell
        Whc = init_weight(Mo, Mo)
        bc = np.zeros(Mo)
        Wxo = init_weight(Mi, Mo)  # input to output gate
        Who = init_weight(Mo, Mo)
        Wco = init_weight(Mo, Mo)
        bo = np.zeros(Mo)
        c0 = np.zeros(Mo)
        h0 = np.zeros(Mo)

        # theano vars
        self.Wxi = theano.shared(Wxi)
        self.Whi = theano.shared(Whi)
        self.Wci = theano.shared(Wci)
        self.bi = theano.shared(bi)
        self.Wxf = theano.shared(Wxf)
        self.Whf = theano.shared(Whf)
        self.Wcf = theano.shared(Wcf)
        self.bf = theano.shared(bf)
        self.Wxc = theano.shared(Wxc)
        self.Whc = theano.shared(Whc)
        self.bc = theano.shared(bc)
        self.Wxo = theano.shared(Wxo)
        self.Who = theano.shared(Who)
        self.Wco = theano.shared(Wco)
        self.bo = theano.shared(bo)
        self.c0 = theano.shared(c0)
        self.h0 = theano.shared(h0)
        self.params = [
            self.Wxi,
            self.Whi,
            self.Wci,
            self.bi,
            self.Wxf,
            self.Whf,
            self.Wcf,
            self.bf,
            self.Wxc,
            self.Whc,
            self.bc,
            self.Wxo,
            self.Who,
            self.Wco,
            self.bo,
            self.c0,
            self.h0,
        ]
    def fit(self, trees, learning_rate=1e-3, mu=0.5, reg=1e-2, eps=1e-2, epochs=20, activation=T.tanh, train_inner_nodes=False):
        D = self.D
        V = self.V
        K = self.K
        self.f = activation
        N = len(trees)

        We = init_weight(V, D)
        W11 = np.random.randn(D, D, D) / np.sqrt(3*D)
        W22 = np.random.randn(D, D, D) / np.sqrt(3*D)
        W12 = np.random.randn(D, D, D) / np.sqrt(3*D)
        W1 = init_weight(D, D)
        W2 = init_weight(D, D)
        bh = np.zeros(D)
        Wo = init_weight(D, K)
        bo = np.zeros(K)

        self.We = theano.shared(We)
        self.W11 = theano.shared(W11)
        self.W22 = theano.shared(W22)
        self.W12 = theano.shared(W12)
        self.W1 = theano.shared(W1)
        self.W2 = theano.shared(W2)
        self.bh = theano.shared(bh)
        self.Wo = theano.shared(Wo)
        self.bo = theano.shared(bo)
        self.params = [self.We, self.W11, self.W22, self.W12, self.W1, self.W2, self.bh, self.Wo, self.bo]

        words = T.ivector('words')
        left_children = T.ivector('left_children')
        right_children = T.ivector('right_children')
        labels = T.ivector('labels')

        def recurrence(n, hiddens, words, left, right):
            w = words[n]
            # any non-word will have index -1
            hiddens = T.switch(
                T.ge(w, 0),
                T.set_subtensor(hiddens[n], self.We[w]),
                T.set_subtensor(hiddens[n],
                    self.f(
                        hiddens[left[n]].dot(self.W11).dot(hiddens[left[n]]) +
                        hiddens[right[n]].dot(self.W22).dot(hiddens[right[n]]) +
                        hiddens[left[n]].dot(self.W12).dot(hiddens[right[n]]) +
                        hiddens[left[n]].dot(self.W1) +
                        hiddens[right[n]].dot(self.W2) +
                        self.bh
                    )
                )
            )
            return hiddens

        hiddens = T.zeros((words.shape[0], D))

        h, _ = theano.scan(
            fn=recurrence,
            outputs_info=[hiddens],
            n_steps=words.shape[0],
            sequences=T.arange(words.shape[0]),
            non_sequences=[words, left_children, right_children],
        )

        py_x = T.nnet.softmax(h[-1].dot(self.Wo) + self.bo)

        prediction = T.argmax(py_x, axis=1)
        
        rcost = reg*T.mean([(p*p).sum() for p in self.params])
        if train_inner_nodes:
            cost = -T.mean(T.log(py_x[T.arange(labels.shape[0]), labels])) + rcost
        else:
            cost = -T.mean(T.log(py_x[-1, labels[-1]])) + rcost
        grads = T.grad(cost, self.params)
        # dparams = [theano.shared(p.get_value()*0) for p in self.params]
        cache = [theano.shared(p.get_value()*0) for p in self.params]

        # momentum
        # updates = [
        #     (p, p + mu*dp - learning_rate*g) for p, dp, g in zip(self.params, dparams, grads)
        # ] + [
        #     (dp, mu*dp - learning_rate*g) for dp, g in zip(dparams, grads)
        # ]
        updates = [
            (c, c + g*g) for c, g in zip(cache, grads)
        ] + [
            (p, p - learning_rate*g / T.sqrt(c + eps)) for p, c, g in zip(self.params, cache, grads)
        ]

        self.cost_predict_op = theano.function(
            inputs=[words, left_children, right_children, labels],
            outputs=[cost, prediction],
            allow_input_downcast=True,
        )

        self.train_op = theano.function(
            inputs=[words, left_children, right_children, labels],
            outputs=[cost, prediction],
            updates=updates
        )

        costs = []
        sequence_indexes = range(N)
        if train_inner_nodes:
            n_total = sum(len(words) for words, _, _, _ in trees)
        else:
            n_total = N
        for i in range(epochs):
            t0 = datetime.now()
            sequence_indexes = shuffle(sequence_indexes)
            n_correct = 0
            cost = 0
            it = 0
            for j in sequence_indexes:
                words, left, right, lab = trees[j]
                c, p = self.train_op(words, left, right, lab)
                if np.isnan(c):
                    print("Cost is nan! Let's stop here. Why don't you try decreasing the learning rate?")
                    exit()
                cost += c
                if train_inner_nodes:
                    n_correct += np.sum(p == lab)
                else:
                    n_correct += (p[-1] == lab[-1])
                it += 1
                if it % 1 == 0:
                    sys.stdout.write("j/N: %d/%d correct rate so far: %f, cost so far: %f\r" % (it, N, float(n_correct)/n_total, cost))
                    sys.stdout.flush()
            print("i:", i, "cost:", cost, "correct rate:", (float(n_correct)/n_total), "time for epoch:", (datetime.now() - t0))
            costs.append(cost)

        plt.plot(costs)
        plt.show()
    def fit(self, X, Y, learning_rate=10e-1, mu=0.99, reg=1.0, activation=T.tanh, epochs=100, show_fig=False):
        D = X[0].shape[1]
        K = len(set(Y.flatten()))
        N = len(Y)
        M = self.M
        self.f = activation

        # initialize weights
        Wx = init_weight(D, M)
        Wh = init_weight(M, M)
        bh = np.zeros(M)
        h0 = np.zeros(M)
        Wo = init_weight(M, K)
        bo = np.zeros(K)

        self.Wx = theano.shared(Wx)
        self.Wh = theano.shared(Wh)
        self.bh = theano.shared(bh)
        self.h0 = theano.shared(h0)
        self.Wo = theano.shared(Wo)
        self.bo = theano.shared(bo)
        self.params = [self.Wx, self.Wh, self.bh, self.h0, self.Wo, self.bo]

        thX = T.fmatrix('X')
        thY = T.ivector('Y')

        def recurrence(x_t, h_t1):
            # returns h(t), y(t)
            h_t = self.f(x_t.dot(self.Wx) + h_t1.dot(self.Wh) + self.bh)
            y_t = T.nnet.softmax(h_t.dot(self.Wo) + self.bo)
            return h_t, y_t

        [h, y], _ = theano.scan(
            fn=recurrence,
            outputs_info=[self.h0, None],
            sequences=thX,
            n_steps=thX.shape[0],
        )

        py_x = y[:, 0, :]
        prediction = T.argmax(py_x, axis=1)

        cost = -T.mean(T.log(py_x[T.arange(thY.shape[0]), thY]))
        grads = T.grad(cost, self.params)
        dparams = [theano.shared(p.get_value()*0) for p in self.params]

        updates = [
            (p, p + mu*dp - learning_rate*g) for p, dp, g in zip(self.params, dparams, grads)
        ] + [
            (dp, mu*dp - learning_rate*g) for dp, g in zip(dparams, grads)
        ]

        self.predict_op = theano.function(inputs=[thX], outputs=prediction)
        self.train_op = theano.function(
            inputs=[thX, thY],
            outputs=[cost, prediction, y],
            updates=updates,
        )

        costs = []
        for i in xrange(epochs):
            X, Y = shuffle(X, Y)
            n_correct = 0
            cost = 0
            for j in xrange(N):
                c, p, rout = self.train_op(X[j], Y[j])
                cost += c
                if p[-1] == Y[j,-1]:
                    n_correct += 1
            print "shape y:", rout.shape
            print "i:", i, "cost:", cost, "classification rate:", (float(n_correct) / N)
            costs.append(cost)

        if show_fig:
            plt.plot(costs)
            plt.show()
Ejemplo n.º 33
0
    def fit(self,
            X,
            Y,
            learning_rate=10e-1,
            mu=0.99,
            reg=1.0,
            activation=T.tanh,
            epochs=100,
            show_fig=False):
        D = X[0].shape[1]  # X is of size N x T(n) x D
        K = len(set(Y.flatten()))
        N = len(Y)
        M = self.M
        self.f = activation

        # initial weights
        Wx = init_weight(D, M)
        Wh = init_weight(M, M)
        bh = np.zeros(M)
        h0 = np.zeros(M)
        Wo = init_weight(M, K)
        bo = np.zeros(K)

        # make them theano shared
        self.Wx = theano.shared(Wx)
        self.Wh = theano.shared(Wh)
        self.bh = theano.shared(bh)
        self.h0 = theano.shared(h0)
        self.Wo = theano.shared(Wo)
        self.bo = theano.shared(bo)
        self.params = [self.Wx, self.Wh, self.bh, self.h0, self.Wo, self.bo]

        thX = T.fmatrix('X')
        thY = T.ivector('Y')

        def recurrence(x_t, h_t1):
            # returns h(t), y(t)
            h_t = self.f(x_t.dot(self.Wx) + h_t1.dot(self.Wh) + self.bh)
            y_t = T.nnet.softmax(h_t.dot(self.Wo) + self.bo)
            return h_t, y_t

        [h, y], _ = theano.scan(
            fn=recurrence,
            outputs_info=[self.h0, None],
            sequences=thX,
            n_steps=thX.shape[0],
        )

        py_x = y[:, 0, :]
        prediction = T.argmax(py_x, axis=1)

        cost = -T.mean((py_x[T.arange(thY.shape[0]), thY]))
        grads = T.grad(cost, self.params)
        dparams = [theano.shared(p.get_value() * 0) for p in self.params]

        updates = [(p, p + mu * dp - learning_rate * g)
                   for p, dp, g in zip(self.params, dparams, grads)
                   ] + [(dp, mu * dp - learning_rate * g)
                        for dp, g in zip(dparams, grads)]

        self.predict_op = theano.function(inputs=[thX], outputs=prediction)
        self.train_op = theano.function(inputs=[thX, thY],
                                        outputs=[cost, prediction, y],
                                        updates=updates)

        costs = []
        for i in xrange(epochs):
            X, Y = shuffle(X, Y)
            n_correct = 0
            cost = 0
            for j in xrange(N):
                c, p, rout = self.train_op(X[j], Y[j])
                # print "p:", p
                cost += c
                if p[-1] == Y[j, -1]:
                    n_correct += 1
            print "shape y:", rout.shape
            print "i:", i, "cost:", cost, "classification rate:", (
                float(n_correct) / N)
            costs.append(cost)

        if show_fig:
            plt.plot(costs)
            plt.show()
Ejemplo n.º 34
0
    def fit(self, X, Y, learning_rate=1e-4, mu=0.99, epochs=30, show_fig=True, activation=T.nnet.relu, RecurrentUnit=GRU, normalize=False):
        D = self.D
        V = self.V
        N = len(X)

        We = init_weight(V, D)
        self.hidden_layers = []
        Mi = D
        for Mo in self.hidden_layer_sizes:
            ru = RecurrentUnit(Mi, Mo, activation)
            self.hidden_layers.append(ru)
            Mi = Mo

        Wo = init_weight(Mi, self.K)
        bo = np.zeros(self.K)

        self.We = theano.shared(We)
        self.Wo = theano.shared(Wo)
        self.bo = theano.shared(bo)
        self.params = [self.Wo, self.bo]
        for ru in self.hidden_layers:
            self.params += ru.params

        thX = T.ivector('X')
        thY = T.ivector('Y')

        Z = self.We[thX]
        for ru in self.hidden_layers:
            Z = ru.output(Z)
        py_x = T.nnet.softmax(Z.dot(self.Wo) + self.bo)

        testf = theano.function(
            inputs=[thX],
            outputs=py_x,
        )
        testout = testf(X[0])
        print("py_x.shape:", testout.shape)

        prediction = T.argmax(py_x, axis=1)
        
        cost = -T.mean(T.log(py_x[T.arange(thY.shape[0]), thY]))
        grads = T.grad(cost, self.params)
        dparams = [theano.shared(p.get_value()*0) for p in self.params]

        dWe = theano.shared(self.We.get_value()*0)
        gWe = T.grad(cost, self.We)
        dWe_update = mu*dWe - learning_rate*gWe
        We_update = self.We + dWe_update
        if normalize:
            We_update /= We_update.norm(2)

        updates = [
            (p, p + mu*dp - learning_rate*g) for p, dp, g in zip(self.params, dparams, grads)
        ] + [
            (dp, mu*dp - learning_rate*g) for dp, g in zip(dparams, grads)
        ] + [
            (self.We, We_update), (dWe, dWe_update)
        ]

        self.cost_predict_op = theano.function(
            inputs=[thX, thY],
            outputs=[cost, prediction],
            allow_input_downcast=True,
        )

        self.train_op = theano.function(
            inputs=[thX, thY],
            outputs=[cost, prediction],
            updates=updates
        )

        costs = []
        sequence_indexes = range(N)
        n_total = sum(len(y) for y in Y)
        for i in range(epochs):
            t0 = datetime.now()
            sequence_indexes = shuffle(sequence_indexes)
            n_correct = 0
            cost = 0
            it = 0
            for j in sequence_indexes:
                c, p = self.train_op(X[j], Y[j])
                cost += c
                n_correct += np.sum(p == Y[j])
                it += 1
                if it % 200 == 0:
                    sys.stdout.write(
                        "j/N: %d/%d correct rate so far: %f, cost so far: %f\r" %
                        (it, N, float(n_correct)/n_total, cost)
                    )
                    sys.stdout.flush()
            print(
                "i:", i, "cost:", cost,
                "correct rate:", (float(n_correct)/n_total),
                "time for epoch:", (datetime.now() - t0)
            )
            costs.append(cost)

        if show_fig:
            plt.plot(costs)
            plt.show()
    def fit(self,
            X,
            Y,
            learning_rate=1.0,
            mu=0.99,
            reg=1.0,
            activation=T.tanh,
            epochs=500,
            show_fig=False):
        M = self.M
        V = self.V
        K = len(set(Y))
        print("V:", V)

        X, Y = shuffle(X, Y)
        Nvalid = 10
        Xvalid, Yvalid = X[-Nvalid:], Y[-Nvalid:]
        X, Y = X[:-Nvalid], Y[:-Nvalid]
        N = len(X)

        # initial weights
        Wx = init_weight(V, M)
        Wh = init_weight(M, M)
        bh = np.zeros(M)
        h0 = np.zeros(M)
        Wo = init_weight(M, K)
        bo = np.zeros(K)

        thX, thY, py_x, prediction = self.set(Wx, Wh, bh, h0, Wo, bo,
                                              activation)

        cost = -T.mean(T.log(py_x[thY]))
        grads = T.grad(cost, self.params)
        dparams = [theano.shared(p.get_value() * 0) for p in self.params]
        lr = T.scalar('learning_rate')

        updates = [(p, p + mu * dp - lr * g)
                   for p, dp, g in zip(self.params, dparams, grads)] + [
                       (dp, mu * dp - lr * g) for dp, g in zip(dparams, grads)
                   ]

        self.train_op = theano.function(
            inputs=[thX, thY, lr],
            outputs=[cost, prediction],
            updates=updates,
            allow_input_downcast=True,
        )

        costs = []
        for i in range(epochs):
            X, Y = shuffle(X, Y)
            n_correct = 0
            cost = 0
            for j in range(N):
                # we set 0 to start and 1 to end
                # print "X[%d]:" % j, X[j], "len:", len(X[j])
                c, p = self.train_op(X[j], Y[j], learning_rate)
                # print "p:", p, "y:", Y[j]
                cost += c
                if p == Y[j]:
                    n_correct += 1
            # update the learning rate
            learning_rate *= 0.9999

            # calculate validation accuracy
            n_correct_valid = 0
            for j in range(Nvalid):
                p = self.predict_op(Xvalid[j])
                if p == Yvalid[j]:
                    n_correct_valid += 1
            print("i:",
                  i,
                  "cost:",
                  cost,
                  "correct rate:", (float(n_correct) / N),
                  end=" ")
            print("validation correct rate:",
                  (float(n_correct_valid) / Nvalid))
            costs.append(cost)

        if show_fig:
            plt.plot(costs)
            plt.show()
Ejemplo n.º 36
0
    def __init__(self, Mi, Mo, activation):
        self.Mi = Mi
        self.Mo = Mo
        self.f = activation

        Wxi = init_weight(Mi, Mo)
        Whi = init_weight(Mo, Mo)
        Wci = init_weight(Mo, Mo)
        bi = np.zeros(Mo)

        Wxf = init_weight(Mi, Mo)
        Whf = init_weight(Mo, Mo)
        Wcf = init_weight(Mo, Mo)
        bf = np.zeros(Mo)

        Wxc = init_weight(Mi, Mo)
        Whc = init_weight(Mo, Mo)
        bc = np.zeros(Mo)

        Wxo = init_weight(Mi, Mo)
        Who = init_weight(Mo, Mo)
        Wco = init_weight(Mo, Mo)
        bo = np.zeros(Mo)

        # initial hidden state
        c0 = np.zeros(Mo)
        h0 = np.zeros(Mo)

        self.Wxi = theano.shared(Wxi)
        self.Whi = theano.shared(Whi)
        self.Wci = theano.shared(Wci)
        self.bi = theano.shared(bi)
        self.Wxf = theano.shared(Wxf)
        self.Whf = theano.shared(Whf)
        self.Wcf = theano.shared(Wcf)
        self.bf = theano.shared(bf)
        self.Wxc = theano.shared(Wxc)
        self.Whc = theano.shared(Whc)
        self.bc = theano.shared(bc)
        self.Wxo = theano.shared(Wxo)
        self.Who = theano.shared(Who)
        self.Wco = theano.shared(Wco)
        self.bo = theano.shared(bo)
        self.c0 = theano.shared(c0)
        self.h0 = theano.shared(h0)

        self.params = [
            self.Wxi, self.Whi, self.Wci, self.bi, self.Wxf, self.Whf,
            self.Wcf, self.bf, self.Wxc, self.Whc, self.bc, self.Wxo, self.Who,
            self.Wco, self.bo, self.c0, self.h0
        ]
    def fit(self,
            X,
            learning_rate=10e-1,
            mu=0.99,
            reg=1.0,
            activation=T.tanh,
            epochs=500,
            show_fig=False):
        N = len(X)
        D = self.D
        M = self.M
        V = self.V
        self.f = activation

        # initial weights
        We = init_weight(V, D)
        Wx = init_weight(D, M)
        Wh = init_weight(M, M)
        bh = np.zeros(M)
        h0 = np.zeros(M)
        Wo = init_weight(M, V)
        bo = np.zeros(V)

        # make them theano shared
        self.We = theano.shared(We)
        self.Wx = theano.shared(Wx)
        self.Wh = theano.shared(Wh)
        self.bh = theano.shared(bh)
        self.h0 = theano.shared(h0)
        self.Wo = theano.shared(Wo)
        self.bo = theano.shared(bo)
        self.params = [
            self.We, self.Wx, self.Wh, self.bh, self.h0, self.Wo, self.bo
        ]

        thX = T.ivector('X')
        Ei = self.We[thX]  # will be a TxD matrix
        thY = T.ivector('Y')

        # sentence input:
        # [START, w1, w2, ..., wn]
        # sentence target:
        # [w1,    w2, w3, ..., END]

        def recurrence(x_t, h_t1):
            # returns h(t), y(t)
            h_t = self.f(x_t.dot(self.Wx) + h_t1.dot(self.Wh) + self.bh)
            y_t = T.nnet.softmax(h_t.dot(self.Wo) + self.bo)
            return h_t, y_t

        [h, y], _ = theano.scan(
            fn=recurrence,
            outputs_info=[self.h0, None],
            sequences=Ei,
            n_steps=Ei.shape[0],
        )

        py_x = y[:, 0, :]
        prediction = T.argmax(py_x, axis=1)

        cost = -T.mean(T.log(py_x[T.arange(thY.shape[0]), thY]))
        grads = T.grad(cost, self.params)
        dparams = [theano.shared(p.get_value() * 0) for p in self.params]

        updates = [(p, p + mu * dp - learning_rate * g)
                   for p, dp, g in zip(self.params, dparams, grads)
                   ] + [(dp, mu * dp - learning_rate * g)
                        for dp, g in zip(dparams, grads)]

        self.predict_op = theano.function(inputs=[thX], outputs=prediction)
        self.train_op = theano.function(inputs=[thX, thY],
                                        outputs=[cost, prediction],
                                        updates=updates)

        self.costs = []
        self.correct_rates = []
        n_total = sum((len(sentence) + 1) for sentence in X)
        for i in range(epochs):
            X = shuffle(X)
            n_correct = 0
            cost = 0
            for j in range(N):
                # problem! many words --> END token are overrepresented
                # result: generated lines will be very short
                # we will try to fix in a later iteration
                input_sequence = [SimpleRNN.SENTENCE_START] + X[j]
                output_sequence = X[j] + [SimpleRNN.SENTENCE_END]

                # we set 0 to start and 1 to end
                c, p = self.train_op(input_sequence, output_sequence)
                # print "p:", p
                cost += c
                # print "j:", j, "c:", c/len(X[j]+1)
                for pj, xj in zip(p, output_sequence):
                    if pj == xj:
                        n_correct += 1

            correct_rate = n_correct / n_total
            if (i + 1) % 10 == 0:
                print("i:", i + 1, "cost:", cost, "correct rate:",
                      correct_rate)
            self.costs.append(cost)
            self.correct_rates.append(correct_rate)

        if show_fig:
            plt.plot(self.costs)
            plt.show()
    def fit(self, X, Y, learning_rate=1.0, mu=0.99, reg=1.0, activation=tf.tanh, epochs=100, show_fig=False):
        N, T, D = X.shape
        K = len(set(Y.flatten()))
        M = self.M
        self.f = activation

        # initial weights
        Wx = init_weight(D, M).astype(np.float32)
        Wh = init_weight(M, M).astype(np.float32)
        bh = np.zeros(M, dtype=np.float32)
        h0 = np.zeros(M, dtype=np.float32)
        Wo = init_weight(M, K).astype(np.float32)
        bo = np.zeros(K, dtype=np.float32)

        # make them theano shared
        self.Wx = tf.Variable(Wx)
        self.Wh = tf.Variable(Wh)
        self.bh = tf.Variable(bh)
        self.h0 = tf.Variable(h0)
        self.Wo = tf.Variable(Wo)
        self.bo = tf.Variable(bo)

        tfX = tf.placeholder(tf.float32, shape=(T, D), name='X')
        tfY = tf.placeholder(tf.int32, shape=(T,), name='Y')

        XWx = tf.matmul(tfX, self.Wx)

        def recurrence(h_t1, xw_t):
            # matmul() only works with 2-D objects
            # we want to return a 1-D object of size M
            # so that the final result is T x M
            # not T x 1 x M
            h_t = self.f(xw_t + tf.matmul(tf.reshape(h_t1, (1, M)), self.Wh) + self.bh)
            return tf.reshape(h_t, (M,))

        h = tf.scan(
            fn=recurrence,
            elems=XWx,
            initializer=self.h0,
        )

        logits = tf.matmul(h, self.Wo) + self.bo

        cost = tf.reduce_mean(
            tf.nn.sparse_softmax_cross_entropy_with_logits(
                labels=tfY,
                logits=logits,
            )
        )

        predict_op = tf.argmax(logits, 1)
        train_op = tf.train.AdamOptimizer(1e-2).minimize(cost)

        init = tf.global_variables_initializer()
        with tf.Session() as session:
            session.run(init)

            costs = []
            for i in range(epochs):
                X, Y = shuffle(X, Y)
                n_correct = 0
                batch_cost = 0
                for j in range(N):
                    _, c, p = session.run([train_op, cost, predict_op], feed_dict={tfX: X[j].reshape(T, D), tfY: Y[j]})
                    batch_cost += c
                    if p[-1] == Y[j,-1]:
                        n_correct += 1
                print("i:", i, "cost:", batch_cost, "classification rate:", (float(n_correct)/N))
                costs.append(batch_cost)
                if n_correct == N:
                    break

        if show_fig:
            plt.plot(costs)
            plt.show()
    def fit(self, trees, learning_rate=3*10e-4, mu=0.99, reg=10e-5, epochs=15, activation=T.nnet.relu, train_inner_nodes=False):
        D = self.D
        V = self.V
        K = self.K
        self.f = activation
        N = len(trees)

        We = init_weight(V, D)
        Wh = np.random.randn(2, D, D) / np.sqrt(2 + D + D)
        bh = np.zeros(D)
        Wo = init_weight(D, K)
        bo = np.zeros(K)

        self.We = theano.shared(We)
        self.Wh = theano.shared(Wh)
        self.bh = theano.shared(bh)
        self.Wo = theano.shared(Wo)
        self.bo = theano.shared(bo)
        self.params = [self.We, self.Wh, self.bh, self.Wo, self.bo]

        words = T.ivector('words')
        parents = T.ivector('parents')
        relations = T.ivector('relations')
        labels = T.ivector('labels')

        def recurrence(n, hiddens, words, parents, relations):
            w = words[n]
            # any non-word will have index -1
            # if T.ge(w, 0):
            #     hiddens = T.set_subtensor(hiddens[n], self.We[w])
            # else:
            #     hiddens = T.set_subtensor(hiddens[n], self.f(hiddens[n] + self.bh))
            hiddens = T.switch(
                T.ge(w, 0),
                T.set_subtensor(hiddens[n], self.We[w]),
                T.set_subtensor(hiddens[n], self.f(hiddens[n] + self.bh))
            )

            r = relations[n] # 0 = is_left, 1 = is_right
            p = parents[n] # parent idx
            # if T.ge(p, 0):
            #     # root will have parent -1
            #     hiddens = T.set_subtensor(hiddens[p], hiddens[p] + hiddens[n].dot(self.Wh[r]))
            hiddens = T.switch(
                T.ge(p, 0),
                T.set_subtensor(hiddens[p], hiddens[p] + hiddens[n].dot(self.Wh[r])),
                hiddens
            )
            return hiddens

        hiddens = T.zeros((words.shape[0], D))

        h, _ = theano.scan(
            fn=recurrence,
            outputs_info=[hiddens],
            n_steps=words.shape[0],
            sequences=T.arange(words.shape[0]),
            non_sequences=[words, parents, relations],
        )

        # shape of h that is returned by scan is TxTxD
        # because hiddens is TxD, and it does the recurrence T times
        # technically this stores T times too much data
        py_x = T.nnet.softmax(h[-1].dot(self.Wo) + self.bo)

        prediction = T.argmax(py_x, axis=1)
        
        rcost = reg*T.mean([(p*p).sum() for p in self.params])
        if train_inner_nodes:
            # won't work for binary classification
            cost = -T.mean(T.log(py_x[T.arange(labels.shape[0]), labels])) + rcost
        else:
            # print "K is:", K
            # premean = T.log(py_x[-1])
            # target = T.zeros(K)
            # target = T.set_subtensor(target[labels[-1]], 1)            
            # cost = -T.mean(target * premean)

            cost = -T.mean(T.log(py_x[-1, labels[-1]])) + rcost
        grads = T.grad(cost, self.params)
        dparams = [theano.shared(p.get_value()*0) for p in self.params]

        updates = [
            (p, p + mu*dp - learning_rate*g) for p, dp, g in zip(self.params, dparams, grads)
        ] + [
            (dp, mu*dp - learning_rate*g) for dp, g in zip(dparams, grads)
        ]

        self.cost_predict_op = theano.function(
            inputs=[words, parents, relations, labels],
            outputs=[cost, prediction],
            allow_input_downcast=True,
        )

        self.train_op = theano.function(
            inputs=[words, parents, relations, labels],
            outputs=[h, cost, prediction],
            updates=updates
        )

        costs = []
        sequence_indexes = range(N)
        if train_inner_nodes:
            n_total = sum(len(words) for words, _, _, _ in trees)
        else:
            n_total = N
        for i in xrange(epochs):
            t0 = datetime.now()
            sequence_indexes = shuffle(sequence_indexes)
            n_correct = 0
            cost = 0
            it = 0
            for j in sequence_indexes:
                words, par, rel, lab = trees[j]
                # print "len(words):", len(words)
                _, c, p = self.train_op(words, par, rel, lab)
                # if h.shape[0] < 10:
                #     print h
                # print "py_x.shape:", y.shape
                # print "pre-mean shape:", pm.shape
                # print "target shape:", t.shape
                # exit()
                if np.isnan(c):
                    print "Cost is nan! Let's stop here. Why don't you try decreasing the learning rate?"
                    exit()
                cost += c
                if train_inner_nodes:
                    n_correct += np.sum(p == lab)
                else:
                    n_correct += (p[-1] == lab[-1])
                it += 1
                if it % 1 == 0:
                    sys.stdout.write("j/N: %d/%d correct rate so far: %f, cost so far: %f\r" % (it, N, float(n_correct)/n_total, cost))
                    sys.stdout.flush()
            print "i:", i, "cost:", cost, "correct rate:", (float(n_correct)/n_total), "time for epoch:", (datetime.now() - t0)
            costs.append(cost)

        plt.plot(costs)
        plt.show()
Ejemplo n.º 40
0
  def fit(self, X, Y, batch_sz=20, learning_rate=0.1, mu=0.9, activation=tf.nn.sigmoid, epochs=100, show_fig=False):
    N, T, D = X.shape # X is of size N x T(n) x D
    K = len(set(Y.flatten()))
    M = self.M
    self.f = activation

    # initial weights
    # note: Wx, Wh, bh are all part of the RNN unit and will be created
    #       by BasicRNNCell
    Wo = init_weight(M, K).astype(np.float32)
    bo = np.zeros(K, dtype=np.float32)

    # make them tf variables
    self.Wo = tf.Variable(Wo)
    self.bo = tf.Variable(bo)

    # tf Graph input
    tfX = tf.compat.v1.placeholder(tf.float32, shape=(batch_sz, T, D), name='inputs')
    tfY = tf.compat.v1.placeholder(tf.int64, shape=(batch_sz, T), name='targets')

    # turn tfX into a sequence, e.g. T tensors all of size (batch_sz, D)
    sequenceX = x2sequence(tfX, T, D, batch_sz)

    # create the simple rnn unit
    rnn_unit = BasicRNNCell(num_units=self.M, activation=self.f)

    # Get rnn cell output
    # outputs, states = rnn_module.rnn(rnn_unit, sequenceX, dtype=tf.float32)
    outputs, states = get_rnn_output(rnn_unit, sequenceX, dtype=tf.float32)

    # outputs are now of size (T, batch_sz, M)
    # so make it (batch_sz, T, M)
    outputs = tf.transpose(a=outputs, perm=(1, 0, 2))
    outputs = tf.reshape(outputs, (T*batch_sz, M))

    # Linear activation, using rnn inner loop last output
    logits = tf.matmul(outputs, self.Wo) + self.bo
    predict_op = tf.argmax(input=logits, axis=1)
    targets = tf.reshape(tfY, (T*batch_sz,))

    cost_op = tf.reduce_mean(
      input_tensor=tf.nn.sparse_softmax_cross_entropy_with_logits(
        logits=logits,
        labels=targets
      )
    )
    train_op = tf.compat.v1.train.MomentumOptimizer(learning_rate, momentum=mu).minimize(cost_op)

    costs = []
    n_batches = N // batch_sz
    
    init = tf.compat.v1.global_variables_initializer()
    with tf.compat.v1.Session() as session:
      session.run(init)
      for i in range(epochs):
        X, Y = shuffle(X, Y)
        n_correct = 0
        cost = 0
        for j in range(n_batches):
          Xbatch = X[j*batch_sz:(j+1)*batch_sz]
          Ybatch = Y[j*batch_sz:(j+1)*batch_sz]
          
          _, c, p = session.run([train_op, cost_op, predict_op], feed_dict={tfX: Xbatch, tfY: Ybatch})
          cost += c
          for b in range(batch_sz):
            idx = (b + 1)*T - 1
            n_correct += (p[idx] == Ybatch[b][-1])
        if i % 10 == 0:
          print("i:", i, "cost:", cost, "classification rate:", (float(n_correct)/N))
        if n_correct == N:
          print("i:", i, "cost:", cost, "classification rate:", (float(n_correct)/N))
          break
        costs.append(cost)

    if show_fig:
      plt.plot(costs)
      plt.show()
	def fit(self, X, Y, learning_rate=10e-1, mu=0.99, reg=1.0, activation=T.tanh, epochs=500, show_fig=False):
		M = self.M
		V = self.V
		K = len(set(Y))
		print "V:", V

		X, Y = shuffle(X, Y)
		Nvalid = 10
		Xvalid, Yvalid = X[-Nvalid:], Y[-Nvalid:]
		X, Y = X[:-Nvalid], Y[:-Nvalid]
		N = len(X)

		# initial weights
		Wx = init_weight(V, M)
		Wh = init_weight(M, M)
		bh = np.zeros(M)
		h0 = np.zeros(M)
		Wo = init_weight(M, K)
		bo = np.zeros(K)

		thX, thY, py_x, prediction = self.set(Wx, Wh, bh, h0, Wo, bo, activation)

		cost = -T.mean(T.log(py_x[thY]))
		grads = T.grad(cost, self.params)
		dparams = [theano.shared(p.get_value()*0) for p in self.params]
		lr = T.scalar('learning_rate')

		updates = [
			(p, p + mu*dp - lr*g) for p, dp, g in zip(self.params, dparams, grads)
		] + [
			(dp, mu*dp - lr*g) for dp, g in zip(dparams, grads)
		]

		self.train_op = theano.function(
			inputs=[thX, thY, lr],
			outputs=[cost, prediction],
			updates=updates,
			allow_input_downcast=True,
		)

		costs = []
		for i in xrange(epochs):
			X, Y = shuffle(X, Y)
			n_correct = 0
			cost = 0
			for j in xrange(N):
				c, p = self.train_op(X[j], Y[j], learning_rate)
				cost += c
				if p == Y[j]:
					n_correct += 1
			learning_rate *= 0.9999

			n_correct_valid = 0
			for j in xrange(Nvalid):
				p = self.predict_op(Xvalid[j])
				if p == Yvalid[j]:
					n_correct_valid += 1

			print "i:", i, "cost:", cost, "correct rate:", (float(n_correct)/N),
			print "validation correct rate:", (float(n_correct_valid)/Nvalid)
			costs.append(cost)

		if show_fig:
			plt.plot(costs)
			plt.show()
Ejemplo n.º 42
0
 def __init__(self, Mi, Mo):
     W = init_weight(Mi, Mo)
     b = np.zeros(Mo)
     self.W = theano.shared(W)
     self.b = theano.shared(b)
     self.params = [self.W, self.b]
Ejemplo n.º 43
0
class RNN:
    def __init__(self, D, hidden_layer_sizes, V):
        self.hidden_layer_sizes = hidden_layer_sizes
        self.D = D
        self.V = V

<<<<<<< HEAD
    def fit(self, X, learning_rate=10e-5, mu=0.99, epochs=10, batch_sz=100, show_fig=True, activation=T.nnet.relu, RecurrentUnit=GRU):
=======
    def fit(self, X, learning_rate=1e-4, mu=0.99, epochs=10, batch_sz=100, show_fig=True, activation=T.nnet.relu, RecurrentUnit=LSTM):
>>>>>>> upstream/master
        D = self.D
        V = self.V
        N = len(X)

        We = init_weight(V, D)
        self.hidden_layers = []
        Mi = D
        for Mo in self.hidden_layer_sizes:
            ru = RecurrentUnit(Mi, Mo, activation)
            self.hidden_layers.append(ru)
            Mi = Mo

        Wo = init_weight(Mi, V)
        bo = np.zeros(V)

        self.We = theano.shared(We)
        self.Wo = theano.shared(Wo)
        self.bo = theano.shared(bo)
        self.params = [self.We, self.Wo, self.bo]
        for ru in self.hidden_layers:
Ejemplo n.º 44
0
Ytest  = tf.keras.preprocessing.sequence.pad_sequences(Ytest,  maxlen=sequence_length)
print("Xtrain.shape:", Xtrain.shape)
print("Ytrain.shape:", Ytrain.shape)



# inputs
inputs = tf.placeholder(tf.int32, shape=(None, sequence_length))
targets = tf.placeholder(tf.int32, shape=(None, sequence_length))
num_samples = tf.shape(inputs)[0] # useful for later

# embedding
We = np.random.randn(V, embedding_dim).astype(np.float32)

# output layer
Wo = init_weight(hidden_layer_size, K).astype(np.float32)
bo = np.zeros(K).astype(np.float32)

# make them tensorflow variables
tfWe = tf.Variable(We)
tfWo = tf.Variable(Wo)
tfbo = tf.Variable(bo)

# make the rnn unit
rnn_unit = GRUCell(num_units=hidden_layer_size, activation=tf.nn.relu)


# get the output
x = tf.nn.embedding_lookup(tfWe, inputs)

# converts x from a tensor of shape N x T x M
Ejemplo n.º 45
0
    def fit(self, X, learning_rate=1e-5, mu=0.99, epochs=10, show_fig=True, activation=T.nnet.relu, RecurrentUnit=GRU, normalize=True):
        D = self.D
        V = self.V
        N = len(X)

        We = init_weight(V, D)
        self.hidden_layers = []
        Mi = D
        for Mo in self.hidden_layer_sizes:
            ru = RecurrentUnit(Mi, Mo, activation)
            self.hidden_layers.append(ru)
            Mi = Mo

        Wo = init_weight(Mi, V)
        bo = np.zeros(V)

        self.We = theano.shared(We)
        self.Wo = theano.shared(Wo)
        self.bo = theano.shared(bo)
        self.params = [self.Wo, self.bo]
        for ru in self.hidden_layers:
            self.params += ru.params

        thX = T.ivector('X')
        thY = T.ivector('Y')

        Z = self.We[thX]
        for ru in self.hidden_layers:
            Z = ru.output(Z)
        py_x = T.nnet.softmax(Z.dot(self.Wo) + self.bo)

        prediction = T.argmax(py_x, axis=1)
        # let's return py_x too so we can draw a sample instead
        self.predict_op = theano.function(
            inputs=[thX],
            outputs=[py_x, prediction],
            allow_input_downcast=True,
        )
        
        cost = -T.mean(T.log(py_x[T.arange(thY.shape[0]), thY]))
        grads = T.grad(cost, self.params)
        dparams = [theano.shared(p.get_value()*0) for p in self.params]

        dWe = theano.shared(self.We.get_value()*0)
        gWe = T.grad(cost, self.We)
        dWe_update = mu*dWe - learning_rate*gWe
        We_update = self.We + dWe_update
        if normalize:
            We_update /= We_update.norm(2)

        updates = [
            (p, p + mu*dp - learning_rate*g) for p, dp, g in zip(self.params, dparams, grads)
        ] + [
            (dp, mu*dp - learning_rate*g) for dp, g in zip(dparams, grads)
        ] + [
            (self.We, We_update), (dWe, dWe_update)
        ]

        self.train_op = theano.function(
            inputs=[thX, thY],
            outputs=[cost, prediction],
            updates=updates
        )

        costs = []
Ejemplo n.º 46
0
    def fit(self,
            X,
            learning_rate=10e-5,
            mu=0.99,
            epochs=10,
            show_fig=True,
            activation=T.nnet.relu,
            RecurrentUnit=GRU,
            normalize=True):
        D = self.D
        V = self.V
        N = len(X)

        We = init_weight(V, D)
        self.hidden_layers = []
        Mi = D
        for Mo in self.hidden_layer_sizes:
            ru = RecurrentUnit(Mi, Mo, activation)
            self.hidden_layers.append(ru)
            Mi = Mo

        Wo = init_weight(Mi, V)
        bo = np.zeros(V)

        self.We = theano.shared(We)
        self.Wo = theano.shared(Wo)
        self.bo = theano.shared(bo)
        self.params = [self.Wo, self.bo]
        for ru in self.hidden_layers:
            self.params += ru.params

        thX = T.ivector('X')
        thY = T.ivector('Y')

        Z = self.We[thX]
        for ru in self.hidden_layers:
            Z = ru.output(Z)
        py_x = T.nnet.softmax(Z.dot(self.Wo) + self.bo)

        prediction = T.argmax(py_x, axis=1)
        # let's return py_x too so we can draw a sample instead
        self.predict_op = theano.function(
            inputs=[thX],
            outputs=[py_x, prediction],
            allow_input_downcast=True,
        )

        cost = -T.mean(T.log(py_x[T.arange(thY.shape[0]), thY]))
        grads = T.grad(cost, self.params)
        dparams = [theano.shared(p.get_value() * 0) for p in self.params]

        dWe = theano.shared(self.We.get_value() * 0)
        gWe = T.grad(cost, self.We)
        dWe_update = mu * dWe - learning_rate * gWe
        We_update = self.We + dWe_update
        if normalize:
            We_update /= We_update.norm(2)

        updates = [(p, p + mu * dp - learning_rate * g)
                   for p, dp, g in zip(self.params, dparams, grads)] + [
                       (dp, mu * dp - learning_rate * g)
                       for dp, g in zip(dparams, grads)
                   ] + [(self.We, We_update), (dWe, dWe_update)]

        self.train_op = theano.function(inputs=[thX, thY],
                                        outputs=[cost, prediction],
                                        updates=updates)

        costs = []
        for i in xrange(epochs):
            t0 = datetime.now()
            X = shuffle(X)
            n_correct = 0
            n_total = 0
            cost = 0
            for j in xrange(N):
                if np.random.random() < 0.01 or len(X[j]) <= 1:
                    input_sequence = [0] + X[j]
                    output_sequence = X[j] + [1]
                else:
                    input_sequence = [0] + X[j][:-1]
                    output_sequence = X[j]
                n_total += len(output_sequence)

                # test:

                try:
                    # we set 0 to start and 1 to end
                    c, p = self.train_op(input_sequence, output_sequence)
                except Exception as e:
                    PYX, pred = self.predict_op(input_sequence)
                    print "input_sequence len:", len(input_sequence)
                    print "PYX.shape:", PYX.shape
                    print "pred.shape:", pred.shape
                    raise e
                # print "p:", p
                cost += c
                # print "j:", j, "c:", c/len(X[j]+1)
                for pj, xj in zip(p, output_sequence):
                    if pj == xj:
                        n_correct += 1
                if j % 200 == 0:
                    sys.stdout.write("j/N: %d/%d correct rate so far: %f\r" %
                                     (j, N, float(n_correct) / n_total))
                    sys.stdout.flush()
            print "i:", i, "cost:", cost, "correct rate:", (
                float(n_correct) /
                n_total), "time for epoch:", (datetime.now() - t0)
            costs.append(cost)

        if show_fig:
            plt.plot(costs)
            plt.show()
    def fit(self,
            X,
            Y,
            batch_sz=20,
            learning_rate=1.0,
            mu=0.99,
            reg=1.0,
            activation=T.tanh,
            epochs=100,
            show_fig=False):
        D = X[0].shape[1]  # X is of size N x T(n) x D
        K = len(set(Y.flatten()))
        N = len(Y)
        M = self.M
        self.f = activation

        # initial weights
        Wx = init_weight(D, M)
        Wh = init_weight(M, M)
        bh = np.zeros(M)
        h0 = np.zeros(M)
        Wo = init_weight(M, K)
        bo = np.zeros(K)

        # make them theano shared
        self.Wx = theano.shared(Wx)
        self.Wh = theano.shared(Wh)
        self.bh = theano.shared(bh)
        self.h0 = theano.shared(h0)
        self.Wo = theano.shared(Wo)
        self.bo = theano.shared(bo)
        self.params = [self.Wx, self.Wh, self.bh, self.h0, self.Wo, self.bo]

        thX = T.fmatrix('X')  # will represent multiple batches concatenated
        thY = T.ivector('Y')
        thStartPoints = T.ivector('start_points')

        XW = thX.dot(self.Wx)

        # startPoints will contain 1 where a sequence starts and 0 otherwise
        # Ex. if I have 3 sequences: [[1,2,3], [4,5], [6,7,8]]
        # Then I will concatenate these into one X: [1,2,3,4,5,6,7,8]
        # And startPoints will be [1,0,0,1,0,1,0,0]

        # One possible solution: loop through index
        # def recurrence(t, h_t1, XW, h0, startPoints):
        #     # returns h(t)

        #     # if at a boundary, state should be h0
        #     h_t = T.switch(
        #         T.eq(startPoints[t], 1),
        #         self.f(XW[t] + h0.dot(self.Wh) + self.bh),
        #         self.f(XW[t] + h_t1.dot(self.Wh) + self.bh)
        #     )
        #     return h_t

        # h, _ = theano.scan(
        #     fn=recurrence,
        #     outputs_info=[self.h0],
        #     sequences=T.arange(XW.shape[0]),
        #     non_sequences=[XW, self.h0, thStartPoints],
        #     n_steps=XW.shape[0],
        # )

        # other solution - loop through all sequences simultaneously
        def recurrence(xw_t, is_start, h_t1, h0):
            # if at a boundary, state should be h0
            h_t = T.switch(T.eq(is_start, 1),
                           self.f(xw_t + h0.dot(self.Wh) + self.bh),
                           self.f(xw_t + h_t1.dot(self.Wh) + self.bh))
            return h_t

        h, _ = theano.scan(
            fn=recurrence,
            outputs_info=[self.h0],
            sequences=[XW, thStartPoints],
            non_sequences=[self.h0],
            n_steps=XW.shape[0],
        )

        # h is of shape (T*batch_sz, M)
        py_x = T.nnet.softmax(h.dot(self.Wo) + self.bo)
        prediction = T.argmax(py_x, axis=1)

        cost = -T.mean(T.log(py_x[T.arange(thY.shape[0]), thY]))
        grads = T.grad(cost, self.params)
        dparams = [theano.shared(p.get_value() * 0) for p in self.params]

        updates = [(p, p + mu * dp - learning_rate * g)
                   for p, dp, g in zip(self.params, dparams, grads)
                   ] + [(dp, mu * dp - learning_rate * g)
                        for dp, g in zip(dparams, grads)]

        # self.predict_op = theano.function(inputs=[thX, thStartPoints], outputs=prediction)
        self.train_op = theano.function(inputs=[thX, thY, thStartPoints],
                                        outputs=[cost, prediction, py_x],
                                        updates=updates)

        costs = []
        n_batches = N // batch_sz
        sequenceLength = X.shape[1]

        # if each sequence was of variable length, we would need to
        # initialize this inside the loop for every new batch
        startPoints = np.zeros(sequenceLength * batch_sz, dtype=np.int32)
        for b in range(batch_sz):
            startPoints[b * sequenceLength] = 1
        for i in range(epochs):
            X, Y = shuffle(X, Y)
            n_correct = 0
            cost = 0
            for j in range(n_batches):
                Xbatch = X[j * batch_sz:(j + 1) * batch_sz].reshape(
                    sequenceLength * batch_sz, D)
                Ybatch = Y[j * batch_sz:(j + 1) * batch_sz].reshape(
                    sequenceLength * batch_sz).astype(np.int32)
                c, p, rout = self.train_op(Xbatch, Ybatch, startPoints)
                # print "p:", p
                cost += c
                # P = p.reshape(batch_sz, sequenceLength)
                for b in range(batch_sz):
                    idx = sequenceLength * (b + 1) - 1
                    if p[idx] == Ybatch[idx]:
                        n_correct += 1
                    # else:
                    # print "pred:", p[idx], "actual:", Ybatch[idx]
            if i % 10 == 0:
                print("shape y:", rout.shape)
                print("i:", i, "cost:", cost, "classification rate:",
                      (float(n_correct) / N))
            if n_correct == N:
                print("i:", i, "cost:", cost, "classification rate:",
                      (float(n_correct) / N))
                break
            costs.append(cost)

        if show_fig:
            plt.plot(costs)
            plt.show()
    def fit(self,
            X,
            Y,
            learning_rate=10e-1,
            mu=0.99,
            reg=1.0,
            activation=T.tanh,
            epochs=100,
            show_fig=False):

        # define all the sizes
        D = X[0].shape[1]
        K = len(set(Y.flatten()))
        N = len(Y)
        M = self.M
        self.f = activation

        # initialize weights
        Wx = init_weight(D, M)
        Wh = init_weight(M, M)
        bh = np.zeros(M)
        h0 = np.zeros(M)
        Wo = init_weight(M, K)
        bo = np.zeros(K)

        # turn to theano shared variables
        self.Wx = theano.shared(Wx)
        self.Wh = theano.shared(Wh)
        self.bh = theano.shared(bh)
        self.h0 = theano.shared(h0)
        self.Wo = theano.shared(Wo)
        self.bo = theano.shared(bo)
        self.params = [self.Wx, self.Wh, self.bh, self.h0, self.Wo, self.bo]

        # define theano inputs outputs
        thX = T.fmatrix("X")
        thY = T.ivector("Y")

        # define recurrence
        def recurrence(x_t, h_t1):
            # x_t: current x
            # h_t1: previous h
            # returns h(t) and y(t)
            h_t = self.f(x_t.dot(self.Wx) + h_t1.dot(self.Wh) + self.bh)
            y_t = T.nnet.softmax(h_t.dot(self.Wo) + self.bo)
            return h_t, y_t

        # theano scan function
        [h, y], _ = theano.scan(fn=recurrence,
                                outputs_info=[self.h0, None],
                                sequences=thX,
                                n_steps=thX.shape[0])

        # define output
        py_x = y[:, 0, :]
        prediction = T.argmax(py_x, axis=1)

        cost = -T.mean(T.log(py_x[T.arange(thY.shape[0]), thY]))
        grads = T.grad(cost, self.params)
        dparams = [theano.shared(p.get_value() * 0) for p in self.params]

        # updates
        updates = [(p, p + mu * dp - learning_rate * g)
                   for p, dp, g in zip(self.params, dparams, grads)
                   ] + [(dp, mu * dp - learning_rate * g)
                        for dp, g in zip(dparams, grads)]

        # theano function
        predict_op = theano.function(
            inputs=[thX],
            outputs=prediction,
        )

        train_op = theano.function(inputs=[thX, thY],
                                   outputs=[cost, prediction, y],
                                   updates=updates)

        # main training loop
        costs = []
        for i in range(epochs):
            print("epoch:", i)
            X, Y = shuffle(X, Y)
            n_correct = 0
            cost = 0
            for j in range(N):
                c, p, rout = train_op(X[j], Y[j])
                cost += c
                if p[-1] == Y[j, -1]:
                    n_correct += 1
            print("shape y:", Y.shape)
            print("i:", i, "cost:", cost, "classification rate:",
                  (float(n_correct)) / N)

            costs.append(cost)

        if show_fig:
            plt.plot(costs)
            plt.show()
Ejemplo n.º 49
0
class RecursiveNN:
    def __init__(self, V, D, K, activation=T.tanh):
        self.V = V
        self.D = D
        self.K = K
        self.f = activation

    def fit(self, trees, reg=1e-3, epochs=8, train_inner_nodes=False):
        D = self.D
        V = self.V
        K = self.K
>>>>>>> upstream/master
        N = len(trees)

        We = init_weight(V, D)
        W11 = np.random.randn(D, D, D) / np.sqrt(3*D)
        W22 = np.random.randn(D, D, D) / np.sqrt(3*D)
        W12 = np.random.randn(D, D, D) / np.sqrt(3*D)
        W1 = init_weight(D, D)
        W2 = init_weight(D, D)
        bh = np.zeros(D)
        Wo = init_weight(D, K)
        bo = np.zeros(K)

        self.We = theano.shared(We)
        self.W11 = theano.shared(W11)
        self.W22 = theano.shared(W22)
        self.W12 = theano.shared(W12)
        self.W1 = theano.shared(W1)
        self.W2 = theano.shared(W2)
    def fit(self, X, learning_rate=10e-5, mu=0.99, epochs=10, batch_sz=100, show_fig=True, activation=T.nnet.relu, RecurrentUnit=GRU):
        D = self.D
        V = self.V
        N = len(X)

        We = init_weight(V, D)
        self.hidden_layers = []
        Mi = D
        for Mo in self.hidden_layer_sizes:
            ru = RecurrentUnit(Mi, Mo, activation)
            self.hidden_layers.append(ru)
            Mi = Mo

        Wo = init_weight(Mi, V)
        bo = np.zeros(V)

        self.We = theano.shared(We)
        self.Wo = theano.shared(Wo)
        self.bo = theano.shared(bo)
        self.params = [self.We, self.Wo, self.bo]
        for ru in self.hidden_layers:
            self.params += ru.params

        thX = T.ivector('X') # will represent multiple batches concatenated
        thY = T.ivector('Y') # represents next word
        thStartPoints = T.ivector('start_points')

        Z = self.We[thX]
        for ru in self.hidden_layers:
            Z = ru.output(Z, thStartPoints)
        py_x = T.nnet.softmax(Z.dot(self.Wo) + self.bo)
        prediction = T.argmax(py_x, axis=1)

        cost = -T.mean(T.log(py_x[T.arange(thY.shape[0]), thY]))
        grads = T.grad(cost, self.params)
        dparams = [theano.shared(p.get_value()*0) for p in self.params]

        updates = [
            (p, p + mu*dp - learning_rate*g) for p, dp, g in zip(self.params, dparams, grads)
        ] + [
            (dp, mu*dp - learning_rate*g) for dp, g in zip(dparams, grads)
        ]

        # self.predict_op = theano.function(inputs=[thX, thStartPoints], outputs=prediction)
        self.train_op = theano.function(
            inputs=[thX, thY, thStartPoints],
            outputs=[cost, prediction],
            updates=updates
        )

        costs = []
        n_batches = N / batch_sz
        for i in xrange(epochs):
            t0 = datetime.now()
            X = shuffle(X)
            n_correct = 0
            n_total = 0
            cost = 0

            for j in xrange(n_batches):
                # construct input sequence and output sequence as
                # concatenatation of multiple input sequences and output sequences
                # input X should be a list of 2-D arrays or one 3-D array
                # N x T(n) x D - batch size x sequence length x num features
                # sequence length can be variable
                sequenceLengths = []
                input_sequence = []
                output_sequence = []
                for k in xrange(j*batch_sz, (j+1)*batch_sz):
                    # don't always add the end token
                    if np.random.random() < 0.01 or len(X[k]) <= 1:
                        input_sequence += [0] + X[k]
                        output_sequence += X[k] + [1]
                        sequenceLengths.append(len(X[k]) + 1)
                    else:
                        input_sequence += [0] + X[k][:-1]
                        output_sequence += X[k]
                        sequenceLengths.append(len(X[k]))
                n_total += len(output_sequence)

                startPoints = np.zeros(len(output_sequence), dtype=np.int32)
                last = 0
                for length in sequenceLengths:
                  startPoints[last] = 1
                  last += length

                c, p = self.train_op(input_sequence, output_sequence, startPoints)
                cost += c
                for pj, xj in zip(p, output_sequence):
                    if pj == xj:
                        n_correct += 1
                if j % 1 == 0:
                    sys.stdout.write("j/n_batches: %d/%d correct rate so far: %f\r" % (j, n_batches, float(n_correct)/n_total))
                    sys.stdout.flush()
            print "i:", i, "cost:", cost, "correct rate:", (float(n_correct)/n_total), "time for epoch:", (datetime.now() - t0)
            costs.append(cost)

        if show_fig:
            plt.plot(costs)
            plt.show()
Ejemplo n.º 51
0
    def fit(self, X, Y, learning_rate=.01, mu=.99, epochs=30, batch_sz=100):
        N, D = X.shape
        K = len(set(Y))

        self.hidden_layers = []
        Mi = D
        for Mo in self.hidden_layer_sizes:
            h = HiddenLayer(Mi, Mo)
            self.hidden_layers.append(h)
            Mi = Mo

        W = init_weight(Mi, K)
        b = np.zeros(K)
        self.W = theano.shared(W)
        self.b = theano.shared(b)

        self.params = [self.W, self.b]
        self.allWs = []
        for h in self.hidden_layers:
            self.params += h.params
            self.allWs.append(h.W)
        self.allWs.append(self.W)

        X_in = T.matrix('X_in')
        targets = T.ivector('Targets')
        pY = self.forward(X_in)

        cost = -T.mean(T.log(pY[T.arange(pY.shape[0]), targets]))
        prediction = self.predict(X_in)

        dparams = [theano.shared(p.get_value() * 0) for p in self.params]
        grads = T.grad(cost, self.params)

        updates = [(p, p + mu * dp - learning_rate * g)
                   for p, dp, g in zip(self.params, dparams, grads)
                   ] + [(dp, mu * dp - learning_rate * g)
                        for dp, g in zip(dparams, grads)]
        # N, D = X.shape
        # K = len(set(Y))

        # self.hidden_layers = []
        # mi = D
        # for mo in self.hidden_layer_sizes:
        #     h = HiddenLayer(mi, mo)
        #     self.hidden_layers.append(h)
        #     mi = mo

        # # initialize logistic regression layer
        # W = init_weight(*(mo, K))
        # b = np.zeros(K)
        # self.W = theano.shared(W)
        # self.b = theano.shared(b)

        # self.params = [self.W, self.b]
        # self.allWs = []
        # for h in self.hidden_layers:
        #     self.params += h.params
        #     self.allWs.append(h.W)
        # self.allWs.append(self.W)

        # X_in = T.matrix('X_in')
        # targets = T.ivector('Targets')
        # pY = self.forward(X_in)

        # cost = -T.mean( T.log(pY[T.arange(pY.shape[0]), targets]) )
        # prediction = self.predict(X_in)

        # dparams = [theano.shared(p.get_value()*0) for p in self.params]
        # grads = T.grad(cost, self.params)

        # updates = [
        #     (p, p + mu*dp - learning_rate*g) for p, dp, g in zip(self.params, dparams, grads)
        # ] + [
        #     (dp, mu*dp - learning_rate*g) for dp, g in zip(dparams, grads)
        # ]

        train_op = theano.function(
            inputs=[X_in, targets],
            outputs=[cost, prediction],
            updates=updates,
        )

        n_batches = N / batch_sz
        costs = []
        lastWs = [W.get_value() for W in self.allWs]

        W_changes = []
        for i in xrange(epochs):
            print "epoch", i
            X, Y = shuffle(X, Y)
            for j in xrange(n_batches):
                Xbatch = X[j * batch_sz:(j + 1) * batch_sz, :]
                Ybatch = Y[j * batch_sz:(j + 1) * batch_sz]

                c, Yhat = train_op(Xbatch, Ybatch)

                if j % 100 == 0:
                    error = error_rate(Ybatch, Yhat)
                    print "i:%d\tj:%d\tnb:%d\tcost:%.6f\terror:%.3f\t" % (
                        i, j, n_batches, c, error)
                costs.append(c)
                W_change = [
                    np.abs(W.get_value() - lastW).mean()
                    for W, lastW in zip(self.allWs, lastWs)
                ]
                W_changes.append(W_change)
                lastWs = [W.get_value() for W in self.allWs]

        W_changes = np.array(W_changes)
        plt.subplot(2, 1, 1)
        for i in xrange(W_changes.shape[1]):
            plt.plot(W_changes[:, 1], label='layer %d' % i)

        plt.legend()

        plt.subplot(2, 1, 2)
        plt.plot(costs)
        plt.show()
Ejemplo n.º 52
0
    def fit(self,
            X,
            learning_rate=10e-1,
            mu=0.99,
            reg=1.0,
            activation=T.tanh,
            epochs=500,
            show_fig=False):
        N = len(X)  # Number of training samples.
        D = self.D
        M = self.M
        V = self.V
        self.f = activation

        # inital weights
        We = init_weight(V, D)
        Wx = init_weight(D, M)
        Wh = init_weight(M, M)
        bh = np.zeros(M)
        h0 = np.zeros(M)
        Wo = init_weight(M, V)
        bo = np.zeros(V)

        self.We = theano.shared(We)
        self.Wx = theano.shared(Wx)
        self.Wh = theano.shared(Wh)
        self.bh = theano.shared(bh)
        self.h0 = theano.shared(h0)
        self.Wo = theano.shared(Wo)
        self.bo = theano.shared(bo)
        self.params = [
            self.We, self.Wx, self.Wh, self.bh, self.h0, self.Wo, self.bo
        ]

        thX = T.ivector('X')  # sequence of indexes.
        Ei = self.We[thX]  # returns a TxD matrix
        thY = T.ivector('Y')

        def recurrence(x_t, h_t1):
            # returns h(t), y(t)
            h_t = self.f(x_t.dot(self.Wx) + h_t1.dot(self.Wh) + self.bh)
            y_t = T.nnet.softmax(h_t.dot(self.Wo) + self.bo)
            return h_t, y_t

        [h, y], _ = theano.scan(fn=recurrence,
                                outputs_info=[self.h0, None],
                                sequences=Ei,
                                n_steps=Ei.shape[0])

        print(f'y.shape: {y.shape}')
        py_x = y[:, 0, :]
        prediction = T.argmax(py_x, axis=1)

        cost = -T.mean(T.log(py_x[T.arange(thY.shape[0]), thY]))
        grads = T.grad(cost,
                       self.params)  # returns gradient of cost with all params
        dparams = [theano.shared(p.get_value() * 0) for p in self.params]

        updates = [(p, p + mu * dp - learning_rate * g)
                   for p, dp, g in zip(self.params, dparams, grads)
                   ] + [(dp, mu * dp - learning_rate * g)
                        for dp, g in zip(dparams, grads)]
        self.predict_op = theano.function(inputs=[thX], outputs=prediction)
        self.train_op = theano.function(inputs=[thX, thY],
                                        outputs=[cost, prediction],
                                        updates=updates)

        costs = []
        n_total = sum((len(sentence) + 1) for sentence in X)

        for i in range(epochs):
            X = shuffle(X)
            n_correct = 0
            cost = 0
            for j in range(N):
                input_sequence = [0] + X[j]
                output_sequence = X[j] + [1]

                c, p = self.train_op(input_sequence, output_sequence)
                cost += c
                for pj, xj in zip(p, output_sequence):
                    if pj == xj:
                        n_correct += 1

            print(
                f'epoch: {i}, cost: {cost}, correct_rate: {float(n_correct) / n_total}'
            )
            costs.append(cost)

        if show_fig:
            plt.plot(costs)
            plt.show()
Ejemplo n.º 53
0
    def fit(self,
            trees,
            learning_rate=3 * 10e-4,
            mu=0.99,
            reg=10e-5,
            epochs=15,
            activation=T.nnet.relu,
            train_inner_nodes=False):
        D = self.D
        V = self.V
        K = self.K
        self.f = activation
        N = len(trees)

        We = init_weight(V, D)
        Wh = np.random.randn(2, D, D) / np.sqrt(2 + D + D)
        bh = np.zeros(D)
        Wo = init_weight(D, K)
        bo = np.zeros(K)

        self.We = theano.shared(We)
        self.Wh = theano.shared(Wh)
        self.bh = theano.shared(bh)
        self.Wo = theano.shared(Wo)
        self.bo = theano.shared(bo)
        self.params = [self.We, self.Wh, self.bh, self.Wo, self.bo]

        words = T.ivector('words')
        parents = T.ivector('parents')
        relations = T.ivector('relations')
        labels = T.ivector('labels')

        def recurrence(n, hiddens, words, parents, relations):
            w = words[n]
            # any non-word will have index -1
            # if T.ge(w, 0):
            #     hiddens = T.set_subtensor(hiddens[n], self.We[w])
            # else:
            #     hiddens = T.set_subtensor(hiddens[n], self.f(hiddens[n] + self.bh))
            hiddens = T.switch(
                T.ge(w, 0), T.set_subtensor(hiddens[n], self.We[w]),
                T.set_subtensor(hiddens[n], self.f(hiddens[n] + self.bh)))

            r = relations[n]  # 0 = is_left, 1 = is_right
            p = parents[n]  # parent idx
            if T.ge(p, 0):
                # root will have parent -1
                hiddens = T.set_subtensor(
                    hiddens[p], hiddens[p] + hiddens[n].dot(self.Wh[r]))
            return hiddens

        hiddens = T.zeros((words.shape[0], D))

        h, _ = theano.scan(
            fn=recurrence,
            outputs_info=[hiddens],
            n_steps=words.shape[0],
            sequences=T.arange(words.shape[0]),
            non_sequences=[words, parents, relations],
        )

        py_x = T.nnet.softmax(h[:, 0, :].dot(self.Wo) + self.bo)

        prediction = T.argmax(py_x, axis=1)

        rcost = T.mean([(p * p).sum() for p in self.params])
        if train_inner_nodes:
            cost = -T.mean(T.log(py_x[T.arange(labels.shape[0]),
                                      labels])) + rcost
        else:
            cost = -T.mean(T.log(py_x[-1, labels[-1]])) + rcost
        grads = T.grad(cost, self.params)
        dparams = [theano.shared(p.get_value() * 0) for p in self.params]

        updates = [(p, p + mu * dp - learning_rate * g)
                   for p, dp, g in zip(self.params, dparams, grads)
                   ] + [(dp, mu * dp - learning_rate * g)
                        for dp, g in zip(dparams, grads)]

        self.cost_predict_op = theano.function(
            inputs=[words, parents, relations, labels],
            outputs=[cost, prediction],
            allow_input_downcast=True,
        )

        self.train_op = theano.function(
            inputs=[words, parents, relations, labels],
            outputs=[cost, prediction],
            updates=updates)

        costs = []
        sequence_indexes = range(N)
        if train_inner_nodes:
            n_total = sum(len(words) for words, _, _, _ in trees)
        else:
            n_total = N
        for i in xrange(epochs):
            t0 = datetime.now()
            sequence_indexes = shuffle(sequence_indexes)
            n_correct = 0
            cost = 0
            it = 0
            for j in sequence_indexes:
                words, par, rel, lab = trees[j]
                c, p = self.train_op(words, par, rel, lab)
                if np.isnan(c):
                    print "Cost is nan! Let's stop here. Why don't you try decreasing the learning rate?"
                    exit()
                cost += c
                if train_inner_nodes:
                    n_correct += np.sum(p == lab)
                else:
                    n_correct += (p[-1] == lab[-1])
                it += 1
                if it % 1 == 0:
                    sys.stdout.write(
                        "j/N: %d/%d correct rate so far: %f, cost so far: %f\r"
                        % (it, N, float(n_correct) / n_total, cost))
                    sys.stdout.flush()
            print "i:", i, "cost:", cost, "correct rate:", (
                float(n_correct) /
                n_total), "time for epoch:", (datetime.now() - t0)
            costs.append(cost)

        plt.plot(costs)
        plt.show()
Ejemplo n.º 54
0
Ytest  = tf.keras.preprocessing.sequence.pad_sequences(Ytest,  maxlen=sequence_length)
print("Xtrain.shape:", Xtrain.shape)
print("Ytrain.shape:", Ytrain.shape)



# inputs
inputs = tf.placeholder(tf.int32, shape=(None, sequence_length))
targets = tf.placeholder(tf.int32, shape=(None, sequence_length))
num_samples = tf.shape(inputs)[0] # useful for later

# embedding
We = np.random.randn(V, embedding_dim).astype(np.float32)

# output layer
Wo = init_weight(hidden_layer_size, K).astype(np.float32)
bo = np.zeros(K).astype(np.float32)

# make them tensorflow variables
tfWe = tf.Variable(We)
tfWo = tf.Variable(Wo)
tfbo = tf.Variable(bo)

# make the rnn unit
rnn_unit = GRUCell(num_units=hidden_layer_size, activation=tf.nn.relu)


# get the output
x = tf.nn.embedding_lookup(tfWe, inputs)

# converts x from a tensor of shape N x T x D