def __init__(self, V, D, K, activation):
        self.D = D
        self.f = activation

        # word embedding:
        We = init_weight(V, D)

        # linear terms:
        W1 = init_weight(D, D)
        W2 = init_weight(D, D)

        # bias
        bh = np.zeros(D)

        # output layer
        Wo = init_weight(D, K)
        bo = np.zeros(K)

        # make the updatable tensorflow variables
        self.We = tf.Variable(We.astype(np.float32))
        self.W1 = tf.Variable(W1.astype(np.float32))
        self.W2 = tf.Variable(W2.astype(np.float32))
        self.bh = tf.Variable(bh.astype(np.float32))
        self.Wo = tf.Variable(Wo.astype(np.float32))
        self.bo = tf.Variable(bo.astype(np.float32))
        self.params = [self.We, self.W1, self.W2, self.Wo]
Ejemplo n.º 2
0
    def __init__(self, Mi, Mo, activation):
        self.Mi = Mi
        self.Mo = Mo
        self.f = activation

        # numpy init
        Wxr = init_weight(Mi, Mo)
        Whr = init_weight(Mo, Mo)
        br = np.zeros(Mo)
        Wxz = init_weight(Mi, Mo)
        Whz = init_weight(Mo, Mo)
        bz = np.zeros(Mo)
        Wxh = init_weight(Mi, Mo)
        Whh = init_weight(Mo, Mo)
        bh = np.zeros(Mo)
        h0 = np.zeros(Mo)

        # theano vars
        self.Wxr = theano.shared(Wxr)
        self.Whr = theano.shared(Whr)
        self.br = theano.shared(br)
        self.Wxz = theano.shared(Wxz)
        self.Whz = theano.shared(Whz)
        self.bz = theano.shared(bz)
        self.Wxh = theano.shared(Wxh)
        self.Whh = theano.shared(Whh)
        self.bh = theano.shared(bh)
        self.h0 = theano.shared(h0)
        self.params = [
            self.Wxr, self.Whr, self.br, self.Wxz, self.Whz, self.bz, self.Wxh,
            self.Whh, self.bh, self.h0
        ]
Ejemplo n.º 3
0
    def fit(self,
            trees,
            test_trees,
            reg=1e-3,
            epochs=8,
            train_inner_nodes=False):
        D = self.D
        V = self.V
        K = self.K
        N = len(trees)

        We = init_weight(V, D)
        W11 = np.random.randn(D, D, D) / np.sqrt(3 * D)
        W22 = np.random.randn(D, D, D) / np.sqrt(3 * D)
        W12 = np.random.randn(D, D, D) / np.sqrt(3 * D)
        W1 = init_weight(D, D)
        W2 = init_weight(D, D)
        bh = np.zeros(D)
        Wo = init_weight(D, K)
        bo = np.zeros(K)

        self.We = tf.Variable(We.astype(np.float32))
        self.W11 = tf.Variable(W11.astype(np.float32))
        self.W22 = tf.Variable(W22.astype(np.float32))
        self.W12 = tf.Variable(W12.astype(np.float32))
        self.W1 = tf.Variable(W1.astype(np.float32))
        self.W2 = tf.Variable(W2.astype(np.float32))
        self.bh = tf.Variable(bh.astype(np.float32))
        self.Wo = tf.Variable(Wo.astype(np.float32))
        self.bo = tf.Variable(bo.astype(np.float32))
        self.weights = [
            self.We, self.W11, self.W22, self.W12, self.W1, self.W2, self.Wo
        ]

        words = tf.placeholder(tf.int32, shape=(None, ), name='words')
        left_children = tf.placeholder(tf.int32,
                                       shape=(None, ),
                                       name='left_children')
        right_children = tf.placeholder(tf.int32,
                                        shape=(None, ),
                                        name='right_children')
        labels = tf.placeholder(tf.int32, shape=(None, ), name='labels')

        # save for later
        self.words = words
        self.left = left_children
        self.right = right_children
        self.labels = labels

        def dot1(a, B):
            return tf.tensordot(a, B, axes=[[0], [1]])

        def dot2(B, a):
            return tf.tensordot(B, a, axes=[[1], [0]])

        def recursive_net_transform(hiddens, n):
            h_left = hiddens.read(left_children[n])
            h_right = hiddens.read(right_children[n])
            return self.f(
                dot1(h_left, dot2(self.W11, h_left)) +
                dot1(h_right, dot2(self.W22, h_right)) +
                dot1(h_left, dot2(self.W12, h_right)) + dot1(h_left, self.W1) +
                dot1(h_right, self.W2) + self.bh)

        def recurrence(hiddens, n):
            w = words[n]
            # any non word will have index -1
            h_n = tf.cond(w >= 0, lambda: tf.nn.embedding_lookup(self.We, w),
                          lambda: recursive_net_transform(hiddens, n))
            hiddens = hiddens.write(n, h_n)
            n = tf.add(n, 1)
            return hiddens, n

        def condition(hiddens, n):
            # loop should continue while n < len(words)
            return tf.less(n, tf.shape(words)[0])

        hiddens = tf.TensorArray(
            tf.float32,
            size=0,
            dynamic_size=True,
            clear_after_read=False,
            infer_shape=False,
        )

        hiddens, _ = tf.while_loop(condition,
                                   recurrence,
                                   [hiddens, tf.constant(0)],
                                   parallel_iterations=1)

        h = hiddens.stack()
        logits = tf.matmul(h, self.Wo) + self.bo

        prediction_op = tf.argmax(logits, axis=1)
        self.prediction_op = prediction_op

        rcost = reg * sum(tf.nn.l2_loss(p) for p in self.weights)
        if train_inner_nodes:
            labeled_indices = tf.where(labels >= 0)

            cost_op = tf.reduce_mean(
                tf.nn.sparse_softmax_cross_entropy_with_logits(
                    logits=tf.gather(logits, labeled_indices),
                    labels=tf.gather(labels, labeled_indices),
                )) + rcost
        else:
            cost_op = tf.reduce_mean(
                tf.nn.sparse_softmax_cross_entropy_with_logits(
                    logits=logits[-1],
                    labels=labels[-1],
                )) + rcost

        # might have to swap out for momentum optimzer if using GPU
        train_op = tf.train.AdagradOptimizer(
            learning_rate=1e-4).minimize(cost_op)

        self.session = tf.InteractiveSession()
        init_op = tf.global_variables_initializer()
        self.session.run(init_op)

        costs = []
        sequence_indexes = range(N)
        for i in range(epochs):
            t0 = datetime.now()
            sequence_indexes = shuffle(sequence_indexes)
            n_correct = 0
            n_total = 0
            cost = 0
            it = 0
            # Use a single example per update - stochastic gradient descent.
            for j in sequence_indexes:
                words_, left, right, lab = trees[j]
                # print("words_:", words_)
                # print("lab:", lab)

                c, p, _ = self.session.run(
                    (cost_op, prediction_op, train_op),
                    feed_dict={
                        words: words_,
                        left_children: left,
                        right_children: right,
                        labels: lab
                    })
                if np.isnan(c):
                    print("Cost is nan! try decreasing the learning rate.")
                    for p in self.params:
                        print(p.get_value().sum())
                    exit()
                cost += c
                n_correct += (p[-1] == lab[-1])
                n_total += 1

                it += 1
                if it % 10 == 0:
                    sys.stdout.write(
                        "j/N: %d/%d correct rate so far: %f, cost so far: %f\r"
                        % (it, N, float(n_correct) / n_total, cost))
                    sys.stdout.flush()

                # calculate the test score:
                n_test_correct = 0
                n_test_total = 0
                for words_, left, right, lab in test_trees:
                    p = self.session.run(prediction_op,
                                         feed_dict={
                                             words: words_,
                                             left_children: left,
                                             right_children: right,
                                             labels: lab
                                         })
                    n_test_correct += (p[-1] == lab[-1])
                    n_test_total += 1

                print("i:", i, "cost:", cost, "train acc:",
                      float(n_correct) / n_total, "test acc:",
                      float(n_test_correct) / n_test_total, "time for epoch:",
                      (datetime.now() - t0))
            costs.append(cost)

        print("costs: ", costs)
        plt.plot(costs)
        plt.show()
    def fit(self,
            X,
            Y,
            learning_rate=1e-4,
            mu=0.99,
            epochs=20,
            show_fig=True,
            activation=T.nnet.relu,
            RecurrentUnit=GRU,
            normalize=False):
        D = self.D
        V = self.V
        N = len(X)

        We = init_weight(V, D)
        self.hidden_layers = []
        Mi = D
        for Mo in self.hidden_layer_sizes:
            ru = RecurrentUnit(Mi, Mo, activation)
            self.hidden_layers.append(ru)
            Mi = Mo

        Wo = init_weight(Mi, self.K)
        bo = np.zeros(self.K)

        self.We = theano.shared(We)
        self.Wo = theano.shared(Wo)
        self.bo = theano.shared(bo)
        self.params = [self.Wo, self.bo]
        for ru in self.hidden_layers:
            self.params += ru.params

        thX = T.ivector('X')
        thY = T.ivector('Y')

        Z = self.We[thX]
        for ru in self.hidden_layers:
            Z = ru.output(Z)
        py_x = T.nnet.softmax(Z.dot(self.Wo) + self.bo)

        testf = theano.function(
            inputs=[thX],
            outputs=py_x,
        )
        testout = testf(X[0])
        print "py_x.shape:", testout.shape

        prediction = T.argmax(py_x, axis=1)

        cost = -T.mean(T.log(py_x[T.arange(thY.shape[0]), thY]))

        dWe = theano.shared(self.We.get_value() * 0)
        gWe = T.grad(cost, self.We)
        dWe_update = mu * dWe - learning_rate * gWe
        We_update = self.We + dWe_update
        if normalize:
            We_update /= We_update.norm(2)

        updates = [
            update for param in self.params
            for update in rmsprop_updates(cost, param, learning_rate, mu)
        ] + [(self.We, We_update), (dWe, dWe_update)]

        self.cost_predict_op = theano.function(
            inputs=[thX, thY],
            outputs=[cost, prediction],
            allow_input_downcast=True,
        )

        self.train_op = theano.function(inputs=[thX, thY],
                                        outputs=[cost, prediction],
                                        updates=updates)

        costs = []
        sequence_indexes = range(N)
        n_total = sum(len(y) for y in Y)
        for i in range(epochs):
            t0 = datetime.now()
            sequence_indexes = shuffle(sequence_indexes)
            n_correct = 0
            cost = 0
            it = 0
            for j in sequence_indexes:
                c, p = self.train_op(X[j], Y[j])
                cost += c
                n_correct += np.sum(p == Y[j])
                it += 1
                if it % 200 == 0:
                    sys.stdout.write(
                        "j/N: %d/%d correct rate so far: %f, cost so far: %f\r"
                        % (it, N, float(n_correct) / n_total, cost))
                    sys.stdout.flush()
            print("i:", i + 1, "cost:", cost, "correct rate:",
                  (float(n_correct) / n_total), "time for epoch:",
                  (datetime.now() - t0))
            costs.append(cost)

        if show_fig:
            plt.plot(costs)
            plt.show()
Ejemplo n.º 5
0
#  ================= 事先準備model 所需的參數與元件 ======================
# inputs
inputs = tf.placeholder(
    tf.int32,
    shape=(None, sequence_length))  # input is a tensor of shape N x T
targets = tf.placeholder(
    tf.int32,
    shape=(None, sequence_length))  # target is a tensor of shape N x T
num_sample = tf.shape(inputs)[0]  # useful for later

# embedding
We = np.random.randn(V, embedding_dim).astype(np.float32)

# output layer
Wo = init_weight(hidden_layer_size, K).astype(np.float32)
bo = np.zeros(K).astype(np.float32)

# make them tensorflow variables
tfWe = tf.Variable(We)
tfWo = tf.Variable(Wo)
tfbo = tf.Variable(bo)

# make the rnn unit
rnn_unit = GRUCell(num_units=hidden_layer_size, activation=tf.nn.relu)

# ================ model + cost + solver  =======================
# get the output from enbedding layer
x = tf.nn.embedding_lookup(tfWe, inputs)  # x is a tensor of shape  N x T x M

# converts x from a tensor of shape N x T x M
Ejemplo n.º 6
0
    def fit(self,
            X,
            Y,
            learning_rate=1e-4,
            mu=0.99,
            epochs=30,
            show_fig=True,
            activation=T.nnet.relu,
            RecurrentUnit=GRU,
            normalize=False):
        ## ==========  先準備所有model中 所有必備的 weight matrix + initial hidden value(h0) ===========
        D = self.D
        V = self.V
        N = len(X)

        We = init_weight(V, D)  # embedding matrix ,這一層不具有bias喔
        self.hidden_layers = []
        Mi = D
        for Mo in self.hidden_layer_size:
            ru = RecurrentUnit(Mi, Mo, activation)
            self.hidden_layers.append(ru)
            Mi = Mo

        Wo = init_weight(Mi, self.K)
        bo = np.zeros(self.K)

        self.We = theano.shared(
            We)  # We 不跟其他參數被一起蒐集到同一個 list內,因為他要另外做weight update
        self.Wo = theano.shared(Wo)
        self.bo = theano.shared(bo)
        self.params = [self.Wo, self.bo]
        for ru in self.hidden_layers:
            self.params += ru.params

        thX = T.ivector('X')
        thY = T.ivector('Y')

        ## =========== step1 model ============================
        Z = self.We[thX]
        for ru in self.hidden_layers:
            Z = ru.output(Z)
        py_x = T.nnet.softmax(Z.dot(self.Wo) + self.bo)

        testf = theano.function(  # 寫這段測試小程式的目的是,檢查py_x.shape是甚麼
            inputs=[thX],
            outputs=py_x,
        )
        print("py_x.shape:", testf(X[0]).shape)

        prediction = T.argmax(py_x, axis=1)

        ## ========== step2,3 cost and solver ===================
        cost = -T.mean(T.log(py_x[T.arange(thY.shape[0]), thY]))

        # 對We做更新的公式
        gWe = T.grad(cost, self.We)
        dWe = theano.shared(self.We.get_value() * 0)
        dWe_update = mu * dWe - learning_rate * gWe
        We_update = self.We + dWe_update
        if normalize:
            We_update /= We_update.norm(2)

        grads = T.grad(cost, self.params)
        dparams = [theano.shared(p.get_value() * 0) for p in self.params]

        # 全部weight 參數更新公式,都包在update 這個串列中
        updates = [(p, p + mu * dp - learning_rate * g)
                   for p, dp, g in zip(self.params, dparams, grads)] + [
                       (dp, mu * dp - learning_rate * g)
                       for dp, g in zip(dparams, grads)
                   ] + [(self.We, We_update), (dWe, dWe_update)]

        self.cost_predict_op = theano.function(
            inputs=[thX, thY],
            outputs=[cost, prediction],
            allow_input_downcast=True,
        )

        self.train_op = theano.function(
            inputs=[thX, thY],
            outputs=[cost, prediction],
            updates=updates,
        )

        # =========== Training Process  ==========
        costs = []
        sequence_indexes = range(N)
        n_total = sum(len(y) for y in Y)  # 用來算準確率
        for i in range(epochs):
            t0 = datetime.now()
            sequence_indexes = shuffle(sequence_indexes)
            n_correct = 0
            cost = 0
            it = 0  # 用來keep track 目前是第幾個j
            for j in sequence_indexes:
                c, p = self.train_op(X[j], Y[j])
                cost += c
                n_correct += np.sum(p == Y[j])
                it += 1
                if it % 200 == 0:
                    sys.stdout.write(
                        "j/N: %d/%d correct rate so far: %f, cost so far: %f\r"
                        % (it, N, float(n_correct) / n_total, cost))
                    sys.stdout.flush()
            print("i:", i, "cost:", cost, "correct rate:",
                  (float(n_correct) / n_total), "time for epoch:",
                  (datetime.now() - t0))
            costs.append(cost)

        if show_fig:
            plt.plot(costs)
            plt.show()
    def fit(self,
            trees,
            learning_rate=3 * 1e-3,
            mu=0.99,
            reg=1e-4,
            epochs=15,
            activation=T.nnet.relu,
            train_inner_nodes=False):
        D = self.D
        V = self.V
        K = self.K
        self.f = activation
        N = len(trees)

        We = init_weight(V, D)
        Wh = np.random.randn(2, D, D) / np.sqrt(2 + D + D)
        bh = np.zeros(D)
        Wo = init_weight(D, K)
        bo = np.zeros(K)

        self.We = theano.shared(We)
        self.Wh = theano.shared(Wh)
        self.bh = theano.shared(bh)
        self.Wo = theano.shared(Wo)
        self.bo = theano.shared(bo)
        self.params = [self.We, self.Wh, self.bh, self.Wo, self.bo]

        words = T.ivector('words')
        parents = T.ivector('parents')
        relations = T.ivector('relations')
        labels = T.ivector('labels')

        def recurrence(n, hiddens, words, parents, relations):
            w = words[n]
            hiddens = T.switch(
                T.ge(w, 0), T.set_subtensor(hiddens[n], self.We[w]),
                T.set_subtensor(hiddens[n], self.f(hiddens[n] + self.bh)))

            r = relations[n]
            p = parents[n]
            hiddens = T.switch(
                T.ge(p, 0),
                T.set_subtensor(hiddens[p],
                                hiddens[p] + hiddens[n].dot(self.Wh[r])),
                hiddens)
            return hiddens

        hiddens = T.zeros((words.shape[0], D))

        h, _ = theano.scan(
            fn=recurrence,
            outputs_info=[hiddens],
            n_steps=words.shape[0],
            sequences=T.arange(words.shape[0]),
            non_sequences=[words, parents, relations],
        )

        py_x = T.nnet.softmax(h[-1].dot(self.Wo) + self.bo)
        prediction = T.argmax(py_x, axis=1)

        rcost = reg * T.mean([(p * p).sum() for p in self.params])
        if train_inner_nodes:
            cost = -T.mean(T.log(py_x[T.arange(labels.shape[0]),
                                      labels])) + rcost
        else:
            cost = -T.mean(T.log(py_x[-1, labels[-1]])) + rcost

        # grads = T.grad(cost, self.params)
        # dparams = [theano.shared(p.get_value()*0) for p in self.params]
        #
        # updates = [
        #     (p, p * mu*dp - learning_rate*g) for p, dp, g in zip(self.params, dparams, grads)
        # ] + [
        #     (dp, mu*dp - learning_rate*g) for dp, g in zip(dparams, grads)
        # ]

        updates = adagrad(cost, self.params, lr=1e-4)

        self.cost_predict_op = theano.function(
            inputs=[words, parents, relations, labels],
            outputs=[cost, prediction],
            allow_input_downcast=True,
        )

        self.train_op = theano.function(
            inputs=[words, parents, relations, labels],
            outputs=[h, cost, prediction],
            updates=updates)

        costs = []
        sequence_indexes = range(N)
        if train_inner_nodes:
            n_total = sum(len(words) for words, _, _, _ in trees)
        else:
            n_total = N
        for i in range(epochs):
            t0 = datetime.now()
            sequence_indexes = shuffle(sequence_indexes)
            n_correct = 0
            cost = 0
            it = 0
            for j in sequence_indexes:
                words, par, rel, lab = trees[j]
                _, c, p = self.train_op(words, par, rel, lab)
                cost += c
                if train_inner_nodes:
                    n_correct += np.sum(p == lab)
                else:
                    n_correct += (p[-1] == lab[-1])
                it += 1
                if it % 1 == 0:
                    sys.stdout.write(
                        "j/N: %d/%d correct rate so far: %f, cost so far: %f/r"
                        % (it, N, float(n_correct / n_total), cost))
                    sys.stdout.flush()
                print("i:", i, "cost:", cost, "correct rate:",
                      (float(n_correct) / n_total), "time for epoch:",
                      (datetime.now() - t0))
                costs.append(cost)

        print('costs:', costs)
        plt.plot(costs)
        plt.show()