def training(self, data_set, correct_output, n=0.2, epochs=1000):
        """
        Trains the NeuralNetwork with the data set.
        Args:
            data_set: matrix with the vectors containg the inputs.
            correct_output: the expected output for each training.
            n: the learning rate.
            epochs: number of times that the network run all the data set.
        """

        # File to write error history
        f = open("graphics/error_output.txt", "w")
        data_set = self.insert_bias(data_set)
        last_errors = []

        for epoch in range(epochs):
            if epoch % 1000 is 0:
                print "Epoch: {}".format(epoch)
            random_index = np.random.randint(data_set.shape[0])

            # layer_data: [w0, w1, w2, output]
            layer_data = [data_set[random_index]]

            # Calculate output for hidden layers
            for layer in range(len(self.weights)):
                dot_value = np.dot(layer_data[layer], self.weights[layer])
                activation = utils.tanh(dot_value)
                layer_data.append(activation)
            # layer_data now contains: [ [outputs from input_layer(inputs)],
            # [outputs from hidden layer(s)], [output from output layer] ]

            # Calculate the error for output layer
            error = correct_output[random_index] - layer_data[-1]
            average_error = abs(np.average(error))
            last_errors.append(average_error)
            if len(last_errors) == 10:
                last_errors_average = np.average(last_errors)
                f.write("{} {}\n".format(epoch, last_errors_average))
                if last_errors_average < 0.001:
                    print last_errors_average
                    break
                last_errors = []
            deltas = [error * utils.dtanh(layer_data[-1])]

            # Calculate Deltas
            for l in range(len(layer_data) - 2, 0, -1):
                deltas.append(
                    deltas[-1].dot(self.weights[l].T)*utils.dtanh(layer_data[l])
                )
            deltas.reverse()

            # Backpropagate. Update the weights for all the layers
            for i in range(len(self.weights)):
                layer = np.atleast_2d(layer_data[i])
                delta = np.atleast_2d(deltas[i])
                self.weights[i] += n * layer.T.dot(delta)

        f.close()
Example #2
0
    def bptt(self, x, y):
        # The total number of time steps
        t_steps = len(x)
        self.null_deltas()
        f, i, o, c, c_curr, h, y_ = self.forward(x)

        # y_ - 1 since 1 should the probability of choosing the correct word
        delta_y_ = y_
        delta_y_[np.arange(len(y)), y] -= 1.
        delta_h = np.zeros(h.shape)
        delta_c = np.zeros(c.shape)
        delta_f = np.zeros(f.shape)
        delta_i = np.zeros(i.shape)
        delta_o = np.zeros(o.shape)
        delta_c_curr = np.zeros(c_curr.shape)

        # For each output backwards...
        for t in np.arange(t_steps)[::-1]:
            # one hot encoding
            x_t = np.zeros((self.word_dim, 1))
            x_t[x[t]] = 1

            delta_h[t] = np.dot(self.w_v.T, delta_y_[t]) + delta_h[t + 1]
            delta_c[t] = delta_c[t + 1] * f[t + 1] + delta_h[t] * o[t] * dtanh(
                c[t])
            delta_f[t] = delta_c[t] * c[t - 1] * dsigmoid(f[t])
            delta_i[t] = delta_c[t] * c_curr[t] * dsigmoid(i[t])
            delta_o[t] = delta_h[t] * dsigmoid(o[t]) * np.tanh(c[t])
            delta_c_curr[t] += delta_c[t] * i[t] * dtanh(c_curr[t])

            # W_v, b_v
            self.dLdWv += np.outer(delta_y_[t], h[t].T)
            self.dLdBv += delta_y_[t]

            # W_fx, W_fh, b_f
            self.dLdWfx += np.dot(delta_f[t], x_t.T)
            self.dLdWfh += np.dot(delta_f[t], h[t - 1].T)
            self.dLdBf += delta_f[t]

            # W_ix, W_ih, b_i
            self.dLdWix += np.dot(delta_i[t], x_t.T)
            self.dLdWih += np.dot(delta_i[t], h[t - 1].T)
            self.dLdBi += delta_i[t]

            # W_cx, W_ch, b_c
            self.dLdWcx += np.dot(delta_c_curr[t], x_t.T)
            self.dLdWch += np.dot(delta_c_curr[t], h[t - 1].T)
            self.dLdBc += delta_c_curr[t]

            # W_ox, W_oh, b_o
            self.dLdWox += np.dot(delta_o[t], x_t.T)
            self.dLdWoh += np.dot(delta_o[t], h[t - 1].T)
            self.dLdBo += delta_o[t]

        self.clip_gradients()
Example #3
0
    def compute_gradients(self, input, labels):
        [a1, az, ar, ahhat, ah, a2] = self.predict(input)
        error = (labels - a2)

        L = np.shape(input)[0]
        H = self.Nhidden
        dz = np.zeros((L, H))
        dr = np.zeros((L, H))
        dh = np.zeros((L, H))
        d1 = np.zeros((L, H))

        # this is ah from the previous timestep
        ahm1 = np.concatenate((np.zeros((1, H)), ah[:-1, :]))

        d2 = error * dtanh(a2)
        e2 = np.dot(error, self.w2.T)
        dh_next = np.zeros((1, self.Nhidden))
        for i in range(L - 1, -1, -1):
            err = e2[i, :] + dh_next
            dz[i, :] = (err * ahhat[i, :] - err * ahm1[i, :]) * dsigm(az[i, :])
            dh[i, :] = err * az[i, :] * dtanh(ahhat[i, :])
            dr[i, :] = np.dot(dh[i, :], self.wh[:H, :].T) * ahm1[i, :] * dsigm(
                ar[i, :])
            dh_next = err * (1 - az[i, :]) + np.dot(
                dh[i, :], self.wh[:H, :].T) * ar[i, :] + np.dot(
                    dz[i, :], self.wz[:H, :].T) + np.dot(
                        dr[i, :], self.wr[:H, :].T)
            d1[i, :] = np.dot(dh[i, :], self.wh[H:, :].T) + np.dot(
                dz[i, :], self.wz[H:, :].T) + np.dot(dr[i, :],
                                                     self.wr[H:, :].T)
        d1 = d1 * dtanh(a1)
        # all the deltas are computed, now compute the gradients
        gw2 = 1.0 / L * np.dot(ah.T, d2)
        gb2 = 1.0 / L * np.sum(d2, 0)
        x = np.concatenate((ahm1, a1), 1)
        gwz = 1.0 / L * np.dot(x.T, dz)
        gbz = 1.0 / L * np.sum(dz, 0)
        gwr = 1.0 / L * np.dot(x.T, dr)
        gbr = 1.0 / L * np.sum(dr, 0)
        x = np.concatenate((ar * ahm1, a1), 1)
        gwh = 1.0 / L * np.dot(x.T, dh)
        gbh = 1.0 / L * np.sum(dh, 0)
        gw1 = 1.0 / L * np.dot(input.T, d1)
        gb1 = 1.0 / L * np.sum(d1, 0)
        weight_grads = [gw1, gwr, gwz, gwh, gw2]
        bias_grads = [gb1, gbr, gbz, gbh, gb2]

        return weight_grads, bias_grads
Example #4
0
    def backward(self, target, dh_next, dC_next, C_prev, z, f, i, C_bar, C, o,
                 h, v, y):
        # the following code still needs to be modified.
        # for example: p -> self
        dv = np.copy(y)
        dv[target] -= 1

        self.W_v.d += np.dot(dv, h.T)
        self.b_v.d += dv

        dh = np.dot(self.W_v.v.T, dv)
        dh += dh_next
        do = dh * utils.tanh(C)
        do = utils.dsigmoid(o) * do
        self.W_o.d += np.dot(do, z.T)
        self.b_o.d += do

        dC = np.copy(dC_next)
        dC += dh * o * utils.dtanh(utils.tanh(C))
        dC_bar = dC * i
        dC_bar = utils.dtanh(C_bar) * dC_bar
        self.W_C.d += np.dot(dC_bar, z.T)
        self.b_C.d += dC_bar

        di = dC * C_bar
        di = utils.dsigmoid(i) * di
        self.W_i.d += np.dot(di, z.T)
        self.b_i.d += di

        df = dC * C_prev
        df = utils.dsigmoid(f) * df
        self.W_f.d += np.dot(df, z.T)
        self.b_f.d += df

        dz = (np.dot(self.W_f.v.T, df) + np.dot(self.W_i.v.T, di) +
              np.dot(self.W_C.v.T, dC_bar) + np.dot(self.W_o.v.T, do))
        dh_prev = dz[:self.h_size, :]
        dC_prev = f * dC

        return dh_prev, dC_prev
Example #5
0
    def back_propagate(self, t, y):
        """
        Refine parameters of this layer with residuals from next layer

        :param t: the class vector of current input
        """
        # delta of this layer
        self.delta = (y - t)*(y - y*y)
        # compute gradient
        partial_theta = numpy.dot(self.delta, self.x.transpose())
        partial_b = self.delta
        # compute residulas of previous layer, i.e., the mlp layer
        self.prev_layer.delta = numpy.dot(self.theta.transpose(), self.delta)*dtanh(self.x)
        # update
        self.theta -= self.learning_rate*partial_theta
        self.b -= self.learning_rate*partial_b
        # continue back propagating
        self.prev_layer.back_propagate()
Example #6
0
 def back_propagate(self):
     """
     Refine parameters of this layer with residuals from next layer
     """
     # compute gradient
     partial_theta = numpy.asarray(map(
         lambda i: numpy.rot90(conv2d(
             reduce(lambda res, x: res + self.x_imgs[x], self.connections[i], 0),
             numpy.rot90(self.delta[i], 2)
         ), 2),
         xrange(0, len(self.connections))
     ))
     parital_b = numpy.asarray(map(lambda x: numpy.sum(x), self.delta))
     # if previous layer is input layer, then do nothing
     if isinstance(self.prev_layer, InputLayer):
         return
     # compute residuals of previous pooling layer
     if not self.prev_layer.connections:
         self.prev_layer.connections = [[] for i in xrange(0, len(self.x_imgs))]
         for i in xrange(0, len(self.connections)):
             for c in self.connections[i]:
                 self.prev_layer.connections[c].append(i)
     conv_full_res = numpy.asarray(map(
         lambda i: conv2d(
             self.delta[i],
             numpy.rot90(self.theta[i], 2),
             border_mode='full'
         ),
         xrange(0, len(self.theta))
     ))
     self.prev_layer.delta = numpy.asarray(map(
         lambda i: dtanh(self.x_imgs[i])*reduce(
             lambda res, x: res + conv_full_res[x],
             self.prev_layer.connections[i],
             0
         ),
         xrange(0, len(self.x_imgs))
     ))
     # update weights and bias
     self.theta -= self.learning_rate*partial_theta
     self.b -= self.learning_rate*parital_b
     # continue back propagating
     self.prev_layer.back_propagate()
Example #7
0
    def back_propagate(self, t, y):
        """
        Refine parameters of this layer with residuals from next layer

        :param t: the class vector of current input
        """
        # delta of this layer
        self.delta = (y - t) * (y - y * y)
        # compute gradient
        partial_theta = numpy.dot(self.delta, self.x.transpose())
        partial_b = self.delta
        # compute residulas of previous layer, i.e., the mlp layer
        self.prev_layer.delta = numpy.dot(self.theta.transpose(),
                                          self.delta) * dtanh(self.x)
        # update
        self.theta -= self.learning_rate * partial_theta
        self.b -= self.learning_rate * partial_b
        # continue back propagating
        self.prev_layer.back_propagate()