def training(self, data_set, correct_output, n=0.2, epochs=1000): """ Trains the NeuralNetwork with the data set. Args: data_set: matrix with the vectors containg the inputs. correct_output: the expected output for each training. n: the learning rate. epochs: number of times that the network run all the data set. """ # File to write error history f = open("graphics/error_output.txt", "w") data_set = self.insert_bias(data_set) last_errors = [] for epoch in range(epochs): if epoch % 1000 is 0: print "Epoch: {}".format(epoch) random_index = np.random.randint(data_set.shape[0]) # layer_data: [w0, w1, w2, output] layer_data = [data_set[random_index]] # Calculate output for hidden layers for layer in range(len(self.weights)): dot_value = np.dot(layer_data[layer], self.weights[layer]) activation = utils.tanh(dot_value) layer_data.append(activation) # layer_data now contains: [ [outputs from input_layer(inputs)], # [outputs from hidden layer(s)], [output from output layer] ] # Calculate the error for output layer error = correct_output[random_index] - layer_data[-1] average_error = abs(np.average(error)) last_errors.append(average_error) if len(last_errors) == 10: last_errors_average = np.average(last_errors) f.write("{} {}\n".format(epoch, last_errors_average)) if last_errors_average < 0.001: print last_errors_average break last_errors = [] deltas = [error * utils.dtanh(layer_data[-1])] # Calculate Deltas for l in range(len(layer_data) - 2, 0, -1): deltas.append( deltas[-1].dot(self.weights[l].T)*utils.dtanh(layer_data[l]) ) deltas.reverse() # Backpropagate. Update the weights for all the layers for i in range(len(self.weights)): layer = np.atleast_2d(layer_data[i]) delta = np.atleast_2d(deltas[i]) self.weights[i] += n * layer.T.dot(delta) f.close()
def bptt(self, x, y): # The total number of time steps t_steps = len(x) self.null_deltas() f, i, o, c, c_curr, h, y_ = self.forward(x) # y_ - 1 since 1 should the probability of choosing the correct word delta_y_ = y_ delta_y_[np.arange(len(y)), y] -= 1. delta_h = np.zeros(h.shape) delta_c = np.zeros(c.shape) delta_f = np.zeros(f.shape) delta_i = np.zeros(i.shape) delta_o = np.zeros(o.shape) delta_c_curr = np.zeros(c_curr.shape) # For each output backwards... for t in np.arange(t_steps)[::-1]: # one hot encoding x_t = np.zeros((self.word_dim, 1)) x_t[x[t]] = 1 delta_h[t] = np.dot(self.w_v.T, delta_y_[t]) + delta_h[t + 1] delta_c[t] = delta_c[t + 1] * f[t + 1] + delta_h[t] * o[t] * dtanh( c[t]) delta_f[t] = delta_c[t] * c[t - 1] * dsigmoid(f[t]) delta_i[t] = delta_c[t] * c_curr[t] * dsigmoid(i[t]) delta_o[t] = delta_h[t] * dsigmoid(o[t]) * np.tanh(c[t]) delta_c_curr[t] += delta_c[t] * i[t] * dtanh(c_curr[t]) # W_v, b_v self.dLdWv += np.outer(delta_y_[t], h[t].T) self.dLdBv += delta_y_[t] # W_fx, W_fh, b_f self.dLdWfx += np.dot(delta_f[t], x_t.T) self.dLdWfh += np.dot(delta_f[t], h[t - 1].T) self.dLdBf += delta_f[t] # W_ix, W_ih, b_i self.dLdWix += np.dot(delta_i[t], x_t.T) self.dLdWih += np.dot(delta_i[t], h[t - 1].T) self.dLdBi += delta_i[t] # W_cx, W_ch, b_c self.dLdWcx += np.dot(delta_c_curr[t], x_t.T) self.dLdWch += np.dot(delta_c_curr[t], h[t - 1].T) self.dLdBc += delta_c_curr[t] # W_ox, W_oh, b_o self.dLdWox += np.dot(delta_o[t], x_t.T) self.dLdWoh += np.dot(delta_o[t], h[t - 1].T) self.dLdBo += delta_o[t] self.clip_gradients()
def compute_gradients(self, input, labels): [a1, az, ar, ahhat, ah, a2] = self.predict(input) error = (labels - a2) L = np.shape(input)[0] H = self.Nhidden dz = np.zeros((L, H)) dr = np.zeros((L, H)) dh = np.zeros((L, H)) d1 = np.zeros((L, H)) # this is ah from the previous timestep ahm1 = np.concatenate((np.zeros((1, H)), ah[:-1, :])) d2 = error * dtanh(a2) e2 = np.dot(error, self.w2.T) dh_next = np.zeros((1, self.Nhidden)) for i in range(L - 1, -1, -1): err = e2[i, :] + dh_next dz[i, :] = (err * ahhat[i, :] - err * ahm1[i, :]) * dsigm(az[i, :]) dh[i, :] = err * az[i, :] * dtanh(ahhat[i, :]) dr[i, :] = np.dot(dh[i, :], self.wh[:H, :].T) * ahm1[i, :] * dsigm( ar[i, :]) dh_next = err * (1 - az[i, :]) + np.dot( dh[i, :], self.wh[:H, :].T) * ar[i, :] + np.dot( dz[i, :], self.wz[:H, :].T) + np.dot( dr[i, :], self.wr[:H, :].T) d1[i, :] = np.dot(dh[i, :], self.wh[H:, :].T) + np.dot( dz[i, :], self.wz[H:, :].T) + np.dot(dr[i, :], self.wr[H:, :].T) d1 = d1 * dtanh(a1) # all the deltas are computed, now compute the gradients gw2 = 1.0 / L * np.dot(ah.T, d2) gb2 = 1.0 / L * np.sum(d2, 0) x = np.concatenate((ahm1, a1), 1) gwz = 1.0 / L * np.dot(x.T, dz) gbz = 1.0 / L * np.sum(dz, 0) gwr = 1.0 / L * np.dot(x.T, dr) gbr = 1.0 / L * np.sum(dr, 0) x = np.concatenate((ar * ahm1, a1), 1) gwh = 1.0 / L * np.dot(x.T, dh) gbh = 1.0 / L * np.sum(dh, 0) gw1 = 1.0 / L * np.dot(input.T, d1) gb1 = 1.0 / L * np.sum(d1, 0) weight_grads = [gw1, gwr, gwz, gwh, gw2] bias_grads = [gb1, gbr, gbz, gbh, gb2] return weight_grads, bias_grads
def backward(self, target, dh_next, dC_next, C_prev, z, f, i, C_bar, C, o, h, v, y): # the following code still needs to be modified. # for example: p -> self dv = np.copy(y) dv[target] -= 1 self.W_v.d += np.dot(dv, h.T) self.b_v.d += dv dh = np.dot(self.W_v.v.T, dv) dh += dh_next do = dh * utils.tanh(C) do = utils.dsigmoid(o) * do self.W_o.d += np.dot(do, z.T) self.b_o.d += do dC = np.copy(dC_next) dC += dh * o * utils.dtanh(utils.tanh(C)) dC_bar = dC * i dC_bar = utils.dtanh(C_bar) * dC_bar self.W_C.d += np.dot(dC_bar, z.T) self.b_C.d += dC_bar di = dC * C_bar di = utils.dsigmoid(i) * di self.W_i.d += np.dot(di, z.T) self.b_i.d += di df = dC * C_prev df = utils.dsigmoid(f) * df self.W_f.d += np.dot(df, z.T) self.b_f.d += df dz = (np.dot(self.W_f.v.T, df) + np.dot(self.W_i.v.T, di) + np.dot(self.W_C.v.T, dC_bar) + np.dot(self.W_o.v.T, do)) dh_prev = dz[:self.h_size, :] dC_prev = f * dC return dh_prev, dC_prev
def back_propagate(self, t, y): """ Refine parameters of this layer with residuals from next layer :param t: the class vector of current input """ # delta of this layer self.delta = (y - t)*(y - y*y) # compute gradient partial_theta = numpy.dot(self.delta, self.x.transpose()) partial_b = self.delta # compute residulas of previous layer, i.e., the mlp layer self.prev_layer.delta = numpy.dot(self.theta.transpose(), self.delta)*dtanh(self.x) # update self.theta -= self.learning_rate*partial_theta self.b -= self.learning_rate*partial_b # continue back propagating self.prev_layer.back_propagate()
def back_propagate(self): """ Refine parameters of this layer with residuals from next layer """ # compute gradient partial_theta = numpy.asarray(map( lambda i: numpy.rot90(conv2d( reduce(lambda res, x: res + self.x_imgs[x], self.connections[i], 0), numpy.rot90(self.delta[i], 2) ), 2), xrange(0, len(self.connections)) )) parital_b = numpy.asarray(map(lambda x: numpy.sum(x), self.delta)) # if previous layer is input layer, then do nothing if isinstance(self.prev_layer, InputLayer): return # compute residuals of previous pooling layer if not self.prev_layer.connections: self.prev_layer.connections = [[] for i in xrange(0, len(self.x_imgs))] for i in xrange(0, len(self.connections)): for c in self.connections[i]: self.prev_layer.connections[c].append(i) conv_full_res = numpy.asarray(map( lambda i: conv2d( self.delta[i], numpy.rot90(self.theta[i], 2), border_mode='full' ), xrange(0, len(self.theta)) )) self.prev_layer.delta = numpy.asarray(map( lambda i: dtanh(self.x_imgs[i])*reduce( lambda res, x: res + conv_full_res[x], self.prev_layer.connections[i], 0 ), xrange(0, len(self.x_imgs)) )) # update weights and bias self.theta -= self.learning_rate*partial_theta self.b -= self.learning_rate*parital_b # continue back propagating self.prev_layer.back_propagate()
def back_propagate(self, t, y): """ Refine parameters of this layer with residuals from next layer :param t: the class vector of current input """ # delta of this layer self.delta = (y - t) * (y - y * y) # compute gradient partial_theta = numpy.dot(self.delta, self.x.transpose()) partial_b = self.delta # compute residulas of previous layer, i.e., the mlp layer self.prev_layer.delta = numpy.dot(self.theta.transpose(), self.delta) * dtanh(self.x) # update self.theta -= self.learning_rate * partial_theta self.b -= self.learning_rate * partial_b # continue back propagating self.prev_layer.back_propagate()