def execute_backward_pass(self, d_next_layer): """ Calculates gradient of the cost with respect to the output of this layer and updates the weights. Arguments: - d_next_layer: gradient of the cost with respect to the output of the next layer. Returns: - the gradient of the cost with respect to the output of this layer. """ d_next_layer = relu_derivative(self.output) doutput = np.zeros(self.inputs.shape) dfilters = np.zeros(self.filters.shape) filter_index = 0 for input_index in range(self.inputs.shape[0]): for y in range(self.filter_size[1]): for x in range(self.filter_size[0]): doutput[input_index, y:y + self.filter_size[1], x:x + self.filter_size[0]] += self.filters[ filter_index] * d_next_layer[filter_index, y, x] dfilters[filter_index, :, :] += self.output[ input_index, y:y + self.filter_size[1], x:x + self.filter_size[0]] * d_next_layer[filter_index, y, x] if filter_index < self.n_filters - 1: filter_index += 1 else: filter_index = 0 self.filters += self.learning_rate * dfilters return doutput
def backpropagate(self, x, y): gradients_b = [np.zeros(b.shape) for b in self.biases] gradients_w = [np.zeros(w.shape) for w in self.weights] #feedforward a = x a_list = [x] z_list = [] for b, w in zip(self.biases[0:-1], self.weights[0:-1]): z = np.dot(w, a) + b a = utils.relu(z) a_list.append(a) z_list.append(z) z = np.dot(self.weights[-1], a_list[-1]) + self.biases[-1] a = utils.softmax(z) a_list.append(a) z_list.append(z) # backward # for softmax-cross-entropy layer: delta in last layer = result - ground truth delta = a_list[-1] - y # update b and w for the last layer L gradients_b[-1] = delta gradients_w[-1] = np.dot(delta, a_list[-2].transpose()) # update b and w for the rest of layers L-1, L-2, ... for l in range(2, self.num_layers): z = z_list[-l] # lth last layer of z r_derivative = utils.relu_derivative(z) # update delta based on delta(l) = transpose of w(l+1) * delta(l+1) delta = np.dot(self.weights[-l + 1].transpose(), delta) * r_derivative gradients_b[-l] = delta gradients_w[-l] = np.dot(delta, a_list[-l - 1].transpose()) return (gradients_b, gradients_w)
def back_step(self, xc, grad, state, h_prev, cell_prev): # softmax self.params.wk_diff = np.dot(state.h, grad.T) # self.params.wk_diff = np.dot(grad,self.params.wr) self.params.bk_diff = grad # Relu dr_before = util.relu_derivative(np.dot(self.params.wk, grad)) self.params.wr_diff += np.dot( dr_before.T, np.dot(state.h, self.params.wr) + self.params.br) self.params.br_diff += dr_before # h next state state.diff_h_values = np.dot(dr_before, self.params.wr) state.diff_h_values += state.h # output do = np.multiply(state.diff_h_values, util.tanh_normal(state.cell_values)) db_o = np.multiply( do, np.multiply(state.output_values, (1 - state.output_values))) self.params.wo_diff += np.dot(db_o, xc.T) self.params.bo_diff += db_o # cell dc = np.multiply( state.diff_h_values, np.multiply(state.output_values, (1 - util.tanh_normal(state.cell_values)**2))) dc += state.cell_values dc_temp = np.multiply(dc, state.input_values) db_c = np.multiply(dc_temp, (1 - state.cell_temp_values**2)) self.params.wg_diff += np.dot(db_c, xc.T) self.params.bg_diff += db_c # input di = np.multiply(dc, state.cell_temp_values) db_i = np.multiply( di, np.multiply(state.input_values, (1 - state.input_values))) self.params.wi_diff += np.dot(db_i, xc.T) self.params.bi_diff += db_i # forget df = np.multiply(dc, cell_prev) db_f = np.multiply( df, np.multiply(state.forget_values, (1 - state.forget_values))) self.params.wf_diff += np.dot(db_f, xc.T) self.params.bf_diff += db_f # xc dxc = (np.dot(self.params.wf.T, db_f) + np.dot(self.params.wi.T, db_i) + np.dot(self.params.wg.T, db_c) + np.dot(self.params.wo.T, db_o))
def BackPropagationLearner(dataset, net, learning_rate, epochs, activation=sigmoid): """[Figure 18.23] The back-propagation algorithm for multilayer networks""" # Initialise weights for layer in net: for node in layer: node.weights = random_weights(min_value=-0.5, max_value=0.5, num_weights=len(node.weights)) examples = dataset.examples ''' As of now dataset.target gives an int instead of list, Changing dataset class will have effect on all the learners. Will be taken care of later. ''' o_nodes = net[-1] i_nodes = net[0] o_units = len(o_nodes) idx_t = dataset.target idx_i = dataset.inputs n_layers = len(net) inputs, targets = init_examples(examples, idx_i, idx_t, o_units) for epoch in range(epochs): # Iterate over each example for e in range(len(examples)): i_val = inputs[e] t_val = targets[e] # Activate input layer for v, n in zip(i_val, i_nodes): n.value = v # Forward pass for layer in net[1:]: for node in layer: inc = [n.value for n in node.inputs] in_val = dotproduct(inc, node.weights) node.value = node.activation(in_val) # Initialize delta delta = [[] for _ in range(n_layers)] # Compute outer layer delta # Error for the MSE cost function err = [t_val[i] - o_nodes[i].value for i in range(o_units)] # The activation function used is relu or sigmoid function if node.activation == sigmoid: delta[-1] = [ sigmoid_derivative(o_nodes[i].value) * err[i] for i in range(o_units) ] elif node.activation == relu: delta[-1] = [ relu_derivative(o_nodes[i].value) * err[i] for i in range(o_units) ] elif node.activation == tanh: delta[-1] = [ tanh_derivative(o_nodes[i].value) * err[i] for i in range(o_units) ] elif node.activation == elu: delta[-1] = [ elu_derivative(o_nodes[i].value) * err[i] for i in range(o_units) ] else: delta[-1] = [ leaky_relu_derivative(o_nodes[i].value) * err[i] for i in range(o_units) ] # Backward pass h_layers = n_layers - 2 for i in range(h_layers, 0, -1): layer = net[i] h_units = len(layer) nx_layer = net[i + 1] # weights from each ith layer node to each i + 1th layer node w = [[node.weights[k] for node in nx_layer] for k in range(h_units)] if activation == sigmoid: delta[i] = [ sigmoid_derivative(layer[j].value) * dotproduct(w[j], delta[i + 1]) for j in range(h_units) ] elif activation == relu: delta[i] = [ relu_derivative(layer[j].value) * dotproduct(w[j], delta[i + 1]) for j in range(h_units) ] elif activation == tanh: delta[i] = [ tanh_derivative(layer[j].value) * dotproduct(w[j], delta[i + 1]) for j in range(h_units) ] elif activation == elu: delta[i] = [ elu_derivative(layer[j].value) * dotproduct(w[j], delta[i + 1]) for j in range(h_units) ] else: delta[i] = [ leaky_relu_derivative(layer[j].value) * dotproduct(w[j], delta[i + 1]) for j in range(h_units) ] # Update weights for i in range(1, n_layers): layer = net[i] inc = [node.value for node in net[i - 1]] units = len(layer) for j in range(units): layer[j].weights = vector_add( layer[j].weights, scalar_vector_product(learning_rate * delta[i][j], inc)) return net
def BackPropagationLearner(dataset, net, learning_rate, epochs, activation=sigmoid, momentum=False, beta=0.903): """[Figure 18.23] The back-propagation algorithm for multilayer networks""" # Initialise weights for layer in net: for node in layer: node.weights = random_weights(min_value=-0.5, max_value=0.5, num_weights=len(node.weights)) examples = dataset.examples ''' As of now dataset.target gives an int instead of list, Changing dataset class will have effect on all the learners. Will be taken care of later. ''' o_nodes = net[-1] i_nodes = net[0] o_units = len(o_nodes) idx_t = dataset.target idx_i = dataset.inputs n_layers = len(net) inputs, targets = init_examples(examples, idx_i, idx_t, o_units) for epoch in range(epochs): # Iterate over each example for e in range(len(examples)): i_val = inputs[e] t_val = targets[e] # Activate input layer for v, n in zip(i_val, i_nodes): n.value = v # Finding the values of the nodes through forward propogation for layer in net[1:]: for node in layer: inc = [n.value for n in node.inputs] in_val = dotproduct(inc, node.weights) node.value = node.activation(in_val) # Initialize delta which stores the values of the gradients for each activation units delta = [[] for _ in range(n_layers)] #initializing the velocity_gradient if momentum == True: v_dw = [[0 for i in range(len(_))] for _ in net] # Compute outer layer delta # Error for the MSE cost function err = [t_val[i] - o_nodes[i].value for i in range(o_units)] # The activation function used is relu or sigmoid function # First backward fast if node.activation == sigmoid: delta[-1] = [sigmoid_derivative(o_nodes[i].value) * err[i] for i in range(o_units)] elif node.activation == relu: delta[-1] = [relu_derivative(o_nodes[i].value) * err[i] for i in range(o_units)] elif node.activation == tanh: delta[-1] = [tanh_derivative(o_nodes[i].value) * err[i] for i in range(o_units)] elif node.activation == elu: delta[-1] = [elu_derivative(o_nodes[i].value) * err[i] for i in range(o_units)] else: delta[-1] = [leaky_relu_derivative(o_nodes[i].value) * err[i] for i in range(o_units)] # Propogating backward and finding gradients of nodes for each hidden layer h_layers = n_layers - 2 for i in range(h_layers, 0, -1): layer = net[i] h_units = len(layer) nx_layer = net[i+1] # weights from each ith layer node to each i + 1th layer node w = [[node.weights[k] for node in nx_layer] for k in range(h_units)] if activation == sigmoid: delta[i] = [sigmoid_derivative(layer[j].value) * dotproduct(w[j], delta[i+1]) for j in range(h_units)] elif activation == relu: delta[i] = [relu_derivative(layer[j].value) * dotproduct(w[j], delta[i+1]) for j in range(h_units)] elif activation == tanh: delta[i] = [tanh_derivative(layer[j].value) * dotproduct(w[j], delta[i+1]) for j in range(h_units)] elif activation == elu: delta[i] = [elu_derivative(layer[j].value) * dotproduct(w[j], delta[i+1]) for j in range(h_units)] else: delta[i] = [leaky_relu_derivative(layer[j].value) * dotproduct(w[j], delta[i+1]) for j in range(h_units)] #optimization with velocity gradient t_ = epoch + 1 if momentum == True: if epoch == 0: for i in range(len(delta)): for j in range(len(delta[i])): v_dw[i][j] = ((1-beta)*delta[i][j])/(1-beta**(t_+1)) else: for i in range(len(delta)): for j in range(len(delta[i])): v_dw[i][j] = (beta*v_dw[i][j]+(1-beta)*delta[i][j])/(1-beta**(t_+1)) # Update weights with normal gradient descent if momentum == False: for i in range(1, n_layers): layer = net[i] inc = [node.value for node in net[i-1]] units = len(layer) for j in range(units): layer[j].weights = vector_add(layer[j].weights, scalar_vector_product( learning_rate * delta[i][j], inc )) # Update weights with velocity gradient optimizer in gradient descent else: for i in range(1, n_layers): layer = net[i] inc = [node.value for node in net[i-1]] units = len(layer) for j in range(units): layer[j].weights = vector_add(layer[j].weights, scalar_vector_product( learning_rate * v_dw[i][j], inc )) return net