def bptt(self, x, y): # The total number of time steps t_steps = len(x) self.null_deltas() f, i, o, c, c_curr, h, y_ = self.forward(x) # y_ - 1 since 1 should the probability of choosing the correct word delta_y_ = y_ delta_y_[np.arange(len(y)), y] -= 1. delta_h = np.zeros(h.shape) delta_c = np.zeros(c.shape) delta_f = np.zeros(f.shape) delta_i = np.zeros(i.shape) delta_o = np.zeros(o.shape) delta_c_curr = np.zeros(c_curr.shape) # For each output backwards... for t in np.arange(t_steps)[::-1]: # one hot encoding x_t = np.zeros((self.word_dim, 1)) x_t[x[t]] = 1 delta_h[t] = np.dot(self.w_v.T, delta_y_[t]) + delta_h[t + 1] delta_c[t] = delta_c[t + 1] * f[t + 1] + delta_h[t] * o[t] * dtanh( c[t]) delta_f[t] = delta_c[t] * c[t - 1] * dsigmoid(f[t]) delta_i[t] = delta_c[t] * c_curr[t] * dsigmoid(i[t]) delta_o[t] = delta_h[t] * dsigmoid(o[t]) * np.tanh(c[t]) delta_c_curr[t] += delta_c[t] * i[t] * dtanh(c_curr[t]) # W_v, b_v self.dLdWv += np.outer(delta_y_[t], h[t].T) self.dLdBv += delta_y_[t] # W_fx, W_fh, b_f self.dLdWfx += np.dot(delta_f[t], x_t.T) self.dLdWfh += np.dot(delta_f[t], h[t - 1].T) self.dLdBf += delta_f[t] # W_ix, W_ih, b_i self.dLdWix += np.dot(delta_i[t], x_t.T) self.dLdWih += np.dot(delta_i[t], h[t - 1].T) self.dLdBi += delta_i[t] # W_cx, W_ch, b_c self.dLdWcx += np.dot(delta_c_curr[t], x_t.T) self.dLdWch += np.dot(delta_c_curr[t], h[t - 1].T) self.dLdBc += delta_c_curr[t] # W_ox, W_oh, b_o self.dLdWox += np.dot(delta_o[t], x_t.T) self.dLdWoh += np.dot(delta_o[t], h[t - 1].T) self.dLdBo += delta_o[t] self.clip_gradients()
def _backwards(self, X, y, i): # output layer error equals output-output layer # output error delta is equal to the error * the derivative of the # logicistic function(sigmoid) of output layer array # hidden layer error equals matrix multiplication of output delta and # transpose of w2 # hidden error delta is equal to the error * the derivative of the # logicistic function(sigmoid) of hidden layer array # update bias and weights hidden, output = self._forward(X) d_o = y - output if (i % 10000) == 0: print(np.mean(np.abs(d_o))) d_o = d_o * utils.dsigmoid(output) d_h = np.dot(d_o, self.w2.T) * utils.dsigmoid(hidden) self.w2 += self.learning_rate * np.dot(hidden.T, d_o) self.b2 += self.learning_rate * np.sum(d_o, axis=0, keepdims=True) self.w += self.learning_rate * np.dot(X.T, d_h) self.b += self.learning_rate * np.sum(d_h, axis=0, keepdims=True)
def backward(self, target, dh_next, dC_next, C_prev, z, f, i, C_bar, C, o, h, v, y): # the following code still needs to be modified. # for example: p -> self dv = np.copy(y) dv[target] -= 1 self.W_v.d += np.dot(dv, h.T) self.b_v.d += dv dh = np.dot(self.W_v.v.T, dv) dh += dh_next do = dh * utils.tanh(C) do = utils.dsigmoid(o) * do self.W_o.d += np.dot(do, z.T) self.b_o.d += do dC = np.copy(dC_next) dC += dh * o * utils.dtanh(utils.tanh(C)) dC_bar = dC * i dC_bar = utils.dtanh(C_bar) * dC_bar self.W_C.d += np.dot(dC_bar, z.T) self.b_C.d += dC_bar di = dC * C_bar di = utils.dsigmoid(i) * di self.W_i.d += np.dot(di, z.T) self.b_i.d += di df = dC * C_prev df = utils.dsigmoid(f) * df self.W_f.d += np.dot(df, z.T) self.b_f.d += df dz = (np.dot(self.W_f.v.T, df) + np.dot(self.W_i.v.T, di) + np.dot(self.W_C.v.T, dC_bar) + np.dot(self.W_o.v.T, do)) dh_prev = dz[:self.h_size, :] dC_prev = f * dC return dh_prev, dC_prev
def _compute_dJdis(self, acts, y): dJdis = [0] * self.nlayers #No error in layer 0, so res[0] = 0 dJdis[-1] = acts[-1][1:] - y for l in range(self.nlayers - 2, 0, -1): #Derivative of error according to output of current layer, computed using #dJdi of the next layer, by backpropaging through weighted arcs, ignoring bias #cause bias units dont have entering arcs. dJdo = np.transpose(np.dot(np.transpose(self.W[l]), dJdis[l + 1]))[1:] #Derivative of output according to inputs relative to the current layer. dodi = dsigmoid(np.dot(self.W[l - 1], acts[l - 1])) dJdis[l] = dJdo * dodi return dJdis
def calculate_errors(self, correct, outputs): errors = [[0 for neuron in range(0, self.neurons_per_layer[layer])] for layer in range(0, self.layers)] # calculate output layer errors for i in range(0, len(outputs[-1])): o = outputs[-1][i] errors[-1][i] = u.dsigmoid(o) * (correct[i] - o) # calculate hidden layer errors for layer in reversed(range(0, self.layers - 1)): for neuron in range(0, self.neurons_per_layer[layer]): neuron_output = outputs[layer][neuron] error_caused = 0 for i in range(0, self.neurons_per_layer[layer + 1]): unit = self.model[layer + 1][i] error_caused += unit.W[0][neuron] \ * errors[layer+1][i] \ * unit.d_func(neuron_output) errors[layer][neuron] = error_caused return errors
def error(self, wt, error, z): return np.multiply(np.matmul(wt.transpose(), error), utils.dsigmoid(z))
def result_error(self, correct, inputs): flz, slz, rz = self.feed_forward(inputs) cost = utils.ncost(correct, utils.sigmoid(rz)) # self.learning_speed = cost return np.multiply(utils.dcost(correct, utils.sigmoid(rz)), utils.dsigmoid(rz))