def backward(self, y_pred, y_train, cache): X, h1_cache, h2_cache, score_cache, nl_cache1, nl_cache2, u1, u2, bn1_cache, bn2_cache = cache # Output layer grad_y = self.dloss_funs[self.loss](y_pred, y_train) # Third layer dh2, dW3, db3 = l.fc_backward(grad_y, score_cache) dW3 += reg.dl2_reg(self.model['W3'], self.lam) dh2 = self.backward_nonlin(dh2, nl_cache2) dh2 = l.dropout_backward(dh2, u2) dh2, dgamma2, dbeta2 = l.bn_backward(dh2, bn2_cache) # Second layer dh1, dW2, db2 = l.fc_backward(dh2, h2_cache) dW2 += reg.dl2_reg(self.model['W2'], self.lam) dh1 = self.backward_nonlin(dh1, nl_cache1) dh1 = l.dropout_backward(dh1, u1) dh1, dgamma1, dbeta1 = l.bn_backward(dh1, bn1_cache) # First layer _, dW1, db1 = l.fc_backward(dh1, h1_cache) dW1 += reg.dl2_reg(self.model['W1'], self.lam) grad = dict( W1=dW1, W2=dW2, W3=dW3, b1=db1, b2=db2, b3=db3, gamma1=dgamma1, gamma2=dgamma2, beta1=dbeta1, beta2=dbeta2 ) return grad
def backward(self, y_pred, y_train, dh_next, cache): X, X_prime, h_old, hz, hz_cache, hz_sigm_cache, hr, hr_cache, hr_sigm_cache, hh, hh_cache, hh_tanh_cache, h, y_cache = cache dy = loss_fun.dcross_entropy(y_pred, y_train) dh, dWy, dby = l.fc_backward(dy, y_cache) dh += dh_next dhh = hz * dh dh_old1 = (1. - hz) * dh dhz = hh * dh - h_old * dh dhh = l.tanh_backward(dhh, hh_tanh_cache) dX_prime, dWh, dbh = l.fc_backward(dhh, hh_cache) dh_prime = dX_prime[:, :self.H] dh_old2 = hr * dh_prime dhr = h_old * dh_prime dhr = l.sigmoid_backward(dhr, hr_sigm_cache) dXr, dWr, dbr = l.fc_backward(dhr, hr_cache) dhz = l.sigmoid_backward(dhz, hz_sigm_cache) dXz, dWz, dbz = l.fc_backward(dhz, hz_cache) dX = dXr + dXz dh_old3 = dX[:, :self.H] dh_next = dh_old1 + dh_old2 + dh_old3 grad = dict(Wz=dWz, Wr=dWr, Wh=dWh, Wy=dWy, bz=dbz, br=dbr, bh=dbh, by=dby) return grad, dh_next
def backward(self, y_pred, y_train, cache): X, h1_cache, h3_cache, score_cache, hpool_cache, hpool, nl_cache1, nl_cache3 = cache # Output layer grad_y = self.dloss_funs[self.loss](y_pred, y_train) # FC-7 dh3, dW3, db3 = l.fc_backward(grad_y, score_cache) dh3 = self.backward_nonlin(dh3, nl_cache3) dh2, dW2, db2 = l.fc_backward(dh3, h3_cache) dh2 = dh2.ravel().reshape(hpool.shape) # Pool-1 dpool = l.maxpool_backward(dh2, hpool_cache) # Conv-1 dh1 = self.backward_nonlin(dpool, nl_cache1) dX, dW1, db1 = l.conv_backward(dh1, h1_cache) grad = dict( W1=dW1, W2=dW2, W3=dW3, b1=db1, b2=db2, b3=db3 ) return grad
def backward(self, y_pred, y_train, cache): (X, h1_cache, h2_cache, h4_cache, h5_cache, score_cache, hpool1_cache, hpool1, hpool2_cache, hpool2, nl_cache1, nl_cache2, nl_cache4, nl_cache5, bn4_cache,bn5_cache ) = cache '''Output layer''' grad_y = self.dloss_funs[self.loss](y_pred, y_train) dh5, dW6, db6 = l.fc_backward(grad_y, score_cache) '''FC-2''' dh5 = self.backward_nonlin(dh5, nl_cache5) dh5, dgamma5, dbeta5 = l.bn_backward(dh5, bn5_cache) dh4, dW5, db5 = l.fc_backward(dh5, h5_cache) '''FC -1''' dh4 = self.backward_nonlin(dh4, nl_cache4) dh4, dgamma4, dbeta4 = l.bn_backward(dh4,bn4_cache) dhpool3_, dW4, db4 = l.fc_backward(dh4, h4_cache) '''reshape''' dhpool3 = dhpool3_.ravel().reshape(hpool2.shape) '''Pool -2''' dpool2 = l.maxpool_backward(dhpool3, hpool2_cache) '''Conv -2''' dh2 = self.backward_nonlin(dpool2, nl_cache2) dh1, dW2, db2 = l.conv_backward(dh2, h2_cache) '''pool -1''' dpool1 = l.maxpool_backward(dh1, hpool1_cache) '''conv -1''' dh1 = self.backward_nonlin(dpool1, nl_cache1) dX, dW1, db1 = l.conv_backward(dh1, h1_cache) grad = dict(W1=dW1, W2=dW2, W4=dW4, W5=dW5, W6=dW6, b1=db1, b2=db2, b4=db4, b5=db5, b6=db6, gamma4=dgamma4,gamma5=dgamma5, beta4=dbeta4,beta5=dbeta5 ) return grad
def backward(self, y_pred, y_train, cache): #X, h1_cache, h3_cache, score_cache, hpool_cache, hpool, nl_cache1,nl_cache3,u1,u2,u3,bn1_cache,pool_cache,bn3_cache = cache X, h1_cache, h3_cache, score_cache, hpool_cache, hpool, nl_cache1, nl_cache3= cache # Output layer grad_y = self.dloss_funs[self.loss](y_pred, y_train) # FC-7 dh3, dW3, db3 = l.fc_backward(grad_y, score_cache) #dW3+=reg.dl2_reg(self.model['W3'],self.lam) dh3 = self.backward_nonlin(dh3, nl_cache3) #dh3 = l.dropout_backward(dh3,u3) #dh3,dgamma3,dbeta3= l.bn_backward(dh3,bn3_cache) dh2, dW2, db2 = l.fc_backward(dh3, h3_cache) #dh2 = l.dropout_backward(dh2,u2) dh2 = dh2.ravel().reshape(hpool.shape) #Pool-1 #dpool,dgamma2,dbeta2 = l.conv_bn_backward(dh2,pool_cache) dpool = l.maxpool_backward(dh2, hpool_cache) # Conv-1 dh1 = self.backward_nonlin(dpool, nl_cache1) #dX, dW1, db1 = l.conv_backward(dh1, h1_cache) #dW1+=reg.dl2_reg(self.model['W1'],self.lam) #dh1= l.dropout_backward(dh1,u1) #dh1,dgamma1,dbeta1 = l.conv_bn_backward(dh1,bn1_cache) dX, dW1, db1 = l.conv_backward(dh1, h1_cache) #grad = dict(W1=dW1, W2=dW2, W3=dW3,b1=db1, b2=db2, b3=db3,gamma1 = dgamma1,beta1 = dbeta1,gamma2 = dgamma2,beta2 = dbeta2,gamma3=dgamma3,beta3=dbeta3) grad = dict( W1=dW1, W2=dW2, W3=dW3, b1=db1, b2=db2, b3=db3 ) return grad
def backward(self, y_pred, y_train, d_next, cache): X, hf, hi, ho, hc, hf_cache, hf_sigm_cache, hi_cache, hi_sigm_cache, ho_cache, ho_sigm_cache, hc_cache, hc_tanh_cache, c_old, c, c_tanh_cache, y_cache = cache dh_next, dc_next = d_next dy = loss_fun.dcross_entropy(y_pred, y_train) dh, dWy, dby = l.fc_backward(dy, y_cache) dh += dh_next dho = c * dh dho = l.sigmoid_backward(dho, ho_sigm_cache) dc = ho * dh dc = l.tanh_backward(dc, c_tanh_cache) dc = dc + dc_next dhf = c_old * dc dhf = l.sigmoid_backward(dhf, hf_sigm_cache) dhi = hc * dc dhi = l.sigmoid_backward(dhi, hi_sigm_cache) dhc = hi * dc dhc = l.tanh_backward(dhc, hc_tanh_cache) dXo, dWo, dbo = l.fc_backward(dho, ho_cache) dXc, dWc, dbc = l.fc_backward(dhc, hc_cache) dXi, dWi, dbi = l.fc_backward(dhi, hi_cache) dXf, dWf, dbf = l.fc_backward(dhf, hf_cache) dX = dXo + dXc + dXi + dXf dh_next = dX[:, :self.H] dc_next = hf * dc grad = dict(Wf=dWf, Wi=dWi, Wc=dWc, Wo=dWo, Wy=dWy, bf=dbf, bi=dbi, bc=dbc, bo=dbo, by=dby) return grad, (dh_next, dc_next)
def backward(self, y_pred, y_train, cache, iter): num_layers = self.num_layers # Output layer grad_y = self.dloss_funs[self.loss](y_pred, y_train) # Fourth layer dh, dW, db = l.fc_backward(grad_y, cache['score_cache']) grad = dict() grad['Wf'] = dW + reg.dl2_reg(self.model['Wf'], self.lam) grad['bf'] = db dprevH = 0 for i in range(num_layers, 0, -1): if self.leapfrog: dh, dprevH, dW, db = l.leap_backward( dh, dprevH, cache['h_cache' + str(i)], cache['nl_cache' + str(i)], i == num_layers, self.hypo) else: dh, dW, db = l.fcrelu_backward( dh, cache['h_cache' + str(i)], cache['nl_cache' + str(i)], antisymmetric=self.antisymmetric, hypo=self.hypo) if not self.antisymmetric and not self.leapfrog: dW += reg.dl2_reg(self.model['W' + str(i)], self.lam) grad['W' + str(i)] = dW grad['b' + str(i)] = db if self.doDropout: dh = l.dropout_backward(dh, cache['u1']) dh, dW, db = l.fcrelu_backward(dh, cache['h_caches'], cache['nl_caches'], antisymmetric=self.antisymmetric, hypo=self.hypo) grad['Ws'] = dW + reg.dl2_reg(self.model['Ws'], self.lam) grad['bs'] = db #dh, dW, db = l.conv_backward(dh, cache['c_cache']) #grad['Wc'] = dW #grad['bs'] = db if self.freezeLastLayer or self.freezeClassificationLayer: grad['Wf'] = 0 grad['bf'] = 0 if self.weights_fixed: grad['Wf'] = 0 grad['bf'] = 0 grad['Ws'] = 0 grad['bs'] = 0 return grad
def backward(self, y_pred, y_train, dh_next, cache): X, Whh, h, hprev, y, h_cache, y_cache = cache # Softmax gradient dy = loss_fun.dcross_entropy(y_pred, y_train) # Hidden to output gradient dh, dWhy, dby = l.fc_backward(dy, y_cache) dh += dh_next dby = dby.reshape((1, -1)) # tanh dh = l.tanh_backward(dh, h_cache) # Hidden gradient dbh = dh dWhh = hprev.T @ dh dWxh = X.T @ dh dh_next = dh @ Whh.T grad = dict(Wxh=dWxh, Whh=dWhh, Why=dWhy, bh=dbh, by=dby) return grad, dh_next