def loss(self, X, y=None): """ Evaluate loss and gradient for the three-layer convolutional network. """ W1 = self.params['W1'] W2, b2 = self.params['W2'], self.params['b2'] W3, b3 = self.params['W3'], self.params['b3'] # pass pool_param to the forward pass for the max-pooling layer pool_param = {'pool_height': 2, 'pool_width': 2, 'stride': 2} scores = None conv, cache1 = layers.conv_forward(X,W1) relu1, cache2 = layers.relu_forward(conv) maxp, cache3 = layers.max_pool_forward(relu1,pool_param) fc1, cache4 = layers.fc_forward(maxp,W2,b2) relu2, cache5 = layers.relu_forward(fc1) scores, cache6 = layers.fc_forward(relu2,W3,b3) if y is None: return scores loss, grads = 0, {} loss, dscores = layers.softmax_loss(scores,y) dx3, dW3, db3 = layers.fc_backward(dscores,cache6) dRelu2 = layers.relu_backward(dx3,cache5) dx2, dW2, db2 = layers.fc_backward(dRelu2,cache4) dmaxp = layers.max_pool_backward(dx2.reshape(maxp.shape),cache3) dRelu1 = layers.relu_backward(dmaxp,cache2) dx,dW1 = layers.conv_backward(dRelu1,cache1) grads = {'W1':dW1,'W2':dW2,'b2':db2,'W3':dW3,'b3':db3} return loss, grads
def backward(self, train_data, y_true): loss, self.gradients["y"] = cross_entropy_loss(self.nurons["y"], y_true) self.gradients["W3"], self.gradients["b3"], self.gradients["z3_relu"] = fc_backward(self.gradients["y"], self.weights["W3"], self.nurons["z3_relu"]) self.gradients["z3"] = relu_backward(self.gradients["z3_relu"], self.nurons["z3"]) self.gradients["W2"], self.gradients["b2"], self.gradients["z2_relu"] = fc_backward(self.gradients["z3"], self.weights["W2"], self.nurons["z2_relu"]) self.gradients["z2"] = relu_backward(self.gradients["z2_relu"], self.nurons["z2"]) self.gradients["W1"], self.gradients["b1"], _ = fc_backward(self.gradients["z2"], self.weights["W1"], train_data) return loss
def train(self): # 随机初始化参数 W1 = np.random.randn(2, 3) b1 = np.zeros([3]) loss = 100.0 lr = 0.01 i = 0 while loss > 1e-15: x, y_true = self.next_sample(2) # 获取当前样本 # 前向传播 y = fc_forward(x, W1, b1) # 反向传播更新梯度 loss, dy = mean_squared_loss(y, y_true) dw, db, _ = fc_backward(dy, self.W, x) # 在一个batch中梯度取均值 # print(dw) # 更新梯度 W1 -= lr * dw b1 -= lr * db # 更新迭代次数 i += 1 if i % 1000 == 0: print("\n迭代{}次,当前loss:{}, 当前权重:{},当前偏置{}".format(i, loss, W1, b1)) # 打印最终结果 print("\n迭代{}次,当前loss:{}, 当前权重:{},当前偏置{}".format(i, loss, W1, b1)) return W1, b1
def test_fc_backward(self): # FC layer: backward np.random.seed(498) x = np.random.randn(10, 6) w = np.random.randn(6, 5) b = np.random.randn(5) dout = np.random.randn(10, 5) dx_num = eval_numerical_gradient_array( lambda x: layers.fc_forward(x, w, b)[0], x, dout) dw_num = eval_numerical_gradient_array( lambda w: layers.fc_forward(x, w, b)[0], w, dout) db_num = eval_numerical_gradient_array( lambda b: layers.fc_forward(x, w, b)[0], b, dout) _, cache = layers.fc_forward(x, w, b) dx, dw, db = layers.fc_backward(dout, cache) # The error should be around 1e-9 print('\nTesting fc_backward function:') print('dx error: ', rel_error(dx_num, dx)) print('dw error: ', rel_error(dw_num, dw)) print('db error: ', rel_error(db_num, db)) np.testing.assert_allclose(dx, dx_num, atol=1e-8) np.testing.assert_allclose(dw, dw_num, atol=1e-8) np.testing.assert_allclose(db, db_num, atol=1e-8)
def backward(self, in_gradient): """ 梯度反向传播 :param in_gradient: 后一层传递过来的梯度,[B,out_units] :return out_gradient: 传递给前一层的梯度,[B,in_units] """ g_weight, g_bias, out_gradient = fc_backward(in_gradient, self.weight, self.in_features) self.set_gradient('weight', g_weight) self.set_gradient('bias', g_bias) return out_gradient
def backward(self, grad_scores, cache): grads = None ####################################################################### # TODO: Implement the backward pass to compute gradients for all # # learnable parameters of the model, storing them in the grads dict # # above. The grads dict should give gradients for all parameters in # # the dict returned by model.parameters(). # ####################################################################### cache11, cache12, cache2 = cache grad_out12, grad_W2, grad_b2 = fc_backward(grad_scores, cache2) grad_out11 = relu_backward(grad_out12, cache12) grad_X, grad_W1, grad_b1 = fc_backward(grad_out11, cache11) grads = { 'W1': grad_W1, 'b1': grad_b1, 'W2': grad_W2, 'b2': grad_b2, } ####################################################################### # END OF YOUR CODE # ####################################################################### return grads
def loss(self, X, y=None): """ Compute loss and gradient for a minibatch of data. Inputs: - X: Array of input data of shape (N, d_in) - y: Array of labels, of shape (N,). y[i] gives the label for X[i]. Returns: If y is None, then run a test-time forward pass of the model and return: - scores: Array of shape (N, C) giving classification scores, where scores[i, c] is the classification score for X[i] and class c. If y is not None, then run a training-time forward and backward pass and return a tuple of: - loss: Scalar value giving the loss - grads: Dictionary with the same keys as self.params, mapping parameter names to gradients of the loss with respect to those parameters. """ W1, b1 = self.params['W1'], self.params['b1'] W3, b3 = self.params['W3'], self.params['b3'] N, d_in = X.shape scores = None f, cache1 = layers.fc_forward(X, W1, b1) #fc h, cache2 = layers.relu_forward(f) #relu scores, cache3 = layers.fc_forward(h, W3, b3) #fc # If y is None then we are in test mode so just return scores if y is None: return scores loss, grads = 0, {} loss, dscores = layers.softmax_loss(scores, y) dx2, dW3, db3 = layers.fc_backward(dscores, cache3) dx1 = layers.relu_backward(dx2, cache2) dx, dW1, db1 = layers.fc_backward(dx1, cache1) grads = {'W1': dW1, 'b1': db1, 'W3': dW3, 'b3': db3} return loss, grads
def loss(self, features, captions): """ Compute training-time loss for the RNN. We input image features and ground-truth captions for those images, and use an RNN to compute loss and gradients on all parameters. Inputs: - features: Input image features, of shape (N, D) - captions: Ground-truth captions; an integer array of shape (N, T) where each element is in the range 0 <= y[i, t] < V Returns a tuple of: - loss: Scalar loss - grads: Dictionary of gradients parallel to self.params """ # Cut captions into two pieces: captions_in has everything but the last word # and will be input to the RNN; captions_out has everything but the first # word and this is what we will expect the RNN to generate. These are offset # by one relative to each other because the RNN should produce word (t+1) # after receiving word t. The first element of captions_in will be the START # token, and the first element of captions_out will be the first word. captions_in = captions[:, :-1] captions_out = captions[:, 1:] # You'll need this mask = (captions_out != self._null) # Weight and bias for the affine transform from image features to initial # hidden state W_proj, b_proj = self.params['W_proj'], self.params['b_proj'] # Word embedding matrix W_embed = self.params['W_embed'] # Input-to-hidden, hidden-to-hidden, and biases for the RNN Wx, Wh, b = self.params['Wx'], self.params['Wh'], self.params['b'] # Weight and bias for the hidden-to-vocab transformation. W_vocab, b_vocab = self.params['W_vocab'], self.params['b_vocab'] loss, grads = 0.0, {} ############################################################################ # TODO: Implement the forward and backward passes for the CaptioningRNN. # # In the forward pass you will need to do the following: # # (1) Use an fc transformation to compute the initial hidden state # # from the image features. This should produce an array of shape (N, H)# # (2) Use a word embedding layer to transform the words in captions_in # # from indices to vectors, giving an array of shape (N, T, W). # # (3) Use a vanilla RNN to process the sequence of input word vectors # # of shape (T, N, W), and produce hidden state vectors for all # # timesteps, producing an array of shape (T, N, H). # # (4) Use a (temporal) fc transformation to compute scores over the # # vocabulary at every timestep using the hidden states, giving an # # array of shape (N, T, V). # # (5) Use (temporal) softmax to compute loss using captions_out, ignoring # # the points where the output word is <NULL> using the mask above. # # # # In the backward pass you will need to compute the gradient of the loss # # with respect to all model parameters. Use the loss and grads variables # # defined above to store loss and gradients; grads[k] should give the # # gradients for self.params[k]. # ############################################################################ # Forward Pass #step 1 h0, cache_0 = layers.fc_forward(features, W_proj, b_proj) #step 2 word_embedded, word_embedded_cache = word_embedding_forward(captions_in, W_embed) word_embedded = np.transpose(word_embedded, (1,0,2)) #step 3 h, cache_rnn = rnn_forward(word_embedded, h0, Wx, Wh, b) h = np.transpose(h, (1,0,2)) #step 4 y_hat, cache_temp = temporal_fc_forward(h, W_vocab, b_vocab) #step 5 loss, dout = temporal_softmax_loss(y_hat, captions_out, mask) # Gradients #temporal backward dh, grads['W_vocab'], grads['b_vocab'] = temporal_fc_backward(dout, cache_temp) dh = np.transpose(dh, (1,0,2)) #rnn backward d_word, dh0, grads['Wx'], grads['Wh'], grads['b'] = rnn_backward(dh, cache_rnn) d_word = np.transpose(d_word, (1,0,2)) #word embedded backward grads['W_embed'] = word_embedding_backward(d_word, word_embedded_cache) #full connected backward d_feature, grads['W_proj'], grads['b_proj'] = layers.fc_backward(dh0, cache_0) ############################################################################ # END OF YOUR CODE # ############################################################################ return loss, grads
def loss(self, features, captions): """ Compute training-time loss for the RNN. We input image features and ground-truth captions for those images, and use an RNN to compute loss and gradients on all parameters. Inputs: - features: Input image features, of shape (N, D) - captions: Ground-truth captions; an integer array of shape (N, T) where each element is in the range 0 <= y[i, t] < V Returns a tuple of: - loss: Scalar loss - grads: Dictionary of gradients parallel to self.params """ # Cut captions into two pieces: captions_in has everything but the last word # and will be input to the RNN; captions_out has everything but the first # word and this is what we will expect the RNN to generate. These are offset # by one relative to each other because the RNN should produce word (t+1) # after receiving word t. The first element of captions_in will be the START # token, and the first element of captions_out will be the first word. captions_in = captions[:, :-1] captions_out = captions[:, 1:] # You'll need this mask = (captions_out != self._null) # Weight and bias for the affine transform from image features to initial # hidden state W_proj, b_proj = self.params['W_proj'], self.params['b_proj'] # Word embedding matrix W_embed = self.params['W_embed'] # Input-to-hidden, hidden-to-hidden, and biases for the RNN Wx, Wh, b = self.params['Wx'], self.params['Wh'], self.params['b'] # Weight and bias for the hidden-to-vocab transformation. W_vocab, b_vocab = self.params['W_vocab'], self.params['b_vocab'] loss, grads = 0.0, {} # Forward Pass fc, cache1 = layers.fc_forward(features, W_proj, b_proj) emb, cache2 = rnn_layers.word_embedding_forward(captions_in, W_embed) emb = emb.transpose(1, 0, 2) rnn, cache3 = rnn_layers.rnn_forward(emb, fc, Wx, Wh, b) rnn = rnn.transpose(1, 0, 2) tfc, cache4 = rnn_layers.temporal_fc_forward(rnn, W_vocab, b_vocab) loss, dout = rnn_layers.temporal_softmax_loss(tfc, captions_out, mask) # Gradients dtfc, dW_vocab, db_vocab = rnn_layers.temporal_fc_backward( dout, cache4) dtfc = dtfc.transpose(1, 0, 2) drnn, dfc, dWx, dWh, db = rnn_layers.rnn_backward(dtfc, cache3) drnn = drnn.transpose(1, 0, 2) dW_embed = rnn_layers.word_embedding_backward(drnn, cache2) dfeature, dW_proj, db_proj = layers.fc_backward(dfc, cache1) grads = { 'W_embed': dW_embed, 'W_proj': dW_proj, 'W_vocab': dW_vocab, 'Wh': dWh, 'Wx': dWx, 'b': db, 'b_proj': db_proj, 'b_vocab': db_vocab } return loss, grads
def loss(self, features, captions): """ Compute training-time loss for the RNN. We input image features and ground-truth captions for those images, and use an RNN to compute loss and gradients on all parameters. Inputs: - features: Input image features, of shape (N, D) - captions: Ground-truth captions; an integer array of shape (N, T) where each element is in the range 0 <= y[i, t] < V Returns a tuple of: - loss: Scalar loss - grads: Dictionary of gradients parallel to self.params """ # Cut captions into two pieces: captions_in has everything but the last word # and will be input to the RNN; captions_out has everything but the first # word and this is what we will expect the RNN to generate. These are offset # by one relative to each other because the RNN should produce word (t+1) # after receiving word t. The first element of captions_in will be the START # token, and the first element of captions_out will be the first word. captions_in = captions[:, :-1] captions_out = captions[:, 1:] # You'll need this mask = (captions_out != self._null) # Weight and bias for the affine transform from image features to initial # hidden state W_proj, b_proj = self.params['W_proj'], self.params['b_proj'] # Word embedding matrix W_embed = self.params['W_embed'] # Input-to-hidden, hidden-to-hidden, and biases for the RNN Wx, Wh, b = self.params['Wx'], self.params['Wh'], self.params['b'] # Weight and bias for the hidden-to-vocab transformation. W_vocab, b_vocab = self.params['W_vocab'], self.params['b_vocab'] loss, grads = 0.0, {} batch_size, input_dim = features.shape _, n_time_steps = captions_in.shape wordvec_dim = Wx.shape[0] hidden_dim = Wh.shape[0] vocab_size = W_vocab.shape[1] ############################################################################ # TODO: Implement the forward and backward passes for the CaptioningRNN. # # In the forward pass you will need to do the following: # # (1) Use an fc transformation to compute the initial hidden state # # from the image features. This should produce an array of shape (N, H)# # (2) Use a word embedding layer to transform the words in captions_in # # from indices to vectors, giving an array of shape (N, T, W). # # (3) Use a vanilla RNN to process the sequence of input word vectors # # and produce hidden state vectors for all timesteps, producing # # an array of shape (T, N, H). # # (4) Use a (temporal) fc transformation to compute scores over the # # vocabulary at every timestep using the hidden states, giving an # # array of shape (N, T, V). # # (5) Use (temporal) softmax to compute loss using captions_out, ignoring # # the points where the output word is <NULL> using the mask above. # # # # In the backward pass you will need to compute the gradient of the loss # # with respect to all model parameters. Use the loss and grads variables # # defined above to store loss and gradients; grads[k] should give the # # gradients for self.params[k]. # ############################################################################ # Forward Pass # N x T x D # (1) compute the initial hidden state (N, H) h0, cache_h0 = fc_forward(features, W_proj, b_proj) # (2) transform the words in captions_in to vectors (N, T, W) x, cache_emb = word_embedding_forward(captions_in, W_embed) x_trans = np.transpose(x, (1, 0, 2)) # (3) produce hidden state vectors for all timestapes (N, T, H) h_trans, cache_h = rnn_forward(x_trans, h0, Wx, Wh, b) h = np.transpose(h_trans, (1, 0, 2)) # (4) compute scores over the vocabulary (N, T, V) out, cache_out = temporal_fc_forward(h, W_vocab, b_vocab) # (5) compute softmax loss using captions_out loss, dout = temporal_softmax_loss(out, captions_out, mask) # Gradients#################################### # (6) backprop for (4) dout = dout.reshape(-1, vocab_size) # (N x T, V) dh, grads['W_vocab'], grads['b_vocab'] = temporal_fc_backward( dout, cache_out) dh = np.transpose(dh, (1, 0, 2)) # (7) backprop for (3) dx, dh0, grads['Wx'], grads['Wh'], grads['b'] = rnn_backward( dh, cache_h) dx = np.transpose(dx, (1, 0, 2)) # (8) backprop for (2) grads['W_embed'] = word_embedding_backward(dx, cache_emb) # (9) backprop for (1) _, grads['W_proj'], grads['b_proj'] = fc_backward(dh0, cache_h0) ############################################################################ # END OF YOUR CODE # ############################################################################ return loss, grads