def train_loss(*args): X = args[0] y = args[1] res = X for l in xrange(self.num_layers): prev_res = res res = affine_forward(prev_res, args[self.w_idx(l)], args[self.b_idx(l)]) if l < (self.num_layers - 1): if self.use_batchnorm: res = batchnorm_forward(res, args[self.bn_ga_idx(l)], args[self.bn_bt_idx(l)], self.bn_params[l]) res = relu_forward(res) if self.use_dropout: res = dropout_forward(res, self.dropout_param) scores = res if mode == 'test': return scores #loss, _ = softmax_loss(scores, y) loss = svm_loss(scores, y) return loss
def loss(self, X, y=None): """ Compute loss and gradient for a minibatch of data. Inputs: - X: Array of input data of shape (N, d_1, ..., d_k) - y: Array of labels, of shape (N,). y[i] gives the label for X[i]. Returns: If y is None, then run a test-time forward pass of the model and return: - scores: Array of shape (N, C) giving classification scores, where scores[i, c] is the classification score for X[i] and class c. If y is not None, then run a training-time forward and backward pass and return a tuple of: - loss: Scalar value giving the loss - grads: Dictionary with the same keys as self.params, mapping parameter names to gradients of the loss with respect to those parameters. """ W1, W2, b1, b2 = self.params['W1'], self.params['W2'], self.params[ 'b1'], self.params['b2'] hidden_out, cache1 = affine_relu_forward(X, W1, b1) scores, cache2 = affine_forward(hidden_out, W2, b2) if y is None: return scores grads = {} loss, dScore = softmax_loss(scores, y) loss += .5 * self.reg * (np.sum(W1**2) + np.sum(W2**2)) dX2, grads['W2'], grads['b2'] = affine_backward(dScore, cache2) dX, grads['W1'], grads['b1'] = affine_relu_backward(dX2, cache1) grads['W2'] += self.reg * W2 grads['W1'] += self.reg * W1 return loss, grads
def loss(self, X, y=None, reg=1e-5): print 'start computing loss and grad.............' W1, b1 = self.params['W1'], self.params['b1'] W2, b2 = self.params['W2'], self.params['b2'] W3, b3 = self.params['W3'], self.params['b3'] # pass conv_param to the forward pass for the convolutional layer filter_size = W1.shape[2] conv_param = {'stride': 1, 'pad': (filter_size - 1) / 2} # pass pool_param to the forward pass for the max-pooling layer pool_param = {'pool_height': 2, 'pool_width': 2, 'stride': 2} # compute the forward pass print 'compute the forward pass......' print 'compute the w1 conv_relu_pool_forward forward pass......' a1, cache1 = layers.conv_relu_pool_forward(X, W1, b1, conv_param, pool_param) print 'compute the w2 affine_relu_forward forward pass......' a2, cache2 = layers.affine_relu_forward(a1, W2, b2) print 'compute the w3 affine_forward forward pass......' scores, cache3 = layers.affine_forward(a2, W3, b3) if y is None: return scores # compute the backward pass print 'compute the backward pass......' print 'compute the softmax_loss backward pass......' data_loss, dscores = layers.softmax_loss(scores, y) print 'compute the dw3 affine_backward backward pass......' da2, dW3, db3 = layers.affine_backward(dscores, cache3) print 'compute the dw2 affine_relu_backward backward pass......' da1, dW2, db2 = layers.affine_relu_backward(da2, cache2) print 'compute the dw1 conv_relu_pool_backward backward pass......' dX, dW1, db1 = layers.conv_relu_pool_backward(da1, cache1) # Add regularization dW1 += self.reg * W1 dW2 += self.reg * W2 dW3 += self.reg * W3 reg_loss = 0.5 * self.reg * sum(np.sum(W * W) for W in [W1, W2, W3]) loss = data_loss + reg_loss grads = { 'W1': dW1, 'b1': db1, 'W2': dW2, 'b2': db2, 'W3': dW3, 'b3': db3 } print ' computing loss and grad end !!!!!!!!!!!!!!!!!' print 'loss is :', loss return loss, grads
def loss(self, X, y=None, reg=1e-5): W1, b1 = self.params['W1'], self.params['b1'] W2, b2 = self.params['W2'], self.params['b2'] W3, b3 = self.params['W3'], self.params['b3'] # pass conv_param to the forward pass for the convolutional layer filter_size = W1.shape[2] conv_param = {'stride': 1, 'pad': (filter_size - 1) / 2} # pass pool_param to the forward pass for the max-pooling layer pool_param = {'pool_height': 2, 'pool_width': 2, 'stride': 2} # compute the forward pass a1, cache1 = layers.conv_relu_pool_forward(X, W1, b1, conv_param, pool_param) norm_out, norm_cache = layers.spatial_batchnorm_forward( a1, 1, 0, bn_param={'mode': 'train'}) a2, cache2 = layers.affine_relu_forward(norm_out, W2, b2) scores, cache3 = layers.affine_forward(a2, W3, b3) if y is None: return scores # compute the backward pass data_loss = NUS_loss_test.NUSDataTrain().loss(scores, y) dscores = NUS_loss_test.NUSDataTrain().eval_numerical_gradient( NUS_loss_test.NUSDataTrain().grad_loss, scores) # layers.softmax_loss(scores, y)#改这里 da2, dW3, db3 = layers.affine_backward(dscores, cache3) da1, dW2, db2 = layers.affine_relu_backward(da2, cache2) dnorm_out, dgamma, dbeta = layers.spatial_batchnorm_backward( da1, norm_cache) dX, dW1, db1 = layers.conv_relu_pool_backward(dnorm_out, cache1) # Add regularization dW1 += self.reg * W1 dW2 += self.reg * W2 dW3 += self.reg * W3 reg_loss = 0.5 * self.reg * sum(np.sum(W * W) for W in [W1, W2, W3]) loss = data_loss + reg_loss grads = { 'W1': dW1, 'b1': db1, 'W2': dW2, 'b2': db2, 'W3': dW3, 'b3': db3 } return loss, grads
def train_loss(X, y, W1, W2, b1, b2): l1 = affine_relu_forward(X, W1, b1) l2 = affine_forward(l1, W2, b2) scores = l2 if y is None: return scores #[TODO]: softmax is not supported yet # loss, d_scores = softmax_loss(scores, y) loss = svm_loss(scores, y) loss_with_reg = loss + np.sum(W1 ** 2) * 0.5 * self.reg + np.sum(W2 ** 2) * 0.5 * self.reg return loss_with_reg
def affine_relu_forward(x, w, b): """ Convenience layer that perorms an affine transform followed by a ReLU Inputs: - x: Input to the affine layer - w, b: Weights for the affine layer Returns a tuple of: - out: Output from the ReLU - cache: Object to give to the backward pass """ a = affine_forward(x, w, b) out = relu_forward(a) return out
def train_loss(X, y, W1, W2, b1, b2): l1 = affine_relu_forward(X, W1, b1) l2 = affine_forward(l1, W2, b2) scores = l2 if y is None: return scores #[TODO]: softmax is not supported yet # loss, d_scores = softmax_loss(scores, y) loss = svm_loss(scores, y) loss_with_reg = loss + np.sum(W1**2) * 0.5 * self.reg + np.sum( W2**2) * 0.5 * self.reg return loss_with_reg
def affine_relu_forward(x, w, b): ''' Convenience layer that perorms an affine transform followed by a ReLU input: x:input to the affine layer w: wights b: bias return: a tuple out: output from the relu cache: object to give to the backward pass ''' a, fc_cache = layers.affine_forward(x, w, b) # a=wx+b fc_cache=(x,w,b) out, relu_cache = layers.relu_forward( a) # out=np.maximum(0,a) relu_cache=a cache = (fc_cache, relu_cache) return out, cache
def loss(self, X, y=None): """ Compute loss and gradient for a minibatch of data. Inputs: - X: Array of input data of shape (N, d_1, ..., d_k) - y: Array of labels, of shape (N,). y[i] gives the label for X[i]. Returns: If y is None, then run a test-time forward pass of the model and return: - scores: Array of shape (N, C) giving classification scores, where scores[i, c] is the classification score for X[i] and class c. If y is not None, then run a training-time forward and backward pass and return a tuple of: - loss: Scalar value giving the loss - grads: Dictionary with the same keys as self.params, mapping parameter names to gradients of the loss with respect to those parameters. """ out1, cache1 = layer_utilities.affine_relu_forward( X, self.params['W1'], self.params['b1']) out2, cache2 = layers.affine_forward( out1, self.params['W2'], self.params['b2']) # last layer no need to use relu scores = out2 if y is None: return scores # backward loss, grads = 0, {} loss, d_scores = layers.softmax_loss(scores, y) loss = loss + 0.5 * self.reg * ( np.sum(self.params['W1'] * self.params['W1']) + np.sum(self.params['W2'] * self.params['W2'])) dout1, dW2, db2 = layers.affine_backward(d_scores, cache2) dx, dW1, db1 = layer_utilities.affine_relu_backward(dout1, cache1) grads['W2'] = dW2 + self.reg * self.params['W2'] grads['b2'] = db2 grads['W1'] = dW1 + self.reg * self.params['W1'] grads['b1'] = db1 return loss, grads
def loss(self,X,y=None): """ Compute loss and gradient for a minibatch of data. Inputs: - X: Array of input data of shape (N, d_1, ..., d_k) - y: Array of labels, of shape (N,). y[i] gives the label for X[i]. Returns: If y is None, then run a test-time forward pass of the model and return: - scores: Array of shape (N, C) giving classification scores, where scores[i, c] is the classification score for X[i] and class c. If y is not None, then run a training-time forward and backward pass and return a tuple of: - loss: Scalar value giving the loss - grads: Dictionary with the same keys as self.params, mapping parameter names to gradients of the loss with respect to those parameters. """ scores = None W1,b1 = self.params['W1'],self.params['b1'] W2,b2 = self.params['W2'],self.params['b2'] ar1_out,ar1_cache = affine_relu_forward(X,W1,b1) ar2_out,ar2_cache = affine_forward(ar1_out,W2,b2) scores = ar2_out if y is None: return scores loss,grads = 0,{} loss,dout = softmax_loss(scores,y) loss = loss+0.5*self.reg*np.sum(W1*W1)+0.5*self.reg*np.sum(W2*W2) dx2,dw2,db2 = affine_backward(dout,ar2_cache) grads['W2'] = dw2 +self.reg*W2 grads['b2'] = db2 dx1,dw1,db1 = affine_relu_backward(dx2,ar1_cache) grads['W1'] = dw1+self.reg*W1 grads['b1'] = db1 return loss,grads
def loss(self, x, y=None): """ Loss function used is MSE loss """ scores = None scores, cache1 = affine_relu_forward(x, self.params['W1'], self.params['b1']) scores, cache2 = affine_relu_forward(scores, self.params['W2'], self.params['b2']) scores, cache3 = affine_relu_forward(scores, self.params['W3'], self.params['b3']) scores, cache4 = affine_forward(scores, self.params['W4'], self.params['b4']) if y is None: return scores loss = mse_loss_forward(scores, y) grads = {} dup = mse_loss_backward(scores, y) dup, grads['W4'], grads['b4'] = affine_backward(dup, cache4) dup, grads['W3'], grads['b3'] = affine_relu_backward(dup, cache3) dup, grads['W2'], grads['b2'] = affine_relu_backward(dup, cache2) dup, grads['W1'], grads['b1'] = affine_relu_backward(dup, cache1) return loss, grads
def predict(self, X): """ Inputs: - X: A numpy array of shape (N, D) giving N D-dimensional data points to classify. Returns: - y_pred: A numpy array of shape (N,) giving predicted labels for each of the elements of X. For all i, y_pred[i] = c means that X[i] is predicted to have class c, where 0 <= c < C. """ y_pred = None # h1 = layers.ReLU(np.dot(X, self.params['W1']) + self.params['b1']) # scores = np.dot(h1, self.params['W2']) + self.params['b2'] # y_pred = np.argmax(scores, axis=1) W1, b1 = self.params['W1'], self.params['b1'] W2, b2 = self.params['W2'], self.params['b2'] W3, b3 = self.params['W3'], self.params['b3'] # pass conv_param to the forward pass for the convolutional layer filter_size = W1.shape[2] conv_param = {'stride': 1, 'pad': (filter_size - 1) / 2} # pass pool_param to the forward pass for the max-pooling layer pool_param = {'pool_height': 2, 'pool_width': 2, 'stride': 2} # compute the forward pass a1, cache1 = layers.conv_relu_pool_forward(X, W1, b1, conv_param, pool_param) a2, cache2 = layers.affine_relu_forward(a1, W2, b2) scores, cache3 = layers.affine_forward(a2, W3, b3) y_pred = np.argmax(scores, axis=1) return y_pred
import layers import numpy as np num_inputs = 2 input_shape = (4, 5, 6) output_dim = 3 input_size = num_inputs * np.prod(input_shape) weight_size = output_dim * np.prod(input_shape) x = np.linspace(-0.1, 0.5, num=input_size).reshape(num_inputs, *input_shape) w = np.linspace(-0.2, 0.3, num=weight_size).reshape(np.prod(input_shape), output_dim) b = np.linspace(-0.3, 0.1, num=output_dim) out, _ = layers.affine_forward(x, w, b)
def affine_relu_forward(x, w, b): a, fc_cache = affine_forward(x, w, b) out, relu_cache = relu_forward(a) cache = (fc_cache, relu_cache) return out, cache
################################################################################### # Test the affine_forward function num_inputs = 2 input_shape = (4, 5, 6) output_dim = 3 input_size = num_inputs * np.prod(input_shape) theta_size = output_dim * np.prod(input_shape) x = np.linspace(-0.1, 0.5, num=input_size).reshape(num_inputs, *input_shape) theta = np.linspace(-0.2, 0.3, num=theta_size).reshape(np.prod(input_shape), output_dim) theta_0 = np.linspace(-0.3, 0.1, num=output_dim) out, _ = layers.affine_forward(x, theta, theta_0) correct_out = np.array([[ 1.49834967, 1.70660132, 1.91485297], [ 3.25553199, 3.5141327, 3.77273342]]) # Compare your output with ours. The error should be around 1e-9. if out.any(): print 'Testing affine_forward function:' print 'difference (should be around 1e-9): ', rel_error(out, correct_out) # Problem 3.1.2 ################################################################################### # Affine layer: backward. # ################################################################################### # In the file layers.py implement the affine_backward function. # # Once you are done you can test your implementation using numeric gradient. # ###################################################################################
def loss(self, X, y=None): """ Compute loss and gradient for the fully-connected net. Input / output: Same as TwoLayerNet above. """ X = X.astype(self.dtype) mode = 'test' if y is None else 'train' # Set train/test mode for batchnorm params and dropout param since they # behave differently during training and testing. if self.dropout_param is not None: self.dropout_param['mode'] = mode if self.use_batchnorm: for bn_param in self.bn_params: bn_param['mode'] = mode scores = None ############################################################################ # TODO: Implement the forward pass for the fully-connected net, computing # # the class scores for X and storing them in the scores variable. # # # # When using dropout, you'll need to pass self.dropout_param to each # # dropout forward pass. # # # # When using batch normalization, you'll need to pass self.bn_params[0] to # # the forward pass for the first batch normalization layer, pass # # self.bn_params[1] to the forward pass for the second batch normalization # # layer, etc. # ############################################################################ layer_input = X ar_cache = {} dp_cache = {} for lay in xrange(self.num_layers - 1): if self.use_batchnorm: layer_input, ar_cache[lay] = affine_bn_relu_forward(layer_input, self.params['W%d' % (lay + 1)], self.params['b%d' % (lay + 1)], self.params['gamma%d' % (lay + 1)], self.params['beta%d' % (lay + 1)], self.bn_params[lay]) else: layer_input, ar_cache[lay] = affine_relu_forward(layer_input, self.params['W%d' % (lay + 1)], self.params['b%d' % (lay + 1)]) if self.use_dropout: layer_input, dp_cache[lay] = dropout_forward(layer_input, self.dropout_param) ar_out, ar_cache[self.num_layers] = affine_forward(layer_input, self.params['W%d' % (self.num_layers)], self.params['b%d' % (self.num_layers)]) scores = ar_out # pass ############################################################################ # END OF YOUR CODE # ############################################################################ # If test mode return early if mode == 'test': return scores loss, grads = 0.0, {} ############################################################################ # TODO: Implement the backward pass for the fully-connected net. Store the # # loss in the loss variable and gradients in the grads dictionary. Compute # # data loss using softmax, and make sure that grads[k] holds the gradients # # for self.params[k]. Don't forget to add L2 regularization! # # # # When using batch normalization, you don't need to regularize the scale # # and shift parameters. # # # # NOTE: To ensure that your implementation matches ours and you pass the # # automated tests, make sure that your L2 regularization includes a factor # # of 0.5 to simplify the expression for the gradient. # ############################################################################ loss, dscores = softmax_loss(scores, y) dhout = dscores loss = loss + 0.5 * self.reg * np.sum( self.params['W%d' % (self.num_layers)] * self.params['W%d' % (self.num_layers)]) dx, dw, db = affine_backward(dhout, ar_cache[self.num_layers]) grads['W%d' % (self.num_layers)] = dw + self.reg * self.params['W%d' % (self.num_layers)] grads['b%d' % (self.num_layers)] = db dhout = dx for idx in xrange(self.num_layers - 1): lay = self.num_layers - 1 - idx - 1 loss = loss + 0.5 * self.reg * np.sum(self.params['W%d' % (lay + 1)] * self.params['W%d' % (lay + 1)]) if self.use_dropout: dhout = dropout_backward(dhout, dp_cache[lay]) if self.use_batchnorm: dx, dw, db, dgamma, dbeta = affine_bn_relu_backward(dhout, ar_cache[lay]) else: dx, dw, db = affine_relu_backward(dhout, ar_cache[lay]) grads['W%d' % (lay + 1)] = dw + self.reg * self.params['W%d' % (lay + 1)] grads['b%d' % (lay + 1)] = db if self.use_batchnorm: grads['gamma%d' % (lay + 1)] = dgamma grads['beta%d' % (lay + 1)] = dbeta dhout = dx # pass ############################################################################ # END OF YOUR CODE # ############################################################################ return loss, grads
def loss(self,X,y=None): X = X.astype(self.dtype) mode = 'test' if y is None else 'train' if self.use_dropout : self.dropout_param['mode'] = mode if self.use_batchnorm: for bn_param in self.bn_params: bn_param['mode'] = mode scores = None inputi = X batch_size = X.shape[0] X = np.reshape(X,[batch_size,-1]) fc_cache_list = [] relu_cache_list = [] bn_cache_list = [] dropout_cache_list = [] for i in range(self.num_layers-1): fc_act,fc_cache= affine_forward(X,self.params['W'+str(i+1)],self.params['b'+str(i+1)]) fc_cache_list.append(fc_cache) if self.use_batchnorm: bn_act,bn_cache = batchnorm_forward(fc_act,self.params['gamma'+str(i+1)],self.params['beta'+str(i+1)],self.bn_params[i]) bn_cache_list.append(bn_cache) relu_act,relu_cache = relu_forward(bn_act) relu_cache_list.append(relu_cache) else: relu_act,relu_cache = relu_forward(fc_act) relu_cache_list.append(relu_cache) if self.use_dropout: relu_act,dropout_cache = dropout_forward(relu_act,self.dropout_param) dropout_cache_list.append(dropout_cache) X = relu_act.copy() ########最后一层 scores,final_cache = affine_forward(X,self.params['W'+str(self.num_layers)],self.params['b'+str(self.num_layers)]) # # for layer in range(self.num_layers): # Wi,bi = self.params['W%d'%(layer+1)],self.params['b%d'%(layer+1)] # outi,fc_cachei = affine_forward(inputi,Wi,bi) # fc_cache_list.append(fc_cachei) # # if self.use_batchnorm and layer!=self.num_layers-1: # gammai,betai = self.params['gamma%d'%(layer+1)],self.params['beta%d'%(layer+1)] # # outi,bn_cachei = batchnorm_forward(outi,gammai,betai,self.bn_params[layer]) # bn_cache_list.append(bn_cachei) # outi,relu_cachei = relu_forward(outi) # relu_cache_list.append(relu_cachei) # # if self.use_dropout: # outi,dropout_cachei = dropout_forward(outi,self.dropout_param) # dropout_cache_list.append(dropout_cachei) # # inputi = outi # # scores = outi if mode == 'test': return scores loss,grads = 0.0,{} loss,dsoft = softmax_loss(scores,y) loss += 0.5*self.reg*(np.sum(np.square(self.params['W'+str(self.num_layers)]))) #########最后一层的反向传播 dx_last,dw_last,db_last = affine_backward(dsoft,final_cache) grads['W'+str(self.num_layers)] = dw_last+self.reg*self.params['W'+str(self.num_layers)] grads['b'+str(self.num_layers)] = db_last for i in range(self.num_layers-1,0,-1): if self.use_dropout: dx_last = dropout_backward(dx_last,dropout_cache_list[i-1]) drelu = relu_backward(dx_last,relu_cache_list[i-1]) if self.use_batchnorm: dbatchnorm,dgamma,dbeta = batchnorm_backward(drelu,bn_cache_list[i-1]) dx_last,dw_last,db_last = affine_backward(dbatchnorm,fc_cache_list[i-1]) grads['beta'+str(i)] = dbeta grads['gamma'+str(i)] = dgamma else: dx_last,dw_last,db_last = affine_backward(drelu,fc_cache_list[i-1]) grads['W'+str(i)] = dw_last+self.reg*self.params['W'+str(i)] grads['b'+str(i)] = db_last loss += 0.5*self.reg*(np.sum(np.square(self.params['W'+str(i)]))) return loss,grads
####################################################################################### from layers import affine_forward num_inputs = 2 input_shape = (4, 5, 6) output_dim = 3 input_size = num_inputs * np.prod(input_shape) weight_size = output_dim * np.prod(input_shape) x = np.linspace(-0.1, 0.5, num=input_size).reshape(num_inputs, *input_shape) w = np.linspace(-0.2, 0.3, num=weight_size).reshape(np.prod(input_shape), output_dim) b = np.linspace(-0.3, 0.1, num=output_dim) out, _ = affine_forward(x, w, b) correct_out = np.array([[1.49834967, 1.70660132, 1.91485297], [3.25553199, 3.5141327, 3.77273342]]) # The error should be around 1e-9. print('Testing affine_forward function:') print('difference: ', rel_error(out, correct_out)) ####################################################################################### ####################################################################################### # Test the affine_backward function ####################################################################################### from layers import affine_backward np.random.seed(231) x = np.random.randn(10, 2, 3)
# Test the affine_forward function num_inputs = 2 input_shape = (4, 5, 6) output_dim = 3 input_size = num_inputs * np.prod(input_shape) theta_size = output_dim * np.prod(input_shape) x = np.linspace(-0.1, 0.5, num=input_size).reshape(num_inputs, *input_shape) theta = np.linspace(-0.2, 0.3, num=theta_size).reshape(np.prod(input_shape), output_dim) theta_0 = np.linspace(-0.3, 0.1, num=output_dim) out, _ = layers.affine_forward(x, theta, theta_0) correct_out = np.array([[1.49834967, 1.70660132, 1.91485297], [3.25553199, 3.5141327, 3.77273342]]) # Compare your output with ours. The error should be around 1e-9. print 'Testing affine_forward function:' print 'difference (should be around 1e-9): ', rel_error(out, correct_out) # Problem 3.1.2 ########################################################################## # Affine layer: backward. # ########################################################################## # In the file layers.py implement the affine_backward function. # # Once you are done you can test your implementation using numeric gradient. # ##########################################################################
def loss(self, X, y=None): """ Compute loss and gradient for the fully-connected net. Input / output: Same as TwoLayerNet above. """ X = X.astype(self.dtype) mode = 'test' if y is None else 'train' # Set train/test mode for batchnorm params and dropout param since they # behave differently during training and testing. if self.use_dropout: self.dropout_param['mode'] = mode if self.use_batchnorm: for bn_param in self.bn_params: bn_param['mode'] = mode scores = None cache = self.num_layers * [None] dropout_cache = (self.num_layers - 1) * [None] for i in np.arange(self.num_layers - 1): if not self.use_batchnorm: scores, cache[i] = affine_relu_forward( X if i == 0 else scores, self.params['W%d' % (i + 1)], self.params['b%d' % (i + 1)]) else: scores, cache[i] = affine_bn_relu_forward( X if i == 0 else scores, self.params['W%d' % (i + 1)], self.params['b%d' % (i + 1)], self.params['gamma%d' % (i + 1)], self.params['beta%d' % (i + 1)], self.bn_params[i]) if self.use_dropout: scores, dropout_cache[i] = dropout_forward( scores, self.dropout_param) scores, cache[self.num_layers - 1] = affine_forward( scores, self.params['W%d' % self.num_layers], self.params['b%d' % self.num_layers]) ############################################################################ # When using batch normalization, you'll need to pass self.bn_params[0] to # # the forward pass for the first batch normalization layer, pass # # self.bn_params[1] to the forward pass for the second batch normalization # # layer, etc. # ############################################################################ # If test mode return early if mode == 'test': return scores loss, grads = 0.0, {} loss, dscore = softmax_loss(scores, y) dx, grads['W%d' % self.num_layers], grads['b%d' % self.num_layers] = affine_backward( dscore, cache[self.num_layers - 1]) for i in reversed(np.arange(self.num_layers - 1)): if self.use_dropout: dx = dropout_backward(dx, dropout_cache[i]) if not self.use_batchnorm: dx, grads['W%d' % (i + 1)], grads['b%d' % (i + 1)] = affine_relu_backward( dx, cache[i]) else: dx, grads['W%d' % (i+1)], grads['b%d' % (i+1)], grads['gamma%d' % (i+1)], grads['beta%d' % (i+1)] \ = affine_bn_relu_backward(dx, cache[i]) for i in np.arange(self.num_layers): loss += .5 * self.reg * np.sum( np.square(self.params['W%d' % (i + 1)])) grads['W%d' % (i + 1)] += self.reg * self.params['W%d' % (i + 1)] ############################################################################ # When using batch normalization, you don't need to regularize the scale # # and shift parameters. # # # ############################################################################ return loss, grads
def build_sampler(self, max_len=20): """ Input: - max_len: max length for generating cations Place Holder: - features: input image features of shape (N, L, D) Returns - sampled_words: sampled word indices - alphas: sampled alpha weights """ # place holder features = self.features #parameters params = self.params # hyper parameters hyper_params = { 'batch_size': self.N, 'spacial_size': self.L, 'dim_feature': self.D, 'n_time_step': self.T, 'dim_hidden': self.H, 'vocab_size': self.V } # generate initial hidden state using cnn features mean_features = tf.reduce_mean(features, 1) prev_h = affine_tanh_forward(mean_features, params['W_init_h'], params['b_init_h']) # (N, H) prev_c = affine_tanh_forward(mean_features, params['W_init_c'], params['b_init_c']) # (N, h) sampled_word_list = [] alpha_list = [] for t in range(max_len): # embed the previous generated word if t == 0: x = tf.zeros([ self.N, self.M ]) # what about assign word vector for '<START>' token ? else: x = word_embedding_forward(sampled_word, params['W_embed']) # (N, M) # lstm forward if self.cell_type == 'rnn': h, alpha = rnn_step_forward_with_attention( x, features, prev_h, params, hyper_params) # (N, H), (N, L) else: h, c, alpha = lstm_step_forward_with_attention( x, features, prev_h, prev_c, params, hyper_params) # (N, H), (N, H), (N, L) prev_c = c # prepare for next time step prev_h = h # save alpha weights alpha_list.append(alpha) # generate scores(logits) from current hidden state logits = affine_forward(h, params['W_vocab'], params['b_vocab']) # (N, V) # sample word indices with logits sampled_word = tf.argmax( logits, 1) # (N, ) where value is in the range of [0, V) sampled_word_list.append( sampled_word) # tensor flow doesn't provide item assignment alphas = tf.transpose(tf.pack(alpha_list), (1, 0, 2)) # (N, T, L) sampled_captions = tf.transpose(tf.pack(sampled_word_list), (1, 0)) # (N, max_len) return alphas, sampled_captions