def main(args): # Create model. model = TwoLayerNet(args) for k, v in model.param_configs.items(): model.params[k] = np.zeros(v['shape']) img = np.zeros((args.batch_size, 784)) label = np.zeros((args.batch_size, )) for l in range(args.num_loops): if l == num_cold: start = time.time() def loss_func(*params): f = model.forward(img, 'train') return model.loss(f, label) if args.only_forward: loss = loss_func() loss.asnumpy() else: param_arrays = list(model.params.values()) param_keys = list(model.params.keys()) grad_and_loss_func = core.grad_and_loss(loss_func, argnum=range( len(param_arrays))) grad_arrays, loss = grad_and_loss_func(*param_arrays) for g in grad_arrays: g.get_data(minpy.array_variants.ArrayType.MXNET).wait_to_read() dur = time.time() - start print('Per Loop Time: %.6f' % (dur / (args.num_loops - num_cold)))
def main(args): # Create model. model = TwoLayerNet(args) for k, v in model.param_configs.items(): model.params[k] = np.zeros(v["shape"]) img = np.zeros((args.batch_size, 784)) label = np.zeros((args.batch_size,)) for l in range(args.num_loops): if l == num_cold: start = time.time() def loss_func(*params): f = model.forward(img, "train") return model.loss(f, label) if args.only_forward: loss = loss_func() loss.asnumpy() else: param_arrays = list(model.params.values()) param_keys = list(model.params.keys()) grad_and_loss_func = core.grad_and_loss(loss_func, argnum=range(len(param_arrays))) grad_arrays, loss = grad_and_loss_func(*param_arrays) for g in grad_arrays: g.get_data(minpy.array_variants.ArrayType.MXNET).wait_to_read() dur = time.time() - start print("Per Loop Time: %.6f" % (dur / (args.num_loops - num_cold)))
def _step(self, batch): """ Make a single gradient update. This is called by train() and should not be called manually. """ # Compute loss and gradient def loss_func(*params): # pylint: disable=unused-argument """ Loss function calculate the loss """ # It seems that params are not used in forward function. But since we will pass # model.params as arguments, we are ok here. predict = self.model.forward_batch(batch, mode='train') return self.model.loss_batch(batch, predict) param_arrays = list(self.model.params.values()) param_keys = list(self.model.params.keys()) grad_and_loss_func = core.grad_and_loss(loss_func, argnum=range( len(param_arrays))) grad_arrays, loss = grad_and_loss_func(*param_arrays) grads = dict(zip(param_keys, grad_arrays)) self.loss_history.append(loss.asnumpy()) # Perform a parameter update for p, w in self.model.params.items(): dw = grads[p] config = self.optim_configs[p] next_w, next_config = self.update_rule(w, dw, config) self.model.params[p] = next_w self.optim_configs[p] = next_config
def _step(self, batch): """ Make a single gradient update. This is called by train() and should not be called manually. """ # Compute loss and gradient def loss_func(*params): # pylint: disable=unused-argument """ Loss function calculate the loss """ # It seems that params are not used in forward function. But since we will pass # model.params as arguments, we are ok here. predict = self.model.forward_batch(batch, mode='train') return self.model.loss_batch(batch, predict) param_arrays = list(self.model.params.values()) param_keys = list(self.model.params.keys()) grad_and_loss_func = core.grad_and_loss( loss_func, argnum=range(len(param_arrays))) grad_arrays, loss = grad_and_loss_func(*param_arrays) grads = dict(zip(param_keys, grad_arrays)) self.loss_history.append(loss.asnumpy()) # Perform a parameter update for p, w in self.model.params.items(): dw = grads[p] config = self.optim_configs[p] next_w, next_config = self.update_rule(w, dw, config) self.model.params[p] = next_w self.optim_configs[p] = next_config
def _step(self): """ Make a single gradient update. This is called by train() and should not be called manually. """ # Make a minibatch of training data num_train = self.X_train.shape[0] batch_mask = np.random.choice(num_train, self.batch_size) X_batch = self.X_train[batch_mask] y_batch = self.y_train[batch_mask] # Compute loss and gradient def loss_func(*params): # It seems that params are not used in forward function. But since we will pass # model.params as arguments, we are ok here. predict = self.model.forward(X_batch) return self.model.loss(predict, y_batch) param_arrays = list(self.model.params.values()) param_keys = list(self.model.params.keys()) grad_and_loss_func = core.grad_and_loss(loss_func, argnum=range(len(param_arrays))) grad_arrays, loss = grad_and_loss_func(*param_arrays) grads = dict(zip(param_keys, grad_arrays)) self.loss_history.append(loss.asnumpy()) # Perform a parameter update for p, w in self.model.params.items(): dw = grads[p] config = self.optim_configs[p] next_w, next_config = self.update_rule(w, dw, config) self.model.params[p] = next_w self.optim_configs[p] = next_config
def train_on_batch(self, tokens, oracle_actions): """ Make a single gradient update. This is called by train() and should not be called manually. """ # Compute loss and gradient def loss_func(*params): """ Loss function calculate the loss """ # It seems that params are not used in forward function. But since we will pass # model.params as arguments, we are ok here. return self.model.parse(tokens, oracle_actions=oracle_actions) param_arrays = list(self.model.params.values()) param_keys = list(self.model.params.keys()) grad_and_loss_func = core.grad_and_loss(loss_func, argnum=range( len(param_arrays))) grad_arrays, loss = grad_and_loss_func(*param_arrays) grads = dict(zip(param_keys, grad_arrays)) # Perform a parameter update for p, w in self.model.params.items(): dw = grads[p] config = self.optim_configs[p] next_w, next_config = self.update_rule(w, dw, config) self.model.params[p] = next_w self.optim_configs[p] = next_config return loss
def main(args): # Create model. model = RNNNet(args) for k, v in model.param_configs.items(): model.params[k] = np.zeros(v['shape']) data = np.zeros( (args.batch_size, args.input_size)) # Data of only one time step. label = np.zeros((args.batch_size, )) for l in range(args.num_loops): if l == num_cold: start = time.time() def loss_func(*params): f = model.forward(data, 'train') return model.loss(f, label) if args.only_forward: loss = loss_func() loss.wait_to_read() else: param_arrays = list(model.params.values()) param_keys = list(model.params.keys()) grad_and_loss_func = core.grad_and_loss(loss_func, argnum=range( len(param_arrays))) grad_arrays, loss = grad_and_loss_func(*param_arrays) for g in grad_arrays: g.wait_to_read() dur = time.time() - start print('Per Loop Time: %.6f' % (dur / (args.num_loops - num_cold)))
def main(args): # Create model. model = RNNNet(args) for k, v in model.param_configs.items(): model.params[k] = np.zeros(v['shape']) data = np.zeros((args.batch_size, args.input_size)) # Data of only one time step. label = np.zeros((args.batch_size,), dtype=np.int) for l in range(args.num_loops): if l == num_cold: start = time.time() def loss_func(*params): f = model.forward(data, 'train') return model.loss(f, label) if args.only_forward: loss = loss_func() loss.asnumpy() else: param_arrays = list(model.params.values()) param_keys = list(model.params.keys()) grad_and_loss_func = core.grad_and_loss( loss_func, argnum=range(len(param_arrays))) grad_arrays, loss = grad_and_loss_func(*param_arrays) dur = time.time() - start print('Per Loop Time: %.6f' % (dur / (args.num_loops - num_cold)))
def loss(self, X, y=None): if y is None: return self._forward(X, *self.param) else: backprop = grad_and_loss(self._softmax_loss, range(2, len(self.param) + 2)) return backprop(X, y, *self.param)
def loss_and_derivative(self, X, y=None): """ Compute loss and gradient for the fully-connected net. Input / output: Same as TwoLayerNet above. """ X_plain = np.reshape(X, (X.shape[0], -1)) mode = 'test' if y is None else 'train' if self.dropout_param is not None: self.dropout_param['mode'] = mode if self.use_batchnorm: for bn_param in self.bn_params: bn_param['mode'] = mode params_array = self.pack_params() def train_loss(*args): X = args[0] y = args[1] res = X for l in xrange(self.num_layers): prev_res = res res = affine_forward(prev_res, args[self.w_idx(l)], args[self.b_idx(l)]) if l < (self.num_layers - 1): if self.use_batchnorm: res = batchnorm_forward(res, args[self.bn_ga_idx(l)], args[self.bn_bt_idx(l)], self.bn_params[l]) res = relu_forward(res) if self.use_dropout: res = dropout_forward(res, self.dropout_param) scores = res if mode == 'test': return scores #loss, _ = softmax_loss(scores, y) loss = svm_loss(scores, y) return loss if y is None: return train_loss(X_plain, y, *params_array) grad_function = grad_and_loss(train_loss, range(self.data_target_cnt, self.data_target_cnt + len(params_array))) grads_array, loss = grad_function(X_plain, y, *params_array) grads = {} for i, grad in enumerate(grads_array): grads[self.param_keys[i]] = grad return loss, grads
def loss_and_derivative(self, X, y=None): """ Compute loss and gradient for a minibatch of data. Inputs: - X: Array of input data of shape (N, d_1, ..., d_k) - y: Array of labels, of shape (N,). y[i] gives the label for X[i]. Returns: If y is None, then run a test-time forward pass of the model and return: - scores: Array of shape (N, C) giving classification scores, where scores[i, c] is the classification score for X[i] and class c. If y is not None, then run a training-time forward and backward pass and return a tuple of: - loss: Scalar value giving the loss - grads: Dictionary with the same keys as self.params, mapping parameter names to gradients of the loss with respect to those parameters. """ # Note: types of X, y are mxnet.ndarray def train_loss(X, y, W1, W2, b1, b2): l1 = affine_relu_forward(X, W1, b1) l2 = affine_forward(l1, W2, b2) scores = l2 if y is None: return scores #[TODO]: softmax is not supported yet # loss, d_scores = softmax_loss(scores, y) loss = svm_loss(scores, y) loss_with_reg = loss + np.sum(W1**2) * 0.5 * self.reg + np.sum( W2**2) * 0.5 * self.reg return loss_with_reg self.params_array = [] params_list_name = ['W1', 'W2', 'b1', 'b2'] for param_name in params_list_name: self.params_array.append(self.params[param_name]) X_plain = np.reshape(X, (X.shape[0], -1)) if y is None: return train_loss(X_plain, y, *self.params_array) grad_function = grad_and_loss(train_loss, range(2, 6)) grads_array, loss = grad_function(X_plain, y, *self.params_array) grads = {} for i in range(len(params_list_name)): grads[params_list_name[i]] = grads_array[i] return loss, grads
def _forward_backward(self, loss_func): param_arrays = list(self.params.values()) param_keys = list(self.params.keys()) grad_and_loss_func = core.grad_and_loss(loss_func, argnum=range(len(param_arrays))) grad_arrays, loss = grad_and_loss_func(*param_arrays) grads = dict(zip(param_keys, grad_arrays)) if self.config.grad_clip: for k, v in grads.iteritems(): grads[k] = numpy.clip(v, -self.config.clip_magnitude, self.config.clip_magnitude) return grads
def loss_and_derivative(self, X, y=None): """ Compute loss and gradient for a minibatch of data. Inputs: - X: Array of input data of shape (N, d_1, ..., d_k) - y: Array of labels, of shape (N,). y[i] gives the label for X[i]. Returns: If y is None, then run a test-time forward pass of the model and return: - scores: Array of shape (N, C) giving classification scores, where scores[i, c] is the classification score for X[i] and class c. If y is not None, then run a training-time forward and backward pass and return a tuple of: - loss: Scalar value giving the loss - grads: Dictionary with the same keys as self.params, mapping parameter names to gradients of the loss with respect to those parameters. """ # Note: types of X, y are mxnet.ndarray def train_loss(X, y, W1, W2, b1, b2): l1 = affine_relu_forward(X, W1, b1) l2 = affine_forward(l1, W2, b2) scores = l2 if y is None: return scores #[TODO]: softmax is not supported yet # loss, d_scores = softmax_loss(scores, y) loss = svm_loss(scores, y) loss_with_reg = loss + np.sum(W1 ** 2) * 0.5 * self.reg + np.sum(W2 ** 2) * 0.5 * self.reg return loss_with_reg self.params_array = [] params_list_name = ['W1', 'W2', 'b1', 'b2'] for param_name in params_list_name: self.params_array.append(self.params[param_name]) X_plain = np.reshape(X, (X.shape[0], -1)) if y is None: return train_loss(X_plain, y, *self.params_array) grad_function = grad_and_loss(train_loss, range(2, 6)) grads_array, loss = grad_function(X_plain, y, *self.params_array) grads = {} for i in range(len(params_list_name)): grads[params_list_name[i]] = grads_array[i] return loss, grads
def loss(self, X, y=None): """ Compute loss and gradient for the fully-connected net. Input / output: Same as TwoLayerNet above. """ mode = 'test' if y is None else 'train' if self.dropout_param is not None: self.dropout_param['mode'] = mode if self.use_batchnorm: for bn_param in self.bn_params: bn_param[mode] = mode # TODO: add bn_options and dropout option assert not (self.use_batchnorm or self.use_dropout) # args is [X, Y, W[0], ..., W[n-1], b[0], ..., b[n-1]] # type of (args) is list. def train_loss(*args): last_layer_output = args[0] for l in xrange(self.num_layers): if l < (self.num_layers - 1): # TODO: last_layer_output is mutated in this code # TODO: rewrite last_layer_output last_layer_output, _ = affine_relu_forward(last_layer_output, args[2 + l], args[2 + self.num_layers + l]) else: last_layer_output, _ = affine_forward(last_layer_output, args[2 + l], args[2 + self.num_layers + l]) scores = last_layer_output if mode == 'test': return scores loss, _ = softmax_loss(scores, y) return loss grad_function = grad_and_loss(train_loss, range(2, 2+2*self.num_layers)) #TODO: define self.WeightAndBiasArray loss, grads_array = grad_function(X, y, *self.WeightAndBiasArray) grads = {} for l in xrange(self.num_layers - 1, -1, -1): grads[self.GetWeightName(l)] = grads_array[l] grads[self.GetBiasName(l)] = grads_array[l + self.num_layers] return loss, grads
def test_policy(): @minpy.wrap_policy(minpy.OnlyNumPyPolicy()) def gaussian_cluster_generator(num_samples=10000, num_features=500, num_classes=5): # with minpy.OnlyNumPyPolicy(): mu = np.random.rand(num_classes, num_features) sigma = np.ones((num_classes, num_features)) * 0.1 num_cls_samples = num_samples / num_classes x = np.zeros((num_samples, num_features)) y = np.zeros((num_samples, num_classes)) for i in range(num_classes): cls_samples = np.random.normal(mu[i, :], sigma[i, :], (num_cls_samples, num_features)) x[i * num_cls_samples:(i + 1) * num_cls_samples] = cls_samples y[i * num_cls_samples:(i + 1) * num_cls_samples, i] = 1 return x, y def predict(w, x): a = np.exp(np.dot(x, w)) a_sum = np.sum(a, axis=1, keepdims=True) prob = a / a_sum return prob def train_loss(w, x): prob = predict(w, x) loss = -np.sum(label * np.log(prob)) / num_samples return loss """Use Minpy's auto-grad to derive a gradient function off loss""" grad_function = grad_and_loss(train_loss) # Using gradient descent to fit the correct classes. def train(w, x, loops): for i in range(loops): dw, loss = grad_function(w, x) if i % 10 == 0: print('Iter {}, training loss {}'.format(i, loss)) # gradient descent w -= 0.1 * dw # Initialize training data. num_samples = 10000 num_features = 500 num_classes = 5 data, label = gaussian_cluster_generator(num_samples, num_features, num_classes) # Initialize training weight and train weight = random.randn(num_features, num_classes) train(weight, data, 100)
def test_policy(): @minpy.wrap_policy(minpy.OnlyNumPyPolicy()) def gaussian_cluster_generator(num_samples=10000, num_features=500, num_classes=5): # with minpy.OnlyNumPyPolicy(): mu = np.random.rand(num_classes, num_features) sigma = np.ones((num_classes, num_features)) * 0.1 num_cls_samples = num_samples / num_classes x = np.zeros((num_samples, num_features)) y = np.zeros((num_samples, num_classes)) for i in range(num_classes): cls_samples = np.random.normal(mu[i,:], sigma[i,:], (num_cls_samples, num_features)) x[i*num_cls_samples:(i+1)*num_cls_samples] = cls_samples y[i*num_cls_samples:(i+1)*num_cls_samples,i] = 1 return x, y def predict(w, x): a = np.exp(np.dot(x, w)) a_sum = np.sum(a, axis=1, keepdims=True) prob = a / a_sum return prob def train_loss(w, x): prob = predict(w, x) loss = -np.sum(label * np.log(prob)) / num_samples return loss """Use Minpy's auto-grad to derive a gradient function off loss""" grad_function = grad_and_loss(train_loss) # Using gradient descent to fit the correct classes. def train(w, x, loops): for i in range(loops): dw, loss = grad_function(w, x) if i % 10 == 0: print('Iter {}, training loss {}'.format(i, loss)) # gradient descent w -= 0.1 * dw # Initialize training data. num_samples = 10000 num_features = 500 num_classes = 5 data, label = gaussian_cluster_generator(num_samples, num_features, num_classes) # Initialize training weight and train weight = random.randn(num_features, num_classes) train(weight, data, 100)
def loss_and_derivative(self, X, y=None): # symbol's init func takes input size. if self.symbol_func == None: self.set_mxnet_symbol(X) params_array = self.pack_params() #TODO(Haoran): isolate this part out for user #if so, loss_and_derivative function should be inherited from super mxnet model class def train_loss(*args): inputs = args[0] softmax_label = args[1] probs = self.symbol_func(**self.make_mxnet_weight_dict( inputs, softmax_label, args[self.data_target_cnt:len(args)])) if softmax_label is None: return probs samples_num = X.shape[0] targets = np.zeros((samples_num, self.num_classes)) targets[np.arange(samples_num), softmax_label] = 1 loss = -np.sum(targets * np.log(probs)) / samples_num for i in self.get_index_reg_weight(): loss = loss + np.sum(0.5 * args[i]**2 * self.reg) return loss if y is None: return train_loss(X, y, *params_array) grad_function = core.grad_and_loss( train_loss, range(self.data_target_cnt, self.data_target_cnt + len(params_array))) grads_array, loss = grad_function(X, y, *params_array) grads = {} for i, grad in enumerate(grads_array): grads[self.param_keys[i]] = grad return loss, grads
def loss_and_derivative(self, X, y=None): # symbol's init func takes input size. if self.symbol_func == None: self.set_mxnet_symbol(X) params_array = self.pack_params() #TODO(Haoran): isolate this part out for user #if so, loss_and_derivative function should be inherited from super mxnet model class def train_loss(*args): inputs = args[0] softmax_label = args[1] probs = self.symbol_func(**self.make_mxnet_weight_dict( inputs, softmax_label, args[self.data_target_cnt:len(args)])) if softmax_label is None: return probs samples_num = X.shape[0] targets = np.zeros((samples_num, self.num_classes)) targets[np.arange(samples_num), softmax_label] = 1 loss = -np.sum(targets * np.log(probs)) / samples_num for i in self.get_index_reg_weight(): loss = loss + np.sum(0.5 * args[i]**2 * self.reg) return loss if y is None: return train_loss(X, y, *params_array) grad_function = core.grad_and_loss(train_loss, range( self.data_target_cnt, self.data_target_cnt + len(params_array))) grads_array, loss = grad_function(X, y, *params_array) grads = {} for i, grad in enumerate(grads_array): grads[self.param_keys[i]] = grad return loss, grads
def loss_and_derivative(self, X, y=None): """ Compute loss and gradient for the fully-connected net. Input / output: Same as TwoLayerNet above. """ X_plain = np.reshape(X, (X.shape[0], -1)) mode = 'test' if y is None else 'train' if self.dropout_param is not None: self.dropout_param['mode'] = mode if self.use_batchnorm: for bn_param in self.bn_params: bn_param['mode'] = mode params_array = self.pack_params() def train_loss(*args): X = args[0] y = args[1] res = X for l in xrange(self.num_layers): prev_res = res res = affine_forward(prev_res, args[self.w_idx(l)], args[self.b_idx(l)]) if l < (self.num_layers - 1): if self.use_batchnorm: res = batchnorm_forward(res, args[self.bn_ga_idx(l)], args[self.bn_bt_idx(l)], self.bn_params[l]) res = relu_forward(res) if self.use_dropout: res = dropout_forward(res, self.dropout_param) scores = res if mode == 'test': return scores #loss, _ = softmax_loss(scores, y) loss = svm_loss(scores, y) return loss if y is None: return train_loss(X_plain, y, *params_array) grad_function = grad_and_loss( train_loss, range(self.data_target_cnt, self.data_target_cnt + len(params_array))) grads_array, loss = grad_function(X_plain, y, *params_array) grads = {} for i, grad in enumerate(grads_array): grads[self.param_keys[i]] = grad return loss, grads
preds = predict(weights, inputs) error = np.count_nonzero( np.argmax( preds, axis=1) - np.argmax( targets, axis=1)) return (256 - error) * 100 / 256.0 xshape = (256, 500) wshape = (500, 250) tshape = (256, 250) inputs = random.rand(*xshape) - 0.5 targets = np.zeros(tshape) truth = random.randint(0, 250, 256) targets[np.arange(256), truth] = 1 weights = random.rand(*wshape) - 0.5 #training_gradient_fun_0 = grad(training_loss, 0) grad_arg0 = grad_and_loss(training_loss, 0) grad, loss = grad_arg0(weights, inputs) print('1st arg\'s grad by single grad func', grad) grad_arg1 = grad_and_loss(training_loss, 1) grad, loss = grad_arg1(weights, inputs) print('2nd arg\'s grad by single grad func', grad) grad_args = grad_and_loss(training_loss, [0, 1]) grads, loss = grad_args(weights, inputs) print('1st arg\'s grad by single grad func', grads[0]) print('2nd arg\'s grad by single grad func', grads[1])
mx.nd.array(numpy.array([1, 2, 3])) type(numpy.array([1, 2, 3])) type(c) np.ones((2, 3)) np.ones([2, 3]) mx.nd.ones([2, 3]) mx.nd.ones([2, 3]).asnumpy() def foo(x): return (5 * (x**2) + 3 * x + 2) print(foo(4)) d_foo = grad(foo) d_l_foo = grad_and_loss(foo) d_foo(4) d_l_foo(4) # Symbol a = mx.sym.Variable('a') b = mx.sym.Variable('b') c = a + b # elemental wise times d = a * b # matrix multiplication e = mx.sym.dot(a, b) f = mx.sym.Reshape(d + e, shape=(1, 4)) # broadcast g = mx.sym.broadcast_to(f, shape=(2, 4)) mx.viz.plot_network(symbol=g)
# preparation N, D, H = 4, 5, 6 x = np.random.randn(N, D) h = np.random.randn(N, H) Wx = np.random.randn(D, H) Wh = np.random.randn(H, H) b = np.random.randn(H) out, cache = rnn_step_forward(x, h, Wx, Wh, b) dnext_h = np.random.randn(*out.shape) # test MinPy start = time.time() rnn_step_forward_loss = lambda x, h, Wx, Wh, b, dnext_h: minpy_rnn_step_forward( x, h, Wx, Wh, b) * nm(dnext_h) grad_loss_function = wraps('numpy')(grad_and_loss(rnn_step_forward_loss, xrange(5))) grad_arrays = grad_loss_function(x, h, Wx, Wh, b, dnext_h)[0] end = time.time() print "MinPy total time elapsed:", end - start # test NumPy start = time.time() out, cache = rnn_step_forward(x, h, Wx, Wh, b) dx, dprev_h, dWx, dWh, db = rnn_step_backward(dnext_h, cache) out *= dnext_h # to agree with MinPy calculation end = time.time() print "NumPy total time elapsed:", end - start print print "Result Check:" print 'dx error: ', rel_error(dx, grad_arrays[0])
def predict(inputs, fc_weight, fc_bias, conv_weight, conv_bias): #return f( data=[('x', inputs)], weight=[('fc_weight', weights)], ctx=mx.cpu()) return f(x=inputs, fc_0_weight=fc_weight, fc_1_bias=fc_bias, conv_0_weight=conv_weight, conv_1_bias=conv_bias) def training_loss(inputs, targets, fc_weight, fc_bias, conv_weight, conv_bias): preds = predict(inputs, fc_weight, fc_bias, conv_weight, conv_bias) label_probabilities = preds * targets + (1 - preds) * (1 - targets) return -np.sum(np.log(label_probabilities)) training_gradient_fun = core.grad_and_loss(training_loss, range(2, 6)) lr = 1e-5 for i in range(100): grads, loss = training_gradient_fun(inputs, targets, fc_weight, fc_bias, conv_weight, conv_bias) #print('Training gradient: {}'.format(gr)) fc_weight -= grads[0] * lr fc_bias -= grads[1] * lr conv_weight -= grads[2] * lr conv_bias -= grads[3] * lr if i % 10 == 0: print('Trained loss: {}'.format(loss))
def train(self): """Trains the model for `num_episodes` iterations. On each iteration, runs an episode (see `.run_episode()`) to generate three matrices of observations, labels and rewards (xs, ys, rs) containing data for the _entire_ episode. Then the parameter gradients are found using these episode matrices. Specifically, auto-grad is performed on `loss_func`, which does a single forward pass with the episode's observations `xs` then computes the loss using the output of the forward pass and the episode's labels `ys` and discounted rewards `rs`. This two-step approach of generating episode data then doing a single forward/backward pass is done to conserve memory during the auto-grad computation. """ # Accumulate gradients since updates are only performed every `update_every` iterations. grad_buffer = self._init_grad_buffer() for episode_number in xrange(1, self.num_episodes): episode_start = time.time() # Generate an episode of training data. xs, ys, rs = self.run_episode() # Performs a forward pass and computes loss using an entire episode's data. def loss_func(*params): ps = self.model.forward(xs) return self.model.loss(ps, ys, rs) # Compute gradients with auto-grad on `loss_func` (duplicated from `Solver`). param_arrays = list(self.model.params.values()) param_keys = list(self.model.params.keys()) grad_and_loss_func = core.grad_and_loss(loss_func, argnum=range(len(param_arrays))) backward_start = time.time() grad_arrays, loss = grad_and_loss_func(*param_arrays) backward_time = time.time() - backward_start grads = dict(zip(param_keys, grad_arrays)) # Accumulate gradients until an update is performed. for k, v in grads.iteritems(): grad_buffer[k] += v # Misc. diagnostic info. self.loss_history.append(loss.asnumpy()) episode_time = time.time() - episode_start if self.verbose: print('Backward pass complete (%.2fs)' % backward_time) if self.verbose or episode_number % self.print_every == 0: print('Episode %d complete (%.2fs), loss: %s, reward: %s, running reward: %s' % (episode_number, episode_time, loss, self.episode_reward, self.running_reward)) # Perform parameter update and reset the `grad_buffer` when appropriate. if episode_number % self.update_every == 0: for p, w in self.model.params.items(): dw = grad_buffer[p] config = self.optim_configs[p] next_w, next_config = self.update_rule(w, dw, config) self.model.params[p] = next_w self.optim_configs[p] = next_config grad_buffer[p] = np.zeros_like(w) # Save model parameters to `save_dir` when appropriate.. if episode_number % self.save_every == 0: if self.verbose: print('Saving model parameters...') file_name = os.path.join(self.save_dir, 'params_%d.p' % episode_number) with open(file_name, 'w') as f: pickle.dump({k: v.asnumpy() for k, v in self.model.params.iteritems()}, f) if self.verbose: print('Wrote parameter file %s' % file_name)
def test_autograd(): @convert_args def minpy_rnn_step_forward(x, prev_h, Wx, Wh, b): next_h = mp.tanh(x.dot(Wx) + prev_h.dot(Wh) + b) return next_h def rel_error(x, y): """ returns relative error """ return np.max(np.abs(x - y) / (np.maximum(1e-8, np.abs(x) + np.abs(y)))) def rnn_step_forward(x, prev_h, Wx, Wh, b): next_h = np.tanh(prev_h.dot(Wh) + x.dot(Wx) + b) cache = next_h, prev_h, x, Wx, Wh return next_h, cache def rnn_step_backward(dnext_h, cache): dx, dprev_h, dWx, dWh, db = None, None, None, None, None # Load values from rnn_step_forward next_h, prev_h, x, Wx, Wh = cache # Gradients of loss wrt tanh dtanh = dnext_h * (1 - next_h * next_h) # (N, H) # Gradients of loss wrt x dx = dtanh.dot(Wx.T) # Gradients of loss wrt prev_h dprev_h = dtanh.dot(Wh.T) # Gradients of loss wrt Wx dWx = x.T.dot(dtanh) # (D, H) # Gradients of loss wrt Wh dWh = prev_h.T.dot(dtanh) # Gradients of loss wrt b. Note we broadcast b in practice. Thus result of # matrix ops are just sum over columns db = dtanh.sum(axis=0) # == np.ones([N, 1]).T.dot(dtanh)[0, :] return dx, dprev_h, dWx, dWh, db # preparation N, D, H = 4, 5, 6 x = np.random.randn(N, D) h = np.random.randn(N, H) Wx = np.random.randn(D, H) Wh = np.random.randn(H, H) b = np.random.randn(H) out, cache = rnn_step_forward(x, h, Wx, Wh, b) dnext_h = np.random.randn(*out.shape) # test MinPy start = time.time() rnn_step_forward_loss = lambda x, h, Wx, Wh, b, dnext_h: minpy_rnn_step_forward(x, h, Wx, Wh, b) * nm(dnext_h) grad_loss_function = return_numpy(grad_and_loss(rnn_step_forward_loss, range(5))) grad_arrays = grad_loss_function(x, h, Wx, Wh, b, dnext_h)[0] end = time.time() print("MinPy total time elapsed:", end - start) # test NumPy start = time.time() out, cache = rnn_step_forward(x, h, Wx, Wh, b) dx, dprev_h, dWx, dWh, db = rnn_step_backward(dnext_h, cache) out *= dnext_h # to agree with MinPy calculation end = time.time() print("NumPy total time elapsed:", end - start) print() print("Result Check:") print('dx error: ', rel_error(dx, grad_arrays[0])) print('dprev_h error: ', rel_error(dprev_h, grad_arrays[1])) print('dWx error: ', rel_error(dWx, grad_arrays[2])) print('dWh error: ', rel_error(dWh, grad_arrays[3])) print('db error: ', rel_error(db, grad_arrays[4]))
def loss(self, features, captions): """ Compute training-time loss for the RNN. We input image features and ground-truth captions for those images, and use an RNN (or LSTM) to compute loss and gradients on all parameters. Inputs: - features: Input image features, of shape (N, D) - captions: Ground-truth captions; an integer array of shape (N, T) where each element is in the range 0 <= y[i, t] < V Returns a tuple of: - loss: Scalar loss - grads: Dictionary of gradients parallel to self.params """ # Cut captions into two pieces: captions_in has everything but the last word # and will be input to the RNN; captions_out has everything but the first # word and this is what we will expect the RNN to generate. These are offset # by one relative to each other because the RNN should produce word (t+1) # after receiving word t. The first element of captions_in will be the START # token, and the first element of captions_out will be the first word. captions_in = captions[:, :-1] captions_out = captions[:, 1:] # You'll need this mask = (captions_out != self._null) # Weight and bias for the affine transform from image features to initial # hidden state W_proj, b_proj = self.params['W_proj'], self.params['b_proj'] # Word embedding matrix W_embed = self.params['W_embed'] # Input-to-hidden, hidden-to-hidden, and biases for the RNN Wx, Wh, b = self.params['Wx'], self.params['Wh'], self.params['b'] # Weight and bias for the hidden-to-vocab transformation. W_vocab, b_vocab = self.params['W_vocab'], self.params['b_vocab'] loss, grads = 0.0, {} grad_function = grad_and_loss(self.rnnNet, xrange(8)) grad_array, loss = grad_function(W_proj, b_proj, W_embed, Wx, Wh, b, W_vocab, b_vocab, features, captions_in, captions_out, mask) # # # In the backward pass you will need to compute the gradient of the loss # # with respect to all model parameters. Use the loss and grads variables # # defined above to store loss and gradients; grads[k] should give the # # gradients for self.params[k]. # TODO: set grad_array to grads dictionary grads['W_proj'] = grads['b_proj'] = grads['W_embed'] = grads['Wx'] = grads['Wh'] = grads['b'] = grads['W_vocab'] = grads['b_vocab'] = # END TO return loss, grads
# preparation N, D, H = 4, 5, 6 x = np.random.randn(N, D) h = np.random.randn(N, H) Wx = np.random.randn(D, H) Wh = np.random.randn(H, H) b = np.random.randn(H) out, cache = rnn_step_forward(x, h, Wx, Wh, b) dnext_h = np.random.randn(*out.shape) # test MinPy start = time.time() rnn_step_forward_loss = lambda x, h, Wx, Wh, b, dnext_h: minpy_rnn_step_forward(x, h, Wx, Wh, b) * nm(dnext_h) grad_loss_function = return_numpy(grad_and_loss(rnn_step_forward_loss, range(5))) grad_arrays = grad_loss_function(x, h, Wx, Wh, b, dnext_h)[0] end = time.time() print("MinPy total time elapsed:", end - start) # test NumPy start = time.time() out, cache = rnn_step_forward(x, h, Wx, Wh, b) dx, dprev_h, dWx, dWh, db = rnn_step_backward(dnext_h, cache) out *= dnext_h # to agree with MinPy calculation end = time.time() print("NumPy total time elapsed:", end - start) print() print("Result Check:") print('dx error: ', rel_error(dx, grad_arrays[0]))
def training_accuracy(weights, inputs): preds = predict(weights, inputs) error = np.count_nonzero( np.argmax(preds, axis=1) - np.argmax(targets, axis=1)) return (256 - error) * 100 / 256.0 xshape = (256, 500) wshape = (500, 250) tshape = (256, 250) inputs = random.rand(*xshape) - 0.5 targets = np.zeros(tshape) truth = random.randint(0, 250, 256) targets[np.arange(256), truth] = 1 weights = random.rand(*wshape) - 0.5 #training_gradient_fun_0 = grad(training_loss, 0) grad_arg0 = grad_and_loss(training_loss, 0) grad, loss = grad_arg0(weights, inputs) print('1st arg\'s grad by single grad func', grad) grad_arg1 = grad_and_loss(training_loss, 1) grad, loss = grad_arg1(weights, inputs) print('2nd arg\'s grad by single grad func', grad) grad_args = grad_and_loss(training_loss, [0, 1]) grads, loss = grad_args(weights, inputs) print('1st arg\'s grad by single grad func', grads[0]) print('2nd arg\'s grad by single grad func', grads[1])
y = np.dot(x, w) prob = softmax(x=y, softmax_label=softmax_label) return prob #util.plot_data(x, t) #util.plot_data(x, predict(w, x)) ''' for i in range(1): prob = predict(w, x) #print prob dy = t - prob dw = np.dot(x.T, dy) / 10000 w -= 0.1 * dw print w #util.plot_data(x, predict(w, x)) ''' def loss(w, x): prob = predict(w, x) return -np.sum(np.log(prob) * t) / 10000 + 0.5 * w * w gl = grad_and_loss(loss) for i in range(10): dw, loss = gl(w, x) print loss w -= 0.1 * dw
# Predict the class using multinomial logistic regression (softmax regression). def predict(w, x): a = np.exp(np.dot(x, w)) a_sum = np.sum(a, axis=1, keepdims=True) prob = a / a_sum return prob def train_loss(w, x): prob = predict(w, x) loss = -np.sum(label * np.log(prob)) / num_samples return loss """Use Minpy's auto-grad to derive a gradient function off loss""" grad_function = grad_and_loss(train_loss) # Using gradient descent to fit the correct classes. def train(w, x, loops): for i in range(loops): dw, loss = grad_function(w, x) if i % 10 == 0: print('Iter {}, training loss {}'.format(i, loss)) # gradient descent w -= 0.1 * dw # Initialize training data. num_samples = 10000 num_features = 500
def test_autograd(): @convert_args def minpy_rnn_step_forward(x, prev_h, Wx, Wh, b): next_h = mp.tanh(x.dot(Wx) + prev_h.dot(Wh) + b) return next_h def rel_error(x, y): """ returns relative error """ return np.max( np.abs(x - y) / (np.maximum(1e-8, np.abs(x) + np.abs(y)))) def rnn_step_forward(x, prev_h, Wx, Wh, b): next_h = np.tanh(prev_h.dot(Wh) + x.dot(Wx) + b) cache = next_h, prev_h, x, Wx, Wh return next_h, cache def rnn_step_backward(dnext_h, cache): dx, dprev_h, dWx, dWh, db = None, None, None, None, None # Load values from rnn_step_forward next_h, prev_h, x, Wx, Wh = cache # Gradients of loss wrt tanh dtanh = dnext_h * (1 - next_h * next_h) # (N, H) # Gradients of loss wrt x dx = dtanh.dot(Wx.T) # Gradients of loss wrt prev_h dprev_h = dtanh.dot(Wh.T) # Gradients of loss wrt Wx dWx = x.T.dot(dtanh) # (D, H) # Gradients of loss wrt Wh dWh = prev_h.T.dot(dtanh) # Gradients of loss wrt b. Note we broadcast b in practice. Thus result of # matrix ops are just sum over columns db = dtanh.sum(axis=0) # == np.ones([N, 1]).T.dot(dtanh)[0, :] return dx, dprev_h, dWx, dWh, db # preparation N, D, H = 4, 5, 6 x = np.random.randn(N, D) h = np.random.randn(N, H) Wx = np.random.randn(D, H) Wh = np.random.randn(H, H) b = np.random.randn(H) out, cache = rnn_step_forward(x, h, Wx, Wh, b) dnext_h = np.random.randn(*out.shape) # test MinPy start = time.time() rnn_step_forward_loss = lambda x, h, Wx, Wh, b, dnext_h: minpy_rnn_step_forward( x, h, Wx, Wh, b) * nm(dnext_h) grad_loss_function = return_numpy( grad_and_loss(rnn_step_forward_loss, range(5))) grad_arrays = grad_loss_function(x, h, Wx, Wh, b, dnext_h)[0] end = time.time() print("MinPy total time elapsed:", end - start) # test NumPy start = time.time() out, cache = rnn_step_forward(x, h, Wx, Wh, b) dx, dprev_h, dWx, dWh, db = rnn_step_backward(dnext_h, cache) out *= dnext_h # to agree with MinPy calculation end = time.time() print("NumPy total time elapsed:", end - start) print() print("Result Check:") print('dx error: ', rel_error(dx, grad_arrays[0])) print('dprev_h error: ', rel_error(dprev_h, grad_arrays[1])) print('dWx error: ', rel_error(dWx, grad_arrays[2])) print('dWh error: ', rel_error(dWh, grad_arrays[3])) print('db error: ', rel_error(db, grad_arrays[4]))
# set_context(gpu(0)) # set the global context as gpu(0) # Predict the class using multinomial logistic regression (softmax regression). def predict(w, x): a = np.exp(np.dot(x, w)) a_sum = np.sum(a, axis=1, keepdims=True) prob = a / a_sum return prob def train_loss(w, x): prob = predict(w, x) loss = -np.sum(label * np.log(prob)) / num_samples return loss """Use Minpy's auto-grad to derive a gradient function off loss""" grad_function = grad_and_loss(train_loss) # Using gradient descent to fit the correct classes. def train(w, x, loops): for i in range(loops): dw, loss = grad_function(w, x) if i % 10 == 0: print('Iter {}, training loss {}'.format(i, loss)) # gradient descent w -= 0.1 * dw # Initialize training data. num_samples = 10000 num_features = 500 num_classes = 5 data, label = make_data(num_samples, num_features, num_classes)