def build_loss(self, rot_true, rot_vect_pred, rot_pred, class_true, class_pred, mask_true, mask_pred): pos_mask = self.rectify_scores(class_true) cls_cross_entropy = \ utils.cross_entropy(pos_mask, class_pred) rot_vect_true = self.rot_to_prob(rot_true) rot_cross_entropy = tf.reduce_mean( utils.cross_entropy( rot_vect_true, rot_vect_pred), axis=1) * pos_mask rot_error = tf.reduce_sum(tf.abs(rot_pred - rot_true) * pos_mask) \ / (1e-6 + tf.reduce_sum(pos_mask)) mask_cross_entropy = \ utils.cross_entropy(mask_true, mask_pred) mask_cross_entropy = tf.reduce_mean( mask_cross_entropy, [1, 2]) mask_cross_entropy = mask_cross_entropy * pos_mask cls_loss = tf.reduce_mean(cls_cross_entropy) rot_loss = tf.reduce_mean(rot_cross_entropy) mask_loss = tf.reduce_mean(mask_cross_entropy) return cls_loss, rot_loss, mask_loss, rot_error
def test_cross_entropy(self): loss, dy = utils.cross_entropy(np.array([[0, 1]]), np.array([0])) self.assertTrue(np.allclose(loss, [[10]])) self.assertTrue(np.allclose(dy, [[-1, 1]])) loss, dy = utils.cross_entropy(np.array([[0, 1]]), np.array([1])) self.assertTrue(np.allclose(loss, [[0]])) self.assertTrue(np.allclose(dy, [[0, 0]])) loss, dy = utils.cross_entropy(np.array([[0.5, 0.5]]), np.array([1])) self.assertTrue(np.allclose(loss, [[-np.log(0.5)]])) self.assertTrue(np.allclose(dy, [[0.5, -0.5]]))
def iteration(self, inputs, targets, state): caches = [] loss = 0.0 # ~ forward pass for x, y_true in zip(inputs, targets): y, state, cache = self.forward_pass(x, state) loss += u.cross_entropy(u.softmax(y), y_true) caches.append(cache) # updating loss loss /= inputs.shape[0] # ~ backward pass d_next = self.initial_state() grads = {k: np.zeros_like(v) for k, v in self.model.items()} for y_true, cache in reversed(list(zip(targets, caches))): grad, d_next = self.backward_pass(y_true, d_next, cache) # accumulating gradients for k in grads.keys(): grads[k] += grad[k] # gradient clipping for k, v in grads.items(): grads[k] = np.clip(v, -5., 5.) return grads, loss, state
def loss_func(self): with tf.name_scope('Loss'): y_one_hot = tf.one_hot(self.y, depth=self.conf.num_cls, axis=4, name='y_one_hot') if self.conf.loss_type == 'cross-entropy': with tf.name_scope('cross_entropy'): loss = cross_entropy(y_one_hot, self.logits, self.conf.num_cls) elif self.conf.loss_type == 'dice': with tf.name_scope('dice_coefficient'): loss = dice_coeff(y_one_hot, self.logits) with tf.name_scope('total'): if self.conf.use_reg: with tf.name_scope('L2_loss'): l2_loss = tf.reduce_sum(self.conf.lmbda * tf.stack([ tf.nn.l2_loss(v) for v in tf.get_collection('weights') ])) self.total_loss = loss + l2_loss else: self.total_loss = loss self.mean_loss, self.mean_loss_op = tf.metrics.mean( self.total_loss)
def create_optimizer(self, optimizer=None): model = get_collection('model') inputs = get_collection('inputs') x, y, logits, probabilities, predictions = ( inputs['x'], inputs['y'], model['logits'], model['probabilities'], model['predictions'] ) with tf.name_scope('metrics'): xe = cross_entropy(self.n_classes, logits=logits, labels=y) loss = tf.reduce_mean(xe, name='loss') targets = tf.argmax(y, axis=1, name='targets') match = tf.cast(tf.equal(predictions, targets), tf.float32) accuracy = tf.reduce_mean(match, name='accuracy') add_to_collection('metrics', loss, accuracy) with tf.name_scope('training'): if optimizer is None: optimizer = tf.train.GradientDescentOptimizer opt = optimizer(inputs['learning_rate']) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): training_op = opt.minimize(loss) add_to_collection('training', training_op)
def _compute_loss(self): """Compute the loss function""" loss = cross_entropy(logits=self.pred, labels=self.Y) loss += self.reg_lam / 2 * np.linalg.norm(self.W1)**2 loss += self.reg_lam / 2 * np.linalg.norm(self.W2)**2 loss += self.reg_lam / 2 * np.linalg.norm(self.W3)**2 return loss
def forward(self, sample): logmasks, history_logsk = self.atten_net(sample) reconstruction_image = torch.zeros(sample.shape).to(sample.device) if self.train: loss1 = 0 loss2 = 0 for i in range(logmasks.shape[1]): logvar, mu, recon_img, logitrecon_mask = self.comp_net(sample, logmasks[:, i, :, :].unsqueeze(1)) mask = torch.exp(logmasks[:, i, :, :].unsqueeze(1)) if self.train: loss1 += torch.sum((mask * sample - mask * recon_img) ** 2 / 0.0225) loss2 += utils.normal_KL_div_loss(logvar, mu) if i == 0: loss3_l = logitrecon_mask recon_images = recon_img.unsqueeze(4) else: loss3_l = torch.cat((loss3_l, logitrecon_mask), 1) recon_images = torch.cat((recon_images, recon_img.unsqueeze(4)), 4) reconstruction_image += recon_img * mask logrecon_masks = torch.nn.functional.log_softmax(loss3_l, dim=1) if self.train: loss3 = utils.cross_entropy(logmasks, logrecon_masks) loss = loss1 + 0.25 * loss2 + 0.025 * loss3 else: loss = loss1 = loss2 = loss3 = 0 if not self.train: return reconstruction_image.detach(), logmasks.detach(), history_logsk.detach(), recon_images.detach() return reconstruction_image, logmasks, history_logsk, recon_images, loss1, loss2, loss3, loss
def train_model(xtrain, ytrain, args): # simply randomly initialze W iters = args.iterations lr = args.lr lamb = args.lamb features = xtrain.shape[1] class_num = ytrain.shape[1] # randomly initialize parameter W W = np.random.rand(features + 1, class_num) for epoch in range(iters): running_loss = 0 for i, data in enumerate(data_loader(xtrain, ytrain, args.batch_size), 0): x, y = data b = np.ones((x.shape[0], 1)) x = np.append(x, b, 1) y_ = softmax(np.dot(x, W)) W += (np.dot(x.T, y - y_) + lamb * W) / x.shape[0] * lr running_loss += cross_entropy(y, y_) if i % 1000 == 999: # print every 2000 batches print('[%d, %5d] loss: %.6f' % (epoch + 1, i + 1, running_loss / 2000)) running_loss = 0.0 return W
def create_optimizer(self, optimizer=None): model = get_collection('model') inputs = get_collection('inputs') alpha, l1_ratio = self.alpha, self.l1_ratio (x, y, class_weights, learning_rate, theta, logits, probabilities, predictions) = (inputs['x'], inputs['y'], inputs['class_weights'], inputs['learning_rate'], model['theta'], model['logits'], model['probabilities'], model['predictions']) with tf.name_scope('metrics'): xe = cross_entropy(self.n_classes, logits=logits, labels=y) loss = tf.reduce_mean(xe, name='loss') weights = tf.reduce_sum(class_weights * y, axis=1) weighted_loss = tf.reduce_mean(xe * weights, name='weighted_loss') penalty = elastic_net(theta, l1_ratio=l1_ratio) penalized_loss = tf.add(weighted_loss, alpha * penalty, name='penalized_loss') targets = tf.argmax(y, axis=1, name='targets') match = tf.cast(tf.equal(predictions, targets), tf.float32) accuracy = tf.reduce_mean(match, name='accuracy') add_to_collection('metrics', loss, penalized_loss, accuracy) with tf.name_scope('training'): opt = tf.train.GradientDescentOptimizer(learning_rate) training_op = opt.minimize(penalized_loss) add_to_collection('training', training_op)
def loss_and_gradients(x, y, params): """ params: a list of the form [W, b, U, b_tag] returns: loss,[gW, gb, gU, gb_tag] loss: scalar gW: matrix, gradients of W gb: vector, gradients of b gU: matrix, gradients of U gb_tag: vector, gradients of b_tag """ W, b, U, b_tag = params y_tag = softmax(classifier_output(x, params)) loss = cross_entropy(y_tag, y) y_ = create_1_hot_vec(y, y_tag) z = np.array(x).dot(W) + b activation = np.tanh(z) gb_tag = y_tag - y_ gU = gb_tag * activation.reshape(-1, 1) gb = (1 - np.power(activation, 2)) * gb_tag.dot(U.T) gW = gb * np.array(x).reshape(-1, 1) return loss, [gW, gb, gU, gb_tag]
def evaluate(self, data): output = [self.feedforward(x) for x, y in data] cost = [utils.cross_entropy(o, d[1]) for o, d in zip(output, data)] results = [(np.argmax(self.feedforward(x)), np.argmax(y)) for (x, y) in data] accuracy = sum(int(x == y) for (x, y) in results) / len(results) avg_cost = sum(cost) / len(cost) return (avg_cost, accuracy)
def forward(self): logits = self.logits.get_data() labels = self.labels.get_data() pred = softmax(logits) res = cross_entropy(logits=pred, labels=labels) self.out.set_data_(res) self.pred.set_data_(pred) return self.out
def train(train_x, train_target, model): prediction = model.predict(train_x) loss = utils.cross_entropy(prediction, train_target) l1 = 0.0 for param in model.parameters(flatten=True): l1 += np.abs(param) loss = loss + lambda1 * l1 return loss
def main(): X, T = get_facialexpression(balance_ones=True) # X, T = np.shuffle(X,T) label_map = [ 'Anger', 'Disgust', 'Fear', 'Happy', 'Sad', 'Surprise', 'Neutral' ] # klass =3 error_rate=0.0 # klass =4 error_rate=0.0 # klass =5 error_rate=0.0 # klass =0 klass = 4 N, D = X.shape X = np.concatenate( (np.ones((N, 1)), X), axis=1, ) T = T.astype(np.int32) X = X.astype(np.float32) #Fix for forecasting on one image T = class1detect(T, detect=klass) D += 1 # params lr = 5e-7 max_iteration = 150 W = np.random.randn(D) / np.sqrt(D) cost = [] error = [] for i in xrange(max_iteration): Y = forward(W, X) cost.append(cross_entropy(T, Y)) error.append(error_rate(T, Y)) W += lr * X.T.dot(T - Y) if i % 5 == 0: print "i=%d\tcost=%.3f\terror=%.3f" % (i, cost[-1], error[-1]) if i % 5 == 0: print "i=%d\tcost=%.3f\terror=%.3f" % (i, cost[-1], error[-1]) print "Final weight:", W print T print np.round(Y) plt.title('logistic regression ' + label_map[klass]) plt.xlabel('iterations') plt.ylabel('cross entropy') plt.plot(cost) plt.show() plt.title('logistic regression ' + label_map[klass]) plt.xlabel('iterations') plt.ylabel('error rate') plt.plot(error) plt.show()
def fit(self, X, Y, learning_rate=1e-6, reg=0, epochs=12000, show_figure=False): X, Y = shuffle(X, Y) Xvalid, Yvalid = X[-1000:, :], Y[-1000:] X, Y = X[:-1000, :], Y[:-1000] K = len(set(Y)) N, D = X.shape Yind_valid = np.zeros((1000, K), dtype=np.int32) Yind = np.zeros((N, K), dtype=np.int32) Yind_valid[np.arange(1000), Yvalid] = 1 Yind[np.arange(N), Y] = 1 self.W = np.random.randn(D, K) / np.sqrt(D + K) self.b = 0 costs = [] best_validation_error = 1 for i in xrange(epochs): for j in xrange(N): xj = X[j, :].T yj = Y[j] yp = np.argmax((self.W.T).dot(xj), axis=0) # gradient descent step self.W[:, yj] += (xj + reg * self.W[:, yj]) self.W[:, yp] -= (xj + reg * self.W[:, yp]) # self.b -= learning_rate *((pY-Y).sum() + reg*self.b) if i % 20 == 0: import code code.interact(local=dict(globals(), **locals())) pYvalid = self.forward(Xvalid) # c = sigmoid_cost(Yvalid, pYvalid) c = cross_entropy(Yind_valid, pYvalid) costs.append(c) e = error_rate(Yvalid, pYvalid) sys.stdout.write("i:%s\tcost:%.4f\terror:%.4f\t\r" % (format(i, '04d'), c, e)) sys.stdout.flush() # print "i", i, "cost:", c, "error", e if e < best_validation_error: best_validation_error = e print "best_validation_error:", best_validation_error if show_figure: plt.plot(costs) plt.show()
def compute_rnn_loss(yhat, y): l = len(y) loss = 0 dy = [None] * l for t in range(l): pt = utils.softmax(yhat[t]) losst, dy[t] = utils.cross_entropy(pt, y[t]) loss += np.sum(losst) return loss, dy
def loss_fn(params): targets = inputs.pop("labels") token_mask = jnp.where(targets > 0, 1.0, 0.0) logits = model(**inputs, train=True, dropout_rng=dropout_rng, params=params)[0] loss, normalizing_factor = cross_entropy(logits, targets, token_mask) return loss / normalizing_factor
def test_backward(self): config = { 'dim_hidden' : 10 , 'len' : 2 } l = RNN(config) l.accept([26]) x = [np.zeros([26])] * 2 x[0][0] = 1.0 x[1][1] = 1.0 y = l.forward(x) dy = [None] * 2 loss, dy[0] = utils.cross_entropy(utils.softmax(y[0]), np.array([0])) loss, dy[1] = utils.cross_entropy(utils.softmax(y[1]), np.array([1])) dW, dU, dV = l.backward(dy)
def fit(self, X, Y, learning_rate=1e-8, reg=1e-12, epochs=10000, show_fig=False): D = X.shape[1] # number of features K = len(set(Y)) # number of classes X, Y = shuffle(X, Y) X_valid, Y_valid = X[-1000:], Y[-1000:] T_valid = one_hot_encoder(Y_valid) X, Y = X[:-1000], Y[:-1000] T = one_hot_encoder(Y) self.W1 = np.random.randn(D, self.M) / np.sqrt(D) self.b1 = np.zeros(self.M) self.W2 = np.random.randn(self.M, K) / np.sqrt(self.M) self.b2 = np.zeros(K) costs = [] best_validation_error = 1 for epoch in range(epochs): Y_hat, Z = self.forward(X) # Weight updates ---------------------- Y_hat_T = Y_hat - T self.W2 -= learning_rate * (Z.T.dot(Y_hat_T) + reg * self.W2) self.b2 -= learning_rate * (Y_hat_T.sum() + reg * self.b2) val = Y_hat_T.dot(self.W2.T) * (1 - Z * Z) #tanh self.W1 -= learning_rate * (X.T.dot(val) + reg * self.W1) self.b1 -= learning_rate * (val.sum() + reg * self.b1) # ------------------------------------- if epoch % 10 == 0: Y_hat_valid, _ = self.forward(X_valid) c = cross_entropy(T_valid, Y_hat_valid) costs.append(c) e = error_rate(Y_valid, np.argmax(Y_hat_valid, axis=1)) print("epoch:", epoch, "cost:", c, "error:", e) if e < best_validation_error: best_validation_error = e print("best_validation_error:", best_validation_error) if show_fig: plt.plot(costs) plt.title('Validation cost') print("Final train classification_rate:", self.score(Y, self.predict(Y_hat)))
def loss(self, g_gen, p_gen, x_logits, x, g_inf, p_inf, p_inf_x, M_prev): # Calculate loss function, separately for each component because you might want to reweight contributions later # L_p_gen is squared error loss between inferred grounded location and grounded location retrieved from inferred abstract location L_p_g = torch.sum(torch.stack(utils.squared_error(p_inf, p_gen), dim=0), dim=0) # L_p_inf is squared error loss between inferred grounded location and grounded location retrieved from sensory experience L_p_x = torch.sum(torch.stack(utils.squared_error(p_inf, p_inf_x), dim=0), dim=0) if self.hyper['use_p_inf'] else torch.zeros_like(L_p_g) # L_g is squared error loss between generated abstract location and inferred abstract location L_g = torch.sum(torch.stack(utils.squared_error(g_inf, g_gen), dim=0), dim=0) # L_x is a cross-entropy loss between sensory experience and different model predictions. First get true labels from sensory experience labels = torch.argmax(x, 1) # L_x_gen: losses generated by generative model from g_prev -> g -> p -> x L_x_gen = utils.cross_entropy(x_logits[2], labels) # L_x_g: Losses generated by generative model from g_inf -> p -> x L_x_g = utils.cross_entropy(x_logits[1], labels) # L_x_p: Losses generated by generative model from p_inf -> x L_x_p = utils.cross_entropy(x_logits[0], labels) # L_reg are regularisation losses, L_reg_g on L2 norm of g L_reg_g = torch.sum(torch.stack([torch.sum(g ** 2, dim=1) for g in g_inf], dim=0), dim=0) # And L_reg_p regularisation on L1 norm of p L_reg_p = torch.sum(torch.stack([torch.sum(torch.abs(p), dim=1) for p in p_inf], dim=0), dim=0) # Return total loss as list of losses, so you can possibly reweight them L = [L_p_g, L_p_x, L_x_gen, L_x_g, L_x_p, L_g, L_reg_g, L_reg_p] return L
def criterion(self,t,targets_old,outputs,targets): # TODO: warm-up of the new layer (paper reports that improves performance, but negligible) # Knowledge distillation loss for all previous tasks loss_dist=0 for t_old in range(0,t): loss_dist+=utils.cross_entropy(outputs[t_old],targets_old[t_old],exp=1/self.T) # Cross entropy loss loss_ce=self.ce(outputs[t],targets) # We could add the weight decay regularization mentioned in the paper. However, this might not be fair/comparable to other approaches return loss_ce+self.lamb*loss_dist
def total_loss(self, data): """ 给定测试数据,计算模型的loss :param data: 用于测试的验证数据集或测试数据集 :return: 模型的loss """ loss = 0.0 for x, y in data: a = self.predict(x) loss += utils.cross_entropy(a, y) / len(data) # 加上L2正则化项 loss += 0.5 * (self.reg_lambda / len(data)) * sum( np.linalg.norm(w)**2 for w in self.weights) return loss
def fit(self, X, Y, learning_rate=1e-8, reg=1e-12, epochs=10000, show_fig=False): D = X.shape[1] # number of features K = len(set(Y)) # number of classes X, Y = shuffle(X, Y) X_valid, Y_valid = X[-1000:], Y[-1000:] T_valid = one_hot_encoder(Y_valid) X, Y = X[:-1000], Y[:-1000] T = one_hot_encoder(Y) self.W = np.random.randn(D, K) / np.sqrt(D) self.b = np.zeros(K) costs = [] best_validation_error = 1 for epoch in range(epochs): Y_hat = self.forward(X) self.W -= learning_rate * (self.dJ_dw(T, Y_hat, X) + reg * self.W) self.b -= learning_rate * (self.dJ_db(T, Y_hat) + reg * self.b) if epoch % 100 == 0: Y_hat_valid = self.forward(X_valid) c = cross_entropy(T_valid, Y_hat_valid) costs.append(c) e = error_rate(Y_valid, np.argmax(Y_hat_valid, axis=1)) print("epoch:", epoch, "cost:", c, "error:", e) if e < best_validation_error: best_validation_error = e print("best_validation_error:", best_validation_error) if show_fig: plt.plot(costs) plt.title('Validation cost') plt.show() print("Final train classification_rate:", self.score(X, Y))
def loss_and_gradients(x, y, params): """ Compute the loss and the gradients at point x with given parameters. y is a scalar indicating the correct label. returns: loss,[gW,gb] loss: scalar gW: matrix, gradients of W gb: vector, gradients of b """ # we put the softmax here for the loss calc, because in prediction its redundant to calc the prob y_tag = softmax(classifier_output(x, params)) loss = cross_entropy(y_tag, y) y_ = create_1_hot_vec(y, y_tag) gb = y_tag - y_ gW = gb * np.array(x).reshape(-1, 1) return loss, [gW, gb]
def main(): user_action=3 X, T = get_ecommerce(user_action=user_action) # X, T = np.shuffle(X,T) N, D = X.shape X = np.concatenate((np.ones((N,1)), X), axis=1, ) T = T.astype(np.int32) X = X.astype(np.float32) D+=1 # params lr = 5e-4 max_iteration=1000 W = np.random.randn(D) / np.sqrt(D) cost = [] error = [] for i in xrange(max_iteration): Y = forward(W, X) cost.append(cross_entropy(T,Y)) error.append(error_rate(T,Y)) W += lr*X.T.dot(T-Y) if i % 5 == 0: print "i=%d\tcost=%.3f\terror=%.3f" % (i,cost[-1],error[-1]) if i % 5 == 0: print "i=%d\tcost=%.3f\terror=%.3f" % (i,cost[-1],error[-1]) print "Final weight:", W plt.title('logistic regression user_action=%d' % (user_action)) plt.xlabel('iterations') plt.ylabel('cross entropy') plt.plot(cost) plt.show() plt.title('logistic regression user_action=%d' % (user_action)) plt.xlabel('iterations') plt.ylabel('error rate') plt.plot(error) plt.show()
def __init__(self, input_channels=3, n_classes=2): tf.reset_default_graph() self.n_classes = n_classes self.x = tf.placeholder(dtype=tf.float32, shape=[None, None, None, input_channels]) self.y = tf.placeholder(dtype=tf.float32, shape=[None, None, None, self.n_classes]) logits = model_build.build(self.x, self.n_classes) self.loss = self._get_loss(logits) self.cross_entropy = tf.reduce_mean(utils.cross_entropy(tf.reshape(self.y, [-1, self.n_classes]), tf.reshape(utils.pixel_wise_softmax_2(logits), [-1, self.n_classes]))) # 计算像素级别的互熵损失 self.predicter = utils.pixel_wise_softmax_2(logits) # 每个类别的正确与否 self.correct_pred = tf.equal(tf.argmax(self.predicter, 3), tf.argmax(self.y, 3)) # 计算所有类别的准确率 self.accuracy = tf.reduce_mean(tf.cast(self.correct_pred, tf.float32))
def fit(self, X, Y, learning_rate=1e-6, reg=0, epochs=12000, show_figure=False): X, Y = shuffle(X, Y) Xvalid, Yvalid = X[-1000:, :], Y[-1000:] X, Y = X[:-1000, :], Y[:-1000] N, D = X.shape self.W = np.random.randn(D) / np.sqrt(D) self.b = 0 costs = [] best_validation_error = 1 for i in xrange(epochs): pY = self.forward(X) # gradient descent step self.W -= learning_rate * (X.T.dot(pY - Y) + reg * self.W) self.b -= learning_rate * ((pY - Y).sum() + reg * self.b) if i % 20 == 0: pYvalid = self.forward(Xvalid) # c = sigmoid_cost(Yvalid, pYvalid) c = cross_entropy(Yvalid, pYvalid) costs.append(c) e = error_rate(Yvalid, pYvalid) sys.stdout.write("i:%s\tcost:%.4f\terror:%.4f\t\r" % (format(i, '04d'), c, e)) sys.stdout.flush() # print "i", i, "cost:", c, "error", e if e < best_validation_error: best_validation_error = e print "best_validation_error:", best_validation_error if show_figure: plt.plot(costs) plt.show()
def loss_and_gradients(x, y, params): """ params: a list as created by create_classifier(...) returns: loss,[gW1, gb1, gW2, gb2, ...] loss: scalar gW1: matrix, gradients of W1 gb1: vector, gradients of b1 gW2: matrix, gradients of W2 gb2: vector, gradients of b2 ... (of course, if we request a linear classifier (ie, params is of length 2), you should not have gW2 and gb2.) """ grads = [] y_tag = softmax(classifier_output(x, params)) loss = cross_entropy(y_tag, y) y_ = create_1_hot_vec(y, y_tag) all_z = [x] all_activation = [] for (W, b) in params: all_z.append(np.array(all_z[-1]).dot(W) + b) all_activation.append(np.tanh(all_z[-1])) gb = y_tag - y_ gW = gb * all_activation[-2].reshape(-1, 1) grads.append([gW, gb]) # We start from -2 because for every n layers we have n-1 calculation when we already # calculating the first one before the for loop for i, layer in enumerate(params[:-1][::-2]): cur_ind = len(params) - 2 - i gb = (1 - np.power(all_activation[cur_ind], 2)) * grads[-1][1].dot( params[cur_ind + 1][0].T) gW = gb * all_z[cur_ind].reshape(-1, 1) grads.append([gW, gb]) return loss, grads
def fit(self, x_train, y_train, max_epochs, learning_rate=0.002): self.initialize() # initialize all weights and biases of all layers x_axis = [] y_axis = [] for i in range(max_epochs): index = 0 loss = 0 # stochastic gradient descent with batch size = 1 for x in x_train: self.layers[0].values = x self.layers[0].output = x y = y_train[index] self.forward_prop() self.back_prop(y, learning_rate) y_pred = self.layers[self.num_layers - 1].output loss += cross_entropy(y, y_pred) index += 1 # print("in no. : %d, loss: %f" % (index, loss)) x_axis.append(i) y_axis.append(loss) print("iter no. : %d, loss: %f" % (i, loss)) return x_axis, y_axis
def backward(self, Y, cache): self._gradients = { key: [ np.zeros_like(self._gradients[key][d]) for d in range(len(self._gradients[key])) ] for key in self._gradients.keys() } (x, a, y_hat, dropout) = cache for t in reversed(range(self._cell_length)): self._loss += sum([ cross_entropy(y_hat[t][b, :], Y[b, t]) for b in range(self._batch_size) ]) / (self._cell_length * self._batch_size) dy = np.array([ cross_entropy_d(y_hat[t][b, :], Y[b, t]) for b in range(self._batch_size) ]) / (self._cell_length * self._batch_size) self._gradients['dW_ay'][0] += np.dot(x[self._depth_size][t].T, dy) self._gradients['db_y'][0] += dy.sum(axis=0) da = np.dot(dy, self._parameters['W_ay'][0].T) for d in reversed(range(self._depth_size)): da = (1 - a[d][t]**2) * (da * dropout[d][t] + self._gradients['da'][d]) self._gradients['dW_xa'][d] += np.dot(x[d][t].T, da) self._gradients['dW_aa'][d] += np.dot(a[d][t - 1].T, da) self._gradients['db_a'][d] += da.sum(axis=0) self._gradients['da'][d] = np.dot( da, self._parameters['W_aa'][d].T) da = np.dot(da, self._parameters['W_xa'][d].T) self._parameters['a'] = [ a[d][self._cell_length - 1] for d in range(self._depth_size) ]
def sgd(self, data, iterations, learning_rate, initial_momentum, final_momentum, minibatch=10, annealing=None, max_epoch_without_improvement=30, early_stop=True): """ Performes an Stochastic gradient descent (SGD) optimisation of the network. """ m = data.shape[1] data = data.T ########################################################################################### # Initialisation of the weights and bias # Copy the weights and biases into a state vector theta weights = [] biases = [] for jj in range(self.mid * 2): weights.append(copy.copy(self.layers[jj].weights)) biases.append(self.layers[jj].hidden_biases) theta, indices, weights_shape, biases_shape = self._roll(weights, biases) del weights, biases ########################################################################################### v_mom = 0 best_cost = 1e8 batch_indices = np.arange(m) n_minibatches = np.int(np.ceil(m/minibatch)) gamma = initial_momentum for epoch in range(iterations): np.random.shuffle(batch_indices) for ibatch in range(n_minibatches+1): ids = batch_indices[ibatch*minibatch:(ibatch+1)*minibatch] batch = data[:,ids] _, thetan = self.cost(theta, indices, weights_shape, biases_shape, 0, 0, 0, batch, cost_fct='cross-entropy', log_cost=False) v_mom = gamma * v_mom + learning_rate * thetan theta -= v_mom actual = self.feedforward(data.T) cost = utils.cross_entropy(data.T, actual) print 'Epoch %4d/%4d:\t%e' % (epoch+1, iterations, cost) self.train_history.append(cost) if cost <= best_cost : best_cost = cost iter_best = epoch if epoch - iter_best > max_epoch_without_improvement : print 'STOP: %d epoches without improvment' % max_epoch_without_improvement break if annealing is not None: learning_rate /= (1. + float(epoch) / annealing) if epoch > 100: gamma = final_momentum else: gamma = initial_momentum + (final_momentum - initial_momentum) * utils.sigmoid(epoch - 50) print learning_rate, gamma ########################################################################################### # Unroll the state vector and saves it to self. for jj in range(self.mid * 2): w, b = self._unroll(theta, jj, indices, weights_shape, biases_shape) self.layers[jj].weights = w self.layers[jj].hidden_biases = b # We're done ! self.is_trained = True