def evaluate(net, data_iter): loss, acc, n = 0., 0., 0. steps = len(data_iter) for data, label in data_iter: data, label = data.as_in_context(ctx), label.as_in_context(ctx) output = net(data) acc += accuracy(output, label) loss += nd.mean(softmax_cross_entropy(output, label)).asscalar() return loss/steps, acc/steps
def predict(net, data, label): data = nd.array(data) label = nd.array(label) hidden = net.begin_state(func=mx.nd.zeros,batch_size = data.shape[0],ctx=mx.cpu()) dd = nd.array(data.reshape((data.shape[0],5,11)).swapaxes(0,1)) output,hidden = net(dd,hidden) output = output.reshape((5,data.shape[0],1)) output = nd.sum(output,axis=0)/5 l = nd.argmax(output, axis=1) res = nd.mean(l==label) return res.asscalar()
def train(net, trainer, img_dir, img_attr_file, img_landmark_file, ctx, batch_size, epochs, out_model_file, lr_schedule): loss_softmax = gluon.loss.SoftmaxCrossEntropyLoss() loss_weighted_cross_entropy = WeightedCrossEntropyLoss() loss_l2 = gluon.loss.L2Loss() # loss_hinge = gluon.loss.HingeLoss() for epoch in range(epochs): data_iter = get_data(img_dir, img_attr_file, img_landmark_file, ctx, batch_size) # total_loss = 0 loss_iter = 0 for i, (data, label) in enumerate(data_iter): if not data: break with autograd.record(): d = data[0] img_files = data[1] l, vs, classifier_output, _, _ = net(d) label, vis, landmarks = label # print(vis.shape) vis_data = [vis[:, k] for k in range(vis.shape[1])] loss_landmark = loss_l2(l, landmarks) / 100 loss_attr = loss_weighted_cross_entropy(classifier_output, label) loss = loss_landmark + loss_attr for v, d in zip(vs, vis_data): loss = loss + loss_softmax(v, d) # print(loss) loss.backward() loss_iter += nd.mean(loss).asscalar() trainer.step(batch_size, ignore_stale_grad=True) if (i + 1) % 40 == 0: print(img_files) print(l) print(landmarks) print(loss_landmark) print(loss_attr) print('epoch: %d, iter: %d, loss: %f' % (epoch, i + 1, loss_iter)) loss_iter = 0 # half the learning rate every epoch lr_schedule.learning_rate /= 2.0 net.collect_params().save(out_model_file + '_' + str(epoch))
def evaluate_rnn(loss_func, data_iterator, model, hidden, ctx=[mx.cpu()]): if isinstance(ctx, mx.Context): ctx = [ctx] acc = nd.array([0]) total_loss = 0.0 n = 0. if isinstance(data_iterator, mx.io.MXDataIter): data_iterator.reset() for batch in data_iterator: data, label, batch_size = _get_batch(batch, ctx) for X, y in zip(data, label): out = model(X, hidden) acc += nd.sum(out.argmax(axis=1)==y).copyto(mx.cpu()) cur_loss = loss_func(out, y).copyto(mx.cpu()) total_loss += nd.mean(cur_loss).asscalar() n += y.size acc.wait_to_read() # don't push too many operators into backend return acc.asscalar() / n, total_loss / n
def evaluate_accuracy(self, data_iterator, net): """ compute top-1 accuracy :param data_iterator: :param net: :return: """ loss = utils.AverageMeter() acc = mx.metric.Accuracy() for idx, (d, l) in enumerate(data_iterator): data = d.as_in_context(self.ctx[0]) label = l.as_in_context(self.ctx[0]) output = net(data) _loss = self.get_loss(output, label) curr_loss = nd.mean(_loss).asscalar() loss.update(curr_loss, data.shape[0]) predictions = nd.argmax(output, axis=1) acc.update(preds=predictions, labels=label) utils.view_bar(idx + 1, len(data_iterator)) # view_bar return acc.get()[1], loss.avg
def trim(epoch, gradients, net, lr, f, byz, b=20): param_list = [ nd.concat(*[xx.reshape((-1, 1)) for xx in x], dim=0) for x in gradients ] param_list = byz(epoch, param_list, net, lr, f) sorted_array = nd.sort(nd.concat(*param_list, dim=1), axis=-1) n = len(param_list) q = f m = n - b * 2 trim_nd = nd.mean(sorted_array[:, b:(b + m)], axis=-1, keepdims=1) idx = 0 for j, (param) in enumerate(net.collect_params().values()): if param.grad_req == 'null': continue param.set_data( param.data() - lr * trim_nd[idx:(idx + param.data().size)].reshape(param.data().shape)) idx += param.data().size
def evaluate_loss(data_iterator, net, ctx=[mx.cpu()]): if isinstance(ctx, mx.Context): ctx = [ctx] acc = nd.array([0]) n = 0. if isinstance(data_iterator, mx.io.MXDataIter) or isinstance( data_iterator, mx.image.ImageIter): data_iterator.reset() for batch in data_iterator: data, label, batch_size = _get_batch(batch, ctx) for X, y in zip(data, label): y = y.astype('float32') y0 = net(X) #acc += nd.sum( (y0-y)*(y0-y) ).copyto(mx.cpu()) acc += nd.mean(0.5 * (y0 - y) * (y0 - y), axis=1).copyto( mx.cpu()).sum() #mean along dim of L2Loss n += y.shape[0] acc.wait_to_read() # don't push too many operators into backend return acc.asscalar() / n #mean of L2Loss
def test_compute_quantile_loss() -> None: y_true = nd.ones(shape=(10, 10, 10)) y_pred = nd.zeros(shape=(10, 10, 10, 2)) quantiles = [0.5, 0.9] loss = QuantileLoss(quantiles) correct_qt_loss = [1.0, 1.8] for idx, q in enumerate(quantiles): assert ( nd.mean( loss.compute_quantile_loss( nd.ndarray, y_true, y_pred[:, :, :, idx], q ) ) - correct_qt_loss[idx] < 1e-5 ), f"computing quantile loss at quantile {q} fails!"
def partial_trim(epoch, v, net, f): # apply partial knowledge trimmed mean attack vi_shape = v[0].shape #first compute the distribution parameters all_grads = nd.concat(*v, dim=1) adv_grads = all_grads[:, :f] e_mu = nd.mean(adv_grads, axis=1) # mean e_sigma = nd.sqrt( nd.sum(nd.square(nd.subtract(adv_grads, e_mu.reshape(-1, 1))), axis=1) / f) # standard deviation for i in range(f): # apply attack to compromised worker devices with randomness v[i] = ( e_mu - nd.multiply(e_sigma, nd.sign(e_mu)) * (3. + nd.random.uniform(shape=e_sigma.shape))).reshape(vi_shape) return v
def evaluate(loader, net, ctx, loss): """ Evaluate the loss function :param loader: data loader to be used in evaluation :param net: network :param context: prediction context :param loss: loss function """ epoch_loss = 0 weight_updates = 0 for i, (X) in enumerate(loader): X_U_cont, X_U_emb, X_I_cont, X_I_emb, X_I_neg_cont, X_I_neg_emb = (x.as_in_context(ctx) for x in X) # Forward pass: loss depends on both positive and negative predictions pos_pred = net(X_U_cont, X_U_emb, X_I_cont, X_I_emb) neg_pred = net(X_U_cont, X_U_emb, X_I_neg_cont, X_I_neg_emb) l = loss(pos_pred, neg_pred) epoch_loss += nd.mean(l).asscalar() weight_updates += 1 return epoch_loss / weight_updates
def train(net, train_data, valid_data, num_epochs, lr, wd, ctx, lr_period, lr_decay): trainer = gluon.Trainer(net.collect_params(), 'sgd', { 'learning_rate': lr, 'momentum': 0.99, 'wd': wd }) prev_time = datetime.datetime.now() for epoch in range(num_epochs): print(epoch) train_loss = 0.0 train_acc = 0.0 if epoch > 0 and epoch % lr_period == 0: trainer.set_learning_rate(trainer.learning_rate * lr_decay) count = 0 for data, label in train_data: label = label.as_in_context(ctx) with autograd.record(): output = net(data.as_in_context(ctx)) loss = softmax_cross_entropy(output, label) loss.backward() trainer.step(batch_size) count += 1 train_loss += nd.mean(loss).asscalar() train_acc += utils.accuracy(output, label) cur_time = datetime.datetime.now() h, remainder = divmod((cur_time - prev_time).seconds, 3600) m, s = divmod(remainder, 60) time_str = "Time %02d:%02d:%02d" % (h, m, s) if valid_data is not None: valid_acc = utils.evaluate_accuracy(valid_data, net, ctx) epoch_str = ("Epoch %d. Loss: %f, Train acc %f, Valid acc %f, " % (epoch, train_loss / len(train_data), train_acc / len(train_data), valid_acc)) else: epoch_str = ("Epoch %d. Loss: %f, Train acc %f, " % (epoch, train_loss / len(train_data), train_acc / len(train_data))) prev_time = cur_time print(epoch_str + time_str + ', lr ' + str(trainer.learning_rate))
def hybrid_forward(self, F, output, *args, **kwargs): """ Masks the outputs to the sequence lengths and returns the cross entropy loss output is a (batch x max_name_length x log_probabilities) tensor of name predictions for each graph """ (label, _), data_encoder = args loss = nd.pick(output, label.values, axis=2) # Masking output to max(where_RNN_emitted_PAD_token, length_of_label) output_preds = F.argmax(output, axis=2).asnumpy() output_lengths = [] for row in output_preds: end_token_idxs = np.where(row == data_encoder.all_node_name_subtokens['__PAD__'])[0] if len(end_token_idxs): output_lengths.append(int(min(end_token_idxs)) + 1) else: output_lengths.append(output.shape[1]) output_lengths = F.array(output_lengths, ctx=output.context) mask_lengths = F.maximum(output_lengths, label.value_lengths) loss = F.SequenceMask(loss, use_sequence_length=True, sequence_length=mask_lengths, axis=1) return nd.mean(-loss, axis=0, exclude=True)
def get_returns(self, discount_factor=0.99): """ Calculate the return for every state. This is defined as the discounted sum of rewards after visiting the state. Args: discount_factor (float) : determines how much we care about distant rewards (1.0) vs immediate rewards (0.). Returns: normalized_returns (array of float) : the returns, from which the mean is substracted to reduce the variance. """ returns = [] curr_sum = 0. for r in reversed(self.rewards): curr_sum = r + discount_factor * curr_sum returns.append(curr_sum) returns.reverse() normalized_returns = nd.array(returns) - nd.mean(nd.array(returns)) return normalized_returns
def _evaluate_accuracy(self, X, Y, batch_size=64): data_loader = self.generate_batch(X, Y, batch_size, shuffled=False) softmax_loss = gluon.loss.SoftmaxCrossEntropyLoss() num_batches = len(X) // batch_size metric = mx.metric.Accuracy() loss_avg = 0. for i, (data, label) in enumerate(data_loader): data = data.as_in_context(self.model_ctx) label = label.as_in_context(self.model_ctx) output = self.model(data) predictions = nd.argmax(output, axis=1) loss = softmax_loss(output, label) metric.update(preds=predictions, labels=label) loss_avg = loss_avg * i / (i + 1) + nd.mean(loss).asscalar() / (i + 1) if i + 1 == num_batches: break return metric.get()[1], loss_avg
def bulyan(epoch, gradients, net, lr, byz, f=0): param_list = [ nd.concat(*[xx.reshape((-1, 1)) for xx in x], dim=0) for x in gradients ] param_list = byz(epoch, param_list, net, f, lr, np.arange(len(param_list))) k = len(param_list) - f - 2 dist = mx.nd.zeros((len(param_list), len(param_list))) for i in range(0, len(param_list)): for j in range(0, i): dist[i][j] = nd.norm(param_list[i] - param_list[j]) dist[j][i] = dist[i][j] sorted_dist = mx.nd.sort(dist) sum_dist = mx.nd.sum(sorted_dist[:, :k + 1], axis=1) bulyan_list = [] bul_client_list = np.ones(len(param_list)) * (-1) for i in range(len(param_list) - 2 * f): chosen = int(nd.argmin(sum_dist).asscalar()) sum_dist[chosen] = 10**8 bul_client_list[i] = chosen bulyan_list.append(param_list[chosen]) for j in range(len(sum_dist)): sum_dist[j] = sum_dist[j] - dist[j][chosen] sorted_array = nd.sort(nd.concat(*bulyan_list, dim=1), axis=-1) trim_nd = nd.mean(sorted_array[:, f:(len(bulyan_list) - f)], axis=-1, keepdims=1) idx = 0 for j, (param) in enumerate(net.collect_params().values()): if param.grad_req == 'null': continue param.set_data( param.data() - lr * trim_nd[idx:(idx + param.data().size)].reshape(param.data().shape)) idx += param.data().size return trim_nd, bul_client_list
def train(net, train_data, valid_data, num_epochs, batch_size, ctx, trainer, loss_func, lr_period, lr_decay, filename): prev_time = datetime.datetime.now() for epoch in range(num_epochs): train_loss = 0.0 train_acc = 0.0 if epoch > 0 and epoch % lr_period == 0: trainer.set_learning_rate(trainer.learning_rate * lr_decay) for data, label in train_data: label = label.astype('float32').as_in_context(ctx) with autograd.record(): output = net(data.as_in_context(ctx)) loss = loss_func(output, label) loss.backward() trainer.step(batch_size) train_loss += nd.mean(loss).asscalar() train_acc += accuracy(output, label) cur_time = datetime.datetime.now() h, remainder = divmod((cur_time - prev_time).seconds, 3600) m, s = divmod(remainder, 60) time_str = "Time %02d:%02d:%02d" % (h, m, s) if valid_data is not None: valid_acc = evaluate_accuracy(valid_data, net, ctx) epoch_str = ("Epoch %d. Loss: %f, Train acc %f, Valid acc %f, " % (epoch, train_loss / len(train_data), train_acc / len(train_data), valid_acc)) else: epoch_str = ("Epoch %d. Loss: %f, Train acc %f, " % (epoch, train_loss / len(train_data), train_acc / len(train_data))) prev_time = cur_time print(epoch_str + time_str + ', lr ' + str(trainer.learning_rate)) net.save_params(filename)
def vali_loss_cal(data_iter, net): data_iter.reset() moving_loss = 0 smoothing_constant = .01 for i, batch in enumerate(train_iter): #print(data.shape) #print(label.shape) data = batch.data[0].as_in_context(ctx) label = batch.label[0].as_in_context(ctx) with autograd.record(): output = net(data) loss = softmax_cross_entropy(output, label) ########################## # Keep a moving average of the losses ########################## curr_loss = nd.mean(loss).asscalar() moving_loss = (curr_loss if ((i == 0) and (e == 0)) else (1 - smoothing_constant) * moving_loss + smoothing_constant * curr_loss) return moving_loss
def evaluate_accuracy(data_iterator, net, ctx, loss_fun, num_classes): """ This function is used for evaluating accuracy of a given data iterator. (Either Train/Test data) It takes in the loss function used too! """ acc = mx.metric.Accuracy() loss_avg = 0. for i, (data, labels) in enumerate(data_iterator): data = data.as_in_context(ctx) #.reshape((-1,784)) labels = labels.as_in_context(ctx) output = net(data) loss = loss_fun(output, labels) preds = [] if (num_classes == 2): preds = (nd.sign(output) + 1) / 2 preds = preds.reshape(-1) else: preds = nd.argmax(output, axis=1) acc.update(preds=preds, labels=labels) loss_avg = loss_avg * i / (i + 1) + nd.mean(loss).asscalar() / (i + 1) return acc.get()[1], loss_avg
def record_loss(losses, loss_names, summary_writer, step=0, exp=''): ''' record a list of losses to summary_writer. Parameter: ---------- losses: list of mxnet.ndarray the array is 1-D, length is batch size loss_names: list of string name of losses, len() summary_writer: mxboard.SummaryWriter step: int training step exp: string record to which figure ''' assert len(losses) == len(loss_names), ( 'length of first arg(losses) should equal to second arg(loss_names)') for i, L in enumerate(losses): loss_name = loss_names[i] summary_writer.add_scalar(exp, (loss_name, nd.mean(L).asnumpy()), step)
def train(epoch): # print(epoch) train_loss = 0. for batch_idx, (data, label) in enumerate(train_data): data = data.as_in_context(ctx) # print(data) label = label.as_in_context(ctx) batch_size = data.shape[0] # print(batch_idx,batch_size) with autograd.record(): output = net(data) loss = softmax_cross_entropy(output, label) loss.backward() trainer.step(batch_size) train_loss += nd.mean(loss).asscalar() if batch_idx % 500 == 0: print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( epoch, batch_idx * len(data), len(train_data.dataset), 100. * batch_idx / len(train_data), train_loss / len(train_data))) test()
def pointEvaluator(self, nnModel, testX, testX2, testY, lossFunc, mode='Normal'): assert mode in set(['Normal', 'logTransform']) pred = self.predict(nnModel, testX, testX2) validPred = pred.asnumpy() validTrue = testY if (mode == 'logTransform'): validPred = np.exp(validPred) - 1 validTrue = np.exp(validTrue) - 1 # The loss loss = nd.mean( lossFunc(pred, nd.array(testY, dtype='float32', ctx=self.dataCtx))).asscalar() # The evaluation metrics validND, validSMAPE, validNRMSE = ND(validPred, validTrue), SMAPE( validPred, validTrue), NRMSE(validPred, validTrue) return loss, validND, validSMAPE, validNRMSE
def forward(self, is_train, req, in_data, out_data, aux): x = in_data[0] gamma = in_data[1] beta = in_data[2] moving_mean = in_data[3] moving_var = in_data[4] # print(x.sum()) y = out_data[0] if is_train: mean = nd.mean(x, axis=(0, 2, 3)) var = nd.array(np.var(x.asnumpy(), axis=(0, 2, 3))) #print(moving_mean ,self.momentum, mean) moving_mean = moving_mean * self.momentum + mean * (1 - self.momentum) moving_var = moving_var * self.momentum + var * (1 - self.momentum) self.assign(in_data[3], req[0], moving_mean) self.assign(in_data[4], req[0], moving_var) else: mean = moving_mean var = moving_var quan_gamma = self.quantize(gamma / (nd.sqrt(var + self.eps))) quan_beta = self.quantize(beta - mean * gamma / nd.sqrt(var + self.eps)) y = nd.BatchNorm(x, gamma=quan_gamma, beta=quan_beta, moving_mean=nd.zeros(shape=moving_mean.shape), moving_var=nd.ones(shape=moving_var.shape), eps=self.eps, momentum=self.momentum, fix_gamma=self.fix_gamma, name=self.name) self.assign(out_data[0], req[0], mx.nd.array(y))
def _leave_one_out_gradient_estimator(h, f, zero_mean_h=False): """Estimate gradient of f using score function and control variate h. Optimal scaling of control variate is given by: a = Cov(h, f) / Var(h). """ if h.ndim > f.ndim: # expand parameter dimension (last dimension summed over in f) f = nd.expand_dims(f, f.ndim) grad_f = h * f if zero_mean_h: cov_h_f = _leave_one_out_mean(h * grad_f) var_h = _leave_one_out_mean(h * h) else: cov_h_f = _held_out_covariance(h, grad_f) var_h = _held_out_covariance(h, h) # sampling zero for low-variance score functions is probable, so add EPSILON! optimal_a = cov_h_f / (EPSILON + var_h) if h.ndim == 2: # If no batch dim: nd.Embedding removes batch dim for batches of size 1 keepdims = True else: keepdims = False return nd.mean(grad_f - optimal_a * h, 0, keepdims=keepdims)
def mask_loss(self, mask_pred, mask_eoc, mask_target, matches, bt_target): samples = (matches >= 0) pos_num = samples.sum(axis=-1).asnumpy().astype('int') rank = (-matches).argsort(axis=-1) losses = [] for i in range(mask_pred.shape[0]): if pos_num[i] == 0: losses.append(nd.zeros(shape=(1, ), ctx=mask_pred.context)) continue idx = rank[i, :pos_num[i]] pos_bboxe = nd.take(bt_target[i], idx) area = (pos_bboxe[:, 3] - pos_bboxe[:, 1]) * (pos_bboxe[:, 2] - pos_bboxe[:, 0]) weight = self.gt_weidth * self.gt_height / area mask_gt = mask_target[i, matches[i, idx], :, :] mask_preds = nd.dot(nd.take(mask_eoc[i], idx), mask_pred[i]) _, h, w = mask_preds.shape # mask_preds = self.global_aware(mask_preds) mask_preds = nd.sigmoid(mask_preds) mask_preds = self.crop(pos_bboxe, h, w, mask_preds) loss = self.SBCELoss(mask_preds, mask_gt) * weight losses.append(nd.mean(loss)) return nd.concat(*losses, dim=0)
def forward(self, x): embeds = self.embed(x) # batch * time step * embedding x_i = embeds.expand_dims(1) x_i = nd.repeat(x_i, repeats=self.sentence_length, axis=1) # batch * time step * time step * embedding x_j = embeds.expand_dims(2) x_j = nd.repeat(x_j, repeats=self.sentence_length, axis=2) # batch * time step * time step * embedding x_full = nd.concat( x_i, x_j, dim=3) # batch * time step * time step * (2 * embedding) # New input data _x = x_full.reshape((-1, 2 * self.emb_dim)) # Network for attention _attn = self.attn(_x) _att = _attn.reshape((-1, self.sentence_length, self.sentence_length)) _att = nd.sigmoid(_att) att = nd.softmax(_att, axis=1) _x = self.g_fc1(_x) # (batch * time step * time step) * hidden_dim _x = self.g_fc2(_x) # (batch * time step * time step) * hidden_dim # sentence_length*sentence_length개의 결과값을 모두 합해서 sentence representation으로 나타냄 x_g = _x.reshape( (-1, self.sentence_length, self.sentence_length, self.hidden_dim)) _inflated_att = _att.expand_dims(axis=-1) _inflated_att = nd.repeat(_inflated_att, repeats=self.hidden_dim, axis=3) x_q = nd.multiply(_inflated_att, x_g) sentence_rep = nd.mean(x_q.reshape(shape=(-1, self.sentence_length**2, self.hidden_dim)), axis=1) return sentence_rep, att
def train(data_iter): lstm = OCRLSTM() lstm.collect_params().initialize(mx.init.Xavier(), ctx=mx.cpu()) loss = gluon.loss.CTCLoss(layout='NTC', label_layout='NT') trainer = gluon.Trainer(lstm.collect_params(), 'sgd', {'learning_rate': 0.001}) state = lstm.begin_state(batch_size=1) global_step = 0 for epoch in range(100): print("epoch ", epoch) for sample in data_iter: data = sample[0] label = sample[1] train_loss = .0 with autograd.record(): # print("data ",state) output, state = lstm(data, state) # output = nd.expand_dims(output, axis=1) output = output.transpose((1, 0, 2)) # label = nd.expand_dims(label, axis=1) # label = label.reshape((1,4)) # print("output ", output.shape, label.shape) L = loss(output, label) L.backward() train_loss = nd.mean(L).asscalar() # sw.add_scalar(tag="loss",value=train_loss,global_step=global_step) global_step = global_step + 1 # if epoch == 1 : # sw.add_graph(net) trainer.step(1, ignore_stale_grad=True) if (epoch % 100 == 0): print('train_loss %.4f' % (train_loss)) # print('output max', output.argmax(axis=2)) predict(data, state)
def pick_the_best_function(self): def accuracy(y_hat, y): # 注意这里 y_hat 的 shape 必须与 y 的 shape 保持一致 return nd.mean(y_hat.argmax( axis=1).reshape(y.shape) == y).asscalar() def evaluate_accuracy(data_iter, net, ctx): acc = 0. for batch_X, batch_y in data_iter: batch_X = batch_X.as_in_context(ctx) batch_y = batch_y.as_in_context(ctx) batch_y = batch_y.reshape((-1, 1)) batch_y_hat = net(batch_X) acc += accuracy(batch_y_hat, batch_y) return acc / len(data_iter) for e in range(self.__epochs): train_loss = 0. train_acc = 0. for self.__batch_X, self.__batch_y in self.__train_data_iter: self.__batch_X = self.__batch_X.as_in_context(self.__ctx) self.__batch_y = self.__batch_y.reshape( (-1, 1)).as_in_context(self.__ctx) with autograd.record(): self.__batch_y_hat = self.__net(self.__batch_X) loss = self.__softmax_cross_entropy( self.__batch_y_hat, self.__batch_y) loss.backward() self.__trainer.step(self.__batch_size) train_loss += nd.mean(loss).asscalar() train_acc += accuracy(self.__batch_y_hat, self.__batch_y) test_acc = evaluate_accuracy(self.__test_data_iter, self.__net, self.__ctx) print("Epoch %d. Loss: %f, Train acc %f, Test acc %f" % (e, train_loss / len(self.__train_data_iter), train_acc / len(self.__train_data_iter), test_acc))
def train_step(model, optimizer, data, epoch): running_loss = 0.0 global update_count N = data.shape[0] idxlist = list(range(N)) np.random.shuffle(idxlist) training_steps = len(range(0, N, args.batch_size)) with trange(training_steps) as t: for batch_idx, start_idx in zip(t, range(0, N, args.batch_size)): t.set_description("epoch: {}".format(epoch + 1)) end_idx = min(start_idx + args.batch_size, N) X_inp = data[idxlist[start_idx:end_idx]] X_inp = nd.array(X_inp.toarray()).as_in_context(ctx) if args.constant_anneal: anneal = args.anneal_cap else: anneal = min(args.anneal_cap, update_count / total_anneal_steps) update_count += 1 with autograd.record(): if model.__class__.__name__ == "MultiVAE": X_out, mu, logvar = model(X_inp) loss = vae_loss_fn(X_inp, X_out, mu, logvar, anneal) train_step.anneal = anneal elif model.__class__.__name__ == "MultiDAE": X_out = model(X_inp) loss = -nd.mean(nd.sum(nd.log_softmax(X_out) * X_inp, -1)) loss.backward() trainer.step(X_inp.shape[0]) running_loss += loss.asscalar() avg_loss = running_loss / (batch_idx + 1) t.set_postfix(loss=avg_loss)
def backward(self, req, out_grad, in_data, out_data, in_grad, aux): dx = in_grad[0] dgamma = in_grad[1] dbeta = in_grad[2] x = in_data[0] gamma = in_data[1] beta = in_data[2] y = out_data[0] dy = out_grad[0] mean = nd.mean(x, axis=(0, 2, 3)) var = nd.array(np.var(x.asnumpy(), axis=(0, 2, 3))) quan_gamma = gamma quan_beta = beta x.attach_grad(), gamma.attach_grad(), beta.attach_grad() with autograd.record(): y = nd.BatchNorm(x, gamma=quan_gamma, beta=quan_beta, moving_mean=mean, moving_var=var, eps=self.eps, momentum=self.momentum, fix_gamma=self.fix_gamma, name=self.name) dx, dgamma, dbeta = autograd.grad(y, [x, quan_gamma, quan_beta], dy, retain_graph=True) self.assign(in_grad[0], req[0], dx) self.assign(in_grad[1], req[0], dgamma) self.assign(in_grad[2], req[0], dbeta)
def train_step(model, train_loader, trainer, metric, epoch, zero_padding): metric.reset() train_steps = len(train_loader) running_loss = 0.0 with trange(train_steps) as t: for batch_idx, (data, target) in zip(t, train_loader): t.set_description("epoch %i" % (epoch + 1)) X = data.as_in_context(ctx) y = target.as_in_context(ctx) with autograd.record(): y_pred = model(X) loss = criterion(y_pred, y) loss.backward() if zero_padding: p_zero_padding(model) trainer.step(X.shape[0]) running_loss += nd.mean(loss).asscalar() avg_loss = running_loss / (batch_idx + 1) metric.update(preds=nd.argmax(y_pred, axis=1), labels=y) t.set_postfix(acc=metric.get()[1], loss=avg_loss)
def cgc_filter(gradients, net, f, byz): """Gets rid of the largest f gradients away from the norm""" cgc_method = cfg['cgc_method'] if cgc_method == 'by-layer': output = cgc_by_layer(gradients, f) else: output = multiply_norms(gradients, f) # X is a 2d list of nd array param_list = [ nd.concat(*[xx.reshape((-1, 1)) for xx in x], dim=0) for x in output ] byz(param_list, f) mean_nd = nd.mean(nd.concat(*param_list, dim=1), axis=-1) grad_collect = [] idx = 0 for j, (param) in enumerate(net.collect_params().values()): if param.grad_req != 'null': # mapping back to the collection of ndarray # append to list for uploading to cloud grad_collect.append(mean_nd[idx:(idx + param.data().size)].reshape( param.data().shape)) idx += param.data().size return grad_collect
def validate(val_data, net, criterion, num_parts, ctx): loss = 0.0 for data, label in val_data: data_list = gluon.utils.split_and_load(data, ctx) label_list = gluon.utils.split_and_load(label, ctx) losses = [] accurays = [] for i in range(opt.num_gpus): outputs = [X for X in net(data_list[i])] temp_loss = sum([criterion(X, label_list[i]) for X in outputs]) / num_parts losses.append(temp_loss) temp_acc = sum([ nd.mean(X.argmax( axis=1) == label_list[i].astype('float32')).asscalar() for X in outputs ]) / num_parts accurays.append(temp_acc) loss_list = [l.mean().asscalar() for l in losses] loss += sum(loss_list) / len(loss_list) return loss / len(val_data), sum(accurays) / len(accurays)
def accuracy(output, label): return nd.mean(output.argmax(axis=1) == label).asscalar()
def accuracy(output, labels): return nd.mean(nd.argmax(output, axis=1) == labels).asscalar()
def train(epochs, ctx): """Training function.""" if isinstance(ctx, mx.Context): ctx = [ctx] net.initialize(mx.init.Xavier(magnitude=2), ctx=ctx) opt_options = {'learning_rate': opt.lr, 'wd': opt.wd} if opt.optimizer == 'sgd': opt_options['momentum'] = 0.9 if opt.optimizer == 'adam': opt_options['epsilon'] = 1e-7 trainer = gluon.Trainer(net.collect_params(), opt.optimizer, opt_options, kvstore=opt.kvstore) if opt.lr_beta > 0.0: # Jointly train class-specific beta. # See "sampling matters in deep embedding learning" paper for details. beta.initialize(mx.init.Constant(opt.beta), ctx=ctx) trainer_beta = gluon.Trainer([beta], 'sgd', {'learning_rate': opt.lr_beta, 'momentum': 0.9}, kvstore=opt.kvstore) loss = MarginLoss(margin=opt.margin, nu=opt.nu) best_val = 0.0 for epoch in range(epochs): tic = time.time() prev_loss, cumulative_loss = 0.0, 0.0 # Learning rate schedule. trainer.set_learning_rate(get_lr(opt.lr, epoch, steps, opt.factor)) logging.info('Epoch %d learning rate=%f', epoch, trainer.learning_rate) if opt.lr_beta > 0.0: trainer_beta.set_learning_rate(get_lr(opt.lr_beta, epoch, steps, opt.factor)) logging.info('Epoch %d beta learning rate=%f', epoch, trainer_beta.learning_rate) # Inner training loop. for i in range(200): batch = train_data.next() data = gluon.utils.split_and_load(batch.data[0], ctx_list=ctx, batch_axis=0) label = gluon.utils.split_and_load(batch.label[0], ctx_list=ctx, batch_axis=0) Ls = [] with ag.record(): for x, y in zip(data, label): a_indices, anchors, positives, negatives, _ = net(x) if opt.lr_beta > 0.0: L = loss(anchors, positives, negatives, beta, y[a_indices]) else: L = loss(anchors, positives, negatives, opt.beta, None) # Store the loss and do backward after we have done forward # on all GPUs for better speed on multiple GPUs. Ls.append(L) cumulative_loss += nd.mean(L).asscalar() for L in Ls: L.backward() # Update. trainer.step(batch.data[0].shape[0]) if opt.lr_beta > 0.0: trainer_beta.step(batch.data[0].shape[0]) if (i+1) % opt.log_interval == 0: logging.info('[Epoch %d, Iter %d] training loss=%f' % ( epoch, i+1, cumulative_loss - prev_loss)) prev_loss = cumulative_loss logging.info('[Epoch %d] training loss=%f'%(epoch, cumulative_loss)) logging.info('[Epoch %d] time cost: %f'%(epoch, time.time()-tic)) names, val_accs = test(ctx) for name, val_acc in zip(names, val_accs): logging.info('[Epoch %d] validation: %s=%f'%(epoch, name, val_acc)) if val_accs[0] > best_val: best_val = val_accs[0] logging.info('Saving %s.' % opt.save_model_prefix) net.save_params('%s.params' % opt.save_model_prefix) return best_val
metric.update([real_label, ], [output, ]) # train with fake image fake_image = g_net(noise) output = d_net(fake_image.detach()).reshape((-1, 1)) errD_fake = loss(output, fake_label) errD = errD_real + errD_fake errD.backward() metric.update([fake_label, ], [output, ]) d_trainer.step(BATCH_SIZE) # update G with autograd.record(): fake_image = g_net(noise) output = d_net(fake_image).reshape(-1, 1) errG = loss(output, real_label) errG.backward() g_trainer.step(BATCH_SIZE) # print log infomation every 100 batches if i % 100 == 0: name, acc = metric.get() logging.info('discriminator loss = %f, generator loss = %f, \ binary training acc = %f at iter %d epoch %d', nd.mean(errD).asscalar(), nd.mean(errG).asscalar(), acc, i, epoch) if i == 0: save_image(fake_image, epoch, IMAGE_SIZE, BATCH_SIZE, OUTPUT_DIR) metric.reset()
def square_loss(yhat, y): return nd.mean((yhat - y) ** 2)
for datas, labels in train_data: #data, label, batch_size = _get_batch(batch, ctx) #batch_size = batch.shape[0] #pdb.set_trace() trainNum += datas.asnumpy().shape[0] labels = labels.astype('float32').as_in_context(ctx) with autograd.record(): #each sample each time yhats = net(datas.as_in_context(ctx)) #losses = [ loss_func(yhat, label) for yhat,label in zip(yhats,labels)] loss = loss_func(yhats,labels) #pdb.set_trace() #for loss in losses: # loss.backward() loss.backward() trainer.step(batch_size) train_loss += nd.mean(loss).asscalar() train_acc += accuracy(yhats,labels) batchNum += 1 cur_time = datetime.datetime.now() h, remainder = divmod((cur_time - prev_time).seconds, 3600) m, s = divmod(remainder, 60) time_str = "Time %02d:%02d:%02d" % (h, m, s) if valid_data is not None and 0 == (batchNum%check_freq): valid_acc = evaluate_accuracy(valid_data, net, ctx) epoch_str = ("Epoch %d. Batch %d Loss: %f, Train acc %f, Valid acc %f, " % (epoch, batchNum, train_loss / trainNum, train_acc / trainNum, valid_acc)) logging.info(epoch_str + time_str + ', lr ' + str(trainer.learning_rate)) elif 0 == (batchNum%print_freq): epoch_str = ("Epoch %d. Batch %d Loss: %f, Train acc %f, " % (epoch, batchNum, train_loss / trainNum,
output = net(data) predictions = nd.argmax(output, axis=1) acc.update(preds=predictions, labels=label) return acc.get()[1] epochs = 10 smoothing_constant = .01 for e in range(epochs): train_data.reset() for i, batch in enumerate(train_data): data = batch.data[0].as_in_context(ctx) label = batch.label[0].as_in_context(ctx) with autograd.record(): output = net(data) loss = softmax_cross_entropy(output, label) loss.backward() trainer.step(data.shape[0]) ########################## # Keep a moving average of the losses ########################## curr_loss = nd.mean(loss).asscalar() moving_loss = (curr_loss if ((i == 0) and (e == 0)) else (1 - smoothing_constant) * moving_loss + (smoothing_constant) * curr_loss) test_accuracy = evaluate_accuracy(test_data, net) train_accuracy = evaluate_accuracy(train_data, net) print("Epoch %s. Loss: %s, Train_acc %s, Test_acc %s" % (e, moving_loss, train_accuracy, test_accuracy))
# state = nd.zeros(shape=(batch_size, num_hidden), ctx=ctx) for e in range(epochs): ############################ # Attenuate the learning rate by a factor of 2 every 100 epochs. ############################ if ((e+1) % 100 == 0): learning_rate = learning_rate / 2.0 h = nd.zeros(shape=(batch_size, num_hidden), ctx=ctx) c = nd.zeros(shape=(batch_size, num_hidden), ctx=ctx) for i in range(num_batches): data_one_hot = train_data[i] label_one_hot = train_label[i] with autograd.record(): outputs, h, c = gru_rnn(data_one_hot, h, c) loss = average_ce_loss(outputs, label_one_hot) loss.backward() SGD(params, learning_rate) ########################## # Keep a moving average of the losses ########################## if (i == 0) and (e == 0): moving_loss = nd.mean(loss).asscalar() else: moving_loss = .99 * moving_loss + .01 * nd.mean(loss).asscalar() print("Epoch %s. Loss: %s" % (e, moving_loss)) print(sample("The Time Ma", 1024, temperature=.1)) print(sample("The Medical Man rose, came to the lamp,", 1024, temperature=.1))
def cross_entropy(yhat, y): return - nd.mean(nd.sum(y * nd.log(yhat), axis=0, exclude=True))