def layerwise_relevance_zclip(self, out, use_bias=False, **kwargs): if self._in is None: raise RuntimeError('Block has not yet executed forward_logged!') R = out a = self._in[0] z = self._out weight = self.weight.data(ctx=a.context) wplus = nd.maximum(0., weight) wminus = nd.minimum(0., weight) bplus = None bminus = None if use_bias is not None: bias = self.bias.data(ctx=a.context) bplus = nd.maximum(0., bias) bminus = nd.minimum(0., bias) alpha = z > 0. beta = z < 0. a.attach_grad() with autograd.record(): zplus = self._forward(data=a, weight=wplus, bias=bplus) cplus, = autograd.grad(zplus, a, head_grads=alpha*R/(zplus + (zplus == 0.))) with autograd.record(): zminus = self._forward(data=a, weight=wminus, bias=bminus) cminus, = autograd.grad(zminus, a, head_grads=beta*R/(zminus + (zminus == 0.))) return a*(cplus - cminus)
def train(train_iter, test_iter, net, loss, trainer, ctx, num_epochs, print_batches=None): """Train and evaluate a model.""" print("training on", ctx) if isinstance(ctx, mx.Context): ctx = [ctx] for epoch in range(1, num_epochs + 1): train_l_sum, train_acc_sum, n, m = 0.0, 0.0, 0.0, 0.0 if isinstance(train_iter, mx.io.MXDataIter): train_iter.reset() start = time() for i, batch in enumerate(train_iter): Xs, ys, batch_size = _get_batch(batch, ctx) ls = [] with autograd.record(): y_hats = [net(X) for X in Xs] ls = [loss(y_hat, y) for y_hat, y in zip(y_hats, ys)] for l in ls: l.backward() train_acc_sum += sum([(y_hat.argmax(axis=1) == y).sum().asscalar() for y_hat, y in zip(y_hats, ys)]) train_l_sum += sum([l.sum().asscalar() for l in ls]) trainer.step(batch_size) n += batch_size m += sum([y.size for y in ys]) if print_batches and (i+1) % print_batches == 0: print("batch %d, loss %f, train acc %f" % ( n, train_l_sum / n, train_acc_sum / m )) test_acc = evaluate_accuracy(test_iter, net, ctx) print("epoch %d, loss %.4f, train acc %.3f, test acc %.3f, time %.1f sec" % ( epoch, train_l_sum / n, train_acc_sum / m, test_acc, time() - start ))
def train(epoch, ctx): if isinstance(ctx, mx.Context): ctx = [ctx] net.initialize(mx.init.Orthogonal(), ctx=ctx) # re-initialize conv4's weight to be Orthogonal net.conv4.collect_params().initialize(mx.init.Orthogonal(scale=1), ctx=ctx) trainer = gluon.Trainer(net.collect_params(), 'adam', {'learning_rate': opt.lr}) loss = gluon.loss.L2Loss() for i in range(epoch): train_data.reset() for batch in train_data: data = gluon.utils.split_and_load(batch.data[0], ctx_list=ctx, batch_axis=0) label = gluon.utils.split_and_load(batch.label[0], ctx_list=ctx, batch_axis=0) outputs = [] with ag.record(): for x, y in zip(data, label): z = net(x) L = loss(z, y) L.backward() outputs.append(z) trainer.step(batch.data[0].shape[0]) metric.update(label, outputs) name, acc = metric.get() metric.reset() print('training mse at epoch %d: %s=%f'%(i, name, acc)) test(ctx) net.save_params('superres.params')
def train(weight_decay): learning_rate = 0.005 epochs = 10 net = gluon.nn.Sequential() with net.name_scope(): net.add(gluon.nn.Dense(1)) net.initialize() # 注意到这里 'wd' trainer = gluon.Trainer(net.collect_params(), 'sgd', { 'learning_rate': learning_rate, 'wd': weight_decay}) #注意在这里设置正则项 # 标准的梯度下降中,w = w-lr*grad, 参数是这样更新的 # 加入正则项后 w = w - lr*grad - wd*w # ?? w = w - lr(grad + wd * w) train_loss = [] test_loss = [] for e in range(epochs): for data, label in data_iter_train: with autograd.record(): output = net(data) loss = square_loss(output, label) loss.backward() trainer.step(batch_size) train_loss.append(test(net, X_train, y_train)) test_loss.append(test(net, X_test, y_test)) plt.plot(train_loss) plt.plot(test_loss) plt.legend(['train', 'test']) plt.show() return ('learned w[:10]:', net[0].weight.data()[:, :10], 'learned b:', net[0].bias.data())
def train_gluon_ch7(trainer_name, trainer_hyperparams, features, labels, batch_size=10, num_epochs=2): """Train a linear regression model with a given Gluon trainer.""" net = nn.Sequential() net.add(nn.Dense(1)) net.initialize(init.Normal(sigma=0.01)) loss = gloss.L2Loss() def eval_loss(): return loss(net(features), labels).mean().asscalar() ls = [eval_loss()] data_iter = gdata.DataLoader( gdata.ArrayDataset(features, labels), batch_size, shuffle=True) trainer = gluon.Trainer(net.collect_params(), trainer_name, trainer_hyperparams) for _ in range(num_epochs): start = time.time() for batch_i, (X, y) in enumerate(data_iter): with autograd.record(): l = loss(net(X), y) l.backward() trainer.step(batch_size) if (batch_i + 1) * batch_size % 100 == 0: ls.append(eval_loss()) print('loss: %f, %f sec per epoch' % (ls[-1], time.time() - start)) set_figsize() plt.plot(np.linspace(0, num_epochs, len(ls)), ls) plt.xlabel('epoch') plt.ylabel('loss')
def train_ch7(model, data_iter, lr, num_epochs, ctx): """Train an encoder-decoder model""" model.initialize(init.Xavier(), force_reinit=True, ctx=ctx) trainer = gluon.Trainer(model.collect_params(), 'adam', {'learning_rate': lr}) loss = MaskedSoftmaxCELoss() tic = time.time() for epoch in range(1, num_epochs+1): l_sum, num_tokens_sum = 0.0, 0.0 for batch in data_iter: X, X_vlen, Y, Y_vlen = [x.as_in_context(ctx) for x in batch] Y_input, Y_label, Y_vlen = Y[:,:-1], Y[:,1:], Y_vlen-1 with autograd.record(): Y_hat, _ = model(X, Y_input, X_vlen, Y_vlen) l = loss(Y_hat, Y_label, Y_vlen) l.backward() grad_clipping_gluon(model, 5, ctx) num_tokens = Y_vlen.sum().asscalar() trainer.step(num_tokens) l_sum += l.sum().asscalar() num_tokens_sum += num_tokens if epoch % (num_epochs // 4) == 0: print("epoch %d, loss %.3f, time %.1f sec" % ( epoch, l_sum/num_tokens_sum, time.time()-tic)) tic = time.time()
def train(train_iter, test_iter, net, loss, trainer, ctx, num_epochs): """Train and evaluate a model.""" print('training on', ctx) if isinstance(ctx, mx.Context): ctx = [ctx] for epoch in range(num_epochs): train_l_sum, train_acc_sum, n, m, start = 0.0, 0.0, 0, 0, time.time() for i, batch in enumerate(train_iter): Xs, ys, batch_size = _get_batch(batch, ctx) ls = [] with autograd.record(): y_hats = [net(X) for X in Xs] ls = [loss(y_hat, y) for y_hat, y in zip(y_hats, ys)] for l in ls: l.backward() trainer.step(batch_size) train_l_sum += sum([l.sum().asscalar() for l in ls]) n += sum([l.size for l in ls]) train_acc_sum += sum([(y_hat.argmax(axis=1) == y).sum().asscalar() for y_hat, y in zip(y_hats, ys)]) m += sum([y.size for y in ys]) test_acc = evaluate_accuracy(test_iter, net, ctx) print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f, ' 'time %.1f sec' % (epoch + 1, train_l_sum / n, train_acc_sum / m, test_acc, time.time() - start))
def layerwise_relevance_zb(self, out, lo=-1, hi=1, use_bias=False, **kwargs): if self._in is None: raise RuntimeError('Block has not yet executed forward_logged!') R = out a = self._in[0] weight = self.weight.data(ctx=a.context) wplus = nd.maximum(0., weight) wminus = nd.minimum(0., weight) bias = None bplus = None bminus = None if use_bias is not None: bias = self.bias.data(ctx=a.context) bplus = nd.maximum(0., bias) bminus = nd.minimum(0., bias) upper = nd.ones_like(a)*hi lower = nd.ones_like(a)*lo a.attach_grad() upper.attach_grad() lower.attach_grad() with autograd.record(): zlh = ( self._forward(a, weight, bias) - self._forward(lower, wplus, bplus) - self._forward(upper, wminus, bminus) ) zlh.backward(out_grad=R/(zlh + (zlh == 0.))) return a*a.grad + upper*upper.grad + lower*lower.grad
def train_and_predict_rnn_gluon(model, num_hiddens, vocab_size, ctx, corpus_indices, idx_to_char, char_to_idx, num_epochs, num_steps, lr, clipping_theta, batch_size, pred_period, pred_len, prefixes): """Train an Gluon RNN model and predict the next item in the sequence.""" loss = gloss.SoftmaxCrossEntropyLoss() model.initialize(ctx=ctx, force_reinit=True, init=init.Normal(0.01)) trainer = gluon.Trainer(model.collect_params(), 'sgd', {'learning_rate': lr, 'momentum': 0, 'wd': 0}) for epoch in range(num_epochs): loss_sum, start = 0.0, time.time() data_iter = data_iter_consecutive( corpus_indices, batch_size, num_steps, ctx) state = model.begin_state(batch_size=batch_size, ctx=ctx) for t, (X, Y) in enumerate(data_iter): for s in state: s.detach() with autograd.record(): (output, state) = model(X, state) y = Y.T.reshape((-1,)) l = loss(output, y).mean() l.backward() params = [p.data() for p in model.collect_params().values()] grad_clipping(params, clipping_theta, ctx) trainer.step(1) loss_sum += l.asscalar() if (epoch + 1) % pred_period == 0: print('epoch %d, perplexity %f, time %.2f sec' % ( epoch + 1, math.exp(loss_sum / (t + 1)), time.time() - start)) for prefix in prefixes: print(' -', predict_rnn_gluon( prefix, pred_len, model, vocab_size, ctx, idx_to_char, char_to_idx))
def train_ch7(trainer_fn, states, hyperparams, features, labels, batch_size=10, num_epochs=2): """Train a linear regression model.""" net, loss = linreg, squared_loss w, b = nd.random.normal(scale=0.01, shape=(features.shape[1], 1)), nd.zeros(1) w.attach_grad() b.attach_grad() def eval_loss(): return loss(net(features, w, b), labels).mean().asscalar() ls = [eval_loss()] data_iter = gdata.DataLoader( gdata.ArrayDataset(features, labels), batch_size, shuffle=True) for _ in range(num_epochs): start = time.time() for batch_i, (X, y) in enumerate(data_iter): with autograd.record(): l = loss(net(X, w, b), y).mean() l.backward() trainer_fn([w, b], states, hyperparams) if (batch_i + 1) * batch_size % 100 == 0: ls.append(eval_loss()) print('loss: %f, %f sec per epoch' % (ls[-1], time.time() - start)) set_figsize() plt.plot(np.linspace(0, num_epochs, len(ls)), ls) plt.xlabel('epoch') plt.ylabel('loss')
def train(train_data, test_data, net, loss, trainer, ctx, num_epochs, print_batches=None): """Train a network""" print("Start training on ", ctx) if isinstance(ctx, mx.Context): ctx = [ctx] for epoch in range(num_epochs): train_loss, train_acc, n, m = 0.0, 0.0, 0.0, 0.0 if isinstance(train_data, mx.io.MXDataIter): train_data.reset() start = time() for i, batch in enumerate(train_data): data, label, batch_size = _get_batch(batch, ctx) losses = [] with autograd.record(): outputs = [net(X) for X in data] losses = [loss(yhat, y) for yhat, y in zip(outputs, label)] for l in losses: l.backward() train_acc += sum([(yhat.argmax(axis=1) == y).sum().asscalar() for yhat, y in zip(outputs, label)]) train_loss += sum([l.sum().asscalar() for l in losses]) trainer.step(batch_size) n += batch_size m += sum([y.size for y in label]) if print_batches and (i + 1) % print_batches == 0: print("Batch %d. Loss: %f, Train acc %f" % ( n, train_loss / n, train_acc / m )) test_acc = evaluate_accuracy(test_data, net, ctx) print("Epoch %d. Loss: %.3f, Train acc %.2f, Test acc %.2f, Time %.1f sec" % ( epoch, train_loss / n, train_acc / m, test_acc, time() - start ))
def test_infer_multiout_op(): data = mx.nd.arange(16, dtype=np.float64).reshape((4, 4)) data.attach_grad() with autograd.record(): y = mx.nd.split(data, axis=0, num_outputs=2) y[0].backward() assert data.grad.dtype == np.float64
def test_infer_multiout_op2(): def test_func(a): q, l = mx.nd.linalg.gelqf(a) return mx.nd.sum(l) data32 = mx.nd.random.normal(shape=(2, 3), ctx=mx.cpu(), dtype=np.float32) data32.attach_grad() with autograd.record(): test32 = test_func(data32) test32.backward() data64 = mx.nd.Cast(data32, dtype=np.float64) data64.attach_grad() with autograd.record(): test64 = test_func(data64) test64.backward() assert_almost_equal(data64.grad.asnumpy(), data32.grad.asnumpy(), atol=1e-5, rtol=1e-5)
def train_and_predict_rnn(rnn, get_params, init_rnn_state, num_hiddens, corpus_indices, vocab, ctx, is_random_iter, num_epochs, num_steps, lr, clipping_theta, batch_size, prefixes): """Train an RNN model and predict the next item in the sequence.""" if is_random_iter: data_iter_fn = data_iter_random else: data_iter_fn = data_iter_consecutive params = get_params() loss = gloss.SoftmaxCrossEntropyLoss() start = time.time() for epoch in range(1, num_epochs+1): if not is_random_iter: # If adjacent sampling is used, the hidden state is initialized # at the beginning of the epoch state = init_rnn_state(batch_size, num_hiddens, ctx) l_sum, n = 0.0, 0 data_iter = data_iter_fn(corpus_indices, batch_size, num_steps, ctx) for X, Y in data_iter: if is_random_iter: # If random sampling is used, the hidden state is initialized # before each mini-batch update state = init_rnn_state(batch_size, num_hiddens, ctx) else: # Otherwise, the detach function needs to be used to separate # the hidden state from the computational graph to avoid # backpropagation beyond the current sample for s in state: s.detach() with autograd.record(): inputs = to_onehot(X, len(vocab)) # outputs is num_steps terms of shape (batch_size, len(vocab)) (outputs, state) = rnn(inputs, state, params) # After stitching it is (num_steps * batch_size, len(vocab)) outputs = nd.concat(*outputs, dim=0) # The shape of Y is (batch_size, num_steps), and then becomes # a vector with a length of batch * num_steps after # transposition. This gives it a one-to-one correspondence # with output rows y = Y.T.reshape((-1,)) # Average classification error via cross entropy loss l = loss(outputs, y).mean() l.backward() grad_clipping(params, clipping_theta, ctx) # Clip the gradient sgd(params, lr, 1) # Since the error is the mean, no need to average gradients here l_sum += l.asscalar() * y.size n += y.size if epoch % (num_epochs // 4) == 0: print('epoch %d, perplexity %f, time %.2f sec' % ( epoch, math.exp(l_sum / n), time.time() - start)) start = time.time() if epoch % (num_epochs // 2) == 0: for prefix in prefixes: print(' -', predict_rnn(prefix, 50, rnn, params, init_rnn_state, num_hiddens, vocab, ctx))
def train(ctx): if isinstance(ctx, mx.Context): ctx = [ctx] if opt.use_pretrained_base: net.deconv_layers.initialize(ctx=ctx) net.final_layer.initialize(ctx=ctx) else: net.initialize(mx.init.MSRAPrelu(), ctx=ctx) trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params) L = gluon.loss.L2Loss() metric = HeatmapAccuracy() best_val_score = 1 if opt.mode == 'hybrid': net.hybridize(static_alloc=True, static_shape=True) for epoch in range(opt.num_epochs): loss_val = 0 tic = time.time() btic = time.time() metric.reset() for i, batch in enumerate(train_data): data, label, weight, imgid = train_batch_fn(batch, ctx) with ag.record(): outputs = [net(X.astype(opt.dtype, copy=False)) for X in data] loss = [nd.cast(L(nd.cast(yhat, 'float32'), y, w), opt.dtype) for yhat, y, w in zip(outputs, label, weight)] for l in loss: l.backward() trainer.step(batch_size) metric.update(label, outputs) loss_val += sum([l.mean().asscalar() for l in loss]) / num_gpus if opt.log_interval and not (i+1)%opt.log_interval: metric_name, metric_score = metric.get() logger.info('Epoch[%d] Batch [%d]\tSpeed: %f samples/sec\tloss=%f\tlr=%f\t%s=%.3f'%( epoch, i, batch_size*opt.log_interval/(time.time()-btic), loss_val / (i+1), trainer.learning_rate, metric_name, metric_score)) btic = time.time() time_elapsed = time.time() - tic logger.info('Epoch[%d]\t\tSpeed: %d samples/sec over %d secs\tloss=%f\n'%( epoch, int(i*batch_size / time_elapsed), int(time_elapsed), loss_val / (i+1))) if save_frequency and save_dir and (epoch + 1) % save_frequency == 0: net.save_parameters('%s/%s-%d.params'%(save_dir, model_name, epoch)) trainer.save_states('%s/%s-%d.states'%(save_dir, model_name, epoch)) if save_frequency and save_dir: net.save_parameters('%s/%s-%d.params'%(save_dir, model_name, opt.num_epochs-1)) trainer.save_states('%s/%s-%d.states'%(save_dir, model_name, opt.num_epochs-1)) return net
def train(input_variable, target_variable, encoder, decoder, teacher_forcing_ratio, encoder_optimizer, decoder_optimizer, criterion, max_length, ctx): with autograd.record(): loss = F.zeros((1,), ctx=ctx) encoder_hidden = encoder.initHidden(ctx) input_length = input_variable.shape[0] target_length = target_variable.shape[0] encoder_outputs, encoder_hidden = encoder( input_variable.expand_dims(0), encoder_hidden) if input_length < max_length: encoder_outputs = F.concat(encoder_outputs.flatten(), F.zeros((max_length - input_length, encoder.hidden_size), ctx=ctx), dim=0) else: encoder_outputs = encoder_outputs.flatten() decoder_input = F.array([SOS_token], ctx=ctx) decoder_hidden = encoder_hidden use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False if use_teacher_forcing: # Teacher forcing: Feed the target as the next input for di in range(target_length): decoder_output, decoder_hidden, decoder_attention = decoder( decoder_input, decoder_hidden, encoder_outputs) loss = F.add(loss, criterion(decoder_output, target_variable[di])) print criterion(decoder_output, target_variable[di]) decoder_input = target_variable[di] # Teacher forcing else: # Without teacher forcing: use its own predictions as the next input for di in range(target_length): decoder_output, decoder_hidden, decoder_attention = decoder( decoder_input, decoder_hidden, encoder_outputs) topi = decoder_output.argmax(axis=1) decoder_input = F.array([topi.asscalar()], ctx=ctx) loss = F.add(loss, criterion(decoder_output, target_variable[di])) if topi.asscalar() == EOS_token: break loss.backward() encoder_optimizer.step(1) decoder_optimizer.step(1) return loss.asscalar()/target_length
def relevance_sensitivity(self, data, out=None, **kwargs): data = Mlist(data) data.attach_grad() with autograd.record(): y = self.forward(data) y.backward(out_grad=out) # WARNING: is hacky and sucks self._out = y return data.grad
def forward_backward(network, data, label): # Ask autograd to remember the forward pass with autograd.record(): # Compute the loss on all GPUs losses = [loss(network(X), Y) for X, Y in zip(data, label)] # Run the backward pass (calculate gradients) on all GPUs for l in losses: l.backward()
def main(net, batch_size, epochs, opt, ctx): train_data, val_data = get_data_iters(batch_size) if opt.hybridize: net.hybridize() trainer = gluon.Trainer(net.collect_params(), 'adam', {'learning_rate': opt.lr, 'wd': opt.wd}) criterion = gluon.loss.SoftmaxCrossEntropyLoss() lr = opt.lr if opt.warmup: minlr = lr*0.01 dlr = (lr-minlr)/(epochs[0]-1) prev_time = datetime.datetime.now() for epoch in range(epochs[-1]): _loss = 0. if opt.warmup: if epoch<epochs[0]: lr = minlr + dlr*epoch if epoch in epochs[1:]: lr = lr * opt.lr_decay trainer.set_learning_rate(lr) for data, label in train_data: data_list = gluon.utils.split_and_load(data, ctx) label_list = gluon.utils.split_and_load(label, ctx) with autograd.record(): outpus = [net(X) for X in data_list] losses = [criterion(X, y) for X, y in zip(outpus, label_list)] for l in losses: l.backward() trainer.step(batch_size) _loss_list = [l.mean().asscalar() for l in losses] _loss += sum(_loss_list) / len(_loss_list) cur_time = datetime.datetime.now() h, remainder = divmod((cur_time - prev_time).seconds, 3600) m, s = divmod(remainder, 60) time_str = "Time %02d:%02d:%02d" % (h, m, s) __loss = _loss/len(train_data) if val_data is not None: val_loss, val_accuray = validate(val_data, net, criterion, ctx) epoch_str = ("Epoch %d. Train loss: %f, Val loss %f, Val accuray %f, " % (epoch, __loss , val_loss, val_accuray)) else: epoch_str = ("Epoch %d. Train loss: %f, " % (epoch, __loss)) prev_time = cur_time print(epoch_str + time_str + ', lr ' + str(trainer.learning_rate)) if not os.path.exists("params"): os.mkdir("params") net.save_parameters("params/resnet50.params")
def relevance_layerwise(self, out, *args, **kwargs): R = out a = self._in[0] pkwargs = self._kwargs.copy() pkwargs['pool_type'] = 'sum' # suppress mxnet warnings about sum-pooling nob being supported with cudnn pkwargs['cudnn_off'] = True a.attach_grad() with autograd.record(): z = nd.Pooling(a, **pkwargs) z.backward(out_grad=R/(z + (z == 0.))) return a * a.grad
def test_inference(): all_models = ['resnet50_v1', 'vgg19_bn', 'alexnet', #'inceptionv3', 'densenet201', 'squeezenet1.0', 'mobilenet0.25'] batch_size = 10 download_data() for model_name in all_models: eprint('testing inference on %s'%model_name) data_shape = (3, 224, 224) if 'inception' not in model_name else (3, 299, 299) dataIter = mx.io.ImageRecordIter( path_imgrec = VAL_DATA, label_width = 1, preprocess_threads = 1, batch_size = batch_size, data_shape = data_shape, label_name = 'softmax_label', rand_crop = False, rand_mirror = False) data_batch = dataIter.next() data = data_batch.data[0] label = data_batch.label[0] gpu_data = data.as_in_context(mx.gpu()) gpu_label = label.as_in_context(mx.gpu()) # This is to create a model and run the model once to initialize # all parameters. cpu_model = get_model(model_name) cpu_model.collect_params().initialize(ctx=mx.cpu()) cpu_model(mx.nd.array(data, ctx=mx.cpu())) gpu_model = get_model(model_name) gpu_model.collect_params().initialize(ctx=mx.gpu()) gpu_model(mx.nd.array(data, ctx=mx.gpu())) # Force the two models have the same parameters. cpu_params = cpu_model.collect_params() gpu_params = gpu_model.collect_params() for k in cpu_params.keys(): k = k.replace(cpu_params.prefix, '') cpu_param = cpu_params.get(k) gpu_param = gpu_params.get(k) gpu_param.set_data(cpu_param.data().as_in_context(mx.gpu())) for i in range(5): # Run inference. with autograd.record(train_mode=False): cpu_out = cpu_model(mx.nd.array(data, ctx=mx.cpu())) gpu_out = gpu_model(gpu_data) out = cpu_out.asnumpy() max_val = np.max(np.abs(out)) gpu_max_val = np.max(np.abs(gpu_out.asnumpy())) eprint(model_name + ": CPU " + str(max_val) + ", GPU " + str(gpu_max_val)) assert_almost_equal(out / max_val, gpu_out.asnumpy() / max_val, rtol=1e-3, atol=1e-3)
def optimize(args): """ Gatys et al. CVPR 2017 ref: Image Style Transfer Using Convolutional Neural Networks """ if args.cuda: ctx = mx.gpu(0) else: ctx = mx.cpu(0) # load the content and style target content_image = utils.tensor_load_rgbimage(args.content_image,ctx, size=args.content_size, keep_asp=True) content_image = utils.subtract_imagenet_mean_preprocess_batch(content_image) style_image = utils.tensor_load_rgbimage(args.style_image, ctx, size=args.style_size) style_image = utils.subtract_imagenet_mean_preprocess_batch(style_image) # load the pre-trained vgg-16 and extract features vgg = net.Vgg16() utils.init_vgg_params(vgg, 'models', ctx=ctx) # content feature f_xc_c = vgg(content_image)[1] # style feature features_style = vgg(style_image) gram_style = [net.gram_matrix(y) for y in features_style] # output output = Parameter('output', shape=content_image.shape) output.initialize(ctx=ctx) output.set_data(content_image) # optimizer trainer = gluon.Trainer([output], 'adam', {'learning_rate': args.lr}) mse_loss = gluon.loss.L2Loss() # optimizing the images for e in range(args.iters): utils.imagenet_clamp_batch(output.data(), 0, 255) # fix BN for pre-trained vgg with autograd.record(): features_y = vgg(output.data()) content_loss = 2 * args.content_weight * mse_loss(features_y[1], f_xc_c) style_loss = 0. for m in range(len(features_y)): gram_y = net.gram_matrix(features_y[m]) gram_s = gram_style[m] style_loss = style_loss + 2 * args.style_weight * mse_loss(gram_y, gram_s) total_loss = content_loss + style_loss total_loss.backward() trainer.step(1) if (e + 1) % args.log_interval == 0: print('loss:{:.2f}'.format(total_loss.asnumpy()[0])) # save the image output = utils.add_imagenet_mean_batch(output.data()) utils.tensor_save_bgrimage(output[0], args.output_image, args.cuda)
def train_and_predict_rnn(rnn, is_random_iter, num_epochs, num_steps, num_hiddens, lr, clipping_theta, batch_size, vocab_size, pred_period, pred_len, prefixes, get_params, get_inputs, ctx, corpus_indices, idx_to_char, char_to_idx, is_lstm=False): """Train an RNN model and predict the next item in the sequence.""" if is_random_iter: data_iter = data_iter_random else: data_iter = data_iter_consecutive params = get_params() loss = gloss.SoftmaxCrossEntropyLoss() for epoch in range(1, num_epochs + 1): if not is_random_iter: state_h = nd.zeros(shape=(batch_size, num_hiddens), ctx=ctx) if is_lstm: state_c = nd.zeros(shape=(batch_size, num_hiddens), ctx=ctx) train_l_sum = nd.array([0], ctx=ctx) train_l_cnt = 0 for X, Y in data_iter(corpus_indices, batch_size, num_steps, ctx): if is_random_iter: state_h = nd.zeros(shape=(batch_size, num_hiddens), ctx=ctx) if is_lstm: state_c = nd.zeros(shape=(batch_size, num_hiddens), ctx=ctx) else: state_h = state_h.detach() if is_lstm: state_c = state_c.detach() with autograd.record(): if is_lstm: outputs, state_h, state_c = rnn( get_inputs(X, vocab_size), state_h, state_c, *params) else: outputs, state_h = rnn( get_inputs(X, vocab_size), state_h, *params) y = Y.T.reshape((-1,)) outputs = nd.concat(*outputs, dim=0) l = loss(outputs, y) l.backward() grad_clipping(params, clipping_theta, ctx) sgd(params, lr, 1) train_l_sum = train_l_sum + l.sum() train_l_cnt += l.size if epoch % pred_period == 0: print("\nepoch %d, perplexity %f" % (epoch, (train_l_sum / train_l_cnt).exp().asscalar())) for prefix in prefixes: print(' - ', predict_rnn( rnn, prefix, pred_len, params, num_hiddens, vocab_size, ctx, idx_to_char, char_to_idx, get_inputs, is_lstm))
def test_lstmp(): hidden_size, projection_size = 3, 2 rtol, atol = 1e-2, 1e-2 batch_size, seq_len = 7, 11 input_size = 5 ctx = mx.gpu(0) lstm_input = mx.nd.uniform( shape=(seq_len, batch_size, input_size), ctx=ctx) shapes = {'i2h_weight': (hidden_size * 4, input_size), 'h2h_weight': (hidden_size * 4, projection_size), 'i2h_bias': (hidden_size * 4,), 'h2h_bias': (hidden_size * 4,), 'h2r_weight': (projection_size, hidden_size)} weights = {k: rand_ndarray(v) for k, v in shapes.items()} lstm_layer = gluon.rnn.LSTM(hidden_size, projection_size=projection_size, input_size=input_size, prefix='lstm0_') lstm_cell = gluon.contrib.rnn.LSTMPCell(hidden_size=hidden_size, projection_size=projection_size, input_size=input_size, prefix='lstm0_l0_') lstm_layer.initialize(ctx=ctx) lstm_cell.initialize(ctx=ctx) layer_params = lstm_layer.collect_params() cell_params = lstm_cell.collect_params() for k, v in weights.items(): layer_params['lstm0_l0_' + k].set_data(v.copy()) cell_params['lstm0_l0_' + k].set_data(v.copy()) with autograd.record(): layer_output = lstm_layer(lstm_input.copy()) cell_output = lstm_cell.unroll(seq_len, lstm_input.copy(), layout='TNC', merge_outputs=True)[0] assert_almost_equal(layer_output.asnumpy(), cell_output.asnumpy(), rtol=rtol, atol=atol) layer_output.backward() cell_output.backward() for k, v in weights.items(): layer_grad = layer_params['lstm0_l0_' + k].grad() cell_grad = cell_params['lstm0_l0_' + k].grad() print('checking gradient for {}'.format('lstm0_l0_' + k)) assert_almost_equal(layer_grad.asnumpy(), cell_grad.asnumpy(), rtol=rtol, atol=atol) check_rnn_layer_forward(gluon.rnn.LSTM( 10, 2, projection_size=5), mx.nd.ones((8, 3, 20)), ctx=ctx) check_rnn_layer_forward(gluon.rnn.LSTM(10, 2, projection_size=5, bidirectional=True), mx.nd.ones( (8, 3, 20)), [mx.nd.ones((4, 3, 5)), mx.nd.ones((4, 3, 10))], ctx=ctx) check_rnn_layer_forward(gluon.rnn.LSTM(10, 2, dropout=0.5, projection_size=5), mx.nd.ones((8, 3, 20)), run_only=True, ctx=ctx) check_rnn_layer_forward(gluon.rnn.LSTM(10, 2, bidirectional=True, dropout=0.5, projection_size=5), mx.nd.ones((8, 3, 20)), [mx.nd.ones((4, 3, 5)), mx.nd.ones((4, 3, 10))], run_only=True, ctx=ctx)
def train(self, data, label, batch_size): """ Description : training for LipNet """ # pylint: disable=no-member sum_losses = 0 len_losses = 0 with autograd.record(): losses = [self.loss_fn(self.net(X), Y) for X, Y in zip(data, label)] for loss in losses: sum_losses += mx.nd.array(loss).sum().asscalar() len_losses += len(loss) loss.backward() self.trainer.step(batch_size) return sum_losses, len_losses
def train(epochs, ctx): if isinstance(ctx, mx.Context): ctx = [ctx] net.initialize(mx.init.Xavier(magnitude=2), ctx=ctx) kv = mx.kv.create(opt.kvstore) train_data, val_data = get_data_iters(dataset, batch_size, kv.num_workers, kv.rank) trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': opt.lr, 'wd': opt.wd, 'momentum': opt.momentum}, kvstore = kv) metric = mx.metric.Accuracy() loss = gluon.loss.SoftmaxCrossEntropyLoss() for epoch in range(epochs): tic = time.time() train_data.reset() metric.reset() btic = time.time() for i, batch in enumerate(train_data): data = gluon.utils.split_and_load(batch.data[0], ctx_list=ctx, batch_axis=0) label = gluon.utils.split_and_load(batch.label[0], ctx_list=ctx, batch_axis=0) outputs = [] Ls = [] with ag.record(): for x, y in zip(data, label): z = net(x) L = loss(z, y) # store the loss and do backward after we have done forward # on all GPUs for better speed on multiple GPUs. Ls.append(L) outputs.append(z) for L in Ls: L.backward() trainer.step(batch.data[0].shape[0]) metric.update(label, outputs) if opt.log_interval and not (i+1)%opt.log_interval: name, acc = metric.get() logging.info('Epoch[%d] Batch [%d]\tSpeed: %f samples/sec\t%s=%f'%( epoch, i, batch_size/(time.time()-btic), name, acc)) btic = time.time() name, acc = metric.get() logging.info('[Epoch %d] training: %s=%f'%(epoch, name, acc)) logging.info('[Epoch %d] time cost: %f'%(epoch, time.time()-tic)) name, val_acc = test(ctx, val_data) logging.info('[Epoch %d] validation: %s=%f'%(epoch, name, val_acc)) net.save_params('image-classifier-%s-%d.params'%(opt.model, epochs))
def train(): best_val = float("Inf") for epoch in range(args.epochs): total_L = 0.0 start_time = time.time() hidden = model.begin_state(func=mx.nd.zeros, batch_size=args.batch_size, ctx=context) for i, (data, target) in enumerate(train_data): data = data.as_in_context(context).T target = target.as_in_context(context).T.reshape((-1, 1)) hidden = detach(hidden) with autograd.record(): output, hidden = model(data, hidden) L = loss(output, target) L.backward() grads = [p.grad(context) for p in model.collect_params().values()] # Here gradient is for the whole batch. # So we multiply max_norm by batch_size and bptt size to balance it. gluon.utils.clip_global_norm(grads, args.clip * args.bptt * args.batch_size) trainer.step(args.batch_size) total_L += mx.nd.sum(L).asscalar() if i % args.log_interval == 0 and i > 0: cur_L = total_L / args.bptt / args.batch_size / args.log_interval print('[Epoch %d Batch %d] loss %.2f, ppl %.2f'%( epoch, i, cur_L, math.exp(cur_L))) total_L = 0.0 val_L = eval(val_data) print('[Epoch %d] time cost %.2fs, valid loss %.2f, valid ppl %.2f'%( epoch, time.time()-start_time, val_L, math.exp(val_L))) if val_L < best_val: best_val = val_L test_L = eval(test_data) model.collect_params().save(args.save) print('test loss %.2f, test ppl %.2f'%(test_L, math.exp(test_L))) else: args.lr = args.lr*0.25 trainer._init_optimizer('sgd', {'learning_rate': args.lr, 'momentum': 0, 'wd': 0}) model.collect_params().load(args.save, context)
def explain_pattern(self, data, out=None, attribution=False): X = Mlist(data) X.attach_grad() with autograd.record(): y = self.forward_pattern(X) if attribution: self.overload_weight_attribution_pattern() else: self.overload_weight_pattern() if out is None: out = y y.backward(out_grad=out) self.overload_weight_reset() return X.grad
def _get_grad(net, image, class_id=None, conv_layer_name=None, image_grad=False): """This is an internal helper function that can be used for either of these but not both at the same time: 1. Record the output and gradient of output of an intermediate convolutional layer. 2. Record the gradients of the image. Parameters ---------- image : NDArray Image to visuaize. This is an NDArray with the preprocessed image. class_id : int Category ID this image belongs to. If not provided, network's prediction will be used. conv_layer_name: str Name of the convolutional layer whose output and output's gradients need to be acptured. image_grad: bool Whether to capture gradients of the image.""" if image_grad: image.attach_grad() Conv2D.capture_layer_name = None Activation.set_guided_backprop(True) else: # Tell convviz.Conv2D which layer's output and gradient needs to be recorded Conv2D.capture_layer_name = conv_layer_name Activation.set_guided_backprop(False) # Run the network with autograd.record(train_mode=False): out = net(image) # If user didn't provide a class id, we'll use the class that the network predicted if class_id == None: model_output = out.asnumpy() class_id = np.argmax(model_output) # Create a one-hot target with class_id and backprop with the created target one_hot_target = mx.nd.one_hot(mx.nd.array([class_id]), 1000) out.backward(one_hot_target, train_mode=False) if image_grad: return image.grad[0].asnumpy() else: # Return the recorded convolution output and gradient conv_out = Conv2D.conv_output return conv_out[0].asnumpy(), conv_out.grad[0].asnumpy()
def train(): best_val = float("Inf") for epoch in range(args.epochs): total_L = 0.0 start_time = time.time() hidden = model.begin_state(func=mx.nd.zeros, batch_size=args.batch_size, ctx=context) for i, (data, target) in enumerate(train_data): data = data.as_in_context(context).T target = target.as_in_context(context).T.reshape((-1, 1)) hidden = detach(hidden) with autograd.record(): output, hidden = model(data, hidden) # Here L is a vector of size batch_size * bptt size L = loss(output, target) L = L / (args.bptt * args.batch_size) L.backward() grads = [p.grad(context) for p in model.collect_params().values()] gluon.utils.clip_global_norm(grads, args.clip) trainer.step(1) total_L += mx.nd.sum(L).asscalar() if i % args.log_interval == 0 and i > 0: cur_L = total_L / args.log_interval print('[Epoch %d Batch %d] loss %.2f, ppl %.2f'%( epoch, i, cur_L, math.exp(cur_L))) total_L = 0.0 if args.export_model: model.export('model') return val_L = eval(val_data) print('[Epoch %d] time cost %.2fs, valid loss %.2f, valid ppl %.2f'%( epoch, time.time()-start_time, val_L, math.exp(val_L))) if val_L < best_val: best_val = val_L test_L = eval(test_data) model.save_parameters(args.save) print('test loss %.2f, test ppl %.2f'%(test_L, math.exp(test_L))) else: args.lr = args.lr*0.25 trainer.set_learning_rate(args.lr)
def train(self, batch_size=64, num_epoch=10, optimizer='adam', optimizer_params=(('learning_rate', 0.001),), load_checkpoint=False, context='cpu', reconstruction_loss='mse', preprocessing=False, checkpoint_period=5, load_pretrained=False, normalize=False, log_period = 50, kl_loss_weight=1, print_images=False): preprocessing = False #TODO to be added - create an additional load_vae_data for preprocessing case num_pus = 1 if context == 'gpu': num_pus = mx.context.num_gpus() if num_pus >= 1: if num_pus == 1: mx_context = [mx.gpu(0)] else: mx_context = [mx.gpu(i) for i in range(num_pus)] else: logging.error("Context argument is '" + context + "'. But no gpu is present in the system.") sys.exit(1) elif context == 'cpu': mx_context = [mx.cpu()] else: logging.error("Context argument is '" + context + "'. Only 'cpu' and 'gpu are valid arguments'.") sys.exit(1) single_pu_batch_size = int(batch_size / num_pus) if print_images: try: logging.info("Creating 'images' directory...") if not os.path.isdir('images'): os.mkdir('images') else: logging.info("'images' directory already exists.") except: logging.error("Creation of the 'images' directory failed.") input_names = [ "data"] train_iter, test_iter, data_mean, data_std, _, _ = self._data_loader.load_vae_data(batch_size=batch_size, input_names=input_names) if 'weight_decay' in optimizer_params: optimizer_params['wd'] = optimizer_params['weight_decay'] del optimizer_params['weight_decay'] if 'learning_rate_decay' in optimizer_params: min_learning_rate = 1e-08 if 'learning_rate_minimum' in optimizer_params: min_learning_rate = optimizer_params['learning_rate_minimum'] del optimizer_params['learning_rate_minimum'] optimizer_params['lr_scheduler'] = mx.lr_scheduler.FactorScheduler( optimizer_params['step_size'], factor=optimizer_params['learning_rate_decay'], stop_factor_lr=min_learning_rate) del optimizer_params['step_size'] del optimizer_params['learning_rate_decay'] begin_epoch = 0 if load_checkpoint: begin_epoch = self._enc_creator.load(mx_context) _ = self._dec_creator.load(mx_context) elif load_pretrained: self._enc_creator.load_pretrained_weights(mx_context) self._dec_creator.load_pretrained_weights(mx_context) else: if os.path.isdir(self._enc_creator._model_dir_): shutil.rmtree(self._enc_creator._model_dir_) if os.path.isdir(self._dec_creator._model_dir_): shutil.rmtree(self._dec_creator._model_dir_) if normalize: self._enc_creator.construct(context=mx_context, batch_size=batch_size, data_mean=data_mean, data_std=data_std) self._dec_creator.construct(context=mx_context, batch_size=batch_size, data_mean=data_mean, data_std=data_std) else: self._enc_creator.construct(context=mx_context, batch_size=batch_size) self._dec_creator.construct(context=mx_context, batch_size=batch_size) encoder_nets = self._enc_creator.networks decoder_nets = self._dec_creator.networks if len(encoder_nets) > 1: logging.error("VAE-components don't support multiple networkmodels yet. Encoder-Networks found: " + str(len(encoder_nets))) sys.exit(1) elif len(decoder_nets) > 1: logging.error("VAE-components don't support multiple networkmodels yet. Decoder-Networks found: " + str(len(decoder_nets))) sys.exit(1) loss_ctx_list = [] loss_ctx_list.append(encoder_nets[0].loss_ctx_dict) enc_trainers = [mx.gluon.Trainer(network.collect_params(), optimizer, optimizer_params) for network in encoder_nets.values() if len(network.collect_params().values()) != 0] dec_trainers = [mx.gluon.Trainer(network.collect_params(), optimizer, optimizer_params) for network in decoder_nets.values() if len(network.collect_params().values()) != 0] loss_function = VAELoss(recon_loss=reconstruction_loss, kl_loss_weight=kl_loss_weight, loss_ctx_list=loss_ctx_list) loss_function.hybridize() tic = None avg_speed = 0 n = 0 train_lost_list = [] test_lost_list = [] for epoch in range(begin_epoch, begin_epoch + num_epoch): global_loss_train = 0.0 global_reconloss = 0.0 train_batches = 0 loss_total = 0 recon_total = 0 train_iter.reset() for batch_i, batch in enumerate(train_iter): with autograd.record(): indexed_labels = 0 indexed_data = 0 if "data" == "label": data_ = gluon.utils.split_and_load(batch.label[indexed_labels], ctx_list=mx_context, even_split=False) indexed_labels += 1 else: data_ = gluon.utils.split_and_load(batch.data[indexed_data], ctx_list=mx_context, even_split=False) indexed_data += 1 lossList = [] loss_param_list = [] reconstruction_losses = [] encoding_ = [] pred_= [] for i in range(num_pus): lossList.append([]) loss_param_list.append([]) reconstruction_losses.append([]) encoding_.append([]) pred_.append([]) nd.waitall() for i in range(num_pus): feature_vec, loss_params_enc = encoder_nets[0]( data_[i]) loss_param_list[i].append(loss_params_enc) encoding_[i] = feature_vec[0] nd.waitall() for i in range(num_pus): res_ = decoder_nets[0]( encoding_[i]) pred_[i] = res_[0][0] nd.waitall() for i in range(num_pus): elbo, reconstruction_loss = loss_function(pred_[i], data_[i], loss_param_list[i]) lossList[i].append(elbo) reconstruction_losses[i].append(reconstruction_loss) losses = [0] * num_pus reconLosses = [0] * num_pus for i in range(num_pus): for element in lossList[i]: losses[i] = losses[i] + element for r in reconstruction_losses[i]: reconLosses[i] = reconLosses[i] + r for loss in losses: loss.backward() loss_total += loss.sum().asscalar() global_loss_train += loss.sum().asscalar() for loss in reconLosses: recon_total += loss.sum().asscalar() global_reconloss += loss.sum().asscalar() train_batches += 1 for trainer in dec_trainers: trainer.step(batch_size) for trainer in enc_trainers: trainer.step(batch_size) if tic is None: tic = time.time() else: if batch_i % log_period == 0: try: speed = log_period * batch_size / (time.time() - tic) except ZeroDivisionError: speed = float("inf") loss_avg = loss_total / (batch_size * log_period) recon_avg = recon_total / (batch_size * log_period) loss_total = 0 recon_total = 0 logging.info("Epoch[%d] Batch[%d] Speed: %.2f samples/sec Average Negative-ELBO Loss: %.5f, Reconstruction Loss: %.5f" % ( epoch, batch_i, speed, loss_avg, recon_avg)) avg_speed += speed n += 1 tic = time.time() global_loss_train /= (train_batches * batch_size) global_reconloss /= (train_batches * batch_size) tic = None global_loss_test = 0.0 test_batches = 0 test_iter.batch_size = single_pu_batch_size test_iter.reset() for batch_i, batch in enumerate(test_iter): indexed_labels = 0 indexed_data = 0 if "data" == "label": data_ = gluon.utils.split_and_load(batch.label[indexed_labels], ctx_list=mx_context, even_split=False) indexed_labels += 1 else: data_ = gluon.utils.split_and_load(batch.data[indexed_data], ctx_list=mx_context, even_split=False) indexed_data += 1 lossList = [] loss_param_list = [] encoding_ = [] pred_= [] for i in range(num_pus): lossList.append([]) loss_param_list.append([]) encoding_.append([]) pred_.append([]) nd.waitall() for i in range(num_pus): feature_vec, loss_params_enc = encoder_nets[0]( data_[i]) loss_param_list[i].append(loss_params_enc) encoding_[i] = feature_vec[0] nd.waitall() for i in range(num_pus): res_ = decoder_nets[0]( encoding_[i]) pred_[i] = res_[0][0] nd.waitall() for i in range(num_pus): elbo, reconstruction_loss = loss_function(pred_[i], data_[i], loss_param_list[i]) lossList[i].append(elbo) losses = [0] * num_pus for i in range(num_pus): for element in lossList[i]: losses[i] = losses[i] + element for loss in losses: global_loss_test += loss.sum().asscalar() test_batches += 1 global_loss_test /= (test_batches * single_pu_batch_size) logging.info("Epoch[%d], Epoch Train Loss: %f, Epoch Reconstruction Loss: %f, Validation Loss: %f" % ( epoch, global_loss_train, global_reconloss, global_loss_test)) if (epoch+1) % checkpoint_period == 0: for i, network in encoder_nets.items(): if network.save_specific_params_list: for name, param_dic in network.save_specific_params_list: param_dic.save(self.encoder_parameter_path(i) + '-' + name + '.params') network.save_parameters(self.encoder_parameter_path(i) + '-' + str(epoch).zfill(4) + '.params') for i, network in decoder_nets.items(): if network.save_specific_params_list: for name, param_dic in network.save_specific_params_list: param_dic.save(self.decoder_parameter_path(i) + '-' + name + '.params') network.save_parameters(self.decoder_parameter_path(i) + '-' + str(epoch).zfill(4) + '.params') if print_images: train_lost_list.append(global_loss_train) test_lost_list.append(global_loss_test) try: #Reconstructions filename = 'test_reconstruction_%06d%06d.png' % (epoch, batch_i) fig = plt.figure() ax = fig.add_subplot(1, 2, 1) plt.imshow(data_[0][0].squeeze(0).asnumpy()) ax.set_title('Original') ax = fig.add_subplot(1, 2, 2) plt.imshow(pred_[0][0].squeeze(0).asnumpy()) ax.set_title('Reconstruction') plt.tight_layout() plt.savefig('images/' + filename) plt.close() except: logging.info("Could not print reconstruction images.") if print_images: if num_epoch != 1: try: #Loss plot batch_x = np.linspace(1, num_epoch, len(train_lost_list)) filename = 'loss_graph.png' plt.plot(batch_x, np.array(train_lost_list)) plt.plot(batch_x, np.array(test_lost_list)) plt.legend(['Train loss', 'Validation Loss']) plt.savefig('images/' + filename) except: logging.info("Could not print loss plot image.") for i, network in encoder_nets.items(): if network.save_specific_params_list: for name, param_dic in network.save_specific_params_list: param_dic.save(self.encoder_parameter_path(i) + '-' + name + '.params') network.save_parameters(self.encoder_parameter_path(i) + '-' + str((num_epoch-1) + begin_epoch).zfill(4) + '.params') network.export(self.encoder_parameter_path(i) + '_newest', epoch=0) loss_function.export(self.encoder_parameter_path(i) + '_newest_loss', epoch=0) for i, network in decoder_nets.items(): if network.save_specific_params_list: for name, param_dic in network.save_specific_params_list: param_dic.save(self.decoder_parameter_path(i) + '-' + name + '.params') network.save_parameters(self.decoder_parameter_path(i) + '-' + str((num_epoch-1) + begin_epoch).zfill(4) + '.params') network.export(self.decoder_parameter_path(i) + '_newest', epoch=0) loss_function.export(self.decoder_parameter_path(i) + '_newest_loss', epoch=0)
def train_mnist(epochs, input_shape, n_class, num_routing, recon_loss_weight, ctx=mx.gpu(0), log_interval=20, **kwargs): batch_size, C, H, W = input_shape capsnet = CapsNet(n_class, num_routing, input_shape) capsnet.initialize(init=mx.init.Xavier(), ctx=ctx) capsnet.hybridize() #mnist = mx.test_utils.get_mnist() #train_iter = mx.io.NDArrayIter(mnist['train_data'], mnist['train_label'], batch_size, shuffle=True) #val_iter = mx.io.NDArrayIter(mnist['test_data'], mnist['test_label'], batch_size) train_iter = mx.io.MNISTIter(image="data/train-images.idx3-ubyte", label="data/train-labels.idx1-ubyte", batch_size=batch_size, shuffle=True) val_iter = mx.io.MNISTIter(image="data/t10k-images.idx3-ubyte", label="data/t10k-labels.idx1-ubyte", batch_size=batch_size, shuffle=False) draw_num = 32 draw_batch = val_iter.next() draw_data = draw_batch.data[0].as_in_context(ctx) draw_label = draw_batch.label[0].as_in_context(ctx) draw_label = mx.nd.one_hot(draw_label, n_class) learning_rate = 0.001 lr_scheduler = SimpleLRScheduler(learning_rate) decay = 0.9 trainer = gluon.Trainer(capsnet.collect_params(), optimizer='adam', optimizer_params={'lr_scheduler': lr_scheduler}) train_plt = viz.line(Y=np.zeros((1, 3)), X=np.zeros((1, 3)), opts=dict( xlabel='Batch', ylabel='Loss and Acc', title='CapsNet traning plot', legend=['Accuracy', 'Digit Loss', 'Mask Loss'])) val_plt = viz.line(Y=np.zeros((1, 3)), X=np.zeros((1, 3)), opts=dict( xlabel='Epoch', ylabel='Loss and Acc', title='CapsNet validation plot', legend=['Accuracy', 'Digit Loss', 'Mask Loss'])) mask_plt = viz.images(np.random.randn(draw_num * 2, 1, 28, 28), opts=dict(title='Mask images', caption='Mask')) hist_acc = 0 loss_metric = LossMetric(batch_size, 1) val_metric = LossMetric(batch_size, 1) batches_one_epoch = 60000 / batch_size for epoch in range(epochs): train_iter.reset() val_iter.reset() loss_metric.reset() for i, batch in enumerate(train_iter): tic = time.time() x = batch.data[0].as_in_context(ctx) y = batch.label[0].as_in_context(ctx) y_ori = y y = mx.nd.one_hot(y, n_class) with autograd.record(): out_caps, out_mask = capsnet(x, y) margin_loss_ = margin_loss(mx.nd, y, out_caps) mask_loss_ = mask_mse_loss(mx.nd, x, out_mask) loss = (1 - recon_loss_weight ) * margin_loss_ + recon_loss_weight * mask_loss_ loss.backward() trainer.step(batch_size) loss_metric.update([y_ori], [out_caps, loss, mask_loss_]) if i % log_interval == 0: acc, digit_loss, mask_loss = loss_metric.get_name_value() viz.line(Y=np.array([acc, digit_loss, mask_loss]).reshape( (1, 3)), X=np.ones((1, 3)) * batches_one_epoch * epoch + i, win=train_plt, update='append') take_num = min(draw_num, batch_size) pred_label, pred_mask = capsnet(draw_data, draw_label) draw = np.concatenate([ draw_data[:take_num].asnumpy(), pred_mask[:take_num].asnumpy() ]) viz.images(draw, win=mask_plt) elasp = time.time() - tic print 'Epoch %2d, train %s %.5f, time %.1f sec, %d samples/s' % ( epoch, "acc", acc, elasp, int(batch_size / elasp)) lr_scheduler.learning_rate = learning_rate * (decay**(epoch + 1)) val_metric.reset() for i, batch in enumerate(val_iter): x = batch.data[0].as_in_context(ctx) y = batch.label[0].as_in_context(ctx) y_ori = y y = mx.nd.one_hot(y, n_class) out_caps, out_mask = capsnet(x, y) margin_loss_ = margin_loss(mx.nd, y, out_caps) mask_loss_ = mask_mse_loss(mx.nd, x, out_mask) loss = (1 - recon_loss_weight ) * margin_loss_ + recon_loss_weight * mask_loss_ val_metric.update([y_ori], [out_caps, loss, mask_loss_]) acc, digit_loss, mask_loss = val_metric.get_name_value() viz.line(Y=np.array([acc, digit_loss, mask_loss]).reshape((1, 3)), X=np.ones((1, 3)) * epoch, win=val_plt, update='append') if acc > hist_acc: hist_acc = acc capsnet.save_params("model/capsnet_%f.params" % acc) print 'Epoch %2d, validation %s %.5f' % (epoch, "acc", acc)
def train(ctx): if isinstance(ctx, mx.Context): ctx = [ctx] if opt.no_wd: for k, v in net.collect_params('.*beta|.*gamma|.*bias').items(): v.wd_mult = 0.0 if opt.partial_bn: train_patterns = None if 'inceptionv3' in opt.model: train_patterns = '.*weight|.*bias|inception30_batchnorm0_gamma|inception30_batchnorm0_beta|inception30_batchnorm0_running_mean|inception30_batchnorm0_running_var' else: logger.info('Current model does not support partial batch normalization.') if opt.kvstore is not None: trainer = gluon.Trainer(net.collect_params(train_patterns), optimizer, optimizer_params, kvstore=kv, update_on_kvstore=False) else: trainer = gluon.Trainer(net.collect_params(train_patterns), optimizer, optimizer_params, update_on_kvstore=False) else: if opt.kvstore is not None: trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params, kvstore=kv, update_on_kvstore=False) else: trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params, update_on_kvstore=False) if opt.accumulate > 1: params = [p for p in net.collect_params().values() if p.grad_req != 'null'] for p in params: p.grad_req = 'add' if opt.resume_states is not '': trainer.load_states(opt.resume_states) if opt.use_amp: amp.init_trainer(trainer) L = gluon.loss.SoftmaxCrossEntropyLoss() best_val_score = 0 lr_decay_count = 0 for epoch in range(opt.resume_epoch, opt.num_epochs): tic = time.time() train_metric.reset() btic = time.time() num_train_iter = len(train_data) train_loss_epoch = 0 train_loss_iter = 0 for i, batch in enumerate(train_data): data, label = batch_fn(batch, ctx) with ag.record(): outputs = [] for _, X in enumerate(data): # X = X.reshape((-1,) + X.shape[2:]) X = X.reshape((-3,-3,-2)) pred = net(X.astype(opt.dtype, copy=False)) outputs.append(pred) loss = [L(yhat, y.astype(opt.dtype, copy=False)) for yhat, y in zip(outputs, label)] # print(loss) if opt.use_amp: with amp.scale_loss(loss, trainer) as scaled_loss: ag.backward(scaled_loss) else: ag.backward(loss) if opt.accumulate > 1 and (i + 1) % opt.accumulate == 0: if opt.kvstore is not None: trainer.step(batch_size * kv.num_workers * opt.accumulate) else: trainer.step(batch_size * opt.accumulate) net.collect_params().zero_grad() else: if opt.kvstore is not None: trainer.step(batch_size * kv.num_workers) else: trainer.step(batch_size) # print(outputs) # print(label) train_metric.update(label, outputs) train_loss_iter = sum([l.mean().asscalar() for l in loss]) / len(loss) train_loss_epoch += train_loss_iter train_metric_name, train_metric_score = train_metric.get() sw.add_scalar(tag='train_acc_top1_iter', value=train_metric_score*100, global_step=epoch * num_train_iter + i) sw.add_scalar(tag='train_loss_iter', value=train_loss_iter, global_step=epoch * num_train_iter + i) sw.add_scalar(tag='learning_rate_iter', value=trainer.learning_rate, global_step=epoch * num_train_iter + i) if opt.log_interval and not (i+1) % opt.log_interval: logger.info('Epoch[%03d] Batch [%04d]/[%04d]\tSpeed: %f samples/sec\t %s=%f\t loss=%f\t lr=%f' % ( epoch, i, num_train_iter, batch_size*opt.log_interval/(time.time()-btic), train_metric_name, train_metric_score*100, train_loss_epoch/(i+1), trainer.learning_rate)) btic = time.time() train_metric_name, train_metric_score = train_metric.get() throughput = int(batch_size * i /(time.time() - tic)) mx.ndarray.waitall() if opt.kvstore is not None and epoch == opt.resume_epoch: kv.init(111111, nd.zeros(1)) kv.init(555555, nd.zeros(1)) kv.init(999999, nd.zeros(1)) if opt.kvstore is not None: acc_top1_val, acc_top5_val, loss_val = test(ctx, val_data, kv) else: acc_top1_val, acc_top5_val, loss_val = test(ctx, val_data) logger.info('[Epoch %03d] training: %s=%f\t loss=%f' % (epoch, train_metric_name, train_metric_score*100, train_loss_epoch/num_train_iter)) logger.info('[Epoch %03d] speed: %d samples/sec\ttime cost: %f' % (epoch, throughput, time.time()-tic)) logger.info('[Epoch %03d] validation: acc-top1=%f acc-top5=%f loss=%f' % (epoch, acc_top1_val*100, acc_top5_val*100, loss_val)) sw.add_scalar(tag='train_loss_epoch', value=train_loss_epoch/num_train_iter, global_step=epoch) sw.add_scalar(tag='val_loss_epoch', value=loss_val, global_step=epoch) sw.add_scalar(tag='val_acc_top1_epoch', value=acc_top1_val*100, global_step=epoch) if acc_top1_val > best_val_score: best_val_score = acc_top1_val net.save_parameters('%s/%.4f-%s-%s-%03d-best.params'%(opt.save_dir, best_val_score, opt.dataset, model_name, epoch)) trainer.save_states('%s/%.4f-%s-%s-%03d-best.states'%(opt.save_dir, best_val_score, opt.dataset, model_name, epoch)) else: if opt.save_frequency and opt.save_dir and (epoch + 1) % opt.save_frequency == 0: net.save_parameters('%s/%s-%s-%03d.params'%(opt.save_dir, opt.dataset, model_name, epoch)) trainer.save_states('%s/%s-%s-%03d.states'%(opt.save_dir, opt.dataset, model_name, epoch)) # save the last model net.save_parameters('%s/%s-%s-%03d.params'%(opt.save_dir, opt.dataset, model_name, opt.num_epochs-1)) trainer.save_states('%s/%s-%s-%03d.states'%(opt.save_dir, opt.dataset, model_name, opt.num_epochs-1))
def relu(X): return nd.maximum(X, 0) def net(X): X = X.reshape((-1, num_input)) h1 = relu(nd.dot(X, w1) + b1) output = nd.dot(h1, w2) + b2 return output loss = gloss.SoftmaxCrossEntropyLoss() epochs = 5 learning_rate = 0.01 train_loss = 0. train_acc = 0. for i in range(epochs): for X, Y in train_iter: with ag.record(): Y_hat = net(X) l = loss(Y_hat, Y) l.backward() utils.SGD(params, learning_rate / batch_size) train_loss += nd.mean(l).asscalar() train_acc += utils.accuracy(Y_hat, Y) test_acc = utils.evaluate_accuracy(test_iter, net) print "Epoch %d train_loss %f train_acc %f test_acc %f" % ( i, train_loss / len(train_iter), train_acc / len(train_iter), test_acc)
def train(): logging.info('Start Training for Task: %s\n' % (task)) finetune_net = build_model() # Define DataLoader train_data = gluon.data.DataLoader( gluon.data.vision.ImageFolderDataset( os.path.join('data/train_valid', task, 'train'), transform=transform_train), batch_size=batch_size, shuffle=True, num_workers=num_workers, last_batch='discard') val_data = gluon.data.DataLoader( gluon.data.vision.ImageFolderDataset( os.path.join('data/train_valid', task, 'val'), transform=transform_val), batch_size=batch_size, shuffle=False, num_workers = num_workers) # Define Trainer trainer = gluon.Trainer(finetune_net.collect_params(), 'sgd', { 'learning_rate': lr, 'momentum': momentum, 'wd': wd}) metric = mx.metric.Accuracy() L = gluon.loss.SoftmaxCrossEntropyLoss() lr_counter = 0 num_batch = len(train_data) # Start Training for epoch in range(epochs): if epoch == lr_steps[lr_counter]: trainer.set_learning_rate(trainer.learning_rate*lr_factor) lr_counter += 1 tic = time.time() train_loss = 0 metric.reset() AP = 0. AP_cnt = 0 for i, batch in enumerate(train_data): data = gluon.utils.split_and_load(batch[0], ctx_list=ctx, batch_axis=0, even_split=False) label = gluon.utils.split_and_load(batch[1], ctx_list=ctx, batch_axis=0, even_split=False) with ag.record(): outputs = [finetune_net(X) for X in data] loss = [L(yhat, y) for yhat, y in zip(outputs, label)] for l in loss: l.backward() trainer.step(batch_size) train_loss += sum([l.mean().asscalar() for l in loss]) / len(loss) metric.update(label, outputs) ap, cnt = calculate_ap(label, outputs) AP += ap AP_cnt += cnt progressbar(i, num_batch-1) train_map = AP / AP_cnt _, train_acc = metric.get() train_loss /= num_batch val_acc, val_map, val_loss = validate(finetune_net, val_data, ctx) logging.info('[Epoch %d] Train-acc: %.3f, mAP: %.3f, loss: %.3f | Val-acc: %.3f, mAP: %.3f, loss: %.3f | time: %.1fs' % (epoch, train_acc, train_map, train_loss, val_acc, val_map, val_loss, time.time() - tic)) saved_path = os.path.join(CKPT_PATH, '%s-%s-epoch-%d.params' % (task, time.strftime("%Y-%m-%d-%H-%M", time.localtime(time.time())), epoch)) finetune_net.save_params(saved_path) logging.info('\nsave results at %s' % saved_path) return (finetune_net, saved_path)
def train(net, train_data, val_data, eval_metric, ctx, args): # set up logger logging.basicConfig() logger = logging.getLogger() logger.setLevel(logging.INFO) log_file_path = args.save_prefix + '_train.log' log_dir = os.path.dirname(log_file_path) if log_dir and not os.path.exists(log_dir): os.makedirs(log_dir) fh = logging.FileHandler(log_file_path) logger.addHandler(fh) logger.info(args) fix_pattern = get_fix_params_pattern(args.network) param_dict = net.collect_params(fix_pattern) for _, param in param_dict.items(): param.grad_req = 'null' logger.info('Fixed such params for net:\n%s' % param_dict) trainer = gluon.Trainer(net.collect_params(), 'sgd', { 'learning_rate': args.lr, 'wd': args.wd, 'momentum': args.momentum }) # lr decay policy lr_decay = float(args.lr_decay) lr_steps = sorted( [float(ls) for ls in args.lr_decay_epoch.split(',') if ls.strip()]) logger.info('Start training from [Epoch {}]'.format(args.start_epoch)) best_metric = [0] loss = gluon.loss.SoftmaxCrossEntropyLoss() loss_metric = mx.metric.Loss('CELoss') num_batch = len(train_data) # Start Training for epoch in range(args.epochs): while lr_steps and epoch >= lr_steps[0]: new_lr = trainer.learning_rate * lr_decay lr_steps.pop(0) trainer.set_learning_rate(new_lr) logger.info("[Epoch {}] Set learning rate to {}".format( epoch, new_lr)) tic = time.time() btic = time.time() loss_metric.reset() eval_metric.reset() for i, batch in enumerate(train_data): data_list = gluon.utils.split_and_load(batch[0], ctx_list=ctx, batch_axis=0, even_split=False) label_list = gluon.utils.split_and_load(batch[1], ctx_list=ctx, batch_axis=0, even_split=False) output_list = [] loss_list = [] with autograd.record(): for data, label in zip(data_list, label_list): output = net(data) output_list.append(output) loss_list.append(loss(output, label)) autograd.backward(loss_list) trainer.step(args.batch_size) loss_metric.update(None, loss_list) eval_metric.update(label_list, output_list) if args.log_interval and not (i + 1) % args.log_interval: _, train_loss = loss_metric.get() metric_name, metric_value = eval_metric.get() speed = args.log_interval * args.batch_size / (time.time() - btic) logger.info( '[Epoch {}][Batch {}], Speed: {:.3f} samples/sec, CELoss=%.3f' .format(epoch, i, speed, train_loss, metric_name, metric_value)) btic = time.time() _, train_loss = loss_metric.get() metric_name, metric_value = eval_metric.get() if not isinstance(metric_value, (list, tuple)): metric_name = [metric_name] metric_value = [metric_value] metric_msg = '\n'.join( ['{}={}'.format(k, v) for k, v in zip(metric_name, metric_value)]) logger.info( '[Epoch {}] Training cost: {:.3f}, CELoss=%.3f, \n{}'.format( epoch, (time.time() - tic), train_loss, metric_msg)) if not (epoch + 1) % args.val_interval: metric_name, metric_value = validate(net, val_data, ctx, eval_metric) if not isinstance(metric_value, (list, tuple)): metric_name = [metric_name] metric_value = [metric_value] metric_msg = '\n'.join([ '{}={}'.format(k, v) for k, v in zip(metric_name, metric_value) ]) logger.info('[Epoch {}] Validation: \n{}'.format( epoch, metric_msg)) current_metric = metric_value[-1] else: current_metric = 0. save_params(net, logger, best_metric, current_metric, epoch, args.save_interval, args.save_prefix)
def train(net, train_data, val_data, eval_metric, ctx, args): """Training pipeline""" net.collect_params().reset_ctx(ctx) if args.horovod: hvd.broadcast_parameters(net.collect_params(), root_rank=0) trainer = hvd.DistributedTrainer(net.collect_params(), 'sgd', { 'learning_rate': args.lr, 'wd': args.wd, 'momentum': args.momentum }) else: trainer = gluon.Trainer( net.collect_params(), 'sgd', { 'learning_rate': args.lr, 'wd': args.wd, 'momentum': args.momentum }, update_on_kvstore=(False if args.amp else None)) if args.amp: amp.init_trainer(trainer) # lr decay policy lr_decay = float(args.lr_decay) lr_steps = sorted( [float(ls) for ls in args.lr_decay_epoch.split(',') if ls.strip()]) mbox_loss = gcv.loss.SSDMultiBoxLoss() ce_metric = mx.metric.Loss('CrossEntropy') smoothl1_metric = mx.metric.Loss('SmoothL1') # set up logger logging.basicConfig() logger = logging.getLogger() logger.setLevel(logging.INFO) log_file_path = args.save_prefix + '_train.log' log_dir = os.path.dirname(log_file_path) if log_dir and not os.path.exists(log_dir): os.makedirs(log_dir) fh = logging.FileHandler(log_file_path) logger.addHandler(fh) logger.info(args) logger.info('Start training from [Epoch {}]'.format(args.start_epoch)) best_map = [0] for epoch in range(args.start_epoch, args.epochs): while lr_steps and epoch >= lr_steps[0]: new_lr = trainer.learning_rate * lr_decay lr_steps.pop(0) trainer.set_learning_rate(new_lr) logger.info("[Epoch {}] Set learning rate to {}".format( epoch, new_lr)) ce_metric.reset() smoothl1_metric.reset() tic = time.time() btic = time.time() net.hybridize(static_alloc=True, static_shape=True) for i, batch in enumerate(train_data): if args.dali: # dali iterator returns a mxnet.io.DataBatch data = [d.data[0] for d in batch] box_targets = [d.label[0] for d in batch] cls_targets = [ nd.cast(d.label[1], dtype='float32') for d in batch ] else: data = gluon.utils.split_and_load(batch[0], ctx_list=ctx, batch_axis=0) cls_targets = gluon.utils.split_and_load(batch[1], ctx_list=ctx, batch_axis=0) box_targets = gluon.utils.split_and_load(batch[2], ctx_list=ctx, batch_axis=0) with autograd.record(): cls_preds = [] box_preds = [] for x in data: cls_pred, box_pred, _ = net(x) cls_preds.append(cls_pred) box_preds.append(box_pred) sum_loss, cls_loss, box_loss = mbox_loss( cls_preds, box_preds, cls_targets, box_targets) if args.amp: with amp.scale_loss(sum_loss, trainer) as scaled_loss: autograd.backward(scaled_loss) else: autograd.backward(sum_loss) # since we have already normalized the loss, we don't want to normalize # by batch-size anymore trainer.step(1) if (not args.horovod or hvd.rank() == 0): local_batch_size = int(args.batch_size // (hvd.size() if args.horovod else 1)) ce_metric.update(0, [l * local_batch_size for l in cls_loss]) smoothl1_metric.update( 0, [l * local_batch_size for l in box_loss]) if args.log_interval and not (i + 1) % args.log_interval: name1, loss1 = ce_metric.get() name2, loss2 = smoothl1_metric.get() logger.info( '[Epoch {}][Batch {}], Speed: {:.3f} samples/sec, {}={:.3f}, {}={:.3f}' .format(epoch, i, args.batch_size / (time.time() - btic), name1, loss1, name2, loss2)) btic = time.time() if (not args.horovod or hvd.rank() == 0): name1, loss1 = ce_metric.get() name2, loss2 = smoothl1_metric.get() logger.info( '[Epoch {}] Training cost: {:.3f}, {}={:.3f}, {}={:.3f}'. format(epoch, (time.time() - tic), name1, loss1, name2, loss2)) if (epoch % args.val_interval == 0) or (args.save_interval and epoch % args.save_interval == 0): # consider reduce the frequency of validation to save time map_name, mean_ap = validate(net, val_data, ctx, eval_metric) val_msg = '\n'.join( ['{}={}'.format(k, v) for k, v in zip(map_name, mean_ap)]) logger.info('[Epoch {}] Validation: \n{}'.format( epoch, val_msg)) current_map = float(mean_ap[-1]) else: current_map = 0. save_params(net, best_map, current_map, epoch, args.save_interval, args.save_prefix)
def test_lstmp(): hidden_size, projection_size = 3, 2 rtol, atol = 1e-2, 1e-2 batch_size, seq_len = 7, 11 input_size = 5 ctx = mx.gpu(0) lstm_input = mx.nd.uniform(shape=(seq_len, batch_size, input_size), ctx=ctx) shapes = { 'i2h_weight': (hidden_size * 4, input_size), 'h2h_weight': (hidden_size * 4, projection_size), 'i2h_bias': (hidden_size * 4, ), 'h2h_bias': (hidden_size * 4, ), 'h2r_weight': (projection_size, hidden_size) } weights = {k: rand_ndarray(v) for k, v in shapes.items()} lstm_layer = gluon.rnn.LSTM(hidden_size, projection_size=projection_size, input_size=input_size) lstm_cell = gluon.rnn.LSTMPCell(hidden_size=hidden_size, projection_size=projection_size, input_size=input_size) lstm_layer.initialize(ctx=ctx) lstm_cell.initialize(ctx=ctx) layer_params = lstm_layer.collect_params() cell_params = lstm_cell.collect_params() for k, v in weights.items(): layer_params['l0_' + k].set_data(v.copy()) cell_params[k].set_data(v.copy()) with autograd.record(): layer_output = lstm_layer(lstm_input.copy()) cell_output = lstm_cell.unroll(seq_len, lstm_input.copy(), layout='TNC', merge_outputs=True)[0] assert_almost_equal(layer_output, cell_output, rtol=rtol, atol=atol) layer_output.backward() cell_output.backward() for k, v in weights.items(): layer_grad = layer_params['l0_' + k].grad() cell_grad = cell_params[k].grad() print('checking gradient for {}'.format('lstm0_l0_' + k)) assert_almost_equal(layer_grad, cell_grad, rtol=rtol, atol=atol) check_rnn_layer_forward(gluon.rnn.LSTM(10, 2, projection_size=5), mx.nd.ones((8, 3, 20)), ctx=ctx) check_rnn_layer_forward( gluon.rnn.LSTM(10, 2, projection_size=5, bidirectional=True), mx.nd.ones((8, 3, 20)), [mx.nd.ones((4, 3, 5)), mx.nd.ones((4, 3, 10))], ctx=ctx) check_rnn_layer_forward(gluon.rnn.LSTM(10, 2, dropout=0.5, projection_size=5), mx.nd.ones((8, 3, 20)), run_only=True, ctx=ctx) check_rnn_layer_forward( gluon.rnn.LSTM(10, 2, bidirectional=True, dropout=0.5, projection_size=5), mx.nd.ones((8, 3, 20)), [mx.nd.ones((4, 3, 5)), mx.nd.ones((4, 3, 10))], run_only=True, ctx=ctx) lstm_layer.save_parameters('gpu_tmp.params') lstm_layer.load_parameters('gpu_tmp.params')
def _train_loop(self, train_data, val_data, train_eval_data, time_limit=math.inf): start_tic = time.time() # fix seed for mxnet, numpy and python builtin random generator. gutils.random.seed(self._cfg.train.seed) # loss and metric mbox_loss = SSDMultiBoxLoss() ce_metric = mx.metric.Loss('CrossEntropy') smoothl1_metric = mx.metric.Loss('SmoothL1') # lr decay policy lr_decay = float(self._cfg.train.lr_decay) lr_steps = sorted([float(ls) for ls in self._cfg.train.lr_decay_epoch]) self._logger.info('Start training from [Epoch %d]', max(self._cfg.train.start_epoch, self.epoch)) self.net.collect_params().reset_ctx(self.ctx) early_stopper = EarlyStopperOnPlateau( patience=self._cfg.train.early_stop_patience, min_delta=self._cfg.train.early_stop_min_delta, baseline_value=self._cfg.train.early_stop_baseline, max_value=self._cfg.train.early_stop_max_value) mean_ap = [-1] cp_name = '' self._time_elapsed += time.time() - start_tic for self.epoch in range(max(self._cfg.train.start_epoch, self.epoch), self._cfg.train.epochs): epoch = self.epoch tic = time.time() last_tic = time.time() if self._best_map >= 1.0: self._logger.info( '[Epoch {}] Early stopping as mAP is reaching 1.0'.format( epoch)) break should_stop, stop_message = early_stopper.get_early_stop_advice() if should_stop: self._logger.info('[Epoch {}] '.format(epoch) + stop_message) break while lr_steps and epoch >= lr_steps[0]: new_lr = self.trainer.learning_rate * lr_decay lr_steps.pop(0) self.trainer.set_learning_rate(new_lr) self._logger.info("[Epoch {}] Set learning rate to {}".format( epoch, new_lr)) ce_metric.reset() smoothl1_metric.reset() self.net.hybridize(static_alloc=True, static_shape=True) for i, batch in enumerate(train_data): btic = time.time() if self._time_elapsed > time_limit: self._logger.warning( f'`time_limit={time_limit}` reached, exit early...') return { 'train_map': float(mean_ap[-1]), 'valid_map': self._best_map, 'time': self._time_elapsed, 'checkpoint': cp_name } if self._cfg.train.dali: # dali iterator returns a mxnet.io.DataBatch data = [d.data[0] for d in batch] box_targets = [d.label[0] for d in batch] cls_targets = [ nd.cast(d.label[1], dtype='float32') for d in batch ] else: data = gluon.utils.split_and_load(batch[0], ctx_list=self.ctx, batch_axis=0, even_split=False) cls_targets = gluon.utils.split_and_load(batch[1], ctx_list=self.ctx, batch_axis=0, even_split=False) box_targets = gluon.utils.split_and_load(batch[2], ctx_list=self.ctx, batch_axis=0, even_split=False) with autograd.record(): cls_preds = [] box_preds = [] for x in data: cls_pred, box_pred, _ = self.net(x) cls_preds.append(cls_pred) box_preds.append(box_pred) sum_loss, cls_loss, box_loss = mbox_loss( cls_preds, box_preds, cls_targets, box_targets) if self._cfg.ssd.amp: with amp.scale_loss(sum_loss, self.trainer) as scaled_loss: autograd.backward(scaled_loss) else: autograd.backward(sum_loss) # since we have already normalized the loss, we don't want to normalize # by batch-size anymore self.trainer.step(1) if not self._cfg.horovod or hvd.rank() == 0: local_batch_size = int( self._cfg.train.batch_size // (hvd.size() if self._cfg.horovod else 1)) ce_metric.update(0, [l * local_batch_size for l in cls_loss]) smoothl1_metric.update( 0, [l * local_batch_size for l in box_loss]) if self._cfg.train.log_interval and not ( i + 1) % self._cfg.train.log_interval: name1, loss1 = ce_metric.get() name2, loss2 = smoothl1_metric.get() self._logger.info( '[Epoch %d][Batch %d], Speed: %f samples/sec, %s=%f, %s=%f', epoch, i, self._cfg.train.batch_size / (time.time() - last_tic), name1, loss1, name2, loss2) last_tic = time.time() self._time_elapsed += time.time() - btic post_tic = time.time() if not self._cfg.horovod or hvd.rank() == 0: name1, loss1 = ce_metric.get() name2, loss2 = smoothl1_metric.get() self._logger.info('[Epoch %d] Training cost: %f, %s=%f, %s=%f', epoch, (time.time() - tic), name1, loss1, name2, loss2) if (epoch % self._cfg.valid.val_interval == 0) or \ (self._cfg.save_interval and epoch % self._cfg.save_interval == 0): # consider reduce the frequency of validation to save time map_name, mean_ap = self._evaluate(val_data) val_msg = '\n'.join([ '{}={}'.format(k, v) for k, v in zip(map_name, mean_ap) ]) self._logger.info('[Epoch %d] Validation: \n%s', epoch, str(val_msg)) current_map = float(mean_ap[-1]) if current_map > self._best_map: cp_name = os.path.join(self._logdir, _BEST_CHECKPOINT_FILE) self._logger.info( '[Epoch %d] Current best map: %f vs previous %f, saved to %s', self.epoch, current_map, self._best_map, cp_name) self.save(cp_name) self._best_map = current_map if self._reporter: self._reporter(epoch=epoch, map_reward=current_map) early_stopper.update(current_map, epoch=epoch) self._time_elapsed += time.time() - post_tic # map on train data tic = time.time() map_name, mean_ap = self._evaluate(train_eval_data) self._time_elapsed += time.time() - tic return { 'train_map': float(mean_ap[-1]), 'valid_map': self._best_map, 'time': self._time_elapsed, 'checkpoint': cp_name }
def run(outdir): """ Runs a set of training and validation experiments and stores result in a directory. """ ''' Set up paths and start log ''' logfile = outdir + 'log.txt' f = open(logfile, 'w') f.close() ''' Hyperparameters ''' epochs = int(FLAGS.iterations) learning_rate = float(FLAGS.learning_rate) wd = float(FLAGS.weight_decay) train_experiments = int(FLAGS.experiments) learning_rate_factor = float(FLAGS.learning_rate_factor) learning_rate_steps = int( FLAGS.learning_rate_steps ) # changes the learning rate for every n updates. ''' Logging ''' logfile = outdir + 'log.txt' f = open(logfile, 'w') f.close() data_train = FLAGS.data_dir + FLAGS.data_train data_train_valid = FLAGS.data_dir + FLAGS.data_test ''' Set GPUs/CPUs ''' num_gpus = mx.context.num_gpus() num_workers = int( FLAGS.num_workers) # replace num_workers with the number of cores ctx = mx.gpu() if num_gpus > 0 else mx.cpu() units = num_gpus if num_gpus > 0 else 1 batch_size_per_unit = int(FLAGS.batch_size_per_unit) # mini-batch size batch_size = batch_size_per_unit * max(units, 1) ''' Set random seeds ''' random.seed(FLAGS.seed) np.random.seed(FLAGS.seed) mx.random.seed(FLAGS.seed) ''' Save parameters ''' save_config(outdir + 'config.txt', FLAGS) log( logfile, 'Training with hyperparameters: alpha=%.2g, lambda=%.2g' % (FLAGS.p_alpha, FLAGS.weight_decay)) ''' Load dataset ''' train_dataset = load_data(data_train, normalize=FLAGS.normalize_input) log(logfile, 'Training data: ' + data_train) log(logfile, 'Valid data: ' + data_train_valid) log( logfile, 'Loaded data with shape [%d,%d]' % (train_dataset['n'], train_dataset['dim'])) ''' CFR Neural Network Architecture for ITE estimation ''' net = CFRNet(FLAGS.dim_rep, FLAGS.dim_hyp, FLAGS.weight_init_scale, train_dataset['dim'], FLAGS.batch_norm) ''' Instantiate net ''' net.initialize(ctx=ctx) net.hybridize() # hybridize for better performance ''' Metric, Loss and Optimizer ''' rmse_metric = mx.metric.RMSE() l2_loss = gluon.loss.L2Loss() wass_loss = WassersteinLoss( lam=FLAGS.wass_lambda, its=FLAGS.wass_iterations, square=True, backpropT=FLAGS.wass_bpg) # Change too at hybrid_test_net_with_cfr scheduler = mx.lr_scheduler.FactorScheduler(step=learning_rate_steps, factor=learning_rate_factor, base_lr=learning_rate) optimizer = mx.optimizer.Adam(learning_rate=learning_rate, lr_scheduler=scheduler) # optimizer = mx.optimizer.Adam(learning_rate=learning_rate, lr_scheduler=scheduler, wd=wd) trainer = gluon.Trainer(net.collect_params(), optimizer=optimizer) ''' Initialize train score results ''' train_scores = np.zeros((train_experiments, 3)) ''' Initialize train experiment durations ''' train_durations = np.zeros((train_experiments, 1)) ''' Initialize valid score results ''' test_scores = np.zeros((train_experiments, 3)) ''' Train experiments means and stds ''' means = np.array([]) stds = np.array([]) ''' Train ''' for train_experiment in range(train_experiments): ''' Create training dataset ''' x = train_dataset['x'][:, :, train_experiment] t = np.reshape(train_dataset['t'][:, train_experiment], (-1, 1)) yf = train_dataset['yf'][:, train_experiment] ycf = train_dataset['ycf'][:, train_experiment] mu0 = train_dataset['mu0'][:, train_experiment] mu1 = train_dataset['mu1'][:, train_experiment] train, valid, test, _ = split_data_in_train_valid_test( x, t, yf, ycf, mu0, mu1) ''' With-in sample ''' train_evaluator = Evaluator( np.concatenate([train['t'], valid['t']]), np.concatenate([train['yf'], valid['yf']]), y_cf=np.concatenate([train['ycf'], valid['ycf']], axis=0), mu0=np.concatenate([train['mu0'], valid['mu0']], axis=0), mu1=np.concatenate([train['mu1'], valid['mu1']], axis=0)) test_evaluator = Evaluator(test['t'], test['yf'], test['ycf'], test['mu0'], test['mu1']) ''' Plot first experiment original TSNE visualization ''' if train_experiment == 0: ''' Learned representations of first experiment for TSNE visualization ''' first_exp_reps = [] ''' Normalize yf ''' if FLAGS.normalize_input: yf_m, yf_std = np.mean(train['yf'], axis=0), np.std(train['yf'], axis=0) train['yf'] = (train['yf'] - yf_m) / yf_std valid['yf'] = (valid['yf'] - yf_m) / yf_std test['yf'] = (test['yf'] - yf_m) / yf_std ''' Save mean and std ''' means = np.append(means, yf_m) stds = np.append(stds, yf_std) ''' Train dataset ''' factual_features = np.hstack((train['x'], train['t'])) train_factual_dataset = gluon.data.ArrayDataset( mx.nd.array(factual_features), mx.nd.array(train['yf'])) ''' With-in sample ''' train_rmse_ite_dataset = gluon.data.ArrayDataset( mx.nd.array(np.concatenate([train['x'], valid['x']]))) ''' Valid dataset ''' valid_factual_features = np.hstack((valid['x'], valid['t'])) valid_factual_dataset = gluon.data.ArrayDataset( mx.nd.array(valid_factual_features), mx.nd.array(valid['yf'])) ''' Test dataset ''' test_rmse_ite_dataset = gluon.data.ArrayDataset( mx.nd.array(test['x'])) # todo rename, rmse_ite has nothing to do ''' Train DataLoader ''' train_factual_loader = gluon.data.DataLoader(train_factual_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers) train_rmse_ite_loader = gluon.data.DataLoader(train_rmse_ite_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers) ''' Valid DataLoader ''' valid_factual_loader = gluon.data.DataLoader(valid_factual_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers) ''' Test DataLoader ''' test_rmse_ite_loader = gluon.data.DataLoader(test_rmse_ite_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers) number_of_batches = len(train_factual_loader) ''' Compute treatment probability ''' treatment_probability = np.mean(train['t']) train_start = time.time() ''' Train model ''' for epoch in range( 1, epochs + 1): # start with epoch 1 for easier learning rate calculation train_loss = 0 rmse_metric.reset() obj_loss = 0 imb_err = 0 for i, (batch_f_features, batch_yf) in enumerate(train_factual_loader): ''' Get data and labels into slices and copy each slice into a context. ''' batch_f_features = batch_f_features.as_in_context(ctx) batch_yf = batch_yf.as_in_context(ctx) x = batch_f_features[:, :-1] t = batch_f_features[:, -1] ''' Get treatment and control indices. Batch_size must be enough to have at least one t=1 sample ''' t1_idx = np.where(t == 1)[0] t0_idx = np.where(t == 0)[0] if t1_idx.shape[0] == 0: log( logfile, 'Encountered no treatment samples at batch ' + str(i) + '.') ''' Compute sample reweighing ''' if FLAGS.reweight_sample: w_t = t / (2 * treatment_probability) w_c = (1 - t) / (2 * 1 - treatment_probability) sample_weight = w_t + w_c else: sample_weight = 1.0 ''' Initialize outputs ''' outputs = np.zeros(batch_yf.shape) loss = np.zeros(batch_yf.shape) ''' Forward (Factual) ''' with autograd.record(): t1_o, t0_o, rep_o = net(x, mx.nd.array(t1_idx), mx.nd.array(t0_idx)) risk = 0 t1_o_loss = l2_loss(t1_o, batch_yf[t1_idx], sample_weight[t1_idx]) np.put(loss, t1_idx, t1_o_loss.asnumpy()) np.put(outputs, t1_idx, t1_o.asnumpy()) risk = risk + t1_o_loss.sum() t0_o_loss = l2_loss(t0_o, batch_yf[t0_idx], sample_weight[t0_idx]) np.put(loss, t0_idx, t0_o_loss.asnumpy()) np.put(outputs, t0_idx, t0_o.asnumpy()) risk = risk + t0_o_loss.sum() if FLAGS.normalization == 'divide': h_rep_norm = rep_o / mx_safe_sqrt( mx.nd.sum( mx.nd.square(rep_o), axis=1, keepdims=True)) else: h_rep_norm = 1.0 * rep_o imb_dist = wass_loss(h_rep_norm[t1_idx], h_rep_norm[t0_idx]) imb_error = FLAGS.p_alpha * imb_dist tot_error = risk if FLAGS.p_alpha > 0: tot_error = tot_error + imb_error ''' Save last epoch of first experiment reps for TSNE vis. ''' if train_experiment == 0 and epoch == range(epochs + 1)[-1]: first_exp_reps.extend(rep_o) ''' Backward ''' tot_error.backward() ''' Optimize ''' trainer.step(batch_size) train_loss += loss.mean() rmse_metric.update(batch_yf, mx.nd.array(outputs)) obj_loss += tot_error.asscalar() imb_err += imb_error.asscalar() if epoch % FLAGS.epoch_output_iter == 0 or epoch == 1: _, train_rmse_factual = rmse_metric.get() train_loss /= number_of_batches (_, valid_rmse_factual), _, _ = hybrid_test_net_with_cfr( net, valid_factual_loader, ctx, FLAGS, np.mean(valid['t'])) log( logfile, '[Epoch %d/%d] Train-rmse-factual: %.3f | Loss: %.3f | learning-rate: ' '%.3E | ObjLoss: %.3f | ImbErr: %.3f | Valid-rmse-factual: %.3f' % (epoch, epochs, train_rmse_factual, train_loss, trainer.learning_rate, obj_loss, imb_err, valid_rmse_factual)) ''' Plot first experiment learned TSNE visualization ''' if train_experiment == 0: tsne_plot_pca(data=train['x'], label=train['t'], learned_representation=np.asarray( [ind.asnumpy() for ind in first_exp_reps]), outdir=outdir + FLAGS.architecture.lower()) train_durations[train_experiment, :] = time.time() - train_start ''' Test model with valid data ''' y_t0, y_t1, = hybrid_predict_treated_and_controlled_with_cfr( net, train_rmse_ite_loader, ctx) if FLAGS.normalize_input: y_t0, y_t1 = y_t0 * yf_std + yf_m, y_t1 * yf_std + yf_m train_score = train_evaluator.get_metrics(y_t1, y_t0) train_scores[train_experiment, :] = train_score y_t0, y_t1, = hybrid_predict_treated_and_controlled_with_cfr( net, test_rmse_ite_loader, ctx) if FLAGS.normalize_input: y_t0, y_t1 = y_t0 * yf_std + yf_m, y_t1 * yf_std + yf_m test_score = test_evaluator.get_metrics(y_t1, y_t0) test_scores[train_experiment, :] = test_score log(logfile, '[Train Replication {}/{}]: train RMSE ITE: {:0.3f}, train ATE: {:0.3f}, train PEHE: {:0.3f},' \ ' test RMSE ITE: {:0.3f}, test ATE: {:0.3f}, test PEHE: {:0.3f}'.format(train_experiment + 1, train_experiments, train_score[0], train_score[1], train_score[2], test_score[0], test_score[1], test_score[2])) ''' Save means and stds NDArray values for inference ''' if FLAGS.normalize_input: mx.nd.save( outdir + FLAGS.architecture.lower() + '_means_stds_ihdp_' + str(train_experiments) + '_.nd', { "means": mx.nd.array(means), "stds": mx.nd.array(stds) }) ''' Export trained models ''' # See mxnet.apache.org/api/python/docs/tutorials/packages/gluon/blocks/save_load_params.html net.export(outdir + FLAGS.architecture.lower() + "-ihdp-predictions-" + str(train_experiments)) # hybrid log(logfile, '\n{} architecture total scores:'.format(FLAGS.architecture.upper())) ''' Train and test scores ''' means, stds = np.mean(train_scores, axis=0), sem(train_scores, axis=0, ddof=0) r_pehe_mean, r_pehe_std = np.mean(np.sqrt(train_scores[:, 2]), axis=0), sem(np.sqrt(train_scores[:, 2]), axis=0, ddof=0) train_total_scores_str = 'train RMSE ITE: {:.2f} ± {:.2f}, train ATE: {:.2f} ± {:.2f}, train PEHE: {:.2f} ± {:.2f}, ' \ 'train root PEHE: {:.2f} ± {:.2f}' \ ''.format(means[0], stds[0], means[1], stds[1], means[2], stds[2], r_pehe_mean, r_pehe_std) means, stds = np.mean(test_scores, axis=0), sem(test_scores, axis=0, ddof=0) r_pehe_mean, r_pehe_std = np.mean(np.sqrt(test_scores[:, 2]), axis=0), sem(np.sqrt(test_scores[:, 2]), axis=0, ddof=0) test_total_scores_str = 'test RMSE ITE: {:.2f} ± {:.2f}, test ATE: {:.2f} ± {:.2f}, test PEHE: {:.2f} ± {:.2f}, ' \ 'test root PEHE: {:.2f} ± {:.2f}' \ ''.format(means[0], stds[0], means[1], stds[1], means[2], stds[2], r_pehe_mean, r_pehe_std) log(logfile, train_total_scores_str) log(logfile, test_total_scores_str) mean_duration = float("{0:.2f}".format( np.mean(train_durations, axis=0)[0])) with open(outdir + FLAGS.architecture.lower() + "-total-scores-" + str(train_experiments), "w", encoding="utf8") as text_file: print(train_total_scores_str, "\n", test_total_scores_str, file=text_file) return { "ite": "{:.2f} ± {:.2f}".format(means[0], stds[0]), "ate": "{:.2f} ± {:.2f}".format(means[1], stds[1]), "pehe": "{:.2f} ± {:.2f}".format(means[2], stds[2]), "mean_duration": mean_duration }
nn.Dense(10)) net.initialize(init=init.Xavier()) softmax_cross_entropy = gluon.loss.SoftmaxCrossEntropyLoss() trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.1}) def acc(output, label): return (output.argmax(axis=1) == label.astype('float32')).mean().asscalar() for epoch in range(10): train_loss, train_acc, valid_acc = 0., 0., 0. tic = time.time() for data, label in train_data: with autograd.record(): output = net(data) loss = softmax_cross_entropy(output, label) loss.backward() # update params trainer.step(batch_size) # calc training metrics train_loss += loss.mean().asscalar() train_acc += acc(output, label) for data, label in valid_data: valid_acc += acc(net(data), label) print("Epoch %d: loss %.3f, train acc %.3f, test acc %.3f, in %.1f sec" % (epoch, train_loss/len(train_data), train_acc/len(train_data),
def train(): """training""" image_pool = ImagePool(pool_size) metric = mx.metric.CustomMetric(facc) stamp = datetime.now().strftime('%Y_%m_%d-%H_%M') logging.basicConfig(level=logging.DEBUG) # define a summary writer that logs data and flushes to the file every 5 seconds sw = SummaryWriter(logdir='%s' % dir_out_sw, flush_secs=5, verbose=False) global_step = 0 for epoch in range(epochs): if epoch == 0: netG.hybridize() netD.hybridize() # sw.add_graph(netG) # sw.add_graph(netD) tic = time.time() btic = time.time() train_data.reset() val_data.reset() iter = 0 for local_step, batch in enumerate(train_data): ############################ # (1) Update D network: maximize log(D(x, y)) + log(1 - D(x, G(x, z))) ########################### tmp = mx.nd.concat(batch.data[0], batch.data[1], batch.data[2], dim=1) tmp = augmenter(tmp, patch_size=128, offset=offset, aug_type=1, aug_methods=aug_methods, random_crop=False) real_in = tmp[:, :1].as_in_context(ctx) real_out = tmp[:, 1:2].as_in_context(ctx) m = tmp[:, 2:3].as_in_context(ctx) # mask fake_out = netG(real_in) * m # loss weight based on mask, applied on L1 loss if no_loss_weights: loss_weight = m else: loss_weight = m.asnumpy() loss_weight[loss_weight == 0] = .1 loss_weight = mx.nd.array(loss_weight, ctx=m.context) fake_concat = image_pool.query(nd.concat(real_in, fake_out, dim=1)) with autograd.record(): # Train with fake image # Use image pooling to utilize history images output = netD(fake_concat) fake_label = nd.zeros(output.shape, ctx=ctx) errD_fake = GAN_loss(output, fake_label) metric.update([ fake_label, ], [ output, ]) # Train with real image real_concat = nd.concat(real_in, real_out, dim=1) output = netD(real_concat) real_label = nd.ones(output.shape, ctx=ctx) errD_real = GAN_loss(output, real_label) errD = (errD_real + errD_fake) * 0.5 errD.backward() metric.update([ real_label, ], [ output, ]) trainerD.step(batch.data[0].shape[0]) ############################ # (2) Update G network: maximize log(D(x, G(x, z))) - lambda1 * L1(y, G(x, z)) ########################### with autograd.record(): fake_out = netG(real_in) fake_concat = nd.concat(real_in, fake_out, dim=1) output = netD(fake_concat) real_label = nd.ones(output.shape, ctx=ctx) errG = GAN_loss(output, real_label) + loss_2nd( real_out, fake_out, loss_weight) * lambda1 errG.backward() trainerG.step(batch.data[0].shape[0]) sw.add_scalar(tag='loss', value=('d_loss', errD.mean().asscalar()), global_step=global_step) sw.add_scalar(tag='loss', value=('g_loss', errG.mean().asscalar()), global_step=global_step) global_step += 1 if epoch + local_step == 0: sw.add_graph((netG)) img_in_list, img_out_list, m_val = val_data.next().data m_val = m_val.as_in_context(ctx) sw.add_image('first_minibatch_train_real', norm3(real_out)) sw.add_image('first_minibatch_val_real', norm3(img_out_list.as_in_context(ctx))) netG.export('%snetG' % dir_out_checkpoints) if local_step == 0: # Log the first batch of images of each epoch (training) sw.add_image('first_minibatch_train_fake', norm3(fake_out * m) * m, epoch) sw.add_image( 'first_minibatch_val_fake', norm3(netG(img_in_list.as_in_context(ctx)) * m_val) * m_val, epoch) # norm3(netG(img_in_list.as_in_context(ctx)) * m_val.as_in_context(ctx)), epoch) if (iter + 1) % 10 == 0: name, acc = metric.get() logging.info('speed: {} samples/s'.format( batch_size / (time.time() - btic))) logging.info( 'discriminator loss = %f, generator loss = %f, binary training acc = %f at iter %d epoch %d' % (nd.mean(errD).asscalar(), nd.mean(errG).asscalar(), acc, iter, epoch)) iter += 1 btic = time.time() sw.add_scalar(tag='binary_training_acc', value=('acc', acc), global_step=epoch) name, acc = metric.get() metric.reset() fake_val = netG(val_data.data[0][1].as_in_context(ctx)) loss_val = loss_2nd(val_data.data[1][1].as_in_context(ctx), fake_val, val_data.data[2][1].as_in_context(ctx)) * lambda1 sw.add_scalar(tag='loss_val', value=('g_loss', loss_val.mean().asscalar()), global_step=epoch) if (epoch % check_point_interval == 0) | (epoch == epochs - 1): netD.save_params('%snetD-%04d' % (dir_out_checkpoints, epoch)) netG.save_params('%snetG-%04d' % (dir_out_checkpoints, epoch)) logging.info('\nbinary training acc at epoch %d: %s=%f' % (epoch, name, acc)) logging.info('time: %f' % (time.time() - tic)) sw.export_scalars('scalar_dict.json') sw.close()
def run(args, outdir): """ Run training for NN4 architecture with Variational Bayes. """ ''' Hyperparameters ''' epochs = int(args.iterations) learning_rate = float(args.learning_rate) wd = float(args.weight_decay) hidden_size = int(args.hidden_size) train_experiments = int(args.experiments) learning_rate_factor = float(args.learning_rate_factor) learning_rate_steps = int( args.learning_rate_steps ) # changes the learning rate for every n updates. epoch_output_iter = int(args.epoch_output_iter) ''' Logging ''' logfile = outdir + 'log.txt' f = open(logfile, 'w') f.close() config = { # TODO may need adjustments # "sigma_p1": 1.5, "sigma_p1": 1.75, # og # "sigma_p2": 0.25, # "sigma_p2": 0.5, # og "sigma_p2": 0.5, "pi": 0.5, "lambda_p": 24.5 } ''' Set GPUs/CPUs ''' num_gpus = mx.context.num_gpus() num_workers = int( args.num_workers) # replace num_workers with the number of cores ctx = [mx.gpu(i) for i in range(num_gpus) ] if num_gpus > 0 else [mx.cpu()] # todo change as cfr_net_train batch_size_per_unit = int(args.batch_size_per_unit) # mini-batch size batch_size = batch_size_per_unit * max(num_gpus, 1) ''' Set seeds ''' for c in ctx: mx.random.seed(int(args.seed), c) np.random.seed(int(args.seed)) ''' Feed Forward Neural Network Model (4 hidden layers) ''' net = ff4_relu_architecture(hidden_size) ''' Load datasets ''' # train_dataset = load_data('../' + args.data_dir + args.data_train) # PyCharm run train_dataset = load_data(args.data_dir + args.data_train) # Terminal run log(logfile, 'Training data: ' + args.data_dir + args.data_train) log(logfile, 'Valid data: ' + args.data_dir + args.data_test) log( logfile, 'Loaded data with shape [%d,%d]' % (train_dataset['n'], train_dataset['dim'])) # ''' Feature correlation ''' # import pandas as pd # df = pd.DataFrame.from_records(train_dataset['x'][:, :, 20]) # df.insert(25, "t", train_dataset['t'][:, 20]) # corr = df.corr() # import seaborn as sns # sns.heatmap(corr, xticklabels=corr.columns, yticklabels=corr.columns, annot=True, fmt='.1f') ''' Instantiate net ''' ''' Param. init. ''' net.collect_params().initialize(mx.init.Xavier(), ctx=ctx) net.hybridize() ''' Forward-propagate a single data set entry once to set up all network parameters (weights and biases) with the desired initializer specified above. ''' x = train_dataset['x'][:, :, 0] t = np.reshape(train_dataset['t'][:, 0], (-1, 1)) yf = train_dataset['yf'][:, 0] yf_m, yf_std = np.mean(yf, axis=0), np.std(yf, axis=0) yf = (yf - yf_m) / yf_std factual_features = np.hstack((x, t)) zero_train_factual_dataset = gluon.data.ArrayDataset( mx.nd.array(factual_features), mx.nd.array(yf)) zero_train_factual_loader = gluon.data.DataLoader( zero_train_factual_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers) for i, (batch_f_features, batch_yf) in enumerate(zero_train_factual_loader): batch_f_features = gluon.utils.split_and_load(batch_f_features, ctx_list=ctx, even_split=False) [net(x) for x in batch_f_features] break weight_scale = .1 rho_offset = -3 lambda_init = 25 ''' Initialize variational parameters; mean and variance for each weight ''' mus = [] rhos = [] lambdas = [] shapes = list(map(lambda x: x.shape, net.collect_params().values())) for shape in shapes: # mu = gluon.Parameter('mu', shape=shape, init=mx.init.Normal(weight_scale)) # rho = gluon.Parameter('rho', shape=shape, init=mx.init.Constant(rho_offset)) lmb = gluon.Parameter('lmb', shape=shape, init=mx.init.Constant(lambda_init)) # mu.initialize(ctx=ctx) # rho.initialize(ctx=ctx) lmb.initialize(ctx=ctx) # mus.append(mu) # rhos.append(rho) lambdas.append(lmb) # variational_params = mus + rhos variational_params = lambdas # raw_mus = list(map(lambda x: x.data(ctx[0]), mus)) # raw_rhos = list(map(lambda x: x.data(ctx[0]), rhos)) raw_lambdas = list(map(lambda x: x.data(ctx[0]), lambdas)) ''' Metric, Loss and Optimizer ''' rmse_metric = mx.metric.RMSE() l2_loss = gluon.loss.L2Loss() bbb_loss = BBBLoss(ctx[0], log_prior="exponential", sigma_p1=config['sigma_p1'], sigma_p2=config['sigma_p2'], pi=config['pi'], lambda_p=config['lambda_p']) # bbb_loss = BBBLoss(ctx[0], log_prior="scale_mixture", sigma_p1=config['sigma_p1'], sigma_p2=config['sigma_p2'], # pi=config['pi']) scheduler = mx.lr_scheduler.FactorScheduler(step=learning_rate_steps, factor=learning_rate_factor, base_lr=learning_rate) # optimizer = mx.optimizer.Adam(learning_rate=learning_rate, lr_scheduler=scheduler) optimizer = mx.optimizer.RMSProp(learning_rate=learning_rate, lr_scheduler=scheduler, wd=wd) # optimizer = mx.optimizer.Adam(learning_rate=learning_rate) trainer = gluon.Trainer(variational_params, optimizer=optimizer) ''' Initialize train score results ''' train_scores = np.zeros((train_experiments, 3)) ''' Initialize train experiment durations ''' train_durations = np.zeros((train_experiments, 1)) ''' Initialize test score results ''' test_scores = np.zeros((train_experiments, 3)) ''' Train experiments means and stds ''' means = np.array([]) stds = np.array([]) ''' Train ''' for train_experiment in range(train_experiments): ''' Create training dataset ''' x = train_dataset['x'][:, :, train_experiment] t = np.reshape(train_dataset['t'][:, train_experiment], (-1, 1)) yf = train_dataset['yf'][:, train_experiment] ycf = train_dataset['ycf'][:, train_experiment] mu0 = train_dataset['mu0'][:, train_experiment] mu1 = train_dataset['mu1'][:, train_experiment] train, valid, test, _ = split_data_in_train_valid_test( x, t, yf, ycf, mu0, mu1) ''' With-in sample ''' train_evaluator = Evaluator( np.concatenate([train['t'], valid['t']]), np.concatenate([train['yf'], valid['yf']]), y_cf=np.concatenate([train['ycf'], valid['ycf']], axis=0), mu0=np.concatenate([train['mu0'], valid['mu0']], axis=0), mu1=np.concatenate([train['mu1'], valid['mu1']], axis=0)) test_evaluator = Evaluator(test['t'], test['yf'], test['ycf'], test['mu0'], test['mu1']) ''' Normalize yf ''' # TODO check for normalize input? yf_m, yf_std = np.mean(train['yf'], axis=0), np.std(train['yf'], axis=0) train['yf'] = (train['yf'] - yf_m) / yf_std valid['yf'] = (valid['yf'] - yf_m) / yf_std test['yf'] = (test['yf'] - yf_m) / yf_std ''' Save mean and std ''' means = np.append(means, yf_m) stds = np.append(stds, yf_std) ''' Train dataset ''' factual_features = np.hstack((train['x'], train['t'])) train_factual_dataset = gluon.data.ArrayDataset( mx.nd.array(factual_features), mx.nd.array(train['yf'])) ''' With-in sample ''' train_rmse_ite_dataset = gluon.data.ArrayDataset( mx.nd.array(np.concatenate([train['x'], valid['x']]))) ''' Valid dataset ''' valid_factual_features = np.hstack((valid['x'], valid['t'])) valid_factual_dataset = gluon.data.ArrayDataset( mx.nd.array(valid_factual_features), mx.nd.array(valid['yf'])) ''' Test dataset ''' test_rmse_ite_dataset = gluon.data.ArrayDataset(mx.nd.array(test['x'])) ''' Train DataLoader ''' train_factual_loader = gluon.data.DataLoader(train_factual_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers) train_rmse_ite_loader = gluon.data.DataLoader(train_rmse_ite_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers) ''' Valid DataLoader ''' valid_factual_loader = gluon.data.DataLoader(valid_factual_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers) ''' Test DataLoader ''' test_rmse_ite_loader = gluon.data.DataLoader(test_rmse_ite_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers) num_batch = len(train_factual_loader) train_start = time.time() train_acc = [] test_acc = [] ''' Train model ''' for epoch in range( 1, epochs + 1): # start with epoch 1 for easier learning rate calculation train_loss = 0 rmse_metric.reset() for i, (batch_f_features, batch_yf) in enumerate(train_factual_loader): ''' Get data and labels into slices and copy each slice into a context.''' batch_f_features = batch_f_features.as_in_context( ctx[0]).reshape((-1, 26)) batch_yf = batch_yf.as_in_context(ctx[0]).reshape( (len(batch_yf), -1)) ''' Forward ''' with autograd.record(): ''' Generate sample ''' # layer_params, sigmas = generate_weight_sample(shapes, raw_mus, raw_rhos, ctx[0]) layer_params = generate_weight_sample_exp( shapes, raw_lambdas, ctx[0]) ''' Overwrite network parameters with sampled parameters ''' for sample, param in zip(layer_params, net.collect_params().values()): param._data[0] = sample ''' Forward-propagate the batch ''' outputs = net(batch_f_features) # if epoch == epochs: # ''' Factual outcomes and batch_yf histograms ''' # import pandas as pd # df = pd.DataFrame({'layer_params': layer_params[6][0].asnumpy().flatten()}, columns=['layer_params']) # df = pd.DataFrame( # {'outputs': outputs.asnumpy().flatten(), 'batch_yf': batch_yf.asnumpy().flatten()}, # columns=['outputs', 'batch_yf']) # df.plot(kind='hist', alpha=0.5) # df.plot.kde() ''' Calculate the loss ''' l2_loss_value = l2_loss(outputs, batch_yf) # bbb_loss_value = bbb_loss(outputs, batch_yf, layer_params, raw_mus, sigmas, num_batch) bbb_loss_value = bbb_loss(outputs, batch_yf, layer_params, raw_lambdas, [], num_batch) loss = bbb_loss_value + l2_loss_value # loss = bbb_loss_value # loss = l2_loss_value ''' Backpropagate for gradient calculation ''' loss.backward() ''' Optimize ''' trainer.step(batch_size) train_loss += sum([l.mean().asscalar() for l in loss]) / len(loss) rmse_metric.update(batch_yf, outputs) if epoch % epoch_output_iter == 0 or epoch == 1: _, train_rmse_factual = rmse_metric.get() train_loss /= num_batch _, valid_rmse_factual = test_net_vb(net, valid_factual_loader, layer_params, ctx) # _, train_RMSE = evaluate_RMSE(train_factual_loader, net, raw_mus, ctx) # _, test_RMSE = evaluate_RMSE(valid_factual_loader, net, raw_mus, ctx) # train_acc.append(np.asscalar(train_RMSE)) # test_acc.append(np.asscalar(test_RMSE)) # print("Epoch %s. Train-RMSE %s, Test-RMSE %s" % # (epoch, train_RMSE, test_RMSE)) log( logfile, 'l2-loss: %.3f, bbb-loss: %.3f' % (l2_loss_value[0].asscalar(), bbb_loss_value[0].asscalar())) log( logfile, '[Epoch %d/%d] Train-rmse-factual: %.3f, loss: %.3f | Valid-rmse-factual: %.3f | learning-rate: ' '%.3E' % (epoch, epochs, train_rmse_factual, train_loss, valid_rmse_factual, trainer.learning_rate)) train_durations[train_experiment, :] = time.time() - train_start ''' Test model ''' # y_t0, y_t1 = predict_treated_and_controlled_vb(net, train_rmse_ite_loader, raw_mus, ctx) y_t0, y_t1 = predict_treated_and_controlled_vb(net, train_rmse_ite_loader, layer_params, ctx) y_t0, y_t1 = y_t0 * yf_std + yf_m, y_t1 * yf_std + yf_m train_score = train_evaluator.get_metrics(y_t1, y_t0) train_scores[train_experiment, :] = train_score # y_t0, y_t1 = predict_treated_and_controlled_vb(net, test_rmse_ite_loader, raw_mus, ctx) y_t0, y_t1 = predict_treated_and_controlled_vb(net, test_rmse_ite_loader, layer_params, ctx) y_t0, y_t1 = y_t0 * yf_std + yf_m, y_t1 * yf_std + yf_m test_score = test_evaluator.get_metrics(y_t1, y_t0) test_scores[train_experiment, :] = test_score log(logfile, '[Train Replication {}/{}]: train RMSE ITE: {:0.3f}, train ATE: {:0.3f}, train PEHE: {:0.3f},' \ ' test RMSE ITE: {:0.3f}, test ATE: {:0.3f}, test PEHE: {:0.3f}'.format(train_experiment + 1, train_experiments, train_score[0], train_score[1], train_score[2], test_score[0], test_score[1], test_score[2])) # plt.plot(train_acc) # plt.plot(test_acc) ''' Save means and stds NDArray values for inference ''' mx.nd.save( outdir + args.architecture.lower() + '_means_stds_ihdp_' + str(train_experiments) + '_.nd', { "means": mx.nd.array(means), "stds": mx.nd.array(stds) }) ''' Export trained model ''' net.export(outdir + args.architecture.lower() + "-ihdp-predictions-" + str(train_experiments), epoch=epochs) log(logfile, '\n{} architecture total scores:'.format(args.architecture.upper())) ''' Train and test scores ''' means, stds = np.mean(train_scores, axis=0), sem(train_scores, axis=0, ddof=0) r_pehe_mean, r_pehe_std = np.mean(np.sqrt(train_scores[:, 2]), axis=0), sem(np.sqrt(train_scores[:, 2]), axis=0, ddof=0) train_total_scores_str = 'train RMSE ITE: {:.2f} ± {:.2f}, train ATE: {:.2f} ± {:.2f}, train PEHE: {:.2f} ± {:.2f}, ' \ 'train root PEHE: {:.2f} ± {:.2f}' \ ''.format(means[0], stds[0], means[1], stds[1], means[2], stds[2], r_pehe_mean, r_pehe_std) means, stds = np.mean(test_scores, axis=0), sem(test_scores, axis=0, ddof=0) r_pehe_mean, r_pehe_std = np.mean(np.sqrt(test_scores[:, 2]), axis=0), sem(np.sqrt(test_scores[:, 2]), axis=0, ddof=0) test_total_scores_str = 'test RMSE ITE: {:.2f} ± {:.2f}, test ATE: {:.2f} ± {:.2f}, test PEHE: {:.2f} ± {:.2f}, ' \ 'test root PEHE: {:.2f} ± {:.2f}' \ ''.format(means[0], stds[0], means[1], stds[1], means[2], stds[2], r_pehe_mean, r_pehe_std) log(logfile, train_total_scores_str) log(logfile, test_total_scores_str) mean_duration = float("{0:.2f}".format( np.mean(train_durations, axis=0)[0])) with open(outdir + args.architecture.lower() + "-total-scores-" + str(train_experiments), "w", encoding="utf8") as text_file: print(train_total_scores_str, "\n", test_total_scores_str, file=text_file) return { "ite": "{:.2f} ± {:.2f}".format(means[0], stds[0]), "ate": "{:.2f} ± {:.2f}".format(means[1], stds[1]), "pehe": "{:.2f} ± {:.2f}".format(means[2], stds[2]), "mean_duration": mean_duration }
def train(self, train_data, epochs=1, batch_size=32, validation_data=None, train_resize_batch_num=None): """Train the model and update the model parameters.""" stats = dict() if self.is_worker: config = self.config.copy() if "batch_size" not in config: config["batch_size"] = batch_size if train_resize_batch_num is not None: config["train_resize_batch_num"] = train_resize_batch_num train_data_iter = train_data(config, self.kv) val_data_iter = validation_data( config, self.kv) if validation_data else None start_time = time.time() if self.trainer: # Imperative API def cpu_context(target_data): if isinstance(target_data, list): return [cpu_context(d) for d in target_data] else: return target_data.as_in_context(mx.cpu()) for epoch in range(epochs): # DataLoader doesn't need to be reset. if isinstance(train_data_iter, mx.io.DataIter): train_data_iter.reset() if self.eval_metrics: self.eval_metrics.reset( ) # metrics will accumulate for one batch. batch_start_time = time.time() epoch_start_time = time.time() for i, batch in enumerate(train_data_iter): data = cpu_context(batch.data) label = cpu_context(batch.label) if not isinstance(data, list): data = [data] if not isinstance(label, list): label = [label] from mxnet import autograd as ag with ag.record(): output = self.model(*data) # forward if not isinstance(output, list): output = [output] Ls = self.loss(*output, *label) ag.backward(Ls) self.trainer.step(batch_size) if self.eval_metrics: self.eval_metrics.update(label, output) if not (i + 1) % self.config["log_interval"]: # This would be logged on driver for each worker process. iteration_log = \ "Epoch[%d] Batch[%d] Speed: %f samples/sec %s=%f" \ % (epoch, i, batch_size / (time.time() - batch_start_time), "loss", Ls.asnumpy().mean()) if self.eval_metrics: names, accs = self.eval_metrics.get() names, accs = to_list(names), to_list(accs) for name, acc in zip(names, accs): iteration_log += " %s=%f" % (name, acc) self.logger.info(iteration_log) batch_start_time = time.time() # Epoch time log. self.logger.info("[Epoch %d] time cost: %f" % (epoch, time.time() - epoch_start_time)) # Epoch metrics log on train data. if self.eval_metrics: epoch_train_log = "[Epoch %d] training: " % epoch names, accs = self.eval_metrics.get() names, accs = to_list(names), to_list(accs) for name, acc in zip(names, accs): epoch_train_log += "%s=%f " % (name, acc) self.logger.info(epoch_train_log) # Epoch metrics log on validation data if any. if val_data_iter: if isinstance(val_data_iter, mx.io.DataIter): val_data_iter.reset() self.val_metrics.reset() for batch in val_data_iter: data = cpu_context(batch.data) label = cpu_context(batch.label) if not isinstance(data, list): data = [data] if not isinstance(label, list): label = [label] output = self.model(*data) if not isinstance(output, list): output = [output] self.val_metrics.update(label, output) epoch_val_log = "[Epoch %d] validation: " % epoch names, accs = self.val_metrics.get() names, accs = to_list(names), to_list(accs) for name, acc in zip(names, accs): epoch_val_log += "%s=%f " % (name, acc) self.logger.info(epoch_val_log) # TODO: save checkpoints if self.eval_metrics: names, accs = self.eval_metrics.get() names, accs = to_list(names), to_list(accs) for name, acc in zip(names, accs): stats[name] = acc else: # Symbolic API # TODO: seems no history (i.e. validation accuracy) returned by fit? if "init" not in self.config: from mxnet.initializer import Uniform self.config["init"] = Uniform( 0.01) # This is the default value for MXNet. if self.eval_metrics is None: self.eval_metrics = 'acc' # This is the default value for MXNet. self.model.fit( train_data=train_data_iter, num_epoch=epochs, initializer=self.config["init"], kvstore=self.kv, optimizer=self.config["optimizer"], optimizer_params=self.config["optimizer_params"], eval_data=val_data_iter, eval_metric=self.eval_metrics, validation_metric=self.val_metrics, batch_end_callback=mx.callback.Speedometer( batch_size, self.config["log_interval"]), epoch_end_callback=None if "model" not in self.config else mx.callback.do_checkpoint(self.config["model"])) epoch_time = time.time() - start_time stats["epoch_time"] = epoch_time return [stats]
def create(input_dataset, target, feature=None, validation_set='auto', warm_start='auto', batch_size=256, max_iterations=100, verbose=True): """ Create a :class:`DrawingClassifier` model. Parameters ---------- dataset : SFrame Input data. The columns named by the ``feature`` and ``target`` parameters will be extracted for training the drawing classifier. target : string Name of the column containing the target variable. The values in this column must be of string or integer type. feature : string optional Name of the column containing the input drawings. 'None' (the default) indicates the column in `dataset` named "drawing" should be used as the feature. The feature column can contain both bitmap-based drawings as well as stroke-based drawings. Bitmap-based drawing input can be a grayscale tc.Image of any size. Stroke-based drawing input must be in the following format: Every drawing must be represented by a list of strokes, where each stroke must be a list of points in the order in which they were drawn on the canvas. Each point must be a dictionary with two keys, "x" and "y", and their respective values must be numerical, i.e. either integer or float. validation_set : SFrame optional A dataset for monitoring the model's generalization performance. The format of this SFrame must be the same as the training set. By default this argument is set to 'auto' and a validation set is automatically sampled and used for progress printing. If validation_set is set to None, then no additional metrics are computed. The default value is 'auto'. warm_start : string optional A string to denote which pretrained model to use. Set to "auto" by default which uses a model trained on 245 of the 345 classes in the Quick, Draw! dataset. Here is a list of all the pretrained models that can be passed in as this argument: "auto": Uses quickdraw_245_v0 "quickdraw_245_v0": Uses a model trained on 245 of the 345 classes in the Quick, Draw! dataset. batch_size: int optional The number of drawings per training step. If not set, a default value of 256 will be used. If you are getting memory errors, try decreasing this value. If you have a powerful computer, increasing this value may improve performance. max_iterations : int optional The maximum number of allowed passes through the data. More passes over the data can result in a more accurately trained model. verbose : bool optional If True, print progress updates and model details. Returns ------- out : DrawingClassifier A trained :class:`DrawingClassifier` model. See Also -------- DrawingClassifier Examples -------- .. sourcecode:: python # Train a drawing classifier model >>> model = turicreate.drawing_classifier.create(data) # Make predictions on the training set and as column to the SFrame >>> data['predictions'] = model.predict(data) """ import mxnet as _mx from mxnet import autograd as _autograd from ._model_architecture import Model as _Model from ._sframe_loader import SFrameClassifierIter as _SFrameClassifierIter start_time = _time.time() # @TODO: Should be able to automatically choose number of iterations # based on data size: Tracked in Github Issue #1576 # automatically infer feature column if feature is None: feature = _tkutl._find_only_drawing_column(input_dataset) _raise_error_if_not_drawing_classifier_input_sframe( input_dataset, feature, target) is_stroke_input = (input_dataset[feature].dtype != _tc.Image) dataset = _extensions._drawing_classifier_prepare_data( input_dataset, feature) if is_stroke_input else input_dataset iteration = 0 classes = dataset[target].unique() classes = sorted(classes) class_to_index = {name: index for index, name in enumerate(classes)} validation_set_corrective_string = ( "'validation_set' parameter must be " + "an SFrame, or None, or must be set to 'auto' for the toolkit to " + "automatically create a validation set.") if isinstance(validation_set, _tc.SFrame): _raise_error_if_not_drawing_classifier_input_sframe( validation_set, feature, target) is_validation_stroke_input = (validation_set[feature].dtype != _tc.Image) validation_dataset = _extensions._drawing_classifier_prepare_data( validation_set, feature) if is_validation_stroke_input else validation_set elif isinstance(validation_set, str): if validation_set == 'auto': if dataset.num_rows() >= 100: if verbose: print( "PROGRESS: Creating a validation set from 5 percent of training data. This may take a while.\n" " You can set ``validation_set=None`` to disable validation tracking.\n" ) dataset, validation_dataset = dataset.random_split( TRAIN_VALIDATION_SPLIT) else: validation_set = None validation_dataset = _tc.SFrame() else: raise _ToolkitError("Unrecognized value for 'validation_set'. " + validation_set_corrective_string) elif validation_set is None: validation_dataset = _tc.SFrame() else: raise TypeError("Unrecognized type for 'validation_set'." + validation_set_corrective_string) train_loader = _SFrameClassifierIter(dataset, batch_size, feature_column=feature, target_column=target, class_to_index=class_to_index, load_labels=True, shuffle=True, iterations=max_iterations) train_loader_to_compute_accuracy = _SFrameClassifierIter( dataset, batch_size, feature_column=feature, target_column=target, class_to_index=class_to_index, load_labels=True, shuffle=True, iterations=1) validation_loader = _SFrameClassifierIter(validation_dataset, batch_size, feature_column=feature, target_column=target, class_to_index=class_to_index, load_labels=True, shuffle=True, iterations=1) if verbose and iteration == 0: column_names = ['iteration', 'train_loss', 'train_accuracy', 'time'] column_titles = [ 'Iteration', 'Training Loss', 'Training Accuracy', 'Elapsed Time (seconds)' ] if validation_set is not None: column_names.insert(3, 'validation_accuracy') column_titles.insert(3, 'Validation Accuracy') table_printer = _tc.util._ProgressTablePrinter(column_names, column_titles) ctx = _mxnet_utils.get_mxnet_context(max_devices=batch_size) model = _Model(num_classes=len(classes), prefix="drawing_") model_params = model.collect_params() model_params.initialize(_mx.init.Xavier(), ctx=ctx) if warm_start is not None: pretrained_model = _pre_trained_models.DrawingClassifierPreTrainedModel( warm_start) pretrained_model_params_path = pretrained_model.get_model_path() model.load_params(pretrained_model_params_path, ctx=ctx, allow_missing=True) softmax_cross_entropy = _mx.gluon.loss.SoftmaxCrossEntropyLoss() model.hybridize() trainer = _mx.gluon.Trainer(model.collect_params(), 'adam') train_accuracy = _mx.metric.Accuracy() validation_accuracy = _mx.metric.Accuracy() def get_data_and_label_from_batch(batch): if batch.pad is not None: size = batch_size - batch.pad batch_data = ( [_mx.nd.slice_axis(batch.data[0], axis=0, begin=0, end=size)] + [None] * (len(ctx) - 1)) batch_label = ( [_mx.nd.slice_axis(batch.label[0], axis=0, begin=0, end=size) ] + [None] * (len(ctx) - 1)) else: batch_data = _mx.gluon.utils.split_and_load(batch.data[0], ctx_list=ctx, batch_axis=0) batch_label = _mx.gluon.utils.split_and_load(batch.label[0], ctx_list=ctx, batch_axis=0) return batch_data, batch_label def compute_accuracy(accuracy_metric, batch_loader): batch_loader.reset() accuracy_metric.reset() for batch in batch_loader: batch_data, batch_label = get_data_and_label_from_batch(batch) outputs = [] for x, y in zip(batch_data, batch_label): if x is None or y is None: continue z = model(x) outputs.append(z) accuracy_metric.update(batch_label, outputs) for train_batch in train_loader: train_batch_data, train_batch_label = get_data_and_label_from_batch( train_batch) with _autograd.record(): # Inside training scope for x, y in zip(train_batch_data, train_batch_label): z = model(x) # Computes softmax cross entropy loss. loss = softmax_cross_entropy(z, y) # Backpropagate the error for one iteration. loss.backward() # Make one step of parameter update. Trainer needs to know the # batch size of data to normalize the gradient by 1/batch_size. trainer.step(train_batch.data[0].shape[0]) # calculate training metrics train_loss = loss.mean().asscalar() train_time = _time.time() - start_time if train_batch.iteration > iteration: # Compute training accuracy compute_accuracy(train_accuracy, train_loader_to_compute_accuracy) # Compute validation accuracy if validation_set is not None: compute_accuracy(validation_accuracy, validation_loader) iteration = train_batch.iteration if verbose: kwargs = { "iteration": iteration, "train_loss": float(train_loss), "train_accuracy": train_accuracy.get()[1], "time": train_time } if validation_set is not None: kwargs["validation_accuracy"] = validation_accuracy.get( )[1] table_printer.print_row(**kwargs) state = { '_model': model, '_class_to_index': class_to_index, 'num_classes': len(classes), 'classes': classes, 'input_image_shape': (1, BITMAP_WIDTH, BITMAP_HEIGHT), 'batch_size': batch_size, 'training_loss': train_loss, 'training_accuracy': train_accuracy.get()[1], 'training_time': train_time, 'validation_accuracy': validation_accuracy.get()[1], # nan if validation_set=None 'max_iterations': max_iterations, 'target': target, 'feature': feature, 'num_examples': len(input_dataset) } return DrawingClassifier(state)
def check_layer_bidirectional_varseqlen(size, in_size): weights = {} for d in ['l', 'r']: weights['{}0_i2h_weight'.format(d)] = mx.random.uniform( shape=(size * 4, in_size)) weights['{}0_h2h_weight'.format(d)] = mx.random.uniform(shape=(size * 4, size)) weights['{}0_i2h_bias'.format(d)] = mx.random.uniform(shape=(size * 4, )) weights['{}0_h2h_bias'.format(d)] = mx.random.uniform(shape=(size * 4, )) net = gluon.rnn.LSTM(size, bidirectional=True, use_sequence_length=True) ref_net = gluon.rnn.LSTM(size, bidirectional=True, use_sequence_length=False) net.initialize() ref_net.initialize() net_params = net.collect_params() ref_net_params = ref_net.collect_params() for k in weights: net_params[k].set_data(weights[k]) ref_net_params[k].set_data(weights[k]) batch_size = 10 num_timesteps = 11 data = mx.random.uniform(shape=(num_timesteps, batch_size, in_size)) data_np = data.asnumpy() sequence_length = nd.random.randint(1, num_timesteps + 1, shape=(batch_size)).astype("int32") sequence_length_np = sequence_length.asnumpy().astype("int32") # Reference net is processing batch elements one at a time, so that it is "perfectly sized" # Because of that, we need to accumulate gradients in reference net. for p in ref_net.collect_params().values(): p.grad_req = 'add' ref_net_output = [] with autograd.record(): net_output = net(data.copy(), sequence_length=sequence_length.copy()) for b in range(batch_size): data_slice = mx.nd.array(data_np[:sequence_length_np[b], b, :]).reshape( sequence_length_np[b], 1, in_size) ref_output_slice = ref_net(data_slice) ref_net_output.append(ref_output_slice) net_output_np = net_output.asnumpy() # TODO: test state return value as well output # Only compare the valid sections for each batch entry for b in range(batch_size): assert_allclose(net_output_np[:sequence_length_np[b], b], ref_net_output[b].asnumpy().squeeze(1), rtol=1e-2, atol=1e-6) # Now test backward net_output.backward() for ref_output_slice in ref_net_output: ref_output_slice.backward() ref_net_params = ref_net.collect_params() for k in weights: net_grad = net_params[k].grad() ref_net_grad = ref_net_params[k].grad() assert_almost_equal(net_grad.asnumpy(), ref_net_grad.asnumpy(), rtol=1e-2, atol=1e-6)
def train(net, train_data, val_data, eval_metric, ctx, args): """Training pipeline""" net.collect_params().setattr('grad_req', 'null') net.collect_train_params().setattr('grad_req', 'write') trainer = gluon.Trainer( net.collect_train_params(), # fix batchnorm, fix first stage, etc... 'sgd', { 'learning_rate': args.lr, 'wd': args.wd, 'momentum': args.momentum, 'clip_gradient': 5 }) # lr decay policy lr_decay = float(args.lr_decay) lr_steps = sorted( [float(ls) for ls in args.lr_decay_epoch.split(',') if ls.strip()]) lr_warmup = float(args.lr_warmup) # avoid int division # TODO(zhreshold) losses? rpn_cls_loss = mx.gluon.loss.SigmoidBinaryCrossEntropyLoss( from_sigmoid=False) rpn_box_loss = mx.gluon.loss.HuberLoss(rho=1 / 9.) # == smoothl1 rcnn_cls_loss = mx.gluon.loss.SoftmaxCrossEntropyLoss() rcnn_box_loss = mx.gluon.loss.HuberLoss() # == smoothl1 metrics = [ mx.metric.Loss('RPN_Conf'), mx.metric.Loss('RPN_SmoothL1'), mx.metric.Loss('RCNN_CrossEntropy'), mx.metric.Loss('RCNN_SmoothL1'), ] rpn_acc_metric = RPNAccMetric() rpn_bbox_metric = RPNL1LossMetric() rcnn_acc_metric = RCNNAccMetric() rcnn_bbox_metric = RCNNL1LossMetric() metrics2 = [ rpn_acc_metric, rpn_bbox_metric, rcnn_acc_metric, rcnn_bbox_metric ] # set up logger logging.basicConfig() logger = logging.getLogger() logger.setLevel(logging.INFO) log_file_path = args.save_prefix + '_train.log' log_dir = os.path.dirname(log_file_path) if log_dir and not os.path.exists(log_dir): os.makedirs(log_dir) fh = logging.FileHandler(log_file_path) logger.addHandler(fh) logger.info(args) if args.verbose: logger.info('Trainable parameters:') logger.info(net.collect_train_params().keys()) logger.info('Start training from [Epoch {}]'.format(args.start_epoch)) best_map = [0] for epoch in range(args.start_epoch, args.epochs): mix_ratio = 1.0 if args.mixup: # TODO(zhreshold) only support evenly mixup now, target generator needs to be modified otherwise train_data._dataset.set_mixup(np.random.uniform, 0.5, 0.5) mix_ratio = 0.5 if epoch >= args.epochs - args.no_mixup_epochs: train_data._dataset.set_mixup(None) mix_ratio = 1.0 while lr_steps and epoch >= lr_steps[0]: new_lr = trainer.learning_rate * lr_decay lr_steps.pop(0) trainer.set_learning_rate(new_lr) logger.info("[Epoch {}] Set learning rate to {}".format( epoch, new_lr)) for metric in metrics: metric.reset() tic = time.time() btic = time.time() net.hybridize(static_alloc=True) base_lr = trainer.learning_rate for i, batch in enumerate(train_data): if epoch == 0 and i <= lr_warmup: # adjust based on real percentage new_lr = base_lr * get_lr_at_iter(i / lr_warmup) if new_lr != trainer.learning_rate: if i % args.log_interval == 0: logger.info( '[Epoch 0 Iteration {}] Set learning rate to {}'. format(i, new_lr)) trainer.set_learning_rate(new_lr) batch = split_and_load(batch, ctx_list=ctx) batch_size = len(batch[0]) losses = [] metric_losses = [[] for _ in metrics] add_losses = [[] for _ in metrics2] with autograd.record(): for data, label, rpn_cls_targets, rpn_box_targets, rpn_box_masks in zip( *batch): gt_label = label[:, :, 4:5] gt_box = label[:, :, :4] cls_pred, box_pred, roi, samples, matches, rpn_score, rpn_box, anchors = net( data, gt_box) # losses of rpn rpn_score = rpn_score.squeeze(axis=-1) num_rpn_pos = (rpn_cls_targets >= 0).sum() rpn_loss1 = rpn_cls_loss( rpn_score, rpn_cls_targets, rpn_cls_targets >= 0) * rpn_cls_targets.size / num_rpn_pos rpn_loss2 = rpn_box_loss( rpn_box, rpn_box_targets, rpn_box_masks) * rpn_box.size / num_rpn_pos # rpn overall loss, use sum rather than average rpn_loss = rpn_loss1 + rpn_loss2 # generate targets for rcnn cls_targets, box_targets, box_masks = net.target_generator( roi, samples, matches, gt_label, gt_box) # losses of rcnn num_rcnn_pos = (cls_targets >= 0).sum() rcnn_loss1 = rcnn_cls_loss( cls_pred, cls_targets, cls_targets >= 0 ) * cls_targets.size / cls_targets.shape[0] / num_rcnn_pos rcnn_loss2 = rcnn_box_loss( box_pred, box_targets, box_masks ) * box_pred.size / box_pred.shape[0] / num_rcnn_pos rcnn_loss = rcnn_loss1 + rcnn_loss2 # overall losses losses.append(rpn_loss.sum() * mix_ratio + rcnn_loss.sum() * mix_ratio) metric_losses[0].append(rpn_loss1.sum() * mix_ratio) metric_losses[1].append(rpn_loss2.sum() * mix_ratio) metric_losses[2].append(rcnn_loss1.sum() * mix_ratio) metric_losses[3].append(rcnn_loss2.sum() * mix_ratio) add_losses[0].append( [[rpn_cls_targets, rpn_cls_targets >= 0], [rpn_score]]) add_losses[1].append([[rpn_box_targets, rpn_box_masks], [rpn_box]]) add_losses[2].append([[cls_targets], [cls_pred]]) add_losses[3].append([[box_targets, box_masks], [box_pred]]) autograd.backward(losses) for metric, record in zip(metrics, metric_losses): metric.update(0, record) for metric, records in zip(metrics2, add_losses): for pred in records: metric.update(pred[0], pred[1]) trainer.step(batch_size) # update metrics if args.log_interval and not (i + 1) % args.log_interval: # msg = ','.join(['{}={:.3f}'.format(*metric.get()) for metric in metrics]) msg = ','.join([ '{}={:.3f}'.format(*metric.get()) for metric in metrics + metrics2 ]) logger.info( '[Epoch {}][Batch {}], Speed: {:.3f} samples/sec, {}'. format( epoch, i, args.log_interval * batch_size / (time.time() - btic), msg)) btic = time.time() msg = ','.join( ['{}={:.3f}'.format(*metric.get()) for metric in metrics]) logger.info('[Epoch {}] Training cost: {:.3f}, {}'.format( epoch, (time.time() - tic), msg)) if not (epoch + 1) % args.val_interval: # consider reduce the frequency of validation to save time map_name, mean_ap = validate(net, val_data, ctx, eval_metric) val_msg = '\n'.join( ['{}={}'.format(k, v) for k, v in zip(map_name, mean_ap)]) logger.info('[Epoch {}] Validation: \n{}'.format(epoch, val_msg)) current_map = float(mean_ap[-1]) else: current_map = 0. save_params(net, logger, best_map, current_map, epoch, args.save_interval, args.save_prefix)
def nnTrain(model_mark, nnModel, train_data, valid_data_X, valid_data_Y, test_data_X, test_data_Y, batch_size, loss_func, epochs, optimizer, optimizer_params, lr_decay_rate=1): """ Providing 3 approaches to train the model: momentum, adadelta and adam """ assert optimizer in set(['sgd', 'adadelta', 'adam']) random.seed(1) train_iter = gluon.data.DataLoader(train_data, batch_size, shuffle=True) nTrain = len(train_data) nValid = len(test_data_Y) # The model mx.random.seed(123456) nnModel.collect_params().initialize(mx.initializer.MSRAPrelu(), ctx=context) trainer = gluon.Trainer(nnModel.collect_params(), optimizer=optimizer, optimizer_params=optimizer_params) best_smape = 1 for e in range(epochs): # if(e>=2): trainer.set_learning_rate(trainer.learning_rate * lr_decay_rate) train_loss = 0 for data, label in train_iter: label = label.as_in_context(context) with autograd.record(): output = nnModel(data) loss = loss_func(output, label) loss.backward() trainer.step(batch_size) train_loss += nd.sum(loss).asscalar() # The valid loss valid_pred = DLPred(nnModel, valid_data_X)[:, 0].asnumpy() valid_true = valid_data_Y.asnumpy() # The valid loss test_pred = DLPred(nnModel, test_data_X)[:, 0].asnumpy() test_true = test_data_Y.asnumpy() valid_loss = nd.sum( abs_loss(nd.array(valid_true), nd.array(valid_pred))).asscalar() test_loss = nd.sum(abs_loss(nd.array(test_true), nd.array(test_pred))).asscalar() valid_smape = smape(valid_true, valid_pred) test_smape = smape(test_true, test_pred) print("Epoch %d, train loss: %f, valid_loss: %f" % (e, train_loss / nTrain, valid_loss / nValid)) print("Valid smape %f; Test smape %f" % (valid_smape, test_smape)) # Save the model if (e == 0 or valid_smape < best_smape): best_smape = valid_smape if e > 0 and valid_smape < best_smape + 0.3: save_checkpoint(nnModel, model_mark + str(e), round(valid_smape, 2), save_path="checkpoints")
def train(channel_input_dirs, hyperparameters, **kwargs): # SageMaker passes num_cpus, num_gpus and other args we can use to tailor training to # the current container environment, but here we just use simple cpu context. """ Args: channel_input_dirs: hyperparameters: **kwargs: """ ctx = mx.cpu() # retrieve the hyperparameters we set in notebook (with some defaults) batch_size = hyperparameters.get("batch_size", 100) epochs = hyperparameters.get("epochs", 10) learning_rate = hyperparameters.get("learning_rate", 0.1) momentum = hyperparameters.get("momentum", 0.9) log_interval = hyperparameters.get("log_interval", 100) training_data = channel_input_dirs["training"] # load training and validation data # we use the gluon.data.vision.MNIST class because of its built in mnist pre-processing logic, # but point it at the location where SageMaker placed the data files, so it doesn't download them again. train_data = get_train_data(training_data, batch_size) val_data = get_val_data(training_data, batch_size) # define the network net = define_network() # Collect all parameters from net and its children, then initialize them. net.initialize(mx.init.Xavier(magnitude=2.24), ctx=ctx) # Trainer is for updating parameters with gradient. trainer = gluon.Trainer( net.collect_params(), "sgd", {"learning_rate": learning_rate, "momentum": momentum} ) metric = mx.metric.Accuracy() loss = gluon.loss.SoftmaxCrossEntropyLoss() for epoch in range(epochs): # reset data iterator and metric at begining of epoch. metric.reset() btic = time.time() for i, (data, label) in enumerate(train_data): # Copy data to ctx if necessary data = data.as_in_context(ctx) label = label.as_in_context(ctx) # Start recording computation graph with record() section. # Recorded graphs can then be differentiated with backward. with autograd.record(): output = net(data) L = loss(output, label) L.backward() # take a gradient step with batch_size equal to data.shape[0] trainer.step(data.shape[0]) # update metric at last. metric.update([label], [output]) if i % log_interval == 0 and i > 0: name, acc = metric.get() logger.info( "[Epoch %d Batch %d] Training: %s=%f, %f samples/s" % (epoch, i, name, acc, batch_size / (time.time() - btic)) ) btic = time.time() name, acc = metric.get() logger.info("[Epoch %d] Training: %s=%f" % (epoch, name, acc)) name, val_acc = test(ctx, net, val_data) logger.info("[Epoch %d] Validation: %s=%f" % (epoch, name, val_acc)) return net
import mxnet as mx
def train(self, batch_size=64, num_epoch=10, eval_metric='acc', optimizer='adam', optimizer_params=(('learning_rate', 0.001), ), load_checkpoint=True, context='gpu', checkpoint_period=5, normalize=True, noise_distribution='gaussian', noise_distribution_params=( ('mean_value', 0), ('spread_value', 1), ), discriminator_optimizer='adam', discriminator_optimizer_params=(('learning_rate', 0.001), ), constraint_distributions={}, constraint_losses={}, preprocessing=False, k_value=1, generator_loss=None, generator_target_name="", noise_input="", gen_loss_weight=1, dis_loss_weight=1, log_period=50, print_images=False): if context == 'gpu': mx_context = mx.gpu() elif context == 'cpu': mx_context = mx.cpu() else: logging.error("Context argument is '" + context + "'. Only 'cpu' and 'gpu are valid arguments'.") gen_input_names = list(self._net_creator_gen.getInputs().keys()) gen_input_names = [name[:-1] for name in gen_input_names] dis_input_names = list(self._net_creator_dis.getInputs().keys()) dis_input_names = [name[:-1] for name in dis_input_names] if self.use_qnet: qnet_input_names = list(self._net_creator_qnet.getOutputs().keys()) qnet_input_names = [name[:-1] for name in qnet_input_names] dis_real_input = list( self._net_creator_gen.getOutputs().keys())[0][:-1] gen_output_name = list( self._net_creator_gen.getOutputs().keys())[0][:-1] if self.use_qnet: cGAN_input_names = set(gen_input_names).difference( qnet_input_names) cGAN_input_names.discard(noise_input) cGAN_input_names = list(cGAN_input_names) else: cGAN_input_names = set(gen_input_names) cGAN_input_names.discard(noise_input) cGAN_input_names = list(cGAN_input_names) if preprocessing: preproc_lib = "CNNPreprocessor_defaultGAN_defaultGANConnector_predictor_executor" self._data_loader._output_names_ = [] if not generator_target_name == "": self._data_loader._input_names_ = cGAN_input_names + [ gen_output_name ] + [generator_target_name] else: self._data_loader._input_names_ = cGAN_input_names + [ gen_output_name ] if preprocessing: train_iter, test_iter, data_mean, data_std, _, _ = self._data_loader.load_preprocessed_data( batch_size, preproc_lib) else: train_iter, test_iter, data_mean, data_std, _, _ = self._data_loader.load_data( batch_size) traindata_to_index = {} curIndex = 0 for data_tuple in train_iter.data: traindata_to_index[data_tuple[0] + "_"] = curIndex curIndex += 1 if 'weight_decay' in optimizer_params: optimizer_params['wd'] = optimizer_params['weight_decay'] del optimizer_params['weight_decay'] if 'learning_rate_decay' in optimizer_params: min_learning_rate = 1e-08 if 'learning_rate_minimum' in optimizer_params: min_learning_rate = optimizer_params['learning_rate_minimum'] del optimizer_params['learning_rate_minimum'] optimizer_params['lr_scheduler'] = mx.lr_scheduler.FactorScheduler( optimizer_params['step_size'], factor=optimizer_params['learning_rate_decay'], stop_factor_lr=min_learning_rate) del optimizer_params['step_size'] del optimizer_params['learning_rate_decay'] if 'weight_decay' in discriminator_optimizer_params: discriminator_optimizer_params[ 'wd'] = discriminator_optimizer_params['weight_decay'] del discriminator_optimizer_params['weight_decay'] if 'learning_rate_decay' in optimizer_params: min_learning_rate = 1e-08 if 'learning_rate_minimum' in discriminator_optimizer_params: min_learning_rate = discriminator_optimizer_params[ 'learning_rate_minimum'] del discriminator_optimizer_params['learning_rate_minimum'] discriminator_optimizer_params[ 'lr_scheduler'] = mx.lr_scheduler.FactorScheduler( discriminator_optimizer_params['step_size'], factor=discriminator_optimizer_params[ 'learning_rate_decay'], stop_factor_lr=min_learning_rate) del discriminator_optimizer_params['step_size'] del discriminator_optimizer_params['learning_rate_decay'] if normalize: self._net_creator_dis.construct([mx_context], batch_size=batch_size, data_mean=data_mean, data_std=data_std) else: self._net_creator_dis.construct([mx_context], batch_size=batch_size) self._net_creator_gen.construct([mx_context]) if self.use_qnet: self._net_creator_qnet.construct([mx_context]) if load_checkpoint: self._net_creator_qnet.load([mx_context]) else: if os.path.isdir(self._net_creator_qnet._model_dir_): shutil.rmtree(self._net_creator_qnet._model_dir_) try: os.makedirs(self._net_creator_qnet._model_dir_) except OSError: if not (os.path.isdir(self._net_creator_qnet._model_dir_)): raise q_net = self._net_creator_qnet.networks[0] begin_epoch = 0 if load_checkpoint: begin_epoch = self._net_creator_dis.load([mx_context]) self._net_creator_gen.load([mx_context]) else: if os.path.isdir(self._net_creator_dis._model_dir_): shutil.rmtree(self._net_creator_dis._model_dir_) if os.path.isdir(self._net_creator_gen._model_dir_): shutil.rmtree(self._net_creator_gen._model_dir_) dis_net = self._net_creator_dis.networks[0] gen_net = self._net_creator_gen.networks[0] try: os.makedirs(self._net_creator_gen._model_dir_) os.makedirs(self._net_creator_dis._model_dir_) except OSError: if not (os.path.isdir(self._net_creator_gen._model_dir_) and os.path.isdir(self._net_creator_dis._model_dir_)): raise gen_trainer = mx.gluon.Trainer(gen_net.collect_params(), optimizer, optimizer_params) dis_trainer = mx.gluon.Trainer(dis_net.collect_params(), discriminator_optimizer, discriminator_optimizer_params) if self.use_qnet: qnet_trainer = mx.gluon.Trainer(q_net.collect_params(), discriminator_optimizer, discriminator_optimizer_params) dis_loss = mx.gluon.loss.SigmoidBinaryCrossEntropyLoss( from_sigmoid=True) dis_loss.hybridize() if not generator_loss == None: if generator_loss == "l2": generator_loss_func = mx.gluon.loss.L2Loss() generator_loss_func.hybridize() elif generator_loss == "l1": generator_loss_func = mx.gluon.loss.L1Loss() generator_loss_func.hybridize() else: logging.error("Invalid generator loss parameter") metric_dis = mx.metric.create(eval_metric) metric_gen = mx.metric.create(eval_metric) gen_inputs = self._net_creator_gen.getInputs() dis_inputs = self._net_creator_dis.getInputs() qnet_outputs = [] if self.use_qnet: qnet_outputs = self._net_creator_qnet.getOutputs() qnet_losses = [] generators = {} if self.use_qnet: for name in qnet_outputs: domain = gen_inputs[name] min = domain[1] max = domain[2] if name[:-1] in constraint_distributions: dist_dict = constraint_distributions[name[:-1]] dist_name = dist_dict['name'] if dist_name is "gaussian": generators[ name] = lambda domain=domain, min=min, max=max: mx.nd.cast( mx.ndarray.random.normal( dist_dict["mean_value"], dist_dict["spread_value"], shape=(batch_size, ) + domain[3], dtype=domain[0], ctx=mx_context), dtype="float32") else: if domain[0] == float: generators[ name] = lambda domain=domain, min=min, max=max: mx.nd.cast( mx.ndarray.random.uniform( min, max, shape=(batch_size, ) + domain[3], dtype=domain[0], ctx=mx_context, ), dtype="float32") elif domain[0] == int: generators[ name] = lambda domain=domain, min=min, max=max: mx.ndarray.one_hot( mx.ndarray.random.randint(low=0, high=int(max - min) + 1, shape=(batch_size, ), dtype=int, ctx=mx_context), depth=int(max - min) + 1, on_value=1).reshape((batch_size, ) + domain[3]) if name[-1] in constraint_losses: loss_dict = constraint_losses[name[:-1]] loss = loss_dict['name'] margin = loss_dict[ 'margin'] if 'margin' in loss_dict else 1.0 sparseLabel = loss_dict[ 'sparse_label'] if 'sparse_label' in loss_dict else True ignore_indices = [ loss_dict['ignore_indices'] ] if 'ignore_indices' in loss_dict else [] fromLogits = loss_dict[ 'from_logits'] if 'from_logits' in loss_dict else False if loss == 'softmax_cross_entropy': qnet_losses += [ mx.gluon.loss.SoftmaxCrossEntropyLoss( from_logits=fromLogits, sparse_label=sparseLabel) ] elif loss == 'softmax_cross_entropy_ignore_indices': qnet_losses += [ SoftmaxCrossEntropyLossIgnoreIndices( ignore_indices=ignore_indices, from_logits=fromLogits, sparse_label=sparseLabel) ] elif loss == 'sigmoid_binary_cross_entropy': qnet_losses += [ mx.gluon.loss.SigmoidBinaryCrossEntropyLoss( from_sigmoid=True) ] elif loss == 'cross_entropy': qnet_losses += [ CrossEntropyLoss(sparse_label=sparseLabel) ] elif loss == 'l2': qnet_losses += [mx.gluon.loss.L2Loss()] elif loss == 'l1': qnet_losses += [mx.gluon.loss.L2Loss()] elif loss == 'log_cosh': qnet_losses += [LogCoshLoss()] else: logging.error( "Invalid loss parameter for constraint:" + name[:-1] + ".") else: if domain[0] == float: qnet_losses += [mx.gluon.loss.L2Loss()] elif domain[0] == int: qnet_losses += [ lambda pred, labels: mx.gluon.loss. SoftmaxCrossEntropyLoss(sparse_label=False) (pred, labels) ] for name in gen_inputs: if name == noise_input + "_": domain = gen_inputs[name] min = domain[1] max = domain[2] if noise_distribution == "gaussian": generators[ name] = lambda domain=domain, min=min, max=max: mx.nd.cast( mx.ndarray.random.normal( noise_distribution_params["mean_value"], noise_distribution_params["spread_value"], shape=(batch_size, ) + domain[3], dtype=domain[0], ctx=mx_context), dtype="float32") elif noise_distribution == "uniform": generators[ name] = lambda domain=domain, min=min, max=max: mx.nd.cast( mx.ndarray.random.uniform(low=min, high=max, shape=(batch_size, ) + domain[3], dtype=domain[0], ctx=mx_context), dtype="float32") def create_generator_input(cur_batch): expected_qnet_output = [] gen_input = [] for name in gen_inputs: if name in traindata_to_index.keys(): gen_input += [ cur_batch.data[traindata_to_index[name]].as_in_context( mx_context) ] elif name in qnet_outputs: value = generators[name]() expected_qnet_output += [value] gen_input += [value] else: gen_input += [generators[name]()] return gen_input, expected_qnet_output def create_discriminator_input(cur_batch): conditional_input = [] for name in gen_inputs: if name in traindata_to_index.keys(): conditional_input += [ cur_batch.data[traindata_to_index[name]].as_in_context( mx_context) ] return conditional_input tic = None for epoch in range(begin_epoch, begin_epoch + num_epoch): train_iter.reset() for batch_i, batch in enumerate(train_iter): real_data = batch.data[traindata_to_index[ dis_real_input + "_"]].as_in_context(mx_context) dis_conditional_input = create_discriminator_input(batch) gen_input, exp_qnet_output = create_generator_input(batch) with autograd.record(): fake_data = gen_net(*gen_input)[0][0] fake_data.detach() discriminated_fake_dis = dis_net( fake_data, *dis_conditional_input)[0][0] if self.use_qnet: discriminated_fake_dis, _ = discriminated_fake_dis fake_labels = mx.nd.zeros(discriminated_fake_dis.shape, ctx=mx_context) real_labels = mx.nd.ones(discriminated_fake_dis.shape, ctx=mx_context) loss_resultF = dis_loss(discriminated_fake_dis, fake_labels) discriminated_real_dis = dis_net( real_data, *dis_conditional_input)[0][0] if self.use_qnet: discriminated_real_dis, _ = discriminated_real_dis loss_resultR = dis_loss(discriminated_real_dis, real_labels) loss_resultD = dis_loss_weight * (loss_resultR + loss_resultF) loss_resultD.backward() dis_trainer.step(batch_size) if batch_i % k_value == 0: with autograd.record(): fake_data = gen_net(*gen_input)[0][0] discriminated_fake_gen = dis_net( fake_data, *dis_conditional_input)[0][0] if self.use_qnet: discriminated_fake_gen, features = discriminated_fake_gen loss_resultG = dis_loss(discriminated_fake_gen, real_labels) if not generator_loss == None: condition = batch.data[traindata_to_index[ generator_target_name + "_"]] loss_resultG = loss_resultG + gen_loss_weight * generator_loss_func( fake_data, condition) if self.use_qnet: qnet_discriminated = [q_net(features)[0][0]] for i, qnet_out in enumerate(qnet_discriminated): loss_resultG = loss_resultG + qnet_losses[i]( qnet_out, exp_qnet_output[i]) loss_resultG.backward() gen_trainer.step(batch_size) if self.use_qnet: qnet_trainer.step(batch_size) if tic is None: tic = time.time() else: if batch_i % log_period == 0: try: speed = log_period * batch_size / (time.time() - tic) except ZeroDivisionError: speed = float("inf") logging.info(" Discriminator loss on real data: " + str(loss_resultR[0].asnumpy().item())) logging.info(" Discriminator loss on fake data: " + str(loss_resultF[0].asnumpy().item())) logging.info(" Generator loss: " + str(loss_resultG[0].asnumpy().item())) logging.info( "Epoch[%d] Batch[%d] Speed: %.2f samples/sec \n" % (epoch, batch_i, speed)) tic = time.time() if print_images: pyplot.subplot(1, 2, 1) fake_img = fake_data[0] visualize(fake_img) filename = 'plot_%06d%06d.png' % (epoch, batch_i) pyplot.savefig(filename) pyplot.close() if (epoch - begin_epoch) % checkpoint_period == 0: gen_net.save_parameters(self.parameter_path_gen() + '-' + str(epoch).zfill(4) + '.params') dis_net.save_parameters(self.parameter_path_dis() + '-' + str(epoch).zfill(4) + '.params') gen_net.save_parameters(self.parameter_path_gen() + '-' + str(num_epoch + begin_epoch).zfill(4) + '.params') gen_net.export(self.parameter_path_gen() + '_newest', epoch=0) dis_net.save_parameters(self.parameter_path_dis() + '-' + str(num_epoch + begin_epoch).zfill(4) + '.params') dis_net.export(self.parameter_path_dis() + '_newest', epoch=0) if not generator_loss == None: generator_loss_func.export(self.parameter_path_gen() + '_newest_loss', epoch=0) dis_loss.export(self.parameter_path_dis() + '_newest_loss', epoch=0)
def CNN(epoch=100, batch_size=128, save_period=10, load_period=100, optimizer="sgd", learning_rate=0.01, dataset="MNIST", ctx=mx.gpu(0)): #data selection if dataset == "MNIST": train_data, test_data = MNIST(batch_size) path = "weights/MNIST-{}.params".format(load_period) elif dataset == "CIFAR10": train_data, test_data = CIFAR10(batch_size) path = "weights/CIFAR10-{}.params".format(load_period) elif dataset == "FashionMNIST": train_data, test_data = FashionMNIST(batch_size) path = "weights/FashionMNIST-{}.params".format(load_period) else: return "The dataset does not exist." '''Follow these steps: •Define network •Initialize parameters •Loop over inputs •Forward input through network to get output •Compute loss with output and label •Backprop gradient •Update parameters with gradient descent. ''' #Convolution Neural Network # formula : output_size=((input−weights+2*Padding)/Stride)+1 # data size # MNIST,FashionMNIST = (batch size , 1 , 28 , 28) # CIFAR = (batch size , 3 , 32 , 32) #net = gluon.nn.Sequential() # stacks 'Block's sequentially net = gluon.nn.HybridSequential() # for faster learning with net.name_scope(): net.add( gluon.nn.Conv2D(channels=60, kernel_size=(3, 3), strides=(1, 1), use_bias=True) ) # MNIST : result = ( batch size , 60 , 26 , 26) , CIFAR10 : : result = ( batch size , 60 , 30 , 30) net.add( gluon.nn.BatchNorm(axis=1, momentum=0.9, epsilon=1e-05, center=True, scale=True, beta_initializer="zeros", gamma_initializer="ones", running_mean_initializer="zeros", running_variance_initializer="ones")) net.add(gluon.nn.Activation("relu")) net.add( gluon.nn.MaxPool2D(pool_size=(2, 2), strides=(2, 2)) ) # MNIST : result = (batch size , 60 , 13 , 13) , CIFAR10 : result = (batch size , 60 , 15 , 15) net.add( gluon.nn.Conv2D(channels=30, kernel_size=(6, 6), strides=(1, 1), use_bias=True) ) # MNIST : result = ( batch size , 30 , 8 , 8), CIFAR10 : result = ( batch size , 30 , 10 , 10) net.add( gluon.nn.BatchNorm(axis=1, momentum=0.9, epsilon=1e-05, center=True, scale=True, beta_initializer="zeros", gamma_initializer="ones", running_mean_initializer="zeros", running_variance_initializer="ones")) net.add(gluon.nn.Activation("relu")) net.add( gluon.nn.MaxPool2D(pool_size=(2, 2), strides=(2, 2)) ) # MNIST : result = (batch size , 30 , 4 , 4) , CIFAR10 : result = (batch size , 30 , 5 , 5) net.add(gluon.nn.Dense(units=120, use_bias=True, flatten=True)) net.add( gluon.nn.BatchNorm(axis=1, momentum=0.9, epsilon=1e-05, center=True, scale=True, beta_initializer="zeros", gamma_initializer="ones", running_mean_initializer="zeros", running_variance_initializer="ones")) net.add(gluon.nn.Activation("relu")) net.add(gluon.nn.Dropout(0.0)) net.add(gluon.nn.Dense(units=64, use_bias=True)) net.add( gluon.nn.BatchNorm(axis=1, momentum=0.9, epsilon=1e-05, center=True, scale=True, beta_initializer="zeros", gamma_initializer="ones", running_mean_initializer="zeros", running_variance_initializer="ones")) net.add(gluon.nn.Activation("relu")) net.add(gluon.nn.Dropout(0.0)) net.add(gluon.nn.Dense(10, use_bias=True)) net.hybridize() # for faster learning #weights initialization if os.path.exists(path): print("loading weights") net.load_params(filename=path, ctx=ctx) # weights load else: print("initializing weights") net.collect_params().initialize(mx.init.Normal(sigma=0.1), ctx=ctx) # weights initialization #net.initialize(mx.init.Normal(sigma=0.1),ctx=ctx) # weights initialization #optimizer trainer = gluon.Trainer(net.collect_params(), optimizer, {"learning_rate": learning_rate}) #learning for i in tqdm(range(1, epoch + 1, 1)): for data, label in train_data: data = data.as_in_context(ctx) label = label.as_in_context(ctx) with autograd.record(train_mode=True): output = net(data) #loss definition '''Why do you write this? answer : Blocks, sequential, softmaxCrossEntropyLoss, and other gluon package keywords should be accessed as classes by default.''' loss = gluon.loss.SoftmaxCrossEntropyLoss(axis=-1, sparse_label=True)( output, label) cost = nd.mean(loss).asscalar() loss.backward() trainer.step(batch_size, ignore_stale_grad=True) print(" epoch : {} , last batch cost : {}".format(i, cost)) #weight_save if i % save_period == 0: if not os.path.exists("weights"): os.makedirs("weights") print("saving weights") if dataset == "MNIST": net.save_params("weights/MNIST-{}.params".format(i)) if dataset == "FashionMNIST": net.save_params("weights/FashionMNIST-{}.params".format(i)) elif dataset == "CIFAR10": net.save_params("weights/CIFAR10-{}.params".format(i)) test_accuracy = evaluate_accuracy(test_data, net, ctx) print("Test_acc : {}".format(test_accuracy)) return "optimization completed"
def train(net, train_data, val_data, eval_metric, polygon_metric, ctx, args): """Training pipeline""" net.collect_params().reset_ctx(ctx) if args.no_wd: for k, v in net.collect_params('.*beta|.*gamma|.*bias').items(): v.wd_mult = 0.0 if args.label_smooth: net._target_generator._label_smooth = True if args.lr_decay_period > 0: lr_decay_epoch = list( range(args.lr_decay_period, args.epochs, args.lr_decay_period)) else: lr_decay_epoch = [int(i) for i in args.lr_decay_epoch.split(',')] lr_scheduler = LRScheduler(mode=args.lr_mode, baselr=args.lr, niters=args.num_samples // args.batch_size, nepochs=args.epochs, step=lr_decay_epoch, step_factor=args.lr_decay, power=2, warmup_epochs=args.warmup_epochs) trainer = gluon.Trainer(net.collect_params(), 'sgd', { 'wd': args.wd, 'momentum': args.momentum, 'lr_scheduler': lr_scheduler }, kvstore='local') # targets sigmoid_ce = gluon.loss.SigmoidBinaryCrossEntropyLoss(from_sigmoid=False) l1_loss = gluon.loss.L1Loss() # metrics obj_metrics = mx.metric.Loss('ObjLoss') center_metrics = mx.metric.Loss('BoxCenterLoss') scale_metrics = mx.metric.Loss('BoxScaleLoss') # coef_center_metrics = mx.metric.Loss('CoefCenterLoss') coef_metrics = mx.metric.Loss('CoefLoss') # w_metrics = mx.metric.Loss('wLoss') cls_metrics = mx.metric.Loss('ClassLoss') # set up logger logging.basicConfig() logger = logging.getLogger() logger.setLevel(logging.INFO) log_file_path = args.save_prefix + '_train.log' log_dir = os.path.dirname(log_file_path) if log_dir and not os.path.exists(log_dir): os.makedirs(log_dir) fh = logging.FileHandler(log_file_path) logger.addHandler(fh) logger.info(args) logger.info('Start training from [Epoch {}]'.format(args.start_epoch)) best_map = [0] for epoch in range(args.start_epoch, args.epochs): if args.mixup: # TODO(threshold): more elegant way to control mixup during runtime try: train_data._dataset.set_mixup(np.random.beta, 1.5, 1.5) except AttributeError: train_data._dataset._data.set_mixup(np.random.beta, 1.5, 1.5) if epoch >= args.epochs - args.no_mixup_epochs: try: train_data._dataset.set_mixup(None) except AttributeError: train_data._dataset._data.set_mixup(None) tic = time.time() btic = time.time() mx.nd.waitall() net.hybridize() for i, batch in enumerate(train_data): batch_size = batch[0].shape[0] data = gluon.utils.split_and_load(batch[0], ctx_list=ctx, batch_axis=0) fixed_targets = [ gluon.utils.split_and_load(batch[it], ctx_list=ctx, batch_axis=0) for it in range(1, 7) ] gt_boxes = gluon.utils.split_and_load(batch[7], ctx_list=ctx, batch_axis=0) sum_losses = [] obj_losses = [] center_losses = [] scale_losses = [] # coef_center_losses = [] coef_losses = [] cls_losses = [] with autograd.record(): for ix, x in enumerate(data): obj_loss, center_loss, scale_loss, coef_loss, cls_loss = net( x, gt_boxes[ix], *[ft[ix] for ft in fixed_targets]) if (args.only_bbox): sum_losses.append(obj_loss + center_loss + scale_loss + cls_loss) else: sum_losses.append(obj_loss + center_loss + scale_loss + coef_loss + cls_loss) # coef_center_losses.append(coef_center_loss) coef_losses.append(coef_loss) obj_losses.append(obj_loss) center_losses.append(center_loss) scale_losses.append(scale_loss) cls_losses.append(cls_loss) autograd.backward(sum_losses) lr_scheduler.update(i, epoch) trainer.step(batch_size) if (args.only_bbox == False): # coef_center_metrics.update(0, coef_center_losses) coef_metrics.update(0, coef_losses) obj_metrics.update(0, obj_losses) center_metrics.update(0, center_losses) scale_metrics.update(0, scale_losses) cls_metrics.update(0, cls_losses) if args.log_interval and not (i + 1) % args.log_interval: name1, loss1 = obj_metrics.get() name2, loss2 = center_metrics.get() name3, loss3 = scale_metrics.get() if (args.only_bbox == False): # name4, loss4 = coef_center_metrics.get() name5, loss5 = coef_metrics.get() name6, loss6 = cls_metrics.get() if (args.only_bbox): logger.info( '[Epoch {}][Batch {}], LR: {:.2E}, Speed: {:.3f} samples/sec, {}={:.3f}, {}={:.3f}, {}={:.3f}, {}={:.3f}' .format(epoch, i, trainer.learning_rate, batch_size / (time.time() - btic), name1, loss1, name2, loss2, name3, loss3, name6, loss6)) else: logger.info( '[Epoch {}][Batch {}], LR: {:.2E}, Speed: {:.3f} samples/sec, {}={:.3f}, {}={:.3f}, {}={:.3f}, {}={:.3f}, {}={:.3f}' .format(epoch, i, trainer.learning_rate, batch_size / (time.time() - btic), name1, loss1, name2, loss2, name3, loss3, name5, loss5, name6, loss6)) btic = time.time() name1, loss1 = obj_metrics.get() name2, loss2 = center_metrics.get() name3, loss3 = scale_metrics.get() if (args.only_bbox == False): # name4, loss4 = coef_center_metrics.get() name5, loss5 = coef_metrics.get() name6, loss6 = cls_metrics.get() if (args.only_bbox): logger.info( '[Epoch {}] Training cost: {:.3f}, {}={:.3f}, {}={:.3f}, {}={:.3f}, {}={:.3f}' .format(epoch, (time.time() - tic), name1, loss1, name2, loss2, name3, loss3, name6, loss6)) else: logger.info( '[Epoch {}] Training cost: {:.3f}, {}={:.3f}, {}={:.3f}, {}={:.3f}, {}={:.3f}, {}={:.3f}' .format(epoch, (time.time() - tic), name1, loss1, name2, loss2, name3, loss3, name5, loss5, name6, loss6)) if False and not (epoch) % args.val_interval: # consider reduce the frequency of validation to save time map_bbox, map_polygon = validate(net, val_data, ctx, eval_metric, polygon_metric, args) map_name, mean_ap = map_bbox polygonmap_name, polygonmean_ap = map_polygon val_msg = '\n'.join( ['{}={}'.format(k, v) for k, v in zip(map_name, mean_ap)]) logger.info('[Epoch {}] Validation: \n{}'.format(epoch, val_msg)) polygonval_msg = '\n'.join([ '{}={}'.format(k, v) for k, v in zip(polygonmap_name, polygonmean_ap) ]) logger.info('[Epoch {}] PolygonValidation: \n{}'.format( epoch, polygonval_msg)) current_map = float(polygonmean_ap[-1]) else: current_map = 0. save_params(net, best_map, current_map, epoch, args.save_interval, args.save_prefix)
def _train_loop(self, train_data, val_data): if self._cfg.train.no_wd: for k, v in self.net.collect_params( '.*beta|.*gamma|.*bias').items(): v.wd_mult = 0.0 if self._cfg.train.label_smoothing or self._cfg.train.mixup: sparse_label_loss = False else: sparse_label_loss = True if self.distillation: L = loss.DistillationSoftmaxCrossEntropyLoss( temperature=self._cfg.train.temperature, hard_weight=self._cfg.train.hard_weight, sparse_label=sparse_label_loss) else: L = gluon.loss.SoftmaxCrossEntropyLoss( sparse_label=sparse_label_loss) if self._cfg.train.mixup: train_metric = mx.metric.RMSE() else: train_metric = mx.metric.Accuracy() if self._cfg.train.mode == 'hybrid': self.net.hybridize(static_alloc=True, static_shape=True) if self.distillation: self.teacher.hybridize(static_alloc=True, static_shape=True) self._logger.info('Start training from [Epoch %d]', max(self._cfg.train.start_epoch, self.epoch)) for self.epoch in range(max(self._cfg.train.start_epoch, self.epoch), self._cfg.train.epochs): epoch = self.epoch tic = time.time() btic = time.time() if self._cfg.train.use_rec: train_data.reset() train_metric.reset() # pylint: disable=undefined-loop-variable for i, batch in enumerate(train_data): data, label = self.batch_fn(batch, self.ctx) if self._cfg.train.mixup: lam = np.random.beta(self._cfg.train.mixup_alpha, self._cfg.train.mixup_alpha) if epoch >= self._cfg.train.epochs - self._cfg.train.mixup_off_epoch: lam = 1 data = [lam * X + (1 - lam) * X[::-1] for X in data] if self._cfg.train.label_smoothing: eta = 0.1 else: eta = 0.0 label = mixup_transform(label, classes, lam, eta) elif self._cfg.train.label_smoothing: hard_label = label label = smooth(label, classes) if self.distillation: teacher_prob = [nd.softmax(self.teacher(X.astype(self._cfg.train.dtype, copy=False)) \ / self._cfg.train.temperature) for X in data] with ag.record(): outputs = [ self.net(X.astype(self._cfg.train.dtype, copy=False)) for X in data ] if self.distillation: losses = [L(yhat.astype('float32', copy=False), y.astype('float32', copy=False), p.astype('float32', copy=False)) \ for yhat, y, p in zip(outputs, label, teacher_prob)] else: losses = [ L(yhat, y.astype(self._cfg.train.dtype, copy=False)) for yhat, y in zip(outputs, label) ] for l in losses: l.backward() self.trainer.step(self.batch_size) if self._cfg.train.mixup: output_softmax = [nd.SoftmaxActivation(out.astype('float32', copy=False)) \ for out in outputs] train_metric.update(label, output_softmax) else: if self._cfg.train.label_smoothing: train_metric.update(hard_label, outputs) else: train_metric.update(label, outputs) if self._cfg.train.log_interval and not ( i + 1) % self._cfg.train.log_interval: train_metric_name, train_metric_score = train_metric.get() self._logger.info( 'Epoch[%d] Batch [%d]\tSpeed: %f samples/sec\t%s=%f\tlr=%f', epoch, i, self._cfg.train.batch_size * self._cfg.train.log_interval / (time.time() - btic), train_metric_name, train_metric_score, self.trainer.learning_rate) btic = time.time() train_metric_name, train_metric_score = train_metric.get() throughput = int(self.batch_size * i / (time.time() - tic)) top1_val, top5_val = self._evaluate(val_data) self._logger.info('[Epoch %d] training: %s=%f', epoch, train_metric_name, train_metric_score) self._logger.info( '[Epoch %d] speed: %d samples/sec\ttime cost: %f', epoch, throughput, time.time() - tic) self._logger.info('[Epoch %d] validation: top1=%f top5=%f', epoch, top1_val, top5_val) if top1_val > self._best_acc: cp_name = os.path.join(self._logdir, 'best_checkpoint.pkl') self._logger.info( '[Epoch %d] Current best top-1: %f vs previous %f, saved to %s', self.epoch, top1_val, self._best_acc, cp_name) self.save(cp_name) self._best_acc = top1_val if self._reporter: self._reporter(epoch=epoch, acc_reward=top1_val) self._time_elapsed += time.time() - btic return { 'train_acc': train_metric_score, 'valid_acc': self._best_acc, 'time': self._time_elapsed }
def check_nth_order_unary(x, op, grad_ops, orders, rtol=None, atol=None): """Assert n-th order autograd gradient against expected gradient. Multiple order of gradients can be checked by passing list of function computing the particular order gradient and passing the corresponding list of order. Note ---- 1. Orders should always be monotonically increasing. 2. Elements of grads_ops should correspond to elements of orders i.e. grads_op = [grad_op, grad_grad_grad_op] should be passed with orders = [1, 3] Parameters ---------- x : mxnet.NDArray Input Array. op : Callable Operation to perform on Input Array. grad_ops : Callable or List of Callable Function to compute and assert gradient of given order. orders : int or List of int Order/s to assert expected and computed gradients. Returns ------- None """ if isinstance(orders, int): orders = [orders] grad_ops = [grad_ops] assert all(i < j for i, j in zip(orders[0:-1], orders[1:])), \ "orders should be monotonically increasing" assert len(set(orders)) == len(orders), \ "orders should have unique elements" highest_order = max(orders) x = nd.array(x) x.attach_grad() expected_grads = [grad_op(x) for grad_op in grad_ops] computed_grads = [] head_grads = [] # Perform compute. with autograd.record(): y = op(x) for current_order in range(1, highest_order + 1): head_grad = nd.random.normal(shape=x.shape) y = autograd.grad(heads=y, variables=x, head_grads=head_grad, create_graph=True, retain_graph=True)[0] if current_order in orders: computed_grads.append(y) head_grads.append(head_grad) # Validate all the gradients. for order, grad, computed_grad in \ zip(orders, expected_grads, computed_grads): # Compute expected values. expected_grad = grad.asnumpy() for head_grad in head_grads[:order]: expected_grad *= head_grad.asnumpy() assert_almost_equal(expected_grad, computed_grad.asnumpy(), rtol=rtol, atol=atol)
def train(args): # SageMaker passes num_cpus, num_gpus and other args we can use to tailor training to # the current container environment, but here we just use simple cpu context. ctx = mx.cpu() # retrieve the hyperparameters we set in notebook (with some defaults) batch_size = args.batch_size epochs = args.epochs learning_rate = args.learning_rate momentum = args.momentum log_interval = args.log_interval num_gpus = int(os.environ['SM_NUM_GPUS']) current_host = args.current_host hosts = args.hosts model_dir = args.model_dir CHECKPOINTS_DIR = '/opt/ml/checkpoints' checkpoints_enabled = os.path.exists(CHECKPOINTS_DIR) # load training and validation data # we use the gluon.data.vision.MNIST class because of its built in mnist pre-processing logic, # but point it at the location where SageMaker placed the data files, so it doesn't download them again. training_dir = args.train train_data = get_train_data(training_dir + '/train', batch_size) val_data = get_val_data(training_dir + '/test', batch_size) # define the network net = define_network() # Collect all parameters from net and its children, then initialize them. net.initialize(mx.init.Xavier(magnitude=2.24), ctx=ctx) # Trainer is for updating parameters with gradient. if len(hosts) == 1: kvstore = 'device' if num_gpus > 0 else 'local' else: kvstore = 'dist_device_sync' if num_gpus > 0 else 'dist_sync' trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': learning_rate, 'momentum': momentum}, kvstore=kvstore) metric = mx.metric.Accuracy() loss = gluon.loss.SoftmaxCrossEntropyLoss() # shard the training data in case we are doing distributed training. Alternatively to splitting in memory, # the data could be pre-split in S3 and use ShardedByS3Key to do distributed training. if len(hosts) > 1: train_data = [x for x in train_data] shard_size = len(train_data) // len(hosts) for i, host in enumerate(hosts): if host == current_host: start = shard_size * i end = start + shard_size break train_data = train_data[start:end] net.hybridize() best_val_score = 0.0 for epoch in range(epochs): # reset data iterator and metric at begining of epoch. metric.reset() btic = time.time() for i, (data, label) in enumerate(train_data): # Copy data to ctx if necessary data = data.as_in_context(ctx) label = label.as_in_context(ctx) # Start recording computation graph with record() section. # Recorded graphs can then be differentiated with backward. with autograd.record(): output = net(data) L = loss(output, label) L.backward() # take a gradient step with batch_size equal to data.shape[0] trainer.step(data.shape[0]) # update metric at last. metric.update([label], [output]) if i % log_interval == 0 and i > 0: name, acc = metric.get() print('[Epoch %d Batch %d] Training: %s=%f, %f samples/s' % (epoch, i, name, acc, batch_size / (time.time() - btic))) btic = time.time() name, acc = metric.get() print('[Epoch %d] Training: %s=%f' % (epoch, name, acc)) name, val_acc = test(ctx, net, val_data) print('[Epoch %d] Validation: %s=%f' % (epoch, name, val_acc)) # checkpoint the model, params and optimizer states in the folder /opt/ml/checkpoints if checkpoints_enabled and val_acc > best_val_score: best_val_score = val_acc logging.info('Saving the model, params and optimizer state.') net.export(CHECKPOINTS_DIR + "/%.4f-gluon_mnist"%(best_val_score), epoch) trainer.save_states(CHECKPOINTS_DIR + '/%.4f-gluon_mnist-%d.states'%(best_val_score, epoch)) if current_host == hosts[0]: save(net, model_dir)
def train(net, train_data, val_data, eval_metric, ctx, args, reporter, final_fit): """Training pipeline""" net.collect_params().reset_ctx(ctx) if args.no_wd: for k, v in net.collect_params('.*beta|.*gamma|.*bias').items(): v.wd_mult = 0.0 if args.label_smooth: net._target_generator._label_smooth = True if args.lr_decay_period > 0: lr_decay_epoch = list( range(args.lr_decay_period, args.epochs, args.lr_decay_period)) else: lr_decay_epoch = [int(i) for i in args.lr_decay_epoch.split(',')] lr_decay_epoch = [e - args.warmup_epochs for e in lr_decay_epoch] num_batches = args.num_samples // args.batch_size lr_scheduler = LRSequential([ LRScheduler('linear', base_lr=0, target_lr=args.lr, nepochs=args.warmup_epochs, iters_per_epoch=num_batches), LRScheduler(args.lr_mode, base_lr=args.lr, nepochs=args.epochs - args.warmup_epochs, iters_per_epoch=num_batches, step_epoch=lr_decay_epoch, step_factor=args.lr_decay, power=2), ]) trainer = gluon.Trainer(net.collect_params(), 'sgd', { 'wd': args.wd, 'momentum': args.momentum, 'lr_scheduler': lr_scheduler }, kvstore='local') # targets sigmoid_ce = gluon.loss.SigmoidBinaryCrossEntropyLoss(from_sigmoid=False) l1_loss = gluon.loss.L1Loss() # metrics obj_metrics = mx.metric.Loss('ObjLoss') center_metrics = mx.metric.Loss('BoxCenterLoss') scale_metrics = mx.metric.Loss('BoxScaleLoss') cls_metrics = mx.metric.Loss('ClassLoss') # set up logger logging.basicConfig() logger = logging.getLogger() logger.setLevel(logging.INFO) log_file_path = args.save_prefix + '_train.log' log_dir = os.path.dirname(log_file_path) if log_dir and not os.path.exists(log_dir): os.makedirs(log_dir) fh = logging.FileHandler(log_file_path) logger.addHandler(fh) logger.info(args) #logger.info('Start training from [Epoch {}]'.format(args.start_epoch)) best_map = [0] pre_current_map = 0 for epoch in range(args.start_epoch, args.epochs): #tbar2.next() if args.mixup: # TODO(zhreshold): more elegant way to control mixup during runtime try: train_data._dataset.set_mixup(np.random.beta, 1.5, 1.5) except AttributeError: train_data._dataset._data.set_mixup(np.random.beta, 1.5, 1.5) if epoch >= args.epochs - args.no_mixup_epochs: try: train_data._dataset.set_mixup(None) except AttributeError: train_data._dataset._data.set_mixup(None) tic = time.time() btic = time.time() mx.nd.waitall() net.hybridize() for i, batch in enumerate(train_data): batch_size = batch[0].shape[0] data = gluon.utils.split_and_load(batch[0], ctx_list=ctx, batch_axis=0) # objectness, center_targets, scale_targets, weights, class_targets fixed_targets = [ gluon.utils.split_and_load(batch[it], ctx_list=ctx, batch_axis=0) for it in range(1, 6) ] gt_boxes = gluon.utils.split_and_load(batch[6], ctx_list=ctx, batch_axis=0) sum_losses = [] obj_losses = [] center_losses = [] scale_losses = [] cls_losses = [] with autograd.record(): for ix, x in enumerate(data): obj_loss, center_loss, scale_loss, cls_loss = net( x, gt_boxes[ix], *[ft[ix] for ft in fixed_targets]) sum_losses.append(obj_loss + center_loss + scale_loss + cls_loss) obj_losses.append(obj_loss) center_losses.append(center_loss) scale_losses.append(scale_loss) cls_losses.append(cls_loss) autograd.backward(sum_losses) trainer.step(batch_size) obj_metrics.update(0, obj_losses) center_metrics.update(0, center_losses) scale_metrics.update(0, scale_losses) cls_metrics.update(0, cls_losses) if args.log_interval and not (i + 1) % args.log_interval: name1, loss1 = obj_metrics.get() name2, loss2 = center_metrics.get() name3, loss3 = scale_metrics.get() name4, loss4 = cls_metrics.get() logger.info( '[Epoch {}][Batch {}], LR: {:.2E}, Speed: {:.3f} samples/sec, {}={:.3f}, {}={:.3f}, {}={:.3f}, {}={:.3f}' .format(epoch, i, trainer.learning_rate, batch_size / (time.time() - btic), name1, loss1, name2, loss2, name3, loss3, name4, loss4)) btic = time.time() name1, loss1 = obj_metrics.get() name2, loss2 = center_metrics.get() name3, loss3 = scale_metrics.get() name4, loss4 = cls_metrics.get() logger.info( '[Epoch {}] Training cost: {:.3f}, {}={:.3f}, {}={:.3f}, {}={:.3f}, {}={:.3f}' .format(epoch, (time.time() - tic), name1, loss1, name2, loss2, name3, loss3, name4, loss4)) if (not (epoch + 1) % args.val_interval) and not final_fit: # consider reduce the frequency of validation to save time map_name, mean_ap = validate(net, val_data, ctx, eval_metric) val_msg = ' '.join( ['{}={}'.format(k, v) for k, v in zip(map_name, mean_ap)]) #$tbar.set_description('[Epoch {}] Validation: {}'.format(epoch, val_msg)) logger.info('[Epoch {}] Validation: {}'.format(epoch, val_msg)) current_map = float(mean_ap[-1]) pre_current_map = current_map else: current_map = pre_current_map reporter(epoch=epoch, map_reward=current_map)
def main(args): # Function to get mnist iterator given a rank def get_mnist_iterator(rank): data_dir = "data-%d" % rank if not os.path.isdir(data_dir): os.makedirs(data_dir) zip_file_path = download('http://data.mxnet.io/mxnet/data/mnist.zip', dirname=data_dir) with zipfile.ZipFile(zip_file_path) as zf: zf.extractall(data_dir) input_shape = (1, 28, 28) batch_size = args.batch_size train_iter = mx.io.MNISTIter( image="%s/train-images-idx3-ubyte" % data_dir, label="%s/train-labels-idx1-ubyte" % data_dir, input_shape=input_shape, batch_size=batch_size, shuffle=True, flat=False, num_parts=hvd.size(), part_index=hvd.rank()) val_iter = mx.io.MNISTIter( image="%s/t10k-images-idx3-ubyte" % data_dir, label="%s/t10k-labels-idx1-ubyte" % data_dir, input_shape=input_shape, batch_size=batch_size, flat=False, ) return train_iter, val_iter kernel_size = 5 strides = 2 pool_size = 2 hidden_dim = 512 output_dim = 10 activation = 'relu' # Function to define neural network def conv_nets(): net = gluon.nn.HybridSequential() with net.name_scope(): net.add( gluon.nn.Conv2D(channels=20, kernel_size=kernel_size, activation=activation)) net.add(gluon.nn.MaxPool2D(pool_size=pool_size, strides=strides)) net.add( gluon.nn.Conv2D(channels=50, kernel_size=kernel_size, activation=activation)) net.add(gluon.nn.MaxPool2D(pool_size=pool_size, strides=strides)) net.add(gluon.nn.Flatten()) net.add(gluon.nn.Dense(hidden_dim, activation=activation)) net.add(gluon.nn.Dense(output_dim)) return net # Function to evaluate accuracy for a model def evaluate(model, data_iter, context): data_iter.reset() metric = mx.metric.Accuracy() for _, batch in enumerate(data_iter): data = batch.data[0].as_in_context(context) label = batch.label[0].as_in_context(context) output = model(data.astype(args.dtype, copy=False)) metric.update([label], [output]) return metric.get() # Initialize Horovod hvd.init() # Horovod: pin context to local rank context = mx.cpu(hvd.local_rank()) if args.no_cuda else mx.gpu( hvd.local_rank()) num_workers = hvd.size() # Load training and validation data train_data, val_data = get_mnist_iterator(hvd.rank()) # Build model model = conv_nets() model.cast(args.dtype) model.hybridize() # Create optimizer optimizer_params = { 'momentum': args.momentum, 'learning_rate': args.lr * hvd.size() } opt = mx.optimizer.create('sgd', **optimizer_params) # Initialize parameters initializer = mx.init.Xavier(rnd_type='gaussian', factor_type="in", magnitude=2) model.initialize(initializer, ctx=context) # Horovod: fetch and broadcast parameters params = model.collect_params() if params is not None: hvd.broadcast_parameters(params, root_rank=0) # Horovod: create DistributedTrainer, a subclass of gluon.Trainer trainer = hvd.DistributedTrainer(params, opt) # Create loss function and train metric loss_fn = gluon.loss.SoftmaxCrossEntropyLoss() metric = mx.metric.Accuracy() # Global training timing if hvd.rank() == 0: global_tic = time.time() # Train model for epoch in range(args.epochs): tic = time.time() train_data.reset() metric.reset() for nbatch, batch in enumerate(train_data, start=1): data = batch.data[0].as_in_context(context) label = batch.label[0].as_in_context(context) with autograd.record(): output = model(data.astype(args.dtype, copy=False)) loss = loss_fn(output, label) loss.backward() trainer.step(args.batch_size) metric.update([label], [output]) if nbatch % 100 == 0: name, acc = metric.get() logging.info('[Epoch %d Batch %d] Training: %s=%f' % (epoch, nbatch, name, acc)) if hvd.rank() == 0: elapsed = time.time() - tic speed = nbatch * args.batch_size * hvd.size() / elapsed logging.info('Epoch[%d]\tSpeed=%.2f samples/s\tTime cost=%f', epoch, speed, elapsed) # Evaluate model accuracy _, train_acc = metric.get() name, val_acc = evaluate(model, val_data, context) if hvd.rank() == 0: logging.info('Epoch[%d]\tTrain: %s=%f\tValidation: %s=%f', epoch, name, train_acc, name, val_acc) if hvd.rank() == 0 and epoch == args.epochs - 1: assert val_acc > 0.96, "Achieved accuracy (%f) is lower than expected\ (0.96)" % val_acc if hvd.rank() == 0: global_training_time = time.time() - global_tic print( "Global elpased time on training:{}".format(global_training_time)) device = context.device_type + str(num_workers) logging.info('Device info: %s', device)
def train(epochs, ctx): if isinstance(ctx, mx.Context): ctx = [ctx] net.initialize(mx.init.Xavier(), ctx=ctx) train_data = gluon.data.DataLoader( gluon.data.vision.CIFAR10(train=True).transform_first(transform_train), batch_size=batch_size, shuffle=True, last_batch='discard', num_workers=num_workers) val_data = gluon.data.DataLoader( gluon.data.vision.CIFAR10(train=False).transform_first(transform_test), batch_size=batch_size, shuffle=False, num_workers=num_workers) trainer = gluon.Trainer(net.collect_params(), optimizer, { 'learning_rate': opt.lr, 'wd': opt.wd, 'momentum': opt.momentum }) metric = mx.metric.Accuracy() train_metric = mx.metric.Accuracy() loss_fn = gluon.loss.SoftmaxCrossEntropyLoss() train_history = TrainingHistory(['training-error', 'validation-error']) iteration = 0 lr_decay_count = 0 best_val_score = 0 for epoch in range(epochs): tic = time.time() train_metric.reset() metric.reset() train_loss = 0 num_batch = len(train_data) alpha = 1 if epoch == lr_decay_epoch[lr_decay_count]: trainer.set_learning_rate(trainer.learning_rate * lr_decay) lr_decay_count += 1 for i, batch in enumerate(train_data): data = gluon.utils.split_and_load(batch[0], ctx_list=ctx, batch_axis=0) label = gluon.utils.split_and_load(batch[1], ctx_list=ctx, batch_axis=0) with ag.record(): output = [net(X) for X in data] loss = [loss_fn(yhat, y) for yhat, y in zip(output, label)] for l in loss: l.backward() trainer.step(batch_size) train_loss += sum([l.sum().asscalar() for l in loss]) train_metric.update(label, output) name, acc = train_metric.get() iteration += 1 train_loss /= batch_size * num_batch name, acc = train_metric.get() name, val_acc = test(ctx, val_data) train_history.update([1 - acc, 1 - val_acc]) train_history.plot(save_path='%s/%s_history.png' % (plot_path, model_name)) if val_acc > best_val_score: best_val_score = val_acc net.save_parameters('%s/%.4f-cifar-%s-%d-best.params' % (save_dir, best_val_score, model_name, epoch)) name, val_acc = test(ctx, val_data) logging.info('[Epoch %d] train=%f val=%f loss=%f time: %f' % (epoch, acc, val_acc, train_loss, time.time() - tic)) if save_period and save_dir and (epoch + 1) % save_period == 0: net.save_parameters('%s/cifar10-%s-%d.params' % (save_dir, model_name, epoch)) if save_period and save_dir: net.save_parameters('%s/cifar10-%s-%d.params' % (save_dir, model_name, epochs - 1))
def train(net, async_net, ctx, args): """Training pipeline""" net.collect_params().reset_ctx(ctx) if args.no_wd: for k, v in net.collect_params(".*beta|.*gamma|.*bias").items(): v.wd_mult = 0.0 if args.label_smooth: net._target_generator._label_smooth = True if args.lr_decay_period > 0: lr_decay_epoch = list( range(args.lr_decay_period, args.epochs, args.lr_decay_period)) else: lr_decay_epoch = [int(i) for i in args.lr_decay_epoch.split(',')] lr_scheduler = LRSequential([ LRScheduler("linear", base_lr=0, target_lr=args.lr, nepochs=args.warmup_epochs, iters_per_epoch=args.batch_size), LRScheduler(args.lr_mode, base_lr=args.lr, nepochs=args.epochs - args.warmup_epochs, iters_per_epoch=args.batch_size, step_epoch=lr_decay_epoch, step_factor=args.lr_decay, power=2), ]) if (args.optimizer == "sgd"): trainer = gluon.Trainer(net.collect_params(), args.optimizer, { "wd": args.wd, "momentum": args.momentum, "lr_scheduler": lr_scheduler }, kvstore="local") elif (args.optimizer == "adam"): trainer = gluon.Trainer(net.collect_params(), args.optimizer, {"lr_scheduler": lr_scheduler}, kvstore="local") else: trainer = gluon.Trainer(net.collect_params(), args.optimizer, kvstore="local") # targets #sigmoid_ce = gluon.loss.SigmoidBinaryCrossEntropyLoss(from_sigmoid=False) #l1_loss = gluon.loss.L1Loss() # Intermediate Metrics: train_metrics = ( mx.metric.Loss("ObjLoss"), mx.metric.Loss("BoxCenterLoss"), mx.metric.Loss("BoxScaleLoss"), mx.metric.Loss("ClassLoss"), mx.metric.Loss("TotalLoss"), ) train_metric_ixs = range(len(train_metrics)) target_metric_ix = -1 # Train towards TotalLoss (the last one) # Evaluation Metrics: val_metric = VOC07MApMetric(iou_thresh=0.5) # Data transformations: train_batchify_fn = Tuple(*([Stack() for _ in range(6)] + [Pad(axis=0, pad_val=-1) for _ in range(1)])) train_transforms = (YOLO3DefaultTrainTransform( args.data_shape, args.data_shape, net=async_net, mixup=args.mixup) if args.no_random_shape else [ YOLO3DefaultTrainTransform( x * 32, x * 32, net=async_net, mixup=args.mixup) for x in range(10, 20) ]) validation_batchify_fn = None validation_transforms = None if args.validation: validation_batchify_fn = Tuple(Stack(), Pad(pad_val=-1)) validation_transforms = YOLO3DefaultValTransform( args.data_shape, args.data_shape) logger.info(args) logger.info(f"Start training from [Epoch {args.start_epoch}]") prev_best_score = float("-inf") best_epoch = args.start_epoch logger.info("Sleeping for 3s in case training data file not yet ready") time.sleep(3) for epoch in range(args.start_epoch, args.epochs): # if args.mixup: # # TODO(zhreshold): more elegant way to control mixup during runtime # try: # train_data._dataset.set_mixup(np.random.beta, 1.5, 1.5) # except AttributeError: # train_data._dataset._data.set_mixup(np.random.beta, 1.5, 1.5) # if epoch >= args.epochs - args.no_mixup_epochs: # try: # train_data._dataset.set_mixup(None) # except AttributeError: # train_data._dataset._data.set_mixup(None) tic = time.time() btic = time.time() mx.nd.waitall() net.hybridize() logger.debug( f'Input data dir contents: {os.listdir("/opt/ml/input/data/")}') train_data_gen = pipe_detection_minibatch( epoch, channel=args.train, batch_size=args.stream_batch_size) for ix_streambatch, train_dataset in enumerate(train_data_gen): # TODO: Mixup is kinda rubbish if it's only within a (potentially small) batch if args.mixup: train_dataset = MixupDetection(train_dataset) # Create dataloader for the stream-batch: if args.no_random_shape: logger.debug( "Creating train DataLoader without random transform") train_dataloader = gluon.data.DataLoader( train_dataset.transform(train_transforms), batch_size=args.batch_size, batchify_fn=train_batchify_fn, last_batch="discard", num_workers=args.num_workers, shuffle=True, ) else: logger.debug("Creating train DataLoader with random transform") train_dataloader = RandomTransformDataLoader( train_transforms, train_dataset, interval=10, batch_size=args.batch_size, batchify_fn=train_batchify_fn, last_batch="discard", num_workers=args.num_workers, shuffle=True, ) if args.mixup: logger.debug("Shuffling stream-batch") # TODO(zhreshold): more elegant way to control mixup during runtime try: train_dataloader._dataset.set_mixup( np.random.beta, 1.5, 1.5) except AttributeError: train_dataloader._dataset._data.set_mixup( np.random.beta, 1.5, 1.5) if epoch >= args.epochs - args.no_mixup_epochs: try: train_dataloader._dataset.set_mixup(None) except AttributeError: train_dataloader._dataset._data.set_mixup(None) logger.debug( f"Training on stream-batch {ix_streambatch} ({len(train_dataset)} records)" ) # TODO: Improve stream-batching robustness to drop loop guard clauses # While it would be nice to simply `for i, batch in enumerate(train_dataloader):`, # corrupted image buffers are somehow sneaking through the stream-batch at the moment. # # For now, we catch and tolerate these errors - trying to resume stream-batch process # where possible and otherwise discarding the remainder of the stream-batch :-( done = False i = -1 dataiter = iter(train_dataloader) while not done: i += 1 batch = None while not batch: try: batch = next(dataiter) except StopIteration: done = True break except ValueError: # Some problem with the minibatch prevented loading - try the next logger.warn( f"[Epoch {epoch}][Streambatch {ix_streambatch}] " f"Failed to load minibatch {i}, trying next...") i += 1 except: logger.error( f"[Epoch {epoch}][Streambatch {ix_streambatch}] " f"Failed to iterate minibatch {i}: Discarding remainder" ) break if not batch: logger.debug( f"[Epoch {epoch}][Streambatch {ix_streambatch}] " f"Done after {i} minibatches") break logger.debug( f"Epoch {epoch}, stream batch {ix_streambatch}, minibatch {i}" ) batch_size = batch[0].shape[0] data = gluon.utils.split_and_load(batch[0], ctx_list=ctx, batch_axis=0, even_split=False) # objectness, center_targets, scale_targets, weights, class_targets fixed_targets = [ gluon.utils.split_and_load(batch[it], ctx_list=ctx, batch_axis=0, even_split=False) for it in range(1, 6) ] gt_boxes = gluon.utils.split_and_load(batch[6], ctx_list=ctx, batch_axis=0, even_split=False) loss_trackers = tuple([] for metric in train_metrics) with autograd.record(): for ix, x in enumerate(data): losses_raw = net(x, gt_boxes[ix], *[ft[ix] for ft in fixed_targets]) # net outputs: [obj_loss, center_loss, scale_loss, cls_loss] # Each a mx.ndarray 1xbatch_size. This is the same order as our # train_metrics, so we just need to add a total vector: total_loss = sum(losses_raw) losses = losses_raw + [total_loss] # If any sample's total loss is non-finite, sum will be: if not isfinite(sum(total_loss)): logger.error( f"[Epoch {epoch}][Streambatch {ix_streambatch}][Minibatch {i}] " f"got non-finite losses: {losses_raw}") # TODO: Terminate training if losses or gradient go infinite? for ix in train_metric_ixs: loss_trackers[ix].append(losses[ix]) autograd.backward(loss_trackers[target_metric_ix]) trainer.step(batch_size) for ix in train_metric_ixs: train_metrics[ix].update(0, loss_trackers[ix]) if args.log_interval and not (i + 1) % args.log_interval: train_metrics_current = map(lambda metric: metric.get(), train_metrics) metrics_msg = "; ".join([ f"{name}={val:.3f}" for name, val in train_metrics_current ]) logger.info( f"[Epoch {epoch}][Streambatch {ix_streambatch}][Minibatch {i}] " f"LR={trainer.learning_rate:.2E}; " f"Speed={batch_size/(time.time()-btic):.3f} samples/sec; {metrics_msg};" ) btic = time.time() train_metrics_current = map(lambda metric: metric.get(), train_metrics) metrics_msg = "; ".join( [f"{name}={val:.3f}" for name, val in train_metrics_current]) logger.info( f"[Epoch {epoch}] TrainingCost={time.time()-tic:.3f}; {metrics_msg};" ) if not (epoch + 1) % args.val_interval: logger.info(f"Validating [Epoch {epoch}]") metric_names, metric_values = validate( net, args.validation, epoch, ctx, VOC07MApMetric(iou_thresh=0.5), validation_transforms, validation_batchify_fn, args) if isinstance(metric_names, list): val_msg = "; ".join( [f"{k}={v}" for k, v in zip(metric_names, metric_values)]) current_score = float(metric_values[-1]) else: val_msg = f"{metric_names}={metric_values}" current_score = metric_values logger.info(f"[Epoch {epoch}] Validation: {val_msg};") else: current_score = float("-inf") save_progress(net, current_score, prev_best_score, args.model_dir, epoch, args.checkpoint_interval, args.checkpoint_dir) if current_score > prev_best_score: prev_best_score = current_score best_epoch = epoch if (args.early_stopping and epoch >= args.early_stopping_min_epochs and (epoch - best_epoch) >= args.early_stopping_patience): logger.info( f"[Epoch {epoch}] No improvement since epoch {best_epoch}: Stopping early" ) break