def test_profile_tune_pause_resume(): enable_profiler() profiler.pause() # "test_profile_task" should *not* show up in tuning analysis test_profile_task() profiler.resume() # "test_profile_event" should show up in tuning analysis test_profile_event() profiler.pause() profiler.set_state('stop')
def test_profile_tune_pause_resume(): enable_profiler('test_profile_tune_pause_resume.json') profiler.pause() # "test_profile_task" should *not* show up in tuning analysis test_profile_task() profiler.resume() # "test_profile_event" should show up in tuning analysis test_profile_event() profiler.pause() profiler.set_state('stop')
for key in aux_params.keys(): # print(key, aux_params[key]) param_size += aux_params[key].size * 4 print("Parameter size", param_size / 1024 / 1024, " MB") repeat_times = 10 profiler.set_state('run') # profiler.pause() # train 5 epochs, i.e. going over the data iter one pass start = time.time() for epoch in range(5): train_data.reset() metric.reset() for i, batch in enumerate(train_data): if i == 1: profiler.resume() mod.forward(batch, is_train=True) # compute predictions mod.update_metric(metric, batch.label) # accumulate prediction accuracy mod.backward() # compute gradients mod.update() # update parameters if i == repeat_times: # benchmark 100 iterations break # print('Epoch %d, Training %s' % (epoch, metric.get())) mx.nd.waitall() profiler.set_state('stop') profiler.dump() end = time.time() time_per_img = (end - start) * 1.0 / batch_size / repeat_times print("batch\tthreshold\tthread number\ttime per image\tmemory (GB)") print("%d\t%d\t%s\t%s\t%f" %(batch_size, threshold, os.environ["MXNET_CPU_WORKER_NTHREADS"], time_per_img, cpuStats()))
def check_ln_speed(nbatch, nchannel, eps, nrepeat): fwd_check_eps = 1E-1 if dtype == np.float16 else 1E-4 bwd_check_eps = 1E-1 if dtype == np.float16 else 1E-3 B, C = nbatch, nchannel for _ in range(2): in_data = mx.nd.random.normal(shape=(B, C), ctx=ctx, dtype=dtype) out_data = in_data * in_data npy_out_data = out_data.asnumpy() mx.nd.waitall() fwd_time = 0 bwd_time = 0 if args.profile: profiler.set_state('run') profiler.pause() for i in range(nrepeat + 1): in_data = mx.nd.random.normal(shape=(B, C), ctx=ctx, dtype=dtype) ograd = mx.nd.random.normal(shape=(B, C), ctx=ctx, dtype=dtype) nd_gamma = mx.nd.ones(shape=(C, ), ctx=ctx, dtype=dtype) nd_beta = mx.nd.zeros(shape=(C, ), ctx=ctx, dtype=dtype) npy_in_data = in_data.asnumpy().astype(np.float64) gt_out = (npy_in_data - npy_in_data.mean(axis=-1, keepdims=True)) \ / np.sqrt(npy_in_data.var(axis=-1, keepdims=True) + eps) gt_in_data_grad, gt_gamma_grad, gt_beta_grad = \ npy_ln_grad(npy_in_data, ograd.asnumpy().astype(np.float64), eps, nd_gamma.asnumpy().astype(np.float64)) mx.nd.waitall() in_data.attach_grad() nd_gamma.attach_grad() nd_beta.attach_grad() _no_use = nd_gamma.asnumpy() _no_use = nd_beta.asnumpy() mx.nd.waitall() # Profile Forward + Backward with mx.autograd.record(): mx.nd.waitall() if args.profile and i > 0: profiler.resume() start = time.time() out_data, mean_val, std_val = mx.nd.LayerNorm(in_data, gamma=nd_gamma, beta=nd_beta, axis=-1, eps=eps, output_mean_var=True) out_data.wait_to_read() if i > 0: fwd_time += time.time() - start mx.nd.waitall() start = time.time() out_data.backward(ograd) mx.nd.waitall() if args.profile and i > 0: profiler.pause() if i > 0: bwd_time += time.time() - start mx_in_data_grad = in_data.grad.asnumpy() mx_gamma_grad = nd_gamma.grad.asnumpy() mx_beta_grad = nd_beta.grad.asnumpy() npt.assert_allclose(mean_val.asnumpy()[:, 0], npy_in_data.mean(axis=-1).astype(dtype), fwd_check_eps, fwd_check_eps) npt.assert_allclose( std_val.asnumpy()[:, 0], np.sqrt(npy_in_data.var(axis=-1) + eps).astype(dtype), fwd_check_eps, fwd_check_eps) npt.assert_allclose(out_data.asnumpy(), gt_out.astype(dtype), fwd_check_eps, fwd_check_eps) for i in range(B): npt.assert_allclose(mx_in_data_grad[i, :], gt_in_data_grad[i, :].astype(dtype), fwd_check_eps, fwd_check_eps) npt.assert_allclose(mx_gamma_grad, gt_gamma_grad.astype(dtype), bwd_check_eps, bwd_check_eps) npt.assert_allclose(mx_beta_grad, gt_beta_grad.astype(dtype), bwd_check_eps, bwd_check_eps) if args.profile: profiler.set_state('stop') return fwd_time / nrepeat * 1000000, bwd_time / nrepeat * 1000000