def test_fetch(self): a = jt.array([1, 2, 3]) a = a * 2 v = [] jt.fetch([a], lambda a: v.append(a)) jt.sync_all(True) assert len(v) == 1 and (v[0] == [2, 4, 6]).all()
def prep_benchmark(dets_out, h, w): with timer.env('Postprocess'): t = postprocess(dets_out, w, h, crop_masks=args.crop, score_threshold=args.score_threshold) result = {} with timer.env('Copy'): classes, scores, boxes, masks = [x[:args.top_k] for x in t] if isinstance(scores, list): box_scores = scores[0] #.numpy() mask_scores = scores[1] #.numpy() jt.fetch( box_scores, lambda box_scores: result.update({'box_scores': box_scores})) jt.fetch( mask_scores, lambda mask_scores: result.update( {'mask_scores': mask_scores})) else: # scores = scores#.numpy() jt.fetch(scores, lambda scores: result.update({'scores': scores})) # classes = classes#.numpy() # boxes = boxes#.numpy() # masks = masks#.numpy() jt.fetch(classes, lambda classes: result.update({'classes': classes})) jt.fetch(boxes, lambda boxes: result.update({'boxes': boxes})) jt.fetch(masks, lambda masks: result.update({'masks': masks})) with timer.env('Sync'): # Just in case jt.sync_all()
def evaluate(net, epoch, dataloader): total_acc = 0 total_num = 0 net.eval() total_time = 0.0 jt.sync_all(True) start_time = time.time() for pts, normals, labels in tqdm(dataloader, desc=f'Epoch {epoch} [Val]'): # pts = jt.float32(pts.numpy()) # normals = jt.float32(normals.numpy()) # labels = jt.int32(labels.numpy()) # feature = concat((pts, normals), 2) # pts = pts.transpose(0, 2, 1) # for pointnet DGCNN # output = net(pts, feature) output = net(pts) # output = net() pred = np.argmax(output.data, axis=1) acc = np.sum(pred == labels.data) total_acc += acc total_num += labels.shape[0] jt.sync_all(True) end_time = time.time() total_time += (end_time - start_time) print('epoch ', epoch, 'testing total time', total_time) acc = 0.0 acc = total_acc / total_num return acc
def test_backward_once_cuda(self): with jt.flag_scope(use_cuda=1): np.random.seed(0) jt.set_seed(3) model = Model2() n = 1 batch_size = 50 def get_data(n): for i in range(n): x = np.random.rand(batch_size, 1) y = x * x yield jt.float32(x), jt.float32(y) for i, (x, y) in enumerate(get_data(n)): pred_y = model(x).name("pred_y") with jt.log_capture_scope(log_v=0, log_vprefix="op.cc=100") as logs: jt.sync_all() logs = find_log_with_re( logs, "Jit op key (not )?found: (cublas)_matmul.*") assert (len(logs) == 1) with jt.log_capture_scope( log_silent=1, log_v=0, log_vprefix="op.cc=100,exe=1000") as logs_b: gs = jt.grad(pred_y, x) gs2 = jt.grad(pred_y, model.linear1.weight) jt.sync_all() logs_b = find_log_with_re( logs_b, "Jit op key (not )?found: (cublas)_matmul.*") assert len(logs_b) == 2, len(logs_b) jt.clean()
def test_segfault(self): a = jt.array([1.0, 2.0, 3.0]) b = (jt.maximum(a, 0)).sum() * 2.0 da = jt.grad(b, a) jt.sync_all() assert (a.data == [1, 2, 3]).all() assert (da.data == [2, 2, 2]).all()
def test_simple_model_train(self): with jt.flag_scope(trace_py_var=2): model = Model(input_size=1) opt = jt.optim.SGD(model.parameters(), 0.1) batch_size = 10 x = jt.float32(np.random.rand(batch_size, 1)) y = model(x) opt.step(y**2) jt.sync_all() data = jt.dump_trace_data() jt.clear_trace_data() # print_stack_tree(data) for k, v in data["execute_op_info"].items(): for i in v['fused_ops']: if i not in data["node_data"]: assert 0, (i, "not found") for k, v in list(data["node_data"].items()): if v["attrs"]["name"] == "unname": assert 0 print(len(data["node_data"])) with open(f"{jt.flags.cache_path}/simple_model_train.pkl", "wb") as f: pickle.dump(data, f)
def main(): make_env = lambda: gym.wrappers.RescaleAction(gym.make(FLAGS.env_id), -1, 1 ) env = make_env() dim_observation = env.observation_space.shape[0] dim_action = env.action_space.shape[0] buffer = ReplayBuffer(env, FLAGS.max_buf_size) policy = MLPPolicy(dim_observation, dim_action) qfns = [ MLPQFunction(dim_observation, dim_action), MLPQFunction(dim_observation, dim_action), ] policy_target = MLPPolicy(dim_observation, dim_action) qfns_target = [ MLPQFunction(dim_observation, dim_action), MLPQFunction(dim_observation, dim_action), ] polyak_copy(policy, policy_target, 1) for qfn, qfn_target in zip(qfns, qfns_target): polyak_copy(qfn, qfn_target, 1) algo_policy = TD3Trainer(policy, policy_target, qfns, qfns_target, FLAGS=FLAGS.TD3, sampler=buffer.sample) print("start training") observation = env.reset() for t in range(FLAGS.n_iters): if t % 10_000 == 0: evaluate(t, policy, make_env()) if t < FLAGS.n_expl_steps: action = env.action_space.sample() else: action = policy.get_actions( observation) + FLAGS.expl_noise * np.random.randn(dim_action) action = np.clip(action, -1, 1) next_observation, reward, done, info = env.step(action) real_done = done and not info['TimeLimit.truncated'] buffer.add_transition({ 'observation': observation, 'action': action, 'next_observation': next_observation, 'reward': reward, 'done': real_done }) if t >= FLAGS.n_expl_steps: algo_policy.step() observation = env.reset() if done else next_observation jt.sync_all()
def test_fetch(self): a = jt.array([1, 2, 3]) a = a * 2 v = [] jt.fetch(a, lambda a: v.append(a)) jt.fetch( 1, 2, 3, a, lambda x, y, z, a: self.assertTrue( x == 1 and y == 2 and z == 3 and isinstance(a, np.ndarray))) jt.sync_all(True) assert len(v) == 1 and (v[0] == [2, 4, 6]).all()
def test_resnet_train_profile(self): with jt.profile_scope(trace_py_var=1): resnet18 = resnet.Resnet18() opt = jt.optim.SGD(resnet18.parameters(), 0.1) x = jt.float32(np.random.rand(2, 3, 224, 224)) y = resnet18(x) opt.step(y**2) jt.sync_all()
def test_data(self): test_img = np.random.random((64,3,224,224)).astype('float32') jittor_test_img = jt.array(test_img) lr = 100 jittor_model = jtmodels.__dict__['mobilenet_v2']() jittor_model2 = jtmodels.__dict__['mobilenet_v2']() # Set eval to avoid dropout layer & bn errors jittor_model.train() jittor_model.classifier[0].eval() for m in jittor_model.modules(): if isinstance(m, jt.nn.BatchNorm): m.eval() jittor_model2.train() jittor_model2.classifier[0].eval() for m in jittor_model2.modules(): if isinstance(m, jt.nn.BatchNorm): m.eval() load_parameters(jittor_model2, jittor_model) for m in jittor_model.modules(): if isinstance(m, jt.nn.Conv): m.is_depthwise_conv = False cnt = 0 for m in jittor_model2.modules(): if isinstance(m, jt.nn.Conv): if (m.is_depthwise_conv): cnt += 1 assert cnt == 17, (cnt, '!=', 17) jt_optimizer = jt.nn.SGD(jittor_model.parameters(), lr = lr) jt_optimizer2 = jt.nn.SGD(jittor_model2.parameters(), lr = lr) jittor_result = jittor_model(jittor_test_img) mask = jt.random(jittor_result.shape, jittor_result.dtype) loss = jittor_result * mask jt_optimizer.step(loss) jt.sync_all(True) jittor_result2 = jittor_model2(jittor_test_img) loss = jittor_result2 * mask x = jittor_result2.data + 1e-8 y = jittor_result.data + 1e-8 relative_error = abs(x - y) / abs(y) diff = relative_error.mean() assert diff < 1e-4, (diff, 'forword') jt_optimizer2.step(loss) jt.sync_all(True) compare_parameters(jittor_model, jittor_model2) jt.clean() jt.gc()
def train(model, dataloader, optimizer, epoch, iteration): # switch to train mode model.train() averMeters.clear() end = time.time() for i, inputs in enumerate(dataloader): for k, v in inputs.items(): print(type(v[0])) averMeters['data_time'].update(time.time() - end) iteration += 1 lr = adjust_learning_rate(optimizer, iteration, BASE_LR=0.0002, WARM_UP_FACTOR=1.0 / 3, WARM_UP_ITERS=1000, STEPS=(0, 14150 * 15, 14150 * 20), GAMMA=0.1) # forward outputs = model(**inputs) # loss loss = outputs jt.sync_all() # backward jt.fetch(averMeters['loss'], loss, lambda a, l: a.update(l.data[0])) #averMeters['loss'].update(loss.data.item()) optimizer.step(loss) # measure elapsed time averMeters['batch_time'].update(time.time() - end) end = time.time() if i % 10 == 0: logger.info('Epoch: [{0}][{1}/{2}]\t' 'Lr: [{3:.8f}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 'loss {loss.val:.5f} ({loss.avg:.5f})\t'.format( epoch, i, len(dataloader), lr, batch_time=averMeters['batch_time'], data_time=averMeters['data_time'], loss=averMeters['loss'])) if i % 10000 == 0: model.save(os.path.join(SNAPSHOTDIR, '%d_%d.pkl' % (epoch, i))) model.save(os.path.join(SNAPSHOTDIR, 'last.pkl')) return iteration
def update(self, model): # Update EMA parameters with jt.no_grad(): self.updates += 1 d = self.decay(self.updates) msd = model.state_dict() # model state_dict for k, v in self.ema.state_dict().items(): if v.dtype == "float32": v *= d v += (1. - d) * msd[k].detach() jt.sync_all()
def test_resnet_train(self): with jt.flag_scope(trace_py_var=2): resnet18 = resnet.Resnet18() opt = jt.optim.SGD(resnet18.parameters(), 0.1) x = jt.float32(np.random.rand(2, 3, 224, 224)) y = resnet18(x) opt.step(y**2) jt.sync_all() data = jt.dump_trace_data() jt.clear_trace_data()
def test_densenet(self): self.setup_seed(1) loss_list = [] acc_list = [] mnist_net = MnistNet() global prev prev = time.time() SGD = nn.SGD(mnist_net.parameters(), self.learning_rate, self.momentum, self.weight_decay) # SGD = jt.optim.Adam(mnist_net.parameters(), lr=0.0001) for batch_idx, (data, target) in enumerate(self.train_loader): output = mnist_net(data) loss = nn.cross_entropy_loss(output, target) SGD.step(loss) def callback(batch_idx, loss, output, target): # print train info global prev pred = np.argmax(output, axis=1) acc = np.mean(target == pred) loss_list.append(loss[0]) acc_list.append(acc) print( 'Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}\tAcc: {:.6f} \tTime:{:.3f}' .format(0, batch_idx, 600, 1. * batch_idx / 6.0, loss[0], acc, time.time() - prev)) # prev = time.time() jt.fetch(batch_idx, loss, output, target, callback) # Train Epoch: 0 [0/600 (0%)] Loss: 2.402650 Acc: 0.060000 # Train Epoch: 0 [1/600 (0%)] Loss: 2.770145 Acc: 0.100000 # Train Epoch: 0 [2/600 (0%)] Loss: 3.528072 Acc: 0.100000 # Train Epoch: 0 [3/600 (0%)] Loss: 2.992042 Acc: 0.100000 # Train Epoch: 0 [4/600 (1%)] Loss: 4.672772 Acc: 0.060000 # Train Epoch: 0 [5/600 (1%)] Loss: 5.003410 Acc: 0.080000 # Train Epoch: 0 [6/600 (1%)] Loss: 5.417546 Acc: 0.100000 # Train Epoch: 0 [7/600 (1%)] Loss: 5.137665 Acc: 0.100000 # Train Epoch: 0 [8/600 (1%)] Loss: 5.241075 Acc: 0.070000 # Train Epoch: 0 [9/600 (2%)] Loss: 4.515363 Acc: 0.100000 # Train Epoch: 0 [10/600 (2%)] Loss: 3.357187 Acc: 0.170000 # Train Epoch: 0 [20/600 (3%)] Loss: 2.265879 Acc: 0.100000 # Train Epoch: 0 [30/600 (5%)] Loss: 2.107000 Acc: 0.250000 # Train Epoch: 0 [40/600 (7%)] Loss: 1.918214 Acc: 0.290000 # Train Epoch: 0 [50/600 (8%)] Loss: 1.645694 Acc: 0.400000 jt.sync_all(True) assert np.mean(loss_list[-50:]) < 0.3 assert np.mean(acc_list[-50:]) > 0.9
def test_cuda_knn(): from jittor import init jt.flags.use_cuda = 1 inq_shape = [32, 128, 1024] input_q = init.gauss(inq_shape, dtype='float') inr_shape = [32, 128, 256] input_r = init.gauss(inr_shape, dtype='float') # print (input_x.shape) # x = input_x.permute(0, 2, 1) cuda_knn = KNN(k=200) import time for i in range (100): jt.sync_all(True) start_time = time.time() idx = knn_point(200, input_r.permute(0, 2, 1), input_q.permute(0, 2, 1)) jt.sync_all(True) end_time = time.time() print ('python time', end_time - start_time) print (idx.shape) print (idx[0,0,:]) for i in range (100): jt.sync_all(True) start_time = time.time() idx_cuda = cuda_knn(input_q, input_r) jt.sync_all(True) end_time = time.time() print ('cuda run time', end_time - start_time) idx_cuda = idx_cuda.permute(0, 2, 1) print (idx_cuda[0,0,:]) print (idx_cuda.shape)
def test_simple_model_train(self): with jt.flag_scope(trace_py_var=2): model = Model(input_size=1) opt = jt.optim.SGD(model.parameters(), 0.1) batch_size = 10 x = jt.float32(np.random.rand(batch_size, 1)) y = model(x) opt.step(y**2) jt.sync_all() data = jt.dump_trace_data() jt.clear_trace_data()
def batchify_rays(rays_flat, chunk=1024 * 32, **kwargs): """Render rays in smaller minibatches to avoid OOM. """ all_ret = {} for i in range(0, rays_flat.shape[0], chunk): ret = render_rays(rays_flat[i:i + chunk], **kwargs) for k in ret: if k not in all_ret: all_ret[k] = [] all_ret[k].append(ret[k]) if jt.flags.no_grad: jt.sync_all() all_ret = {k: jt.concat(all_ret[k], 0) for k in all_ret} return all_ret
def check(xshape, wshape, stride=(1,1,1), padding=(0,0,0), dilation=(1,1,1), group=1): with jt.flag_scope(use_cuda=1): x = jt.random(xshape) w = jt.random(wshape) # y = jt.cudnn.ops.cudnn_conv3d(x, w, *stride, *padding, *dilation, group) y = jt.nn.conv3d(x, w, None, stride, padding, dilation, group) masky = jt.rand_like(y) dx, dw = jt.grad(masky*y, [x, w]) jt.sync_all() y2 = jt.nn.conv3d(x, w, None, stride, padding, dilation, group) dx2, dw2 = jt.grad(masky*y2, [x, w]) np.testing.assert_allclose(y.data, y2.data) np.testing.assert_allclose(dx.data, dx2.data, rtol=1e-5, atol=1e-3) np.testing.assert_allclose(dw.data, dw2.data, rtol=1e-5, atol=1e-3)
def test_cudnn_rnn_speed(self): from time import time iters = 100 h0 = np.random.rand(1, 128, 256).astype(np.float32) input = np.random.rand(128, 128, 128).astype(np.float32) dev = torch.device('cuda:0') t_rnn = tnn.RNN(128, 256, nonlinearity='relu').to(dev) t_optim = torch.optim.SGD(t_rnn.parameters(), lr=1e-3, momentum=0.9) t_input = torch.from_numpy(input).to(dev) t_h0 = torch.from_numpy(h0).to(dev) start_time = time() for i in range(iters): t_optim.zero_grad() t_output, th = t_rnn(t_input, t_h0) t_loss = (t_output**2).sum() + (th**2).sum() t_loss.backward() t_optim.step() print('torch time = ', time() - start_time) j_rnn = nn.RNN(128, 256, nonlinearity='relu') j_rnn.load_state_dict(t_rnn.state_dict()) j_optim = nn.SGD(j_rnn.parameters(), lr=1e-3, momentum=0.9) j_input, j_h0 = jt.array(input), jt.array(h0) start_time = time() for i in range(iters): j_output, jh = j_rnn(j_input, j_h0) j_loss = (j_output**2).sum() + (jh**2).sum() j_optim.step(j_loss) jt.sync_all(True) print('jittor Cudnn time = ', time() - start_time) jt_cudnn, jt.cudnn = jt.cudnn, None j_rnn = nn.RNN(128, 256, nonlinearity='relu') j_rnn.load_state_dict(t_rnn.state_dict()) j_optim = nn.SGD(j_rnn.parameters(), lr=1e-3, momentum=0.9) start_time = time() for i in range(iters): j_output, jh = j_rnn(j_input, j_h0) j_loss = (j_output**2).sum() + (jh**2).sum() j_optim.step(j_loss) jt.sync_all(True) print('jittor native time = ', time() - start_time) jt.cudnn = jt_cudnn
def test_oom(self): backups = [] jt.flags.use_cuda = 1 one_g = np.ones((1024 * 1024 * 1024 // 4, ), "float32") meminfo = jt.get_mem_info() n = int(meminfo.total_cuda_ram // (1024**3) * 0.6) for i in range(n): a = jt.array(one_g) b = a + 1 b.sync() backups.append((a, b)) jt.sync_all(True) backups = []
def train(model, train_loader, optimizer, epoch, init_lr, writer): model.train() max_iter = len(train_loader) for idx, (image, target) in enumerate(train_loader): poly_lr_scheduler(optimizer, init_lr, idx, epoch, max_iter, settings.EPOCHS) image = image.float32() jt.sync_all() start_time = time.time() context, pred = model(image) loss = model.get_loss(target, pred, context, settings.IGNORE_INDEX) optimizer.step(loss) jt.sync_all() end_time = time.time() print('total time =', end_time - start_time) writer.add_scalar('train/total_loss_iter', loss.data, idx + max_iter * epoch) print('Training in epoch {} iteration {} loss = {}'.format( epoch, idx, loss.data[0]))
def test_resnet_trainx(self): with jt.flag_scope(trace_py_var=2): resnet18 = resnet.Resnet18() opt = jt.optim.SGD(resnet18.parameters(), 0.1) x = jt.float32(np.random.rand(2, 3, 224, 224)) y = resnet18(x) opt.step(y**2) jt.sync_all() data = jt.dump_trace_data() jt.clear_trace_data() with open(f"{jt.flags.cache_path}/resnet_train.pkl", "wb") as f: pickle.dump(data, f) for k, v in data["execute_op_info"].items(): for i in v['fused_ops']: if i not in data["node_data"]: assert 0, (i, "not found") for k, v in data["node_data"].items(): if 'name' not in v["attrs"]: print(v)
def test_resnet(self): self.setup_seed(1) loss_list=[] acc_list=[] mnist_net = MnistNet() global prev prev = time.time() SGD = nn.SGD(mnist_net.parameters(), self.learning_rate, self.momentum, self.weight_decay) iters = 10 for batch_idx, (data, target) in enumerate(self.train_loader): if (batch_idx > iters): break jt.display_memory_info() output = mnist_net(data) loss = nn.cross_entropy_loss(output, target) SGD.step(loss) def callback(batch_idx, loss, output, target): global prev pred = np.argmax(output, axis=1) acc = np.mean(target==pred) loss_list.append(loss[0]) acc_list.append(acc) print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}\tAcc: {:.6f} \tTime:{:.3f}' .format(0, batch_idx, iters,1. * batch_idx / 6.0, loss[0], acc, time.time()-prev)) jt.fetch(batch_idx, loss, output, target, callback) jt.sync_all(True) jt.display_max_memory_info() _, out = jt.get_max_memory_treemap() out_ = out.split('\n') assert(out_[0] == 'root()') assert(out_[3].endswith('(_run_module_as_main)')) assert(out_[7].endswith('(_run_code)')) _, out = jt.get_max_memory_treemap(build_by=1) out_ = out.split('\n') assert(out_[0] == 'root()') assert(out_[4].endswith('(_run_module_as_main)')) assert(out_[8].endswith('(_run_code)'))
def check(xshape, wshape, stride=(1,1,1), padding=(0,0,0), dilation=(1,1,1), group=1): with jt.flag_scope(use_cuda=1): x = jt.random(xshape) w = jt.random(wshape) jt.sync_all() y2 = jt.nn.conv_transpose3d(x, w, None, stride, padding, 0, group, dilation) jt.sync_all() with jt.flag_scope(use_cuda=1): # y = jt.cudnn.ops.cudnn_conv3d_backward_x(w, x, *y2.shape[2:], *stride, *padding, *dilation, group) y = jt.nn.conv_transpose3d(x, w, None, stride, padding, 0, group, dilation) masky = jt.rand_like(y) dx, dw = jt.grad(masky*y, [x, w]) jt.sync_all() dx2, dw2 = jt.grad(masky*y2, [x, w]) jt.sync_all() np.testing.assert_allclose(y.numpy(), y2.numpy(), rtol=1e-6, atol=1e-4) np.testing.assert_allclose(dx.numpy(), dx2.numpy(), rtol=1e-6, atol=1e-4) np.testing.assert_allclose(dw.numpy(), dw2.numpy(), rtol=1e-5, atol=1e-3)
def test(test_loader, model, args): # switch to evaluate mode model.eval() for data in test_loader: images, names, size = data break jt.sync_all(True) # warmup for i in range(10): model(images).sync() jt.sync_all(True) # rerun t = time.time() for i in range(300): print(i, i / (time.time() - t)) model(images).sync() jt.sync_all(True) t = time.time() - t print("BS:", images.shape[0], "FPS:", 300 * images.shape[0] / t)
def evaluate(net: Yolact, dataset, train_mode=False): net.detect.use_fast_nms = args.fast_nms net.detect.use_cross_class_nms = args.cross_class_nms cfg.mask_proto_debug = args.mask_proto_debug # TODO Currently we do not support Fast Mask Re-scroing in evalimage, evalimages, and evalvideo if args.image is not None: if ':' in args.image: inp, out = args.image.split(':') evalimage(net, inp, out) else: evalimage(net, args.image) return elif args.images is not None: inp, out = args.images.split(':') evalimages(net, inp, out) return elif args.video is not None: if ':' in args.video: inp, out = args.video.split(':') evalvideo(net, inp, out) else: evalvideo(net, args.video) return frame_times = MovingAverage() dataset_size = len(dataset) if args.max_images < 0 else min( args.max_images, len(dataset)) progress_bar = ProgressBar(30, dataset_size) print() if not args.display and not args.benchmark: # For each class and iou, stores tuples (score, isPositive) # Index ap_data[type][iouIdx][classIdx] ap_data = { 'box': [[APDataObject() for _ in cfg.dataset.class_names] for _ in iou_thresholds], 'mask': [[APDataObject() for _ in cfg.dataset.class_names] for _ in iou_thresholds] } detections = Detections() else: timer.disable('Load Data') dataset_indices = list(range(len(dataset))) if args.shuffle: random.shuffle(dataset_indices) elif not args.no_sort: # Do a deterministic shuffle based on the image ids # # I do this because on python 3.5 dictionary key order is *random*, while in 3.6 it's # the order of insertion. That means on python 3.6, the images come in the order they are in # in the annotations file. For some reason, the first images in the annotations file are # the hardest. To combat this, I use a hard-coded hash function based on the image ids # to shuffle the indices we use. That way, no matter what python version or how pycocotools # handles the data, we get the same result every time. hashed = [badhash(x) for x in dataset.ids] dataset_indices.sort(key=lambda x: hashed[x]) # dataset_size=1000 dataset_indices = dataset_indices[:dataset_size] try: # Main eval loop dataset.batch_size = 1 dataset.num_workers = 1 for it, batch in enumerate(dataset): timer.reset() image_idx, img, gt, gt_masks, h, w, num_crowd = batch[0] if not args.benchmark: gt = gt.numpy() gt_masks = gt_masks.numpy() batch = img.reshape(1, img.shape[0], img.shape[1], img.shape[2]) # batch = jt.array([img]) with timer.env('Network Extra'): preds = net(batch) if args.display: img_numpy = prep_display(preds, img, h, w) elif args.benchmark: prep_benchmark(preds, h, w) else: prep_metrics(ap_data, preds, img, gt, gt_masks, h, w, num_crowd, dataset.ids[image_idx], detections) # First couple of images take longer because we're constructing the graph. # Since that's technically initialization, don't include those in the FPS calculations. if it > 1: frame_times.add(timer.total_time()) if args.display: if it > 1: print('Avg FPS: %.4f' % (1 / frame_times.get_avg())) plt.imshow(img_numpy) plt.title(str(dataset.ids[image_idx])) plt.show() elif not args.no_bar: if it > 1: fps = 1 / frame_times.get_avg() else: fps = 0 progress = (it + 1) / dataset_size * 100 progress_bar.set_val(it + 1) print( '\rProcessing Images %s %6d / %6d (%5.2f%%) %5.2f fps ' % (repr(progress_bar), it + 1, dataset_size, progress, fps), end='') jt.sync_all(True) if not args.display and not args.benchmark: print() if args.output_coco_json: print('Dumping detections..') if args.output_web_json: detections.dump_web() else: detections.dump() else: if not train_mode: print('Saving data..') with open(args.ap_data_file, 'wb') as f: pickle.dump(ap_data, f) return calc_map(ap_data) elif args.benchmark: print() print() print('Stats for the last frame:') timer.print_stats() avg_seconds = frame_times.get_avg() print('Average: %5.2f fps, %5.2f ms' % (1 / frame_times.get_avg(), 1000 * avg_seconds)) except KeyboardInterrupt: print('Stopping..')
def time_synchronized(): # accurate time if jt.flags.use_cuda == 1: jt.sync_all() return time.time()
# --------------------- # Train Discriminator # --------------------- real_loss = adversarial_loss(discriminator(real_imgs), valid) fake_loss = adversarial_loss(discriminator(gen_imgs.detach()), fake) gradient_penalty = compute_gradient_penalty(discriminator, real_imgs) d_loss = (real_loss + fake_loss) / 2 + gradient_penalty optimizer_D.step(d_loss) if warmup_times == -1: print(('[Epoch %d/%d] [Batch %d/%d] [D loss: %f] [G loss: %f]' % (epoch, opt.n_epochs, i, len(dataloader), d_loss.numpy()[0], g_loss.numpy()[0]))) else: jt.sync_all() cnt += 1 print(cnt) if cnt == warmup_times: jt.sync_all(True) sta = time.time() if cnt > warmup_times + run_times: jt.sync_all(True) total_time = time.time() - sta print( f"run {run_times} iters cost {total_time} seconds, and avg {total_time / run_times} one iter." ) exit(0) if warmup_times == -1: save_image(gen_imgs.data, ('images/%d.png' % epoch),
def test(name, model_name, bs): print("hello", name, model_name, bs) import numpy as np import time is_train = False _model_name = model_name if model_name.startswith("train_"): is_train = True model_name = model_name[6:] if name == "torch": import torch import torchvision.models as tcmodels from torch import optim from torch import nn torch.backends.cudnn.deterministic = False torch.backends.cudnn.benchmark = True model = tcmodels.__dict__[model_name]() model = model.cuda() else: import jittor as jt from jittor import optim from jittor import nn jt.flags.use_cuda = 1 jt.cudnn.set_algorithm_cache_size(10000) import jittor.models as jtmodels model = jtmodels.__dict__[model_name]() if (model == "resnet152" or model == "resnet101") and bs == 128 and is_train: jt.cudnn.set_max_workspace_ratio(0.05) if is_train: model.train() else: model.eval() img_size = 224 if model_name == "inception_v3": img_size = 300 test_img = np.random.random((bs, 3, img_size, img_size)).astype("float32") if is_train: label = (np.random.random((bs,)) * 1000).astype("int32") if name == "torch": test_img = torch.Tensor(test_img).cuda() if is_train: label = torch.LongTensor(label).cuda() opt = optim.SGD(model.parameters(), 0.001) sync = lambda: torch.cuda.synchronize() jt = torch else: test_img = jt.array(test_img).stop_grad() if is_train: label = jt.array(label).stop_grad() opt = optim.SGD(model.parameters(), 0.001) sync = lambda: jt.sync_all(True) sync() use_profiler = os.environ.get("use_profiler", "0") == "1" if hasattr(jt, "nograd"): ng = jt.no_grad() ng.__enter__() def iter(): x = model(test_img) if isinstance(x, tuple): x = x[0] if is_train: loss = nn.CrossEntropyLoss()(x, label) if name == "jittor": opt.step(loss) else: opt.zero_grad() loss.backward() opt.step() else: x.sync() sync() for i in time_iter(): iter() sync() for i in time_iter(): iter() sync() if use_profiler: if name == "torch": prof = torch.autograd.profiler.profile(use_cuda=True) else: prof = jt.profile_scope() prof.__enter__() if name == "jittor": if hasattr(jt.flags, "use_parallel_op_compiler"): jt.flags.use_parallel_op_compiler = 0 start = time.time() for i in time_iter(10): iter() sync() end = time.time() if use_profiler: prof.__exit__(None,None,None) if name == "torch": print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=30)) total_iter = i+1 print("duration:", end-start, "FPS:", total_iter*bs/(end-start)) fpath = f"{home_path}/.cache/jittor/{name}-{_model_name}-{bs}.txt" with open(fpath, 'w') as f: f.write(f"duration: {end-start} FPS: {total_iter*bs/(end-start)}") os.chmod(fpath, 0x666)
def test_resnet(self): self.setup_seed(1) loss_list = [] acc_list = [] mnist_net = MnistNet() global prev prev = time.time() SGD = nn.SGD(mnist_net.parameters(), self.learning_rate, self.momentum, self.weight_decay) self.train_loader.endless = True for data, target in self.train_loader: batch_id = self.train_loader.batch_id epoch_id = self.train_loader.epoch_id # train step # with jt.log_capture_scope( # log_silent=1, # log_v=1, log_vprefix="op.cc=100,exe=10", # ) as logs: output = mnist_net(data) loss = nn.cross_entropy_loss(output, target) SGD.step(loss) def callback(epoch_id, batch_id, loss, output, target): # print train info global prev pred = np.argmax(output, axis=1) acc = np.mean(target == pred) loss_list.append(loss[0]) acc_list.append(acc) print( 'Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}\tAcc: {:.6f} \tTime:{:.3f}' .format(epoch_id, batch_id, 600, 1. * batch_id / 6.0, loss[0], acc, time.time() - prev)) # prev = time.time() jt.fetch(epoch_id, batch_id, loss, output, target, callback) # log_conv = find_log_with_re(logs, # "Jit op key (not )?found: ((mkl)|(cudnn))_conv.*") # log_matmul = find_log_with_re(logs, # "Jit op key (not )?found: ((mkl)|(cublas))_matmul.*") # if batch_id > 2: # assert len(log_conv)==59 and len(log_matmul)==6, (len(log_conv), len(log_matmul)) mem_used = jt.flags.stat_allocator_total_alloc_byte \ -jt.flags.stat_allocator_total_free_byte # assert mem_used < 4e9, mem_used # TODO: why bigger? assert mem_used < 5.6e9, mem_used # example log: # Train Epoch: 0 [0/100 (0%)] Loss: 2.352903 Acc: 0.110000 # Train Epoch: 0 [1/100 (1%)] Loss: 2.840830 Acc: 0.080000 # Train Epoch: 0 [2/100 (2%)] Loss: 3.473594 Acc: 0.100000 # Train Epoch: 0 [3/100 (3%)] Loss: 3.131615 Acc: 0.200000 # Train Epoch: 0 [4/100 (4%)] Loss: 2.524094 Acc: 0.230000 # Train Epoch: 0 [5/100 (5%)] Loss: 7.780025 Acc: 0.080000 # Train Epoch: 0 [6/100 (6%)] Loss: 3.890721 Acc: 0.160000 # Train Epoch: 0 [7/100 (7%)] Loss: 6.370137 Acc: 0.140000 # Train Epoch: 0 [8/100 (8%)] Loss: 11.390827 Acc: 0.150000 # Train Epoch: 0 [9/100 (9%)] Loss: 21.598564 Acc: 0.080000 # Train Epoch: 0 [10/100 (10%)] Loss: 23.369165 Acc: 0.130000 # Train Epoch: 0 [20/100 (20%)] Loss: 4.804510 Acc: 0.100000 # Train Epoch: 0 [30/100 (30%)] Loss: 3.393924 Acc: 0.110000 # Train Epoch: 0 [40/100 (40%)] Loss: 2.286762 Acc: 0.130000 # Train Epoch: 0 [50/100 (50%)] Loss: 2.055014 Acc: 0.290000 if jt.in_mpi: assert jt.core.number_of_lived_vars( ) < 8100, jt.core.number_of_lived_vars() else: assert jt.core.number_of_lived_vars( ) < 7000, jt.core.number_of_lived_vars() if self.train_loader.epoch_id >= 2: break jt.sync_all(True) assert np.mean(loss_list[-50:]) < 0.5 assert np.mean(acc_list[-50:]) > 0.8