def check(xshape, wshape, stride=1, padding=0, dilation=1): with jt.log_capture_scope( use_cuda=1, enable_tuner=1, log_v=1, log_vprefix="op.cc=100,exe=1000") as raw_log: x = jt.random(xshape) w = jt.random(wshape) y = conv(x, w, stride, padding) mask = jt.random(y.shape) loss = mask * y dx, dw = jt.grad(loss, [x, w]) jt.sync([y, loss, dx, dw]) # fails when enable_tuner=1, something wrong with mkl_conv_backward_x maybe. with jt.flag_scope(use_cuda=0, enable_tuner=0): cy = conv(x, w, stride, padding) closs = mask * cy cdx, cdw = jt.grad(closs, [x, w]) jt.sync([cy, closs, cdx, cdw]) logs = find_log_with_re(raw_log, "(Jit op key (not )?found: cudnn_conv.*)") assert len(logs) == 3 and "oihw" in logs[0][0], logs assert np.allclose(y.data, cy.data) assert np.allclose(dx.data, cdx.data, 1e-2) assert np.allclose(dw.data, cdw.data, 1e-2)
def check_backward(xshape, wshape, stride, padding, dilation, groups, use_cuda, nhwc): assert nhwc == 0 test_func = test_nchw # only check cudnn with jt.log_capture_scope(use_cuda=use_cuda, enable_tuner=1, log_v=10, log_vprefix="conv_tuner.cc=1000") as raw_log: x = jt.random(xshape) w = jt.random(wshape) y = test_func(x, w, stride, padding, dilation, groups) dx, dw = jt.grad(y, [x, w]) jt.sync([y, dx, dw]) with jt.flag_scope(use_cuda=0, enable_tuner=0, compile_options={"test": 233}): cy = test_func(x, w, stride, padding, dilation, groups) cdx, cdw = jt.grad(cy, [x, w]) jt.sync([cy, cdx, cdw]) assert np.allclose(y.data, cy.data) assert np.allclose(dw.data, cdw.data, 1e-3), (dw.data, cdw.data, np.abs(dw.data - cdw.data).max()) assert np.allclose(dx.data, cdx.data, 1e-3), (dx.data, cdx.data, np.abs(dx.data - cdx.data).max())
def check_backward(xshape, wshape, stride, padding, dilation, use_cuda, nhwc): if nhwc: test_func = test_nhwc else: test_func = test_nchw if use_cuda == 1: op_name = "cudnn_conv" else: op_name = "mkl_conv" with jt.log_capture_scope(use_cuda=use_cuda, enable_tuner=1, log_v=1, log_vprefix="op.cc=1000,exe=1000,conv_t=1000", compile_options={"test":244} ) as raw_log: x = jt.random(xshape) w = jt.random(wshape) y = test_func(x, w, stride, padding, dilation) loss = y.mean() dx, dw = jt.grad(loss, [x, w]) jt.sync([y, loss, dx, dw]) with jt.flag_scope(use_cuda=0, enable_tuner=0, compile_options={"test":233}): cy = test_func(x, w, stride, padding, dilation) closs = cy.mean() cdx, cdw = jt.grad(closs, [x, w]) jt.sync([cy, closs, cdx, cdw]) logs = find_log_with_re(raw_log, "(Jit op key (not )?found: " + op_name + ".*)") assert len(logs)==3 and "oihw" in logs[0][0], (logs) assert np.allclose(y.data, cy.data, 1e-3) assert np.allclose(dw.data, cdw.data, 1e-3), (dw.data, cdw.data) assert np.allclose(dx.data, cdx.data, 1e-3), (dx.data, cdx.data, np.abs(cdx.data).max(), np.abs(dx.data - cdx.data).max())
def test_scalar_fuse_unary(self): with jt.profile_scope() as rep: a = jt.array([1]) b = -a a = a.clone() b = b.clone() jt.sync([a, b]) assert a.data == 1 assert b.data == -1 assert len(rep) == 2
def step(self, loss): ps = self.parameters gs = jt.grad(loss, ps) self.adam_step += 1 n, (b0, b1) = float(self.adam_step), self.betas for p, g, v, m in zip(ps, gs, self.values, self.m): m.assign(b0 * m + (1-b0) * g) v.assign(b1 * v + (1-b1) * g * g) step_size = self.lr * jt.sqrt(1-b1**n) / (1-b0 ** n) p -= m * step_size / (jt.sqrt(v) + self.eps) p.detach_inplace() jt.sync(self.no_grad_parameters)
def test_wrong_fuse2(self): a = jt.array([1]) b = jt.random([ 10, ]) c = jt.random([ 100, ]) bb = a * b cc = a * c jt.sync([bb, cc]) np.testing.assert_allclose(b.data, bb.data) np.testing.assert_allclose(c.data, cc.data)
def test_longest_dis_fuse(self): x = jt.array(np.random.rand(1, 3, 224, 224).astype(np.float32)) loss = jt.sum(resnet_fake(x)) ps = jt.find_vars('resnet_fake') gs = jt.grad(loss, ps) jt.sync(gs) # assert not alloc big tensor g = jt.dump_all_graphs() for s in g.nodes_info: if not s.startswith("Var"): continue shape = s.split("[")[1].split("]")[0].split(",") ptr = s.split("(")[1].split(")")[0].split(",")[-1] if ptr != '0': assert len(shape) <= 5, s
def step(self, loss): ps = self.parameters gs = jt.grad(loss, ps) for p, g, v in zip(ps, gs, self.values): dp = p * self.weight_decay + g v.assign(self.momentum * v + dp * (1 - self.dampening)) if self.nesterov: p -= (dp + self.momentum * v) * self.lr else: p -= v * self.lr # detach with the prev graph to reduce memory consumption p.detach_inplace() # sync all no grad parameters, such as # moving_mean and moving_var in batch_norm # sync such parameters to reduce memory consumption jt.sync(self.no_grad_parameters)
def test(): class MyFunc(Function): def execute(self, x, z, y): self.x = x self.y = y return x*y, "test", x/y def grad(self, grad0, _, grad1): assert _ is None res = (grad0 * self.y, None, grad1 * self.x) return res a = jt.array(3.0) b = jt.array(4.0) c,_,d = MyFunc()(a, "a", b) g = jt.grad(c+d*3, [a, b]) jt.sync(g)
def test_stop_fuse2(self): with jt.profile_scope() as report: a = jt.float32(0).stop_fuse() c = jt.float32(0).stop_fuse() bs = [c] for i in range(2000): b = jt.float32(i) * 2 * c bs.append(b) a += b a = a * 2 dbs = jt.grad(a, bs) jt.sync(dbs + [a]) for a in report[1:]: assert len(a[0].split("opkey")) < 8
def step(self, loss): self.adam_step += 1 ps = self.parameters gs = jt.grad(loss, ps) if jt.mpi: for g in gs: g.assign(g.mpi_all_reduce("mean")) if self.adam_step % self.param_sync_iter == 0: for p in ps: p.assign(p.mpi_all_reduce("mean")) n, (b0, b1) = float(self.adam_step), self.betas for p, g, v, m in zip(ps, gs, self.values, self.m): m.assign(b0 * m + (1 - b0) * g) v.assign(b1 * v + (1 - b1) * g * g) step_size = self.lr * jt.sqrt(1 - b1**n) / (1 - b0**n) p -= m * step_size / (jt.sqrt(v) + self.eps) p.detach_inplace() jt.sync(self.no_grad_parameters)
def pre_step(self, loss): """ something should be done before step, such as calc gradients, mpi sync, and so on. Example: ``` class MyOptimizer(Optimizer): def step(self, loss): self.post_step(loss) ... ``` """ # clean prev grads params = [] params_has_grad = [] for pg in self.param_groups: pg["grads"] = [None] * len(pg['params']) for p in pg['params']: params.append(p) if not p.is_stop_grad(): params_has_grad.append(p) # sync params, reduce computing graph size jt.sync(params) # get gradient grads = jt.grad(loss, params_has_grad) # sync grads and model if in mpi if jt.mpi: for g in grads: g.assign(g.mpi_all_reduce("mean")) if self.n_step % self.param_sync_iter == 0: for p in params: p.assign(p.mpi_all_reduce("mean")) self.n_step += 1 # set up grads in param_groups pid = 0 for pg in self.param_groups: pg_grads = pg["grads"] for i, p in enumerate(pg['params']): if not p.is_stop_grad(): pg_grads[i] = grads[pid] pid += 1
def test_stop_fuse(self): with jt.profile_scope() as report: a = jt.float32(0).stop_fuse() c = jt.float32(0) bs = [c] for i in range(2000): b = jt.float32(i) * 2 * c bs.append(b) a += b a = a * 2 dbs = jt.grad(a, bs) jt.sync(dbs + [a]) for a in report[1:]: # origin is 50 # after update queue, increase to 102 assert len(a[0].split("opkey")) < 110, len(a[0].split("opkey"))
def test_reduce_opt(self): a = jt.random((16, 512, 38, 38)) b = jt.random((16, 512, 38, 38)) jt.sync([a, b]) with jt.profile_scope(rerun=10, warmup=10) as rep: norm = a.sqr().sum(1, keepdims=True).sqrt() c = a / norm da = jt.grad(c * b, a) jt.sync([c, da]) gpu_c = c.numpy() gpu_da = da.numpy() with jt.flag_scope(use_cuda=0): norm = a.sqr().sum(1, keepdims=True).sqrt() c = a / norm da = jt.grad(c * b, a) assert np.allclose(gpu_c, c.data, 1e-3) assert (np.abs(gpu_da - da.data).max() < 1e-6) assert float(rep[1][3]) < 15e6, float(rep[1][3]) # 15ms(about 8ms)
def check(data_shape, weights_shape, stride=1, dilation=1): N, C, H, W = data_shape i, o, h, w = weights_shape img = np.random.rand(N, C, H, W).astype("float32") weights = np.random.rand(i, o, h, w).astype("float32") m1 = jt.nn.ConvTranspose(i, o, h, stride=stride, dilation=dilation, bias=False) m2 = torch.nn.ConvTranspose2d(i, o, h, stride=stride, dilation=dilation, bias=False) m1.weight.data = weights m2.weight.data = torch.Tensor(weights) x = jt.array(img) # out1 = m1(x) out1 = jt.nn.conv_transpose2d(x, m1.weight, stride=stride, dilation=dilation, bias=False) mask = jt.random(out1.shape) out1 = out1 * mask tx = torch.Tensor(img) tx.requires_grad = True out2 = m2(tx) * torch.Tensor(mask.data) with jt.log_capture_scope( log_silent=1, log_vprefix="var_re=0,conv=0,op.cc=100") as logs: assert np.allclose(out1.data, out2.data) dx, dw = jt.grad(out1, [x, m1.weight]) jt.sync([dx, dw]) out2.sum().backward() assert np.allclose(dw.data, m2.weight.grad.numpy(), 1e-3) assert np.allclose(dx.data, tx.grad.numpy()) assert len(find_log_with_re(logs, "conv")) == 3
def test_memcopy_overlap(self): import time from jittor.models import resnet im = np.random.rand(100, 3, 224, 224).astype(np.float32) net = resnet.Resnet34() net.eval() # warm up x = jt.array(im).stop_grad() for i in range(10): a = net(x) a.sync() jt.sync(device_sync=True) # pure compute time_start = time.time() x = jt.array(im).stop_grad() for i in range(10): a = net(x) a.sync() jt.sync(device_sync=True) t1 = time.time() - time_start # warm up for i in range(3): x = jt.array(im) b = net(x) b.fetch(lambda b: None) b.sync() jt.sync(device_sync=True) # overlap time_start = time.time() results = [] for i in range(10): x = jt.array(im) b = net(x) b.fetch(lambda b: results.append(b)) b.sync() # del c jt.sync(device_sync=True) t2 = time.time() - time_start assert t2 - t1 < 0.010, (t2, t1, t2 - t1) assert np.allclose(a.data, b.data) assert len(results) == 10 for v in results: assert np.allclose(a.data, v), (v.shape, a.data.shape) jt.LOG.v(f"pure compute: {t1}, overlap: {t2}")
def train(): if args.cuda: jt.flags.use_cuda = 1 if not os.path.exists(args.save_folder): os.mkdir(args.save_folder) # dataset = COCODetection(image_path=cfg.dataset.train_images, # info_file=cfg.dataset.train_info, # transform=SSDAugmentation(MEANS)) dataset = COCODetection(image_path=cfg.dataset.train_images, info_file=cfg.dataset.train_info, transform=BaseTransform(MEANS)) if args.validation_epoch > 0: setup_eval() val_dataset = EvalCOCODetection(image_path=cfg.dataset.valid_images, info_file=cfg.dataset.valid_info, transform=BaseTransform(MEANS)) # Parallel wraps the underlying module, but when saving and loading we don't want that yolact_net = Yolact() net = yolact_net net.train() if args.log: log = Log(cfg.name, args.log_folder, dict(args._get_kwargs()), overwrite=(args.resume is None), log_gpu_stats=args.log_gpu) # I don't use the timer during training (I use a different timing method). # Apparently there's a race condition with multiple GPUs, so disable it just to be safe. timer.disable_all() # Both of these can set args.resume to None, so do them before the check if args.resume == 'interrupt': args.resume = SavePath.get_interrupt(args.save_folder) elif args.resume == 'latest': args.resume = SavePath.get_latest(args.save_folder, cfg.name) if args.resume is not None: print('Resuming training, loading {}..'.format(args.resume)) yolact_net.load_weights(args.resume) if args.start_iter == -1: args.start_iter = SavePath.from_str(args.resume).iteration else: print('Initializing weights..') yolact_net.init_weights(backbone_path=args.save_folder + cfg.backbone.path) optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.decay) criterion = MultiBoxLoss(num_classes=cfg.num_classes, pos_threshold=cfg.positive_iou_threshold, neg_threshold=cfg.negative_iou_threshold, negpos_ratio=cfg.ohem_negpos_ratio) if args.batch_alloc is not None: args.batch_alloc = [int(x) for x in args.batch_alloc.split(',')] if sum(args.batch_alloc) != args.batch_size: print( 'Error: Batch allocation (%s) does not sum to batch size (%s).' % (args.batch_alloc, args.batch_size)) exit(-1) net = NetLoss(net, criterion) # Initialize everything if not cfg.freeze_bn: yolact_net.freeze_bn() # Freeze bn so we don't kill our means yolact_net(jt.zeros((1, 3, cfg.max_size, cfg.max_size))) if not cfg.freeze_bn: yolact_net.freeze_bn(True) # loss counters loc_loss = 0 conf_loss = 0 iteration = max(args.start_iter, 0) last_time = time.time() epoch_size = len(dataset) // args.batch_size num_epochs = math.ceil(cfg.max_iter / epoch_size) # Which learning rate adjustment step are we on? lr' = lr * gamma ^ step_index step_index = 0 dataset.set_attrs(batch_size=args.batch_size, num_workers=args.num_workers, shuffle=False) dataset.collate_batch = detection_collate data_loader = dataset save_path = lambda epoch, iteration: SavePath( cfg.name, epoch, iteration).get_path(root=args.save_folder) time_avg = MovingAverage() global loss_types # Forms the print order loss_avgs = {k: MovingAverage(100) for k in loss_types} print('Begin training!') print() # try-except so you can use ctrl+c to save early and stop training try: # jt.profiler.start(0, 0) i = 0 for epoch in range(num_epochs): # Resume from start_iter if (epoch + 1) * epoch_size < iteration: continue for datum in data_loader: # data_loader.display_worker_status() # Stop if we've reached an epoch if we're resuming from start_iter if iteration == (epoch + 1) * epoch_size: break # Stop at the configured number of iterations even if mid-epoch if iteration == cfg.max_iter: break # Change a config setting if we've reached the specified iteration changed = False for change in cfg.delayed_settings: if iteration >= change[0]: changed = True cfg.replace(change[1]) # Reset the loss averages because things might have changed for avg in loss_avgs: avg.reset() # If a config setting was changed, remove it from the list so we don't keep checking if changed: cfg.delayed_settings = [ x for x in cfg.delayed_settings if x[0] > iteration ] # Warm up by linearly interpolating the learning rate from some smaller value if cfg.lr_warmup_until > 0 and iteration <= cfg.lr_warmup_until: set_lr(optimizer, (args.lr - cfg.lr_warmup_init) * (iteration / cfg.lr_warmup_until) + cfg.lr_warmup_init) # Adjust the learning rate at the given iterations, but also if we resume from past that iteration while step_index < len( cfg.lr_steps ) and iteration >= cfg.lr_steps[step_index]: step_index += 1 set_lr(optimizer, args.lr * (args.gamma**step_index)) # Zero the grad to get ready to compute gradients #optimizer.zero_grad() # Forward Pass + Compute loss at the same time (see CustomDataParallel and NetLoss) splits = prepare_data(datum) losses = net(*splits) losses = {k: (v).mean() for k, v in losses.items() } # Mean here because Dataparallel loss = sum([losses[k] for k in losses]) # no_inf_mean removes some components from the loss, so make sure to backward through all of it # all_loss = sum([v.mean() for v in losses.values()]) # loss.sync() # Backprop loss.sync() optimizer.step(loss) jt.sync(optimizer.param_groups[0]['params']) # Add the loss to the moving average for bookkeeping for k in losses: loss_avgs[k].add(losses[k].item()) # for k in losses: # loss_avgs[k].add(0) cur_time = time.time() elapsed = cur_time - last_time last_time = cur_time # Exclude graph setup from the timing information if iteration != args.start_iter: time_avg.add(elapsed) if iteration % 10 == 0: eta_str = str( datetime.timedelta(seconds=(cfg.max_iter - iteration) * time_avg.get_avg())).split('.')[0] total = sum([loss_avgs[k].get_avg() for k in losses]) loss_labels = sum([[k, loss_avgs[k].get_avg()] for k in loss_types if k in losses], []) print(('[%3d] %7d ||' + (' %s: %.3f |' * len(losses)) + ' T: %.3f || ETA: %s || timer: %.3f') % tuple([epoch, iteration] + loss_labels + [total, eta_str, elapsed]), flush=True) if args.log: precision = 5 # loss_info = {k: round(float(losses[k].item()), precision) for k in losses} # loss_info['T'] = round(float(loss.item()), precision) loss_info = {k: round(float(0), precision) for k in losses} loss_info['T'] = round(float(0), precision) if args.log_gpu: log.log_gpu_stats = (iteration % 10 == 0 ) # nvidia-smi is sloooow log.log('train', loss=loss_info, epoch=epoch, iter=iteration, lr=round(cur_lr, 10), elapsed=elapsed) log.log_gpu_stats = args.log_gpu iteration += 1 if iteration % args.save_interval == 0 and iteration != args.start_iter: if args.keep_latest: latest = SavePath.get_latest(args.save_folder, cfg.name) print('Saving state, iter:', iteration) yolact_net.save_weights(save_path(epoch, iteration)) if args.keep_latest and latest is not None: if args.keep_latest_interval <= 0 or iteration % args.keep_latest_interval != args.save_interval: print('Deleting old save..') os.remove(latest) i += 1 if i > 100: break if i > 100: break # This is done per epoch if args.validation_epoch > 0: if epoch % args.validation_epoch == 0 and epoch > 0: compute_validation_map(epoch, iteration, yolact_net, val_dataset, log if args.log else None) # Compute validation mAP after training is finished # compute_validation_map(epoch, iteration, yolact_net, val_dataset, log if args.log else None) except KeyboardInterrupt: if args.interrupt: print('Stopping early. Saving network..') # Delete previous copy of the interrupted network so we don't spam the weights folder SavePath.remove_interrupt(args.save_folder) yolact_net.save_weights( save_path(epoch, repr(iteration) + '_interrupt')) exit() yolact_net.save_weights(save_path(epoch, iteration))
def filter_results(self, boxlist, num_classes): """Returns bounding-box detection results by thresholding on scores and applying non-maximum suppression (NMS). """ # unwrap the boxlist to avoid additional overhead. # if we had multi-class NMS, we could perform this directly on the boxlist boxes = boxlist.bbox.reshape(-1, num_classes * 4) scores = boxlist.get_field("scores").reshape(-1, num_classes) result = [] # Apply threshold on detection probabilities and apply NMS # Skip j = 0, because it's the background class # inds_all = (scores > self.score_thresh).int() inds_all = scores > self.score_thresh # print(self.score_thresh,num_classes) # print(inds_all.shape) # inds_all = inds_all.transpose(1,0) inds_nonzeros = [ inds_all[:,j].nonzero() for j in range(1, num_classes) ] jt.sync(inds_nonzeros) for j in range(1, num_classes): # with nvtx_scope("aa"): # inds = inds_all[:,j].nonzero().squeeze(1) # with nvtx_scope("bb"): # scores_j = scores[inds, j] # boxes_j = boxes[inds, j * 4 : (j + 1) * 4] # with nvtx_scope("cc"): # boxlist_for_class = BoxList(boxes_j, boxlist.size, mode="xyxy") # with nvtx_scope("cc2"): # boxlist_for_class.add_field("scores", scores_j) # with nvtx_scope("cc3"): # boxlist_for_class = boxlist_nms( # boxlist_for_class, self.nms # ) # with nvtx_scope("dd"): # num_labels = len(boxlist_for_class) # with nvtx_scope("dd2"): # boxlist_for_class.add_field( # "labels", jt.full((num_labels,), j).int32() # ) # result.append(boxlist_for_class) # inds = inds_all[:,j].nonzero().squeeze(1) inds = inds_nonzeros[j-1] if inds.shape[0] == 0: continue inds = inds.squeeze(1) scores_j = scores[inds, j] boxes_j = boxes[inds, j * 4 : (j + 1) * 4] boxlist_for_class = BoxList(boxes_j, boxlist.size, mode="xyxy") boxlist_for_class.add_field("scores", scores_j) boxlist_for_class = boxlist_nms( boxlist_for_class, self.nms ) num_labels = len(boxlist_for_class) # print(j,num_labels) boxlist_for_class.add_field( "labels", jt.full((num_labels,), j).int32() ) result.append(boxlist_for_class) result = cat_boxlist(result) if not result.has_field('labels'): result.add_field('labels',jt.empty((0,))) if not result.has_field('scores'): result.add_field('scores',jt.empty((0,))) number_of_detections = len(result) #Limit to max_per_image detections **over all classes** if number_of_detections > self.detections_per_img > 0: cls_scores = result.get_field("scores") image_thresh, _ = jt.kthvalue( cls_scores, number_of_detections - self.detections_per_img + 1 ) keep = cls_scores >= image_thresh keep = jt.nonzero(keep).squeeze(1) result = result[keep] # # Absolute limit detection imgs # if number_of_detections > self.detections_per_img > 0: # cls_scores = result.get_field("scores") # scores, indices = jt.topk( # cls_scores, self.detections_per_img # ) # result = result[indices] return result
def test_multioutput(self): a, b = jt.index([2, 2]) jt.sync([a, b]) assert (a.data == [[0, 0], [1, 1]]).all() assert (b.data == [[0, 1], [0, 1]]).all(), b.data