def worker(rank, world_size, args): cfg = import_config_from_file(args.config) if world_size > 1: dist.init_process_group( master_ip="localhost", master_port=23456, world_size=world_size, rank=rank, dev=rank, ) logger.info("Init process group done") logger.info("Prepare dataset") train_loader, epoch_size = build_dataloader(cfg.BATCH_SIZE, args.dataset_dir, cfg) batch_iter = epoch_size // (cfg.BATCH_SIZE * world_size) net = DeepLabV3Plus(class_num=cfg.NUM_CLASSES, pretrained=args.weight_file) base_lr = cfg.LEARNING_RATE * world_size optimizer = optim.SGD( net.parameters(requires_grad=True), lr=base_lr, momentum=0.9, weight_decay=0.00004, ) @jit.trace(symbolic=True, opt_level=2) def train_func(data, label, net=None, optimizer=None): net.train() pred = net(data) loss = softmax_cross_entropy(pred, label, ignore_index=cfg.IGNORE_INDEX) optimizer.backward(loss) return pred, loss begin_epoch = 0 end_epoch = cfg.EPOCHS if args.resume is not None: pretrained = mge.load(args.resume) begin_epoch = pretrained["epoch"] + 1 net.load_state_dict(pretrained["state_dict"]) logger.info("load success: epoch %d", begin_epoch) itr = begin_epoch * batch_iter max_itr = end_epoch * batch_iter image = mge.tensor( np.zeros([cfg.BATCH_SIZE, 3, cfg.IMG_HEIGHT, cfg.IMG_WIDTH]).astype(np.float32), dtype="float32", ) label = mge.tensor( np.zeros([cfg.BATCH_SIZE, cfg.IMG_HEIGHT, cfg.IMG_WIDTH]).astype(np.int32), dtype="int32", ) exp_name = os.path.abspath(os.path.dirname(__file__)).split("/")[-1] for epoch in range(begin_epoch, end_epoch): for i_batch, sample_batched in enumerate(train_loader): def adjust_lr(optimizer, itr, max_itr): now_lr = base_lr * (1 - itr / (max_itr + 1)) ** 0.9 for param_group in optimizer.param_groups: param_group["lr"] = now_lr return now_lr now_lr = adjust_lr(optimizer, itr, max_itr) inputs_batched, labels_batched = sample_batched labels_batched = np.squeeze(labels_batched, axis=1).astype(np.int32) image.set_value(inputs_batched) label.set_value(labels_batched) optimizer.zero_grad() _, loss = train_func(image, label, net=net, optimizer=optimizer) optimizer.step() running_loss = loss.numpy()[0] if rank == 0: logger.info( "%s epoch:%d/%d\tbatch:%d/%d\titr:%d\tlr:%g\tloss:%g", exp_name, epoch, end_epoch, i_batch, batch_iter, itr + 1, now_lr, running_loss, ) itr += 1 if rank == 0: save_path = os.path.join(cfg.MODEL_SAVE_DIR, "epoch%d.pkl" % (epoch)) mge.save({"epoch": epoch, "state_dict": net.state_dict()}, save_path) logger.info("save epoch%d", epoch)
def test_fill_(): x = tensor(np.zeros((2, 3, 4)), dtype=np.float32) fill_(x, 5.0) np.testing.assert_array_equal( x.numpy(), np.full(shape=(2, 3, 4), fill_value=5.0, dtype=np.float32))
import numpy as np import megengine as mge # 我们习惯将 MegEngine 缩写为 mge import megengine.functional as F # 我们习惯将 functional 缩写为 F from megengine import tensor from megengine import Tensor # 1. 生成 Python List,然后转化为 MegEngine Tensor py_list = range(5) print(mge.tensor(py_list)) # 2. 生成 Numpy ndarray,然后转化为 MegEngine Tensor np_ndarray = np.arange(5).astype("float32") print(mge.tensor(np_ndarray)) # 3. 使用 functional 模块直接生成 MegEngine Tensor mge_tensor = F.arange(5) print(mge_tensor) print(mge_tensor.dtype) print(type(mge_tensor)) new_tensor = mge_tensor.astype("float16") print(new_tensor) print(type(mge_tensor.numpy())) print(tensor([1, 2, 3])) # 实际上我们更希望使用 float32 类型的 Tensor print(Tensor([1., 2., 3.])) # 因此我们会习惯性地加上一个点表示这是浮点数 matrix_tensor = mge.tensor([[1., 2., 3.], [4., 5., 6.]])
def worker(world_size, args): # pylint: disable=too-many-statements rank = dist.get_rank() if world_size > 1: # Initialize distributed process group logger.info("init distributed process group {} / {}".format(rank, world_size)) save_dir = os.path.join(args.save, args.arch + "." + args.mode) if not os.path.exists(save_dir): os.makedirs(save_dir, exist_ok=True) mge.set_log_file(os.path.join(save_dir, "log.txt")) model = models.__dict__[args.arch]() cfg = config.get_finetune_config(args.arch) cfg.LEARNING_RATE *= world_size # scale learning rate in distributed training total_batch_size = cfg.BATCH_SIZE * world_size steps_per_epoch = 1280000 // total_batch_size total_steps = steps_per_epoch * cfg.EPOCHS if args.mode != "normal": quantize_qat(model, qconfig=Q.ema_fakequant_qconfig) if args.checkpoint: logger.info("Load pretrained weights from %s", args.checkpoint) ckpt = mge.load(args.checkpoint) ckpt = ckpt["state_dict"] if "state_dict" in ckpt else ckpt model.load_state_dict(ckpt, strict=False) if args.mode == "quantized": raise ValueError("mode = quantized only used during inference") if world_size > 1: # Sync parameters dist.bcast_list_(model.parameters(), dist.WORLD) # Autodiff gradient manager gm = autodiff.GradManager().attach( model.parameters(), callbacks=dist.make_allreduce_cb("MEAN") if world_size > 1 else None, ) optimizer = optim.SGD( get_parameters(model, cfg), lr=cfg.LEARNING_RATE, momentum=cfg.MOMENTUM, ) # Define train and valid graph def train_func(image, label): with gm: model.train() logits = model(image) loss = F.loss.cross_entropy(logits, label, label_smooth=0.1) acc1, acc5 = F.topk_accuracy(logits, label, (1, 5)) gm.backward(loss) optimizer.step().clear_grad() return loss, acc1, acc5 def valid_func(image, label): model.eval() logits = model(image) loss = F.loss.cross_entropy(logits, label, label_smooth=0.1) acc1, acc5 = F.topk_accuracy(logits, label, (1, 5)) return loss, acc1, acc5 # Build train and valid datasets logger.info("preparing dataset..") train_dataset = data.dataset.ImageNet(args.data, train=True) train_sampler = data.Infinite( data.RandomSampler(train_dataset, batch_size=cfg.BATCH_SIZE, drop_last=True) ) train_queue = data.DataLoader( train_dataset, sampler=train_sampler, transform=T.Compose( [ T.RandomResizedCrop(224), T.RandomHorizontalFlip(), cfg.COLOR_JITTOR, T.Normalize(mean=128), T.ToMode("CHW"), ] ), num_workers=args.workers, ) train_queue = iter(train_queue) valid_dataset = data.dataset.ImageNet(args.data, train=False) valid_sampler = data.SequentialSampler( valid_dataset, batch_size=100, drop_last=False ) valid_queue = data.DataLoader( valid_dataset, sampler=valid_sampler, transform=T.Compose( [T.Resize(256), T.CenterCrop(224), T.Normalize(mean=128), T.ToMode("CHW")] ), num_workers=args.workers, ) def adjust_learning_rate(step, epoch): learning_rate = cfg.LEARNING_RATE if cfg.SCHEDULER == "Linear": learning_rate *= 1 - float(step) / total_steps elif cfg.SCHEDULER == "Multistep": learning_rate *= cfg.SCHEDULER_GAMMA ** bisect.bisect_right( cfg.SCHEDULER_STEPS, epoch ) else: raise ValueError(cfg.SCHEDULER) for param_group in optimizer.param_groups: param_group["lr"] = learning_rate return learning_rate # Start training objs = AverageMeter("Loss") top1 = AverageMeter("Acc@1") top5 = AverageMeter("Acc@5") total_time = AverageMeter("Time") t = time.time() for step in range(0, total_steps): # Linear learning rate decay epoch = step // steps_per_epoch learning_rate = adjust_learning_rate(step, epoch) image, label = next(train_queue) n = image.shape[0] image = mge.tensor(image, dtype="float32") label = mge.tensor(label, dtype="int32") loss, acc1, acc5 = train_func(image, label) top1.update(100 * acc1.numpy()[0], n) top5.update(100 * acc5.numpy()[0], n) objs.update(loss.numpy()[0], n) total_time.update(time.time() - t) t = time.time() if step % args.report_freq == 0 and rank == 0: logger.info( "TRAIN e%d %06d %f %s %s %s %s", epoch, step, learning_rate, objs, top1, top5, total_time, ) objs.reset() top1.reset() top5.reset() total_time.reset() if step != 0 and step % 10000 == 0 and rank == 0: logger.info("SAVING %06d", step) save_path = os.path.join(save_dir, str(step)) if not os.path.exists(save_path): os.makedirs(save_path, exist_ok=True) mge.save( {"step": step, "state_dict": model.state_dict()}, os.path.join(save_path, "checkpoint.pkl"), ) if step % 10000 == 0 and step != 0: _, valid_acc, valid_acc5 = infer(valid_func, valid_queue, args) logger.info("TEST %06d %f, %f", step, valid_acc, valid_acc5) mge.save( {"step": step, "state_dict": model.state_dict()}, os.path.join(save_dir, "checkpoint-final.pkl"), ) _, valid_acc, valid_acc5 = infer(valid_func, valid_queue, args) logger.info("TEST %06d %f, %f", step, valid_acc, valid_acc5)
def __init__(self): super().__init__() self.data = megengine.tensor(np.random.random((1, 1, 4, 4)), dtype=np.float32)
def worker(args): # pylint: disable=too-many-statements rank = dist.get_rank() world_size = dist.get_world_size() if rank == 0: os.makedirs(os.path.join(args.save, args.arch), exist_ok=True) megengine.logger.set_log_file(os.path.join(args.save, args.arch, "log.txt")) # init process group # build dataset train_dataloader, valid_dataloader = build_dataset(args) train_queue = iter(train_dataloader) # infinite steps_per_epoch = args.steps_per_epoch # build model model = UNetD(3) # Sync parameters if world_size > 1: dist.bcast_list_(model.parameters(), dist.WORLD) # Autodiff gradient manager gm = autodiff.GradManager().attach( model.parameters(), callbacks=dist.make_allreduce_cb("SUM") if world_size > 1 else None, ) # Optimizer opt = optim.Adam( model.parameters(), lr=args.lr, weight_decay=args.weight_decay * world_size, # scale weight decay in "SUM" mode ) # mixup def preprocess(image, label): if args.dnd: image, label = MixUp_AUG(image, label) return image, label # train and valid func def train_step(image, label): with gm: logits = model(image) logits = image - logits loss = F.nn.l1_loss(logits, label) gm.backward(loss) opt.step().clear_grad() return loss def valid_step(image, label): pred = model(image) pred = image - pred mae_iter = F.nn.l1_loss(pred, label) psnr_it = batch_PSNR(pred, label) #print(psnr_it.item()) if world_size > 1: mae_iter = F.distributed.all_reduce_sum(mae_iter) / world_size psnr_it = F.distributed.all_reduce_sum(psnr_it) / world_size return mae_iter, psnr_it # multi-step learning rate scheduler with warmup def adjust_learning_rate(step): #lr = 1e-6 + 0.5 * (args.lr - 1e-6)*(1 + np.cos(step/(args.epochs*steps_per_epoch) * np.pi)) lr = args.lr * (np.cos(step / (steps_per_epoch * args.epochs) * np.pi) + 1) / 2 for param_group in opt.param_groups: param_group["lr"] = lr return lr # start training for step in range(0, int(args.epochs * steps_per_epoch)): #print(step) lr = adjust_learning_rate(step) t_step = time.time() image, label = next(train_queue) if step > steps_per_epoch: image, label = preprocess(image, label) image = megengine.tensor(image) label = megengine.tensor(label) t_data = time.time() - t_step loss = train_step(image, label) t_train = time.time() - t_step speed = 1. / t_train if step % args.print_freq == 0 and dist.get_rank() == 0: logging.info( "Epoch {} Step {}, Speed={:.2g} mb/s, dp_cost={:.2g}, Loss={:5.2e}, lr={:.2e}".format( step // int(steps_per_epoch), step, speed, t_data/t_train, loss.item(), lr )) #print(steps_per_epoch) if (step + 1) % steps_per_epoch == 0: model.eval() loss, psnr_v = valid(valid_step, valid_dataloader) model.train() logging.info( "Epoch {} Test mae {:.3f}, psnr {:.3f}".format( (step + 1) // steps_per_epoch, loss.item(), psnr_v.item(), )) megengine.save( { "epoch": (step + 1) // steps_per_epoch, "state_dict": model.state_dict(), }, os.path.join(args.save, args.arch, "checkpoint.pkl"), ) if rank == 0 else None
def test_quantize_batchmatmul_activation(): batch = 4 in_features = 8 out_features = 4 class TestNet(Module): def __init__(self, bias): super().__init__() self.quant = QuantStub() self.dequant = DequantStub() self.batch_mm = BatchMatMulActivation(batch, in_features, out_features, bias=bias) def forward(self, inp): out = self.quant(inp) out = self.batch_mm(out) out = expand_dims(out, -1) out = self.dequant(out) return out inputs = tensor( np.random.randn(batch, in_features, out_features).astype(np.float32)) for bias in (True, False): net = TestNet(bias) net.train() qat_net = quantize_qat(net, inplace=False) disable_fake_quant(qat_net) normal_outputs = net(inputs) qat_outputs = qat_net(inputs) np.testing.assert_allclose(normal_outputs.numpy(), qat_outputs.numpy()) net.eval() normal_outputs = net(inputs) qat_net.eval() qat_outputs = qat_net(inputs) np.testing.assert_allclose(normal_outputs.numpy(), qat_outputs.numpy()) enable_fake_quant(qat_net) qat_outputs = qat_net(inputs) qnet = quantize(qat_net, inplace=False) qnet.eval() quantize_outputs = qnet(inputs) np.testing.assert_allclose(qat_outputs.numpy(), quantize_outputs.numpy(), atol=1e-6) @jit.trace(capture_as_const=True) def f(x): qnet.eval() return qnet(x) f(inputs) file = io.BytesIO() f.dump(file, enable_nchw4=True) file.seek(0) infer_cg = cgtools.GraphInference(file)[0] dumped_outputs = list(infer_cg.run(inputs.numpy()).values())[0] np.testing.assert_allclose(quantize_outputs.numpy(), dumped_outputs, atol=1e-6)
def test_func( N, IC, IH, IW, OC, KH, KW, SH, SW, PH, PW, DH, DW, groups=1, has_bias=True, conv_mode: str = "cross_correlation", compute_mode: str = "default", ): inp_scale = np.float32(rng.uniform(low=0.04, high=0.06)) weight_scale = np.float32(rng.uniform(low=0.04, high=0.06)) bias_scale = inp_scale * weight_scale out_scale = np.float32(rng.uniform(low=0.04, high=0.06)) inp_dtype = dtype.qint8(inp_scale) weight_dtype = dtype.qint8(weight_scale) bias_dtype = dtype.qint32(bias_scale) out_dtype = dtype.qint8(out_scale) inp_fp32 = rng.uniform(low=-1, high=1, size=(N, IC, IH, IW)).astype(np.float32) weight_fp32 = rng.uniform(low=-1, high=1, size=(IC, OC, KH, KW)).astype(np.float32) bias_fp32 = rng.uniform(low=-1, high=1, size=(1, OC, 1, 1)).astype(np.float32) inp_int8 = dtype.convert_to_qint8(inp_fp32, inp_dtype) weight_int8 = dtype.convert_to_qint8(weight_fp32, weight_dtype) bias_int32 = dtype.convert_to_qint32(bias_fp32, bias_dtype) inp_int8 = mge.tensor(inp_int8, dtype=inp_dtype) weight_int8 = mge.Parameter(weight_int8, dtype=weight_dtype) bias_int32 = mge.Parameter(bias_int32, dtype=bias_dtype) inp_fp32 = inp_int8.astype("float32") weight_fp32 = weight_int8.astype("float32") bias_fp32 = bias_int32.astype("float32") expected = F.conv_transpose2d( inp_fp32, weight_fp32, bias_fp32 if has_bias else None, stride=(SH, SW), padding=(PH, PW), dilation=(DH, DW), groups=groups, conv_mode=conv_mode, compute_mode=compute_mode, ) expected = dtype.convert_to_qint8(expected.numpy(), out_dtype) expected = dtype.convert_from_qint8(expected) conv_transpose2d = ConvTranspose2d( in_channels=IC, out_channels=OC, kernel_size=(KH, KW), stride=(SH, SW), padding=(PH, PW), dilation=(DH, DW), groups=groups, bias=has_bias, conv_mode=conv_mode, compute_mode=compute_mode, dtype=out_dtype, ) conv_transpose2d.weight = mge.Parameter(weight_int8) if has_bias: conv_transpose2d.bias = mge.Parameter(bias_int32) result = conv_transpose2d.forward(inp_int8).numpy() result = dtype.convert_from_qint8(result) np.testing.assert_allclose(result, expected, atol=out_scale)
def roi_pool(rpn_fms, rois, stride, pool_shape, roi_type='roi_align', labels=None, bbox_targets=None): assert len(stride) == len(rpn_fms) canonical_level = 4 canonical_box_size = 224 min_level = math.log2(stride[0]) max_level = math.log2(stride[-1]) num_fms = len(rpn_fms) box_sizes = F.sqrt((rois[:, 3] - rois[:, 1]) * (rois[:, 4] - rois[:, 2])) level_assignments = F.floor(canonical_level + F.log(box_sizes / canonical_box_size) / np.log(2)) level_assignments = F.minimum(level_assignments, max_level) level_assignments = F.maximum(level_assignments, min_level) level_assignments = level_assignments - min_level available_masks = F.concat( [F.ones(level_assignments.shape[0]), F.zeros(num_fms)], axis=0) level_assignments = F.concat( [level_assignments, mge.tensor(np.arange(num_fms, dtype=np.int32))], axis=0) rois = F.concat([rois, F.zeros((num_fms, rois.shape[-1]))], axis=0) if labels is not None and bbox_targets is not None: labels = F.concat([labels, F.ones((num_fms, labels.shape[-1]))], axis=0) bbox_targets = F.concat( [bbox_targets, F.zeros((num_fms, bbox_targets.shape[-1]))], axis=0) pool_list, inds_list = [], [] for i in range(len(rpn_fms)): # mask = level_assignments == i # inds = mask_to_inds(mask) mask = F.equal(level_assignments, i) _, inds = F.cond_take(mask > 0, mask) rois_fm = rois[inds.astype(np.int32)] if roi_type == 'roi_pool': pool_fm = F.nn.roi_pooling(rpn_fms[i], rois_fm, pool_shape, mode='max', scale=1.0 / stride[i]) elif roi_type == 'roi_align': pool_fm = F.nn.roi_align(rpn_fms[i], rois_fm, pool_shape, mode='average', spatial_scale=1.0 / stride[i], sample_points=2, aligned=True) pool_list.append(pool_fm) inds_list.append(inds) fm_order = F.concat(inds_list, axis=0) pool_feature = F.concat(pool_list, axis=0) ordered_available_masks = available_masks[fm_order] # available_inds = mask_to_inds(ordered_available_masks) _, available_inds = F.cond_take(ordered_available_masks > 0, ordered_available_masks) available_inds = available_inds.astype(np.int32) pool_feature = pool_feature[available_inds.astype(np.int32)] rois = rois[fm_order, :][available_inds.astype(np.int32)] if labels is not None: labels = labels[fm_order][available_inds] bbox_targets = bbox_targets[fm_order][available_inds] return pool_feature, rois, labels.detach(), bbox_targets.detach() else: return pool_feature, rois, None, None
def test_tensor_routine(): mge.tensor(np.zeros((1, 2), dtype=np.int32)) mge.tensor([1]) mge.tensor(1.5)
return mean, var, decode def get_net(): c = 8 net = AE([8 * c, 16 * c, 24 * c]) return net if __name__ == '__main__': net = get_net() net.eval() #x = mge.tensor(np.random.randn(2, 1, 28, 28).astype(np.float32)) x = np.random.randn(2, 1, 28, 28).astype(np.float32) gaussian = np.random.normal(size=(2, net.layers[-1], 3, 3)) target = np.array([1, 3]) onehot = np.broadcast_to( np.eye(10)[target][:, :, np.newaxis, np.newaxis], (2, 10, 3, 3)) print(onehot, onehot.shape) t_x = mge.tensor() t_x.set_value(x) t_gaussian = mge.tensor() t_gaussian.set_value(gaussian) t_onehot = mge.tensor() t_onehot.set_value(onehot) _, _, out = net(t_x, t_gaussian, t_onehot) print(out.shape) print(out) #net = ResNet(Bottleneck, [3,3,3,3]
def test_wrong_dtype(): with pytest.raises(TypeError): mge.tensor(np.zeros((5, 5), dtype=np.float64)) with pytest.raises(TypeError): mge.Parameter(np.zeros((5, 5), dtype=np.int64))
def __init__(self, cfg, batch_size): super().__init__() self.cfg = cfg self.batch_size = batch_size self.anchor_gen = layers.DefaultAnchorGenerator( base_size=4, anchor_scales=self.cfg.anchor_scales, anchor_ratios=self.cfg.anchor_ratios, ) self.box_coder = layers.BoxCoder(cfg.reg_mean, cfg.reg_std) self.stride_list = np.array(cfg.stride).astype(np.float32) self.in_features = ["p3", "p4", "p5", "p6", "p7"] # ----------------------- build the backbone ------------------------ # bottom_up = getattr(resnet, cfg.backbone)( norm=layers.get_norm(cfg.resnet_norm), pretrained=cfg.backbone_pretrained) # ------------ freeze the weights of resnet stage1 and stage 2 ------ # if self.cfg.backbone_freeze_at >= 1: for p in bottom_up.conv1.parameters(): p.requires_grad = False if self.cfg.backbone_freeze_at >= 2: for p in bottom_up.layer1.parameters(): p.requires_grad = False # ----------------------- build the FPN ----------------------------- # in_channels_p6p7 = 2048 out_channels = 256 self.backbone = layers.FPN( bottom_up=bottom_up, in_features=["res3", "res4", "res5"], out_channels=out_channels, norm=cfg.fpn_norm, top_block=layers.LastLevelP6P7(in_channels_p6p7, out_channels), ) backbone_shape = self.backbone.output_shape() feature_shapes = [backbone_shape[f] for f in self.in_features] # ----------------------- build the RetinaNet Head ------------------ # self.head = layers.RetinaNetHead(cfg, feature_shapes) self.inputs = { "image": mge.tensor( np.random.random([2, 3, 224, 224]).astype(np.float32), dtype="float32", ), "im_info": mge.tensor( np.random.random([2, 5]).astype(np.float32), dtype="float32", ), "gt_boxes": mge.tensor( np.random.random([2, 100, 5]).astype(np.float32), dtype="float32", ), } self.loss_normalizer = mge.tensor(100.0)
mnist_train_dataloader = DataLoader( dataset=mnist_train_dataset, sampler=random_sampler, transform=Compose([ RandomResizedCrop(output_size=28), # mean 和 std 分别是 MNIST 数据的均值和标准差,图片数值范围是 0~255 #Normalize(mean=0.1307*255, std=0.3081*255), #Pad(2), # 'CHW'表示把图片由 (height, width, channel) 格式转换成 (channel, height, width) 格式 #ToMode('CHW'), ]) ) mnist_test_dataloader = DataLoader( dataset=mnist_test_dataset, sampler=sequential_sampler, ) data = mge.tensor() code = mge.tensor() for step, batch_sample in enumerate(mnist_test_dataloader): imgs, _ = batch_sample imgs = imgs.transpose((0,3,1,2)) data.set_value(imgs) reconstruct = (model(data).numpy() * 255).astype('uint8') all_img = np.concatenate([imgs, reconstruct], axis=1).reshape(-1, 1, 28, 28) show = torchvision.utils.make_grid(torch.tensor(all_img), 32) cv2.imwrite(str(step) + '.png', show.numpy().transpose(1,2,0)) print(type(show)) print(show.shape)
def test_error_shape_linear_interpolate(): inp = tensor(np.arange(1, 5, dtype=np.float32).reshape(1, 1, 2, 2)) with pytest.raises(ValueError): F.interpolate(inp, scale_factor=2.0, mode="LINEAR")
def fpn_roi_target(rpn_rois, im_info, gt_boxes, fg_threshold=config.fg_threshold, top_k=1): return_rois, return_labels = [], [] return_bbox_targets = [] # get per image proposals and gt_boxes batch_per_gpu = im_info.shape[0] sampling = True # is_sample = True if top_k < 2 else False for bid in range(batch_per_gpu): gt_boxes_perimg = gt_boxes[bid, :im_info[bid, 5].astype(np.int32), :] dummy_gt = F.ones([1, gt_boxes_perimg.shape[1]]) batch_inds = F.ones((gt_boxes_perimg.shape[0], 1)) * bid #if config.proposal_append_gt: gt_rois = F.concat([batch_inds, gt_boxes_perimg[:, :4]], axis=1) batch_rois_mask = F.equal(rpn_rois[:, 0], bid) > 0 _, batch_rois_index = F.cond_take(batch_rois_mask, batch_rois_mask) # batch_roi_mask = rpn_rois[:, 0] == bid # batch_roi_inds = mask_to_inds(batch_roi_mask) all_rois= F.concat([rpn_rois[batch_rois_index], gt_rois], axis=0) if sampling \ else rpn_rois[batch_rois_index] # all_rois = F.concat([rpn_rois.ai[batch_roi_inds], gt_rois], axis=0) gt_boxes_perimg = F.concat([gt_boxes_perimg, dummy_gt], axis=0) overlaps_normal, overlaps_ignore = box_overlap_ignore_opr( all_rois[:, 1:5], gt_boxes_perimg) # overlaps_normal, overlaps_normal_indices = F.argsort(overlaps_normal, descending=True) # overlaps_ignore, overlaps_ignore_indices = F.argsort(overlaps_ignore, descending=True) overlaps_normal_indices = F.argsort(overlaps_normal, descending=True) overlaps_normal = F.gather(overlaps_normal, 1, overlaps_normal_indices) # overlaps_normal = F.nn.indexing_one_hot(overlaps_normal, overlaps_normal_indices, 1) overlaps_ignore_indices = F.argsort(overlaps_ignore, descending=True) overlaps_ignore = F.gather(overlaps_ignore, 1, overlaps_ignore_indices) # overlaps_ignore = F.nn.indexing_one_hot(overlaps_ignore, overlaps_ignore_indices, 1) # gt max and indices, ignore max and indices max_overlaps_normal = overlaps_normal[:, :top_k].flatten() gt_assignment_normal = overlaps_normal_indices[:, :top_k].flatten() max_overlaps_ignore = overlaps_ignore[:, :top_k].flatten() gt_assignment_ignore = overlaps_ignore_indices[:, :top_k].flatten() # cons masks ignore_assign_mask = (max_overlaps_normal < fg_threshold).astype( np.float32) * (max_overlaps_ignore > max_overlaps_normal).astype( np.float32) max_overlaps = max_overlaps_normal * (1 - ignore_assign_mask).astype(np.float32) + \ max_overlaps_ignore * ignore_assign_mask gt_assignment = gt_assignment_normal * (1- ignore_assign_mask) + \ gt_assignment_ignore * ignore_assign_mask gt_assignment = gt_assignment.astype(np.int32) labels = gt_boxes_perimg[gt_assignment, 4] fg_mask = (max_overlaps >= fg_threshold).astype( np.float32) * (1 - F.equal(labels, config.ignore_label)) bg_mask = (max_overlaps < config.bg_threshold_high).astype( np.float32) * (max_overlaps >= config.bg_threshold_low).astype( np.float32) fg_mask = fg_mask.reshape(-1, top_k) bg_mask = bg_mask.reshape(-1, top_k) pos_max = config.num_rois * config.fg_ratio fg_inds_mask = _bernoulli_sample_masks( fg_mask[:, 0], pos_max, 1) if sampling else F.equal(fg_mask[:, 0], 0) neg_max = config.num_rois - fg_inds_mask.sum() bg_inds_mask = _bernoulli_sample_masks( bg_mask[:, 0], neg_max, 1) if sampling else F.equal(bg_mask[:, 0], 0) labels = labels * fg_mask.reshape(-1) keep_mask = fg_inds_mask + bg_inds_mask keep_mask = keep_mask + F.equal(keep_mask.sum(), 0) # keep_inds = mask_to_inds(keep_mask) _, keep_inds = F.cond_take(keep_mask > 0, keep_mask) #keep_inds = keep_inds[:F.minimum(config.num_rois, keep_inds.shapeof()[0])] # labels labels = labels.reshape(-1, top_k)[keep_inds] gt_assignment = gt_assignment.reshape( -1, top_k)[keep_inds].reshape(-1).astype(np.int32) target_boxes = gt_boxes_perimg[gt_assignment, :4] # rois = all_rois.ai[keep_inds] rois = all_rois[keep_inds] # target_shape = (rois.shapeof()[0], top_k, rois.shapeof()[-1]) n, c = rois.shape[0], rois.shape[1] target_rois = F.broadcast_to(F.expand_dims(rois, 1), (n, top_k, c)).reshape(-1, c) # target_rois = F.add_axis(rois, 1).broadcast(target_shape).reshape(-1, rois.shapeof()[-1]) bbox_targets = bbox_transform_opr(target_rois[:, 1:5], target_boxes[:, :4]) if config.rcnn_bbox_normalize_targets: std_opr = mge.tensor(config.bbox_normalize_stds[None, :]).to( rois.device) mean_opr = mge.tensor(config.bbox_normalize_means[None, :]).to( rois.device) minus_opr = mean_opr / std_opr bbox_targets = bbox_targets / std_opr - minus_opr bbox_targets = bbox_targets.reshape(-1, top_k * 4) return_rois.append(rois) return_labels.append(labels) return_bbox_targets.append(bbox_targets) if config.batch_per_gpu == 1: rois, labels, bbox_targets = rois.detach(), labels.detach( ), bbox_targets.detach() return rois, labels, bbox_targets # return F.zero_grad(rois), F.zero_grad(labels), F.zero_grad(bbox_targets) else: return_rois = F.concat(return_rois, axis=0) return_labels = F.concat(return_labels, axis=0) return_bbox_targets = F.concat(return_bbox_targets, axis=0) return_rois = return_rois.detach() return_labels = return_labels.detach() return_bbox_targets = return_bbox_targets.detach() return return_rois, return_labels, return_bbox_targets
def test_inappropriate_scale_linear_interpolate(): inp = tensor(np.arange(1, 3, dtype=np.float32).reshape(1, 1, 2)) with pytest.raises(ValueError): F.interpolate(inp, scale_factor=[2.0, 3.0], mode="LINEAR")
def run( N, IC, OC, IH, IW, KH, KW, PH, PW, SH, SW, has_bias=True, nonlinear_mode="IDENTITY", ): inp_v = np.random.normal(size=(N, IC, IH, IW)) w_v = np.random.normal(size=(OC, IC, KW, KW)) b_v = np.random.normal(size=(1, OC, 1, 1)) inp_scale = mgb.dtype.get_scale(inp_dtype) w_scale = mgb.dtype.get_scale(w_dtype) b_scale = mgb.dtype.get_scale(b_dtype) inpv = mgb.dtype.convert_to_qint8(inp_v * inp_scale, inp_dtype) wv = mgb.dtype.convert_to_qint8(w_v * w_scale, w_dtype) bv = mgb.dtype.convert_to_qint32(b_v * b_scale, b_dtype) inp_int8 = tensor(inpv, dtype=inp_dtype) w_int8 = Parameter(wv, dtype=w_dtype) b_int32 = Parameter(bv, dtype=b_dtype) inp_fp32 = inp_int8.astype("float32") w_fp32 = w_int8.astype("float32") b_fp32 = b_int32.astype("float32") jit.trace.enabled = True b_symbolic = True def convert_to_nchw4(var): return var.reshape(var.shapeof(0), var.shapeof(1) // 4, 4, var.shapeof(2), var.shapeof(3)).dimshuffle(0, 1, 3, 4, 2) @jit.trace(symbolic=b_symbolic) def run_conv2d(inp, w, b): O = F.conv2d( inp, w, b if has_bias else None, stride=(SH, SW), padding=(PH, PW), ) if nonlinear_mode == "RELU": return F.relu(O) else: return O @jit.trace(symbolic=b_symbolic) def run_conv_bias(inp, w, b, format="NCHW"): b = b if has_bias else np.zeros_like(b) if format == "NCHW4": inp = convert_to_nchw4(inp) w = convert_to_nchw4(w) b = F.flatten(b) return F.conv_bias_activation( inp, w, b, stride=(SH, SW), padding=(PH, PW), dtype=out_dtype, nonlinear_mode=nonlinear_mode, ) format = "NCHW4" if is_cuda_available() else "NCHW" expected = run_conv2d(inp_fp32, w_fp32, b_fp32) expected = expected.astype(out_dtype).astype("float32") result = run_conv_bias(inp_int8, w_int8, b_int32, format=format).astype("float32") if format == "NCHW4": result = result.dimshuffle(0, 1, 4, 2, 3) expected = F.flatten(expected) result = F.flatten(result) assertTensorClose(result.numpy(), expected.numpy())
def f(): return a._broadcast(tensor([1, 10], dtype=np.int32))
def _mean(inp): inp = mge.tensor(inp) return F.mean(inp).numpy()
def get_ground_truth(self, anchors_list, batched_gt_boxes, batched_num_gts): labels_list = [] offsets_list = [] ctrness_list = [] all_level_anchors = F.concat(anchors_list, axis=0) for bid in range(batched_gt_boxes.shape[0]): gt_boxes = batched_gt_boxes[bid, :batched_num_gts[bid]] offsets = self.point_coder.encode( all_level_anchors, F.expand_dims(gt_boxes[:, :4], axis=1)) object_sizes_of_interest = F.concat([ F.broadcast_to( F.expand_dims(mge.tensor(size, dtype="float32"), axis=0), (anchors_i.shape[0], 2)) for anchors_i, size in zip( anchors_list, self.cfg.object_sizes_of_interest) ], axis=0) max_offsets = F.max(offsets, axis=2) is_cared_in_the_level = ( (max_offsets >= F.expand_dims(object_sizes_of_interest[:, 0], axis=0)) & (max_offsets <= F.expand_dims(object_sizes_of_interest[:, 1], axis=0))) if self.cfg.center_sampling_radius > 0: gt_centers = (gt_boxes[:, :2] + gt_boxes[:, 2:4]) / 2 is_in_boxes = [] for stride, anchors_i in zip(self.cfg.stride, anchors_list): radius = stride * self.cfg.center_sampling_radius center_boxes = F.concat([ F.maximum(gt_centers - radius, gt_boxes[:, :2]), F.minimum(gt_centers + radius, gt_boxes[:, 2:4]), ], axis=1) center_offsets = self.point_coder.encode( anchors_i, F.expand_dims(center_boxes, axis=1)) is_in_boxes.append(F.min(center_offsets, axis=2) > 0) is_in_boxes = F.concat(is_in_boxes, axis=1) else: is_in_boxes = F.min(offsets, axis=2) > 0 gt_area = (gt_boxes[:, 2] - gt_boxes[:, 0]) * (gt_boxes[:, 3] - gt_boxes[:, 1]) # FIXME: use repeat instead of broadcast_to areas = F.broadcast_to(F.expand_dims(gt_area, axis=1), offsets.shape[:2]) areas[~is_cared_in_the_level] = float("inf") areas[~is_in_boxes] = float("inf") match_indices = F.argmin(areas, axis=0) gt_boxes_matched = gt_boxes[match_indices] anchor_min_area = F.indexing_one_hot(areas, match_indices, axis=0) labels = gt_boxes_matched[:, 4].astype("int32") labels[anchor_min_area == float("inf")] = 0 offsets = self.point_coder.encode(all_level_anchors, gt_boxes_matched[:, :4]) left_right = offsets[:, [0, 2]] top_bottom = offsets[:, [1, 3]] ctrness = F.sqrt( F.maximum( F.min(left_right, axis=1) / F.max(left_right, axis=1), 0) * F.maximum( F.min(top_bottom, axis=1) / F.max(top_bottom, axis=1), 0)) labels_list.append(labels) offsets_list.append(offsets) ctrness_list.append(ctrness) return ( F.stack(labels_list, axis=0).detach(), F.stack(offsets_list, axis=0).detach(), F.stack(ctrness_list, axis=0).detach(), )
def _std(inp): inp = mge.tensor(inp) return F.std(inp).numpy()
def forward(self, a): # add if self.mode == "add": x = a + mge.tensor(np.float32(10)) y = a + mge.tensor(self.data1) z = x + y # sub elif self.mode == "sub": x = a - mge.tensor(np.float32(10)) y = a - mge.tensor(self.data1) z = x - y # mul elif self.mode == "mul": x = a * mge.tensor(np.float32(10)) y = mge.tensor(self.data1) * a z = x * y # div elif self.mode == "max": x = a + mge.tensor(self.data) y = a + mge.tensor(self.data2) z = F.maximum(x, y) elif self.mode == "min": x = a + mge.tensor(self.data) y = a + mge.tensor(self.data2) z = F.minimum(x, y) elif self.mode == "pow": z = a**2 elif self.mode == "ceil": z = F.ceil(a) elif self.mode == "floor": z = F.floor(a) elif self.mode == "div": y = mge.tensor(self.data1) / a x = a / mge.tensor(np.float32(2)) z = y / x # cycle_div elif self.mode == "cycle_div": z = a / mge.tensor(self.data1) # abs elif self.mode == "abs": z = F.abs(a) # exp elif self.mode == "exp": z = F.exp(a) # log elif self.mode == "log": z = F.log(a) elif self.mode == "fuse_add_relu": y = a + mge.tensor(self.data2) z = F.relu(y) elif self.mode == "fuse_mul_add3": y = a * mge.tensor(self.data1) z = y + mge.tensor(self.data2) elif self.mode == "fuse_add_sigmoid": y = a + mge.tensor(self.data2) z = F.sigmoid(y) else: raise NotImplementedError('no such elemwise mode "%s"' % self.mode) return z
def run( N, IC, OC, IH, IW, KH, KW, PH, PW, SH, SW, has_bias=True, nonlinear_mode="IDENTITY", ): inp_v = np.random.normal(size=(N, IC, IH, IW)) w_v = np.random.normal(size=(OC, IC, KH, KW)) b_v = np.random.normal(size=(1, OC, 1, 1)) inp_scale = dtype.get_scale(inp_dtype) w_scale = dtype.get_scale(w_dtype) b_scale = dtype.get_scale(b_dtype) inpv = dtype.convert_to_qint8(inp_v * inp_scale, inp_dtype) wv = dtype.convert_to_qint8(w_v * w_scale, w_dtype) bv = dtype.convert_to_qint32(b_v * b_scale, b_dtype) inp_int8 = tensor(inpv, dtype=inp_dtype) w_int8 = Parameter(wv, dtype=w_dtype) b_int32 = Parameter(bv, dtype=b_dtype) inp_fp32 = inp_int8.astype("float32") w_fp32 = w_int8.astype("float32") b_fp32 = b_int32.astype("float32") def convert_to_nchw4(var): var = F.reshape(var, (var.shape[0], var.shape[1] // 4, 4, var.shape[2], var.shape[3])) var = F.transpose(var, (0, 1, 3, 4, 2)) return var def run_conv2d(inp, w, b): O = F.conv2d( inp, w, b if has_bias else None, stride=(SH, SW), padding=(PH, PW), ) if nonlinear_mode == "RELU": return F.relu(O) else: return O def run_conv_bias(inp, w, b, format="NCHW"): b = b if has_bias else Parameter(np.zeros_like(b.numpy())) if format == "NCHW4": inp = convert_to_nchw4(inp) w = convert_to_nchw4(w) b = convert_to_nchw4(b) return F.quantized.conv_bias_activation( inp, w, b, stride=(SH, SW), padding=(PH, PW), dtype=out_dtype, nonlinear_mode=nonlinear_mode, ) format = "NCHW4" if is_cuda_available() else "NCHW" expected = run_conv2d(inp_fp32, w_fp32, b_fp32) expected = expected.astype(out_dtype).astype("float32") result = run_conv_bias(inp_int8, w_int8, b_int32, format=format).astype("float32") if format == "NCHW4": result = F.transpose(result, (0, 1, 4, 2, 3)) expected = F.flatten(expected) result = F.flatten(result) np.testing.assert_allclose(result.numpy(), expected.numpy(), atol=outp_scale)
def worker(rank, world_size, ngpus_per_node, args): # pylint: disable=too-many-statements if rank == 0: os.makedirs(os.path.join(args.save, args.arch), exist_ok=True) megengine.logger.set_log_file( os.path.join(args.save, args.arch, "log.txt")) # init process group if world_size > 1: dist.init_process_group( master_ip=args.dist_addr, port=args.dist_port, world_size=world_size, rank=rank, device=rank % ngpus_per_node, backend="nccl", ) logging.info("init process group rank %d / %d", dist.get_rank(), dist.get_world_size()) # build dataset train_dataloader, valid_dataloader = build_dataset(args) train_queue = iter(train_dataloader) # infinite steps_per_epoch = 1280000 // (world_size * args.batch_size) # build model model = snet_model.__dict__[args.arch]() # Sync parameters if world_size > 1: dist.bcast_list_(model.parameters(), dist.WORLD) # Autodiff gradient manager gm = autodiff.GradManager().attach( model.parameters(), callbacks=dist.make_allreduce_cb("SUM") if world_size > 1 else None, ) # Optimizer params_wd = [] params_nwd = [] for n, p in model.named_parameters(): if n.find("weight") >= 0 and len(p.shape) > 1: print("include ", n, p.shape) params_wd.append(p) else: print("NOT include ", n, p.shape) params_nwd.append(p) opt = optim.SGD( [ { "params": params_wd }, { "params": params_nwd, "weight_decay": 0 }, ], lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay * world_size, # scale weight decay in "SUM" mode ) # train and valid func def train_step(image, label): with gm: logits = model(image) loss = F.nn.cross_entropy(logits, label, label_smooth=0.1) acc1, acc5 = F.topk_accuracy(logits, label, topk=(1, 5)) gm.backward(loss) opt.step().clear_grad() return loss, acc1, acc5 def valid_step(image, label): logits = model(image) loss = F.nn.cross_entropy(logits, label, label_smooth=0.1) acc1, acc5 = F.topk_accuracy(logits, label, topk=(1, 5)) # calculate mean values if world_size > 1: loss = F.distributed.all_reduce_sum(loss) / world_size acc1 = F.distributed.all_reduce_sum(acc1) / world_size acc5 = F.distributed.all_reduce_sum(acc5) / world_size return loss, acc1, acc5 # linear learning rate scheduler def adjust_learning_rate(step): lr = args.lr * (1 - step / (args.epochs * steps_per_epoch)) for param_group in opt.param_groups: param_group["lr"] = lr return lr # start training objs = AverageMeter("Loss") top1 = AverageMeter("Acc@1") top5 = AverageMeter("Acc@5") clck = AverageMeter("Time") for step in range(0, args.epochs * steps_per_epoch): lr = adjust_learning_rate(step) t = time.time() image, label = next(train_queue) image = megengine.tensor(image, dtype="float32") label = megengine.tensor(label, dtype="int32") loss, acc1, acc5 = train_step(image, label) objs.update(loss.item()) top1.update(100 * acc1.item()) top5.update(100 * acc5.item()) clck.update(time.time() - t) if step % args.print_freq == 0 and dist.get_rank() == 0: logging.info( "Epoch %d Step %d, LR %.4f, %s %s %s %s", step // steps_per_epoch, step, lr, objs, top1, top5, clck, ) objs.reset() top1.reset() top5.reset() clck.reset() if (step + 1) % steps_per_epoch == 0: model.eval() _, valid_acc1, valid_acc5 = valid(valid_step, valid_dataloader, args) model.train() logging.info( "Epoch %d Test Acc@1 %.3f, Acc@5 %.3f", (step + 1) // steps_per_epoch, valid_acc1, valid_acc5, ) megengine.save( { "epoch": (step + 1) // steps_per_epoch, "state_dict": model.state_dict(), }, os.path.join(args.save, args.arch, "checkpoint.pkl"), )
def test_dropout(): data = tensor(np.ones(10, dtype=np.float32)) out = F.dropout(data, 1.0 / 3.0, training=False) assert out.numpy().sum() >= 0.0
def worker(): rank = dist.get_rank() m = SyncMinMaxObserver() y = mge.tensor(x[rank * 3:(rank + 1) * 3]) m(y) assert m.min_val == np_min and m.max_val == np_max
def test_xornet_trace_dump(): net = XORNet() opt = optim.SGD(net.parameters(), lr=0.01, momentum=0.9) gm = GradManager().attach(net.parameters()) batch_size = 64 train_dataset = minibatch_generator(batch_size) val_dataset = minibatch_generator(batch_size) @trace def train_fun(data, label): with gm: net.train() pred = net(data) loss = F.nn.cross_entropy(pred, label) gm.backward(loss) return pred, loss @trace def val_fun(data, label): net.eval() pred = net(data) loss = F.nn.cross_entropy(pred, label) return pred, loss @trace(symbolic=True, capture_as_const=True) def pred_fun(data): net.eval() pred = net(data) pred_normalized = F.softmax(pred) return pred_normalized train_loss = [] val_loss = [] for step, minibatch in enumerate(train_dataset): if step > 100: break data = tensor(minibatch["data"]) label = tensor(minibatch["label"]) opt.clear_grad() _, loss = train_fun(data, label) train_loss.append((step, loss.numpy())) if step % 50 == 0: minibatch = next(val_dataset) _, loss = val_fun(data, label) loss = loss.numpy()[0] val_loss.append((step, loss)) print("Step: {} loss={}".format(step, loss)) opt.step() test_data = np.array([ (0.5, 0.5), (0.3, 0.7), (0.1, 0.9), (-0.5, -0.5), (-0.3, -0.7), (-0.9, -0.1), (0.5, -0.5), (0.3, -0.7), (0.9, -0.1), (-0.5, 0.5), (-0.3, 0.7), (-0.1, 0.9), ]) data = tensor(test_data.astype(np.float32)) out = pred_fun(data) pred_output = out.numpy() pred_label = np.argmax(pred_output, 1) with np.printoptions(precision=4, suppress=True): print("Predicated probability:") print(pred_output) with mkstemp() as out: pred_fun.dump(out, arg_names=["data"], output_names=["label"])
def make_dev_tensor(value, dtype=None, device=None): return tensor(value, dtype=dtype, device=device)._dev_tensor()
import megengine import numpy as np a = megengine.tensor(np.zeros((3, 3, 3, 3), dtype=np.float32)) print(a[:, 0, ...].shape)