def worker(rank, world_size, ngpus_per_node, args): if world_size > 1: # init process group dist.init_process_group( master_ip=args.dist_addr, port=args.dist_port, world_size=world_size, rank=rank, device=rank % ngpus_per_node, backend="nccl", ) logging.info("init process group rank %d / %d", dist.get_rank(), dist.get_world_size()) # build dataset _, valid_dataloader = build_dataset(args) # build model model = resnet_model.__dict__[args.arch](pretrained=args.model is None) if args.model is not None: logging.info("load from checkpoint %s", args.model) checkpoint = megengine.load(args.model) if "state_dict" in checkpoint: state_dict = checkpoint["state_dict"] model.load_state_dict(state_dict) def valid_step(image, label): logits = model(image) loss = F.nn.cross_entropy(logits, label) acc1, acc5 = F.topk_accuracy(logits, label, topk=(1, 5)) # calculate mean values if world_size > 1: loss = F.distributed.all_reduce_sum(loss) / world_size acc1 = F.distributed.all_reduce_sum(acc1) / world_size acc5 = F.distributed.all_reduce_sum(acc5) / world_size return loss, acc1, acc5 model.eval() _, valid_acc1, valid_acc5 = valid(valid_step, valid_dataloader, args) logging.info( "Test Acc@1 %.3f, Acc@5 %.3f", valid_acc1, valid_acc5, )
def test_save_load(): net = Simple() optim = optimizer.SGD(net.parameters(), lr=1.0, momentum=0.9) optim.clear_grad() gm = ad.GradManager().attach(net.parameters()) data = tensor([2.34]) with gm: loss = net(data) gm.backward(loss) optim.step() model_name = "simple.pkl" mge.save( { "name": "simple", "state_dict": net.state_dict(), "opt_state": optim.state_dict(), }, model_name, ) # Load param to cpu checkpoint = mge.load(model_name, map_location="cpu0") device_save = mge.get_default_device() mge.set_default_device("cpu0") net = Simple() net.load_state_dict(checkpoint["state_dict"]) optim = optimizer.SGD(net.parameters(), lr=1.0, momentum=0.9) optim.load_state_dict(checkpoint["opt_state"]) os.remove("simple.pkl") with gm: loss = net([1.23]) gm.backward(loss) optim.step() # Restore device mge.set_default_device(device_save)
def worker( net_file, model_file, data_dir, worker_id, total_worker, result_queue, ): """ :param net_file: network description file :param model_file: file of dump weights :param data_dir: the dataset directory :param worker_id: the index of the worker :param total_worker: number of gpu for evaluation :param result_queue: processing queue """ os.environ["CUDA_VISIBLE_DEVICES"] = str(worker_id) @jit.trace(symbolic=True, opt_level=2) def val_func(): pred = model(model.inputs) return pred sys.path.insert(0, os.path.dirname(net_file)) current_network = importlib.import_module(os.path.basename(net_file).split(".")[0]) model = current_network.Net(current_network.Cfg(), batch_size=1) model.eval() evaluator = DetEvaluator(model) model.load_state_dict(mge.load(model_file)["state_dict"]) loader = build_dataloader(worker_id, total_worker, data_dir) for data_dict in loader: data, im_info = DetEvaluator.process_inputs( data_dict[0][0], model.cfg.test_image_short_size, model.cfg.test_image_max_size, ) model.inputs["im_info"].set_value(im_info) model.inputs["image"].set_value(data.astype(np.float32)) pred_res = evaluator.predict(val_func) result_queue.put_nowait( { "det_res": pred_res, "image_id": int(data_dict[1][2][0].split(".")[0].split("_")[-1]), } )
def inference(args): @jit.trace(symbolic=False) def val_func(): pred_boxes = net(net.inputs) return pred_boxes # model path saveDir = config.model_dir evalDir = config.eval_dir misc_utils.ensure_dir(evalDir) model_file = os.path.join(saveDir, 'epoch_{}.pkl'.format(args.resume_weights)) assert os.path.exists(model_file) # load model net = network.Network() net.eval() check_point = mge.load(model_file) net.load_state_dict(check_point['state_dict']) ori_image, image, im_info = get_data(args.img_path) net.inputs["image"].set_value(image.astype(np.float32)) net.inputs["im_info"].set_value(im_info) pred_boxes = val_func().numpy() num_tag = config.num_classes - 1 target_shape = (pred_boxes.shape[0] // num_tag, 1) pred_tags = (np.arange(num_tag) + 1).reshape(-1, 1) pred_tags = np.tile(pred_tags, target_shape).reshape(-1, 1) # nms from set_nms_utils import cpu_nms keep = pred_boxes[:, -1] > args.thresh pred_boxes = pred_boxes[keep] pred_tags = pred_tags[keep] keep = cpu_nms(pred_boxes, 0.5) pred_boxes = pred_boxes[keep] pred_tags = pred_tags[keep] pred_tags = pred_tags.astype(np.int32).flatten() pred_tags_name = np.array(config.class_names)[pred_tags] visual_utils.draw_boxes(ori_image, pred_boxes[:, :-1], pred_boxes[:, -1], pred_tags_name) name = args.img_path.split('/')[-1].split('.')[-2] fpath = '/data/jupyter/{}.png'.format(name) cv2.imwrite(fpath, ori_image)
def initDetector(self): ''' 自定义目标检测器 :return: ''' current_network = import_from_file(self.detector_model) cfg = current_network.Cfg() cfg.backbone_pretrained = False model = current_network.Net(cfg) model.eval() state_dict = mge.load(self.detector_weight) if "state_dict" in state_dict: state_dict = state_dict["state_dict"] model.load_state_dict(state_dict) detector = DetEvaluator(model) short_size = model.cfg.test_image_short_size max_size = model.cfg.test_image_max_size return detector, short_size, max_size
def test_snpe_model_8f(): model = "8w16f_backbone.tm" net = mge.load(model) print(net.flatten().graph) inp_dtype = dtype.quint8(16.0 / 128.0, 128) inps = get_qat_inputs_quint8(inp_dtype, num_inp=2, shape=(1, 16, 384, 512)) tm_result = dict(zip(net.graph.outputs, net(*inps))) _test_convert_result( inps, net, tm_result, max_err, input_data_type="quint8", input_scales=inps[0].qparams.scale, input_zero_points=inps[0].qparams.zero_point, require_quantize=False, param_fake_quant=True, split_conv_relu=True, input_name=["inp", "prev"], )
def worker( current_network, weight_file, dataset_dir, result_list, master_ip=None, port=None, world_size=None, rank=None ): if world_size > 1: dist.init_process_group( master_ip=master_ip, port=port, world_size=world_size, rank=rank, device=rank, ) cfg = current_network.Cfg() cfg.backbone_pretrained = False model = current_network.Net(cfg) model.eval() state_dict = mge.load(weight_file) if "state_dict" in state_dict: state_dict = state_dict["state_dict"] model.load_state_dict(state_dict) def pred_func(data): pred = model(data) return pred test_loader = build_dataloader(dataset_dir, model.cfg) if dist.get_world_size() == 1: test_loader = tqdm(test_loader) for data in test_loader: img = data[0].squeeze() label = data[1].squeeze() im_info = data[2] pred = evaluate(pred_func, img, model.cfg) result = {"pred": pred, "gt": label, "name": im_info[2]} if dist.get_world_size() > 1: result_list.put_nowait(result) else: result_list.append(result)
def main(): parser = make_parser() args = parser.parse_args() logger.info("Load Model : %s completed", args.weight_file) @jit.trace(symbolic=True) def val_func(): pred = model(model.inputs) return pred sys.path.insert(0, os.path.dirname(args.file)) current_network = importlib.import_module( os.path.basename(args.file).split(".")[0]) cfg = current_network.Cfg() cfg.backbone_pretrained = False model = current_network.Net(cfg, batch_size=1) model.eval() state_dict = mge.load(args.weight_file) if "state_dict" in state_dict: state_dict = state_dict["state_dict"] model.load_state_dict(state_dict) evaluator = DetEvaluator(model) ori_img = cv2.imread(args.image) data, im_info = DetEvaluator.process_inputs( ori_img.copy(), model.cfg.test_image_short_size, model.cfg.test_image_max_size, ) model.inputs["im_info"].set_value(im_info) model.inputs["image"].set_value(data.astype(np.float32)) pred_res = evaluator.predict(val_func) res_img = DetEvaluator.vis_det( ori_img, pred_res, is_show_label=True, classes=COCO.class_names, ) cv2.imwrite("results.jpg", res_img)
def main(): parser = argparse.ArgumentParser() parser.add_argument("-c", "--config", type=str, required=True, help="configuration file") parser.add_argument( "-d", "--dataset_dir", type=str, default="/data/datasets/VOC2012", ) parser.add_argument("-m", "--model_path", type=str, default=None, help="eval model file") args = parser.parse_args() cfg = import_config_from_file(args.config) test_loader, test_size = build_dataloader(args.dataset_dir, cfg) print("number of test images: %d" % (test_size)) net = DeepLabV3Plus(class_num=cfg.NUM_CLASSES) model_dict = mge.load(args.model_path) net.load_state_dict(model_dict["state_dict"]) print("load model %s" % (args.model_path)) net.eval() result_list = [] for sample_batched in tqdm(test_loader): img = sample_batched[0].squeeze() label = sample_batched[1].squeeze() im_info = sample_batched[2] pred = evaluate(net, img, cfg) result_list.append({"pred": pred, "gt": label, "name": im_info[2]}) if cfg.VAL_SAVE: save_results(result_list, cfg.VAL_SAVE, cfg) compute_metric(result_list, cfg)
def test_optimizer_serialization(): data, data_shape, label, label_shape = get_input() mlp = MLP() opt = SGD(mlp.parameters(), lr=0.01, momentum=0.9) slots = TensorDict() for param in mlp.parameters(): slots[param] = np.zeros(param.shape).astype(np.float32) pred = mlp(data) loss = F.square_loss(pred, label.reshape(-1, 1)) opt.zero_grad() opt.backward(loss) opt.step() for param in mlp.parameters(): slot = slots[param] slot *= 0.9 slot -= param.grad.numpy() * 0.01 with BytesIO() as fout: save(opt.state_dict(), fout) fout.seek(0) state_dict = load(fout) opt1 = SGD(mlp.parameters(), lr=0.02, momentum=0.8) opt1.load_state_dict(state_dict) data.set_value(np.random.random(data_shape).astype(np.float32)) label.set_value(np.random.randint(0, 10, label_shape)) pred = mlp(data) loss = F.square_loss(pred, label.reshape(-1, 1)) opt1.zero_grad() opt1.backward(loss) orig_params = TensorDict() for param in mlp.parameters(): orig_params[param] = np.copy(param.numpy()) opt1.step() for param in mlp.parameters(): orig_param = orig_params[param] slot = slots[param] slot *= 0.9 slot -= param.grad.numpy() * 0.01 assertTensorClose(param.numpy(), orig_param + slot)
def main(): parser = argparse.ArgumentParser() parser.add_argument("-a", "--arch", default="shufflenet_v1_x0_5_g3_int8", type=str) parser.add_argument("-m", "--model", default=None, type=str) args = parser.parse_args() model = getattr(M, args.arch)(pretrained=(args.model is None)) quantize_qat(model, qconfig=ema_fakequant_qconfig) if args.model: state_dict = mge.load(args.model) model.load_state_dict(state_dict, strict=False) quantize(model) data = mge.tensor(np.zeros((10, 3, 224, 224), dtype="float32")) infer_func(data, model=model) infer_func.dump(args.arch, arg_names=["data"], optimize_for_inference=True)
def test_state_dict(): data_shape = (2, 28) data = tensor(np.random.random(data_shape)) mlp = MLP() pred0 = mlp(data) with BytesIO() as fout: mge.save(mlp.state_dict(), fout) fout.seek(0) state_dict = mge.load(fout) state_dict["extra"] = None mlp1 = MLP() mlp1.load_state_dict(state_dict, strict=False) pred1 = mlp1(data) np.testing.assert_allclose(pred0.numpy(), pred1.numpy(), atol=5e-6) with pytest.raises(KeyError): mlp1.load_state_dict(state_dict) del state_dict["extra"] del state_dict["dense0.bias"] with pytest.raises(KeyError): mlp1.load_state_dict(state_dict)
def update_model(model_path): """ Update the dumped model with test cases for new reference values. The model with pre-trained weights is trained for one iter with the test data attached. The loss and updated net state dict is dumped. .. code-block:: python from test_dp_correctness import update_model update_model('mnist_model_with_test.mge') # for gpu update_model('mnist_model_with_test_cpu.mge') # for cpu """ net = MnistNet(has_bn=True) checkpoint = mge.load(model_path) net.load_state_dict(checkpoint["net_init"]) lr = checkpoint["sgd_lr"] opt = SGD(net.parameters(), lr=lr) gm = ad.GradManager().attach( net.parameters(), callbacks=[dist.make_allreduce_cb("MEAN", dist.WORLD)]) data = Tensor(checkpoint["data"], dtype=np.float32) label = Tensor(checkpoint["label"], dtype=np.int32) opt.clear_grad() loss = train(data, label, net=net, opt=opt) opt.step() xpu_name = get_xpu_name() checkpoint.update({ "net_updated": net.state_dict(), "loss": loss.numpy(), "xpu": xpu_name }) mge.serialization.save(checkpoint, model_path)
def inference(model_file, device, records, result_queue): @jit.trace(symbolic=False) def val_func(): pred_boxes = net(net.inputs) return pred_boxes net = network.Network() net.eval() check_point = mge.load(model_file) net.load_state_dict(check_point['state_dict']) for record in records: np.set_printoptions(precision=2, suppress=True) net.eval() image, gt_boxes, im_info, ID = get_data(record, device) net.inputs["image"].set_value(image.astype(np.float32)) net.inputs["im_info"].set_value(im_info) pred_boxes = val_func().numpy() # nms if if_set_nms: from set_nms_utils import set_cpu_nms n = pred_boxes.shape[0] // 2 idents = np.tile(np.arange(n)[:, None], (1, 2)).reshape(-1, 1) pred_boxes = np.hstack((pred_boxes, idents)) keep = pred_boxes[:, -2] > 0.05 pred_boxes = pred_boxes[keep] keep = set_cpu_nms(pred_boxes, 0.5) pred_boxes = pred_boxes[keep][:, :-1] else: from set_nms_utils import cpu_nms keep = pred_boxes[:, -1] > 0.05 pred_boxes = pred_boxes[keep] keep = cpu_nms(pred_boxes, 0.5) pred_boxes = pred_boxes[keep] result_dict = dict(ID=ID, height=int(im_info[0, -2]), width=int(im_info[0, -1]), dtboxes=boxes_dump(pred_boxes, False), gtboxes=boxes_dump(gt_boxes, True)) result_queue.put_nowait(result_dict)
def main(): parser = make_parser() args = parser.parse_args() detector = getattr(Det, args.detector)(pretrained=True) detector.eval() logger.info("Load Model : %s completed", args.detector) keypoint_model = getattr(M, args.arch)() keypoint_model.load_state_dict(mge.load(args.model)["state_dict"]) keypoint_model.eval() logger.info("Load Model : %s completed", args.arch) @jit.trace(symbolic=True) def det_func(): pred = detector(detector.inputs) return pred @jit.trace(symbolic=True) def keypoint_func(): pred = keypoint_model.predict() return pred evaluator = KeypointEvaluator(detector, det_func, keypoint_model, keypoint_func) image = cv2.imread(args.image) logger.info("Detecting Humans") person_boxes = evaluator.detect_persons(image) logger.info("Detecting Keypoints") all_keypoints = evaluator.predict(image, person_boxes) logger.info("Visualizing") canvas = evaluator.vis_skeletons(image, all_keypoints) cv2.imwrite("vis_skeleton.jpg", canvas)
def worker( arch, model_file, data_root, ann_file, worker_id, total_worker, result_queue, ): """ :param net_file: network description file :param model_file: file of dump weights :param data_dir: the dataset directory :param worker_id: the index of the worker :param total_worker: number of gpu for evaluation :param result_queue: processing queue """ os.environ["CUDA_VISIBLE_DEVICES"] = str(worker_id) @jit.trace(symbolic=True, opt_level=2) def val_func(): pred = model.predict() return pred model = getattr(M, arch)() model.eval() model.load_state_dict(mge.load(model_file)["state_dict"]) loader = build_dataloader(worker_id, total_worker, data_root, ann_file) for data_dict in loader: img, bbox, info = data_dict fliped_img = img[:, :, :, ::-1] - np.zeros_like(img) data = np.concatenate([img, fliped_img], 0) model.inputs["image"].set_value( np.ascontiguousarray(data).astype(np.float32)) instance = find_results(val_func, img, bbox[0, 0], info) result_queue.put_nowait(instance)
def run_test(model_path, use_jit, use_symbolic): """ Load the model with test cases and run the training for one iter. The loss and updated weights are compared with reference value to verify the correctness. Dump a new file with updated result by calling update_model if you think the test fails due to numerical rounding errors instead of bugs. Please think twice before you do so. """ net = MnistNet(has_bn=True) checkpoint = mge.load(model_path) net.load_state_dict(checkpoint["net_init"]) lr = checkpoint["sgd_lr"] opt = SGD(net.parameters(), lr=lr) data = tensor(dtype=np.float32) label = tensor(dtype=np.int32) data.set_value(checkpoint["data"]) label.set_value(checkpoint["label"]) max_err = 1e-5 train_func = train if use_jit: train_func = jit.trace(train_func, symbolic=use_symbolic) opt.zero_grad() loss = train_func(data, label, net=net, opt=opt) opt.step() assertTensorClose(loss.numpy(), checkpoint["loss"], max_err=max_err) for param, param_ref in zip(net.state_dict().items(), checkpoint["net_updated"].items()): assert param[0] == param_ref[0] assertTensorClose(param[1], param_ref[1], max_err=max_err)
def update_model(model_path): """ Update the dumped model with test cases for new reference values. The model with pre-trained weights is trained for one iter with the test data attached. The loss and updated net state dict is dumped. .. code-block:: python from test_correctness import update_model update_model('mnist_model_with_test.mge') # for gpu update_model('mnist_model_with_test_cpu.mge') # for cpu """ net = MnistNet(has_bn=True) checkpoint = mge.load(model_path) net.load_state_dict(checkpoint["net_init"]) lr = checkpoint["sgd_lr"] opt = SGD(net.parameters(), lr=lr) data = tensor(dtype=np.float32) label = tensor(dtype=np.int32) data.set_value(checkpoint["data"]) label.set_value(checkpoint["label"]) opt.zero_grad() loss = train(data, label, net=net, opt=opt) opt.step() xpu_name = get_xpu_name() checkpoint.update({ "net_updated": net.state_dict(), "loss": loss.numpy(), "xpu": xpu_name }) mge.save(checkpoint, model_path)
def inference(args): @jit.trace(symbolic=False) def val_func(): pred_boxes = net(net.inputs) return pred_boxes # model path saveDir = config.model_dir evalDir = config.eval_dir misc_utils.ensure_dir(evalDir) model_file = os.path.join(saveDir, 'epoch_{}.pkl'.format(args.resume_weights)) assert os.path.exists(model_file) # load model net = network.Network() net.eval() check_point = mge.load(model_file) net.load_state_dict(check_point['state_dict']) image, im_info = get_data(args.img_path) net.inputs["image"].set_value(image.astype(np.float32)) net.inputs["im_info"].set_value(im_info) pred_boxes = val_func().numpy() num_tag = config.num_classes - 1 target_shape = (pred_boxes.shape[0]//num_tag, 1) pred_tags = (np.arange(num_tag) + 1).reshape(-1,1) pred_tags = np.tile(pred_tags, target_shape).reshape(-1,1) # nms from set_nms_utils import cpu_nms keep = pred_boxes[:, -1] > 0.05 pred_boxes = pred_boxes[keep] pred_tags = pred_tags[keep] keep = cpu_nms(pred_boxes, 0.5) pred_boxes = pred_boxes[keep] pred_tags = pred_tags[keep].flatten() result_dict = dict(height=int(im_info[0, -2]), width=int(im_info[0, -1]), dtboxes=boxes_dump(pred_boxes, pred_tags)) name = args.img_path.split('/')[-1].split('.')[-2] misc_utils.save_json_lines([result_dict], '{}.json'.format(name))
def worker(args): current_network = import_from_file(args.file) model = current_network.Net(current_network.Cfg()) model.train() if dist.get_rank() == 0: logger.info(get_config_info(model.cfg)) logger.info(repr(model)) backbone_params = [] head_params = [] for name, param in model.named_parameters(): if "backbone" in name: backbone_params.append(param) else: head_params.append(param) opt = SGD( [ { "params": backbone_params, "lr": model.cfg.learning_rate * 0.1 }, { "params": head_params }, ], lr=model.cfg.learning_rate, momentum=model.cfg.momentum, weight_decay=model.cfg.weight_decay * dist.get_world_size(), ) gm = GradManager() if dist.get_world_size() > 1: gm.attach(model.parameters(), callbacks=[dist.make_allreduce_cb("SUM", dist.WORLD)]) else: gm.attach(model.parameters()) cur_epoch = 0 if args.resume is not None: pretrained = mge.load(args.resume) cur_epoch = pretrained["epoch"] + 1 model.load_state_dict(pretrained["state_dict"]) opt.load_state_dict(pretrained["opt"]) if dist.get_rank() == 0: logger.info("load success: epoch %d", cur_epoch) if dist.get_world_size() > 1: dist.bcast_list_(model.parameters(), dist.WORLD) # sync parameters if dist.get_rank() == 0: logger.info("Prepare dataset") train_loader = iter( build_dataloader(model.cfg.batch_size, args.dataset_dir, model.cfg)) for epoch in range(cur_epoch, model.cfg.max_epoch): train_one_epoch(model, train_loader, opt, gm, epoch) if dist.get_rank() == 0: save_path = "log-of-{}/epoch_{}.pkl".format( os.path.basename(args.file).split(".")[0], epoch) mge.save( { "epoch": epoch, "state_dict": model.state_dict(), "opt": opt.state_dict() }, save_path) logger.info("dump weights to %s", save_path)
def worker(rank, world_size, args): if world_size > 1: dist.init_process_group( master_ip="localhost", master_port=23456, world_size=world_size, rank=rank, dev=rank, ) logger.info("Init process group for gpu%d done", rank) sys.path.insert(0, os.path.dirname(args.file)) current_network = importlib.import_module( os.path.basename(args.file).split(".")[0]) model = current_network.Net(current_network.Cfg(), batch_size=args.batch_size) params = model.parameters(requires_grad=True) model.train() if rank == 0: logger.info(get_config_info(model.cfg)) opt = optim.SGD( params, lr=model.cfg.basic_lr * world_size * model.batch_size, momentum=model.cfg.momentum, weight_decay=model.cfg.weight_decay, ) if args.weight_file is not None: weights = mge.load(args.weight_file) model.backbone.bottom_up.load_state_dict(weights) if rank == 0: logger.info("Prepare dataset") train_loader = iter( build_dataloader(model.batch_size, args.dataset_dir, model.cfg)) for epoch_id in range(model.cfg.max_epoch): for param_group in opt.param_groups: param_group["lr"] = (model.cfg.basic_lr * world_size * model.batch_size * (model.cfg.lr_decay_rate**bisect.bisect_right( model.cfg.lr_decay_stages, epoch_id))) tot_steps = model.cfg.nr_images_epoch // (model.batch_size * world_size) train_one_epoch( model, train_loader, opt, tot_steps, rank, epoch_id, world_size, args.enable_sublinear, ) if rank == 0: save_path = "log-of-{}/epoch_{}.pkl".format( os.path.basename(args.file).split(".")[0], epoch_id) mge.save( { "epoch": epoch_id, "state_dict": model.state_dict() }, save_path, ) logger.info("dump weights to %s", save_path)
def make_data_given_desc(args, inputs, shape0_multiply=1): if args.load_input_data: logger.info("load data from {}".format(args.load_input_data)) data = mge.load(args.load_input_data) data_names = [inp.name for inp in inputs] if isinstance(data, np.ndarray): assert len(data_names) == 1, ( "data is given as a single numpy array, so there should be " "exactly one input in the graph; got: {}".format(data_names) ) data = {data_names[0]: data} assert isinstance(data, dict) for v in data.values(): assert isinstance( v, np.ndarray ), "data should provide ndarray; got {} instead".format(v) if args.batchsize: for k, v in list(data.items()): assert ( args.batchsize % v.shape[0] == 0 ), "current batch size must divide given batch size: {} {}".format( args.batchsize, v.shape[0] ) data[k] = np.repeat(v, args.batchsize // v.shape[0], axis=0) return data def iter_inpdesc(desc): if not desc: return for pair in desc.split(";"): name, value = pair.split(":") if name not in data_shapes: logger.warning("rng name {} not in data provider".format(name)) yield name, value rng = np.random.RandomState(args.seed) data_shapes = OrderedDict((inp.name, list(inp.shape)) for inp in inputs) data_dtypes = OrderedDict((inp.name, inp.dtype) for inp in inputs) for name, shape in iter_inpdesc(args.input_desc): data_shapes[name] = list(map(int, shape.split(","))) if args.batchsize: for i in data_shapes.values(): i[0] = args.batchsize data_rngs = dict(iter_inpdesc(args.rng)) result = OrderedDict() for name, shape in data_shapes.items(): shape[0] *= shape0_multiply rng_expr = data_rngs.get(name) if rng_expr: value = eval("rng.{}".format(rng_expr).format(shape), {"rng": rng}) else: value = rng.uniform(size=shape) value = np.ascontiguousarray(value, dtype=data_dtypes[name]) assert value.shape == tuple(shape) result[name] = value return result
def worker(rank, world_size, args): # pylint: disable=too-many-statements mge.set_log_file(os.path.join(args.save, args.arch, "log.txt")) if world_size > 1: # Initialize distributed process group logger.info("init distributed process group {} / {}".format( rank, world_size)) dist.init_process_group( master_ip="localhost", master_port=23456, world_size=world_size, rank=rank, dev=rank, ) save_dir = os.path.join(args.save, args.arch) model = getattr(M, args.arch)() step_start = 0 if args.model: logger.info("load weights from %s", args.model) model.load_state_dict(mge.load(args.model)) step_start = int(args.model.split("-")[1].split(".")[0]) optimizer = optim.SGD( get_parameters(model), lr=args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay, ) # Define train and valid graph @jit.trace(symbolic=True) def train_func(image, label): model.train() logits = model(image) loss = F.cross_entropy_with_softmax(logits, label, label_smooth=0.) acc1, acc5 = F.accuracy(logits, label, (1, 5)) optimizer.backward(loss) # compute gradients if dist.is_distributed(): # all_reduce_mean loss = dist.all_reduce_sum(loss, "train_loss") / dist.get_world_size() acc1 = dist.all_reduce_sum(acc1, "train_acc1") / dist.get_world_size() acc5 = dist.all_reduce_sum(acc5, "train_acc5") / dist.get_world_size() return loss, acc1, acc5 @jit.trace(symbolic=True) def valid_func(image, label): model.eval() logits = model(image) loss = F.cross_entropy_with_softmax(logits, label, label_smooth=0.) acc1, acc5 = F.accuracy(logits, label, (1, 5)) if dist.is_distributed(): # all_reduce_mean loss = dist.all_reduce_sum(loss, "valid_loss") / dist.get_world_size() acc1 = dist.all_reduce_sum(acc1, "valid_acc1") / dist.get_world_size() acc5 = dist.all_reduce_sum(acc5, "valid_acc5") / dist.get_world_size() return loss, acc1, acc5 # Build train and valid datasets logger.info("preparing dataset..") train_dataset = data.dataset.ImageNet(args.data, train=True) train_sampler = data.Infinite( data.RandomSampler(train_dataset, batch_size=args.batch_size, drop_last=True)) train_queue = data.DataLoader( train_dataset, sampler=train_sampler, transform=T.Compose([ T.RandomResizedCrop(224), T.RandomHorizontalFlip(), T.ColorJitter(brightness=0.4, contrast=0.4, saturation=0.4), T.ToMode("CHW"), ]), num_workers=args.workers, ) valid_dataset = data.dataset.ImageNet(args.data, train=False) valid_sampler = data.SequentialSampler(valid_dataset, batch_size=100, drop_last=False) valid_queue = data.DataLoader( valid_dataset, sampler=valid_sampler, transform=T.Compose([ T.Resize(256), T.CenterCrop(224), T.ToMode("CHW"), ]), num_workers=args.workers, ) # Start training objs = AverageMeter("Loss") top1 = AverageMeter("Acc@1") top5 = AverageMeter("Acc@5") total_time = AverageMeter("Time") t = time.time() for step in range(step_start, args.steps + 1): # Linear learning rate decay decay = 1.0 decay = 1 - float(step) / args.steps if step < args.steps else 0 for param_group in optimizer.param_groups: param_group["lr"] = args.learning_rate * decay image, label = next(train_queue) time_data = time.time() - t image = image.astype("float32") label = label.astype("int32") n = image.shape[0] optimizer.zero_grad() loss, acc1, acc5 = train_func(image, label) optimizer.step() top1.update(100 * acc1.numpy()[0], n) top5.update(100 * acc5.numpy()[0], n) objs.update(loss.numpy()[0], n) total_time.update(time.time() - t) time_iter = time.time() - t t = time.time() if step % args.report_freq == 0 and rank == 0: logger.info( "TRAIN Iter %06d: lr = %f,\tloss = %f,\twc_loss = 1,\tTop-1 err = %f,\tTop-5 err = %f,\tdata_time = %f,\ttrain_time = %f,\tremain_hours=%f", step, args.learning_rate * decay, float(objs.__str__().split()[1]), 1 - float(top1.__str__().split()[1]) / 100, 1 - float(top5.__str__().split()[1]) / 100, time_data, time_iter - time_data, time_iter * (args.steps - step) / 3600, ) objs.reset() top1.reset() top5.reset() total_time.reset() if step % 10000 == 0 and rank == 0 and step != 0: logger.info("SAVING %06d", step) mge.save( model.state_dict(), os.path.join(save_dir, "checkpoint-{:06d}.pkl".format(step)), ) if step % 50000 == 0 and step != 0: _, valid_acc, valid_acc5 = infer(valid_func, valid_queue, args) logger.info( "TEST Iter %06d: loss = %f,\tTop-1 err = %f,\tTop-5 err = %f", step, _, 1 - valid_acc / 100, 1 - valid_acc5 / 100) mge.save(model.state_dict(), os.path.join(save_dir, "checkpoint-{:06d}.pkl".format(step))) _, valid_acc, valid_acc5 = infer(valid_func, valid_queue, args) logger.info("TEST Iter %06d: loss=%f,\tTop-1 err = %f,\tTop-5 err = %f", step, _, 1 - valid_acc / 100, 1 - valid_acc5 / 100)
mge.tensor(t) for t in batch ) batch_size = input_ids.shape[0] loss, logits, label_ids = net_eval( input_ids, segment_ids, input_mask, label_ids, net=net ) sum_loss += loss.mean().item() sum_accuracy += accuracy(logits, label_ids) total_examples += batch_size total_steps += 1 result = { "eval_loss": sum_loss / total_steps, "eval_accuracy": sum_accuracy / total_examples, } logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info("%s = %s", key, str(result[key])) if __name__ == "__main__": bert, config, vocab_file = create_hub_bert(args.pretrained_bert, pretrained=False) args.vocab_file = vocab_file model = BertForSequenceClassification(config, num_labels=2, bert=bert) mrpc_dataset = MRPCDataset(args) model.load_state_dict(mge.load(args.load_model_path)) mrpc_dataset = MRPCDataset(args) eval_dataloader, eval_size = mrpc_dataset.get_eval_dataloader() eval(eval_dataloader, model)
import megengine as mge from model import ReverseString from dataset import get_dataloader, make_string_from_tensor, MAXLEN model = ReverseString() model.load_state_dict(mge.load('transformer.60.mge')) model.eval() test_data = get_dataloader() data = mge.tensor() for idx, (batch_data, batch_label, batch_mask) in enumerate(test_data): data.set_value(batch_data) prob = model(data) prob = prob.reshape(-1, MAXLEN + 1, 28) predicted = prob.numpy().argmax(axis=2) inp_str = make_string_from_tensor(batch_data) pred_str = make_string_from_tensor(predicted) gt_str = make_string_from_tensor(batch_label) for i in range(len(inp_str)): print(inp_str[i], gt_str[i], pred_str[i], batch_mask[i])
def worker(master_ip, port, rank, world_size, args): if world_size > 1: # Initialize distributed process group logger.info("init distributed process group {} / {}".format(rank, world_size)) dist.init_process_group( master_ip=master_ip, port=port, world_size=world_size, rank=rank, device=rank, ) model_name = "{}_{}x{}".format(args.arch, cfg.input_shape[0], cfg.input_shape[1]) save_dir = os.path.join(args.save, model_name) model = getattr(kpm, args.arch)() model.train() start_epoch = 0 if args.resume is not None: file = mge.load(args.resume) model.load_state_dict(file["state_dict"]) start_epoch = file["epoch"] optimizer = optim.Adam( model.parameters(), lr=cfg.initial_lr, weight_decay=cfg.weight_decay ) gm = GradManager() if dist.get_world_size() > 1: gm.attach( model.parameters(), callbacks=[dist.make_allreduce_cb("SUM", dist.WORLD)], ) else: gm.attach(model.parameters()) if dist.get_world_size() > 1: dist.bcast_list_(model.parameters(), dist.WORLD) # sync parameters # Build train datasets logger.info("preparing dataset..") ann_file = os.path.join( cfg.data_root, "annotations", "person_keypoints_train2017.json" ) train_dataset = COCOJoints( cfg.data_root, ann_file, image_set="train2017", order=("image", "keypoints", "boxes", "info"), ) logger.info("Num of Samples: {}".format(len(train_dataset))) train_sampler = data.RandomSampler( train_dataset, batch_size=cfg.batch_size, drop_last=True ) transforms = [ T.Normalize(mean=cfg.img_mean, std=cfg.img_std), RandomHorizontalFlip(0.5, keypoint_flip_order=cfg.keypoint_flip_order) ] if cfg.half_body_transform: transforms.append( HalfBodyTransform( cfg.upper_body_ids, cfg.lower_body_ids, cfg.prob_half_body ) ) if cfg.extend_boxes: transforms.append( ExtendBoxes(cfg.x_ext, cfg.y_ext, cfg.input_shape[1] / cfg.input_shape[0]) ) transforms += [ RandomBoxAffine( degrees=cfg.rotate_range, scale=cfg.scale_range, output_shape=cfg.input_shape, rotate_prob=cfg.rotation_prob, scale_prob=cfg.scale_prob, ) ] transforms += [T.ToMode()] train_queue = data.DataLoader( train_dataset, sampler=train_sampler, num_workers=args.workers, transform=T.Compose(transforms=transforms, order=train_dataset.order,), collator=HeatmapCollator( cfg.input_shape, cfg.output_shape, cfg.keypoint_num, cfg.heat_thr, cfg.heat_kernels if args.multi_scale_supervision else cfg.heat_kernels[-1:], cfg.heat_range, ), ) # Start training for epoch in range(start_epoch, cfg.epochs): loss = train(model, train_queue, optimizer, gm, epoch=epoch) logger.info("Epoch %d Train %.6f ", epoch, loss) if rank == 0 and epoch % cfg.save_freq == 0: # save checkpoint mge.save( {"epoch": epoch + 1, "state_dict": model.state_dict()}, os.path.join(save_dir, "epoch_{}.pkl".format(epoch)), )
def worker(rank, world_size, args): # pylint: disable=too-many-statements if world_size > 1: # Initialize distributed process group logger.info("init distributed process group {} / {}".format( rank, world_size)) dist.init_process_group( master_ip="localhost", master_port=23456, world_size=world_size, rank=rank, dev=rank, ) save_dir = os.path.join(args.save, args.arch + "." + args.mode) if not os.path.exists(save_dir): os.makedirs(save_dir, exist_ok=True) mge.set_log_file(os.path.join(save_dir, "log.txt")) model = models.__dict__[args.arch]() cfg = config.get_finetune_config(args.arch) cfg.LEARNING_RATE *= world_size # scale learning rate in distributed training total_batch_size = cfg.BATCH_SIZE * world_size steps_per_epoch = 1280000 // total_batch_size total_steps = steps_per_epoch * cfg.EPOCHS if args.mode != "normal": Q.quantize_qat(model, Q.ema_fakequant_qconfig) if args.checkpoint: logger.info("Load pretrained weights from %s", args.checkpoint) ckpt = mge.load(args.checkpoint) ckpt = ckpt["state_dict"] if "state_dict" in ckpt else ckpt model.load_state_dict(ckpt, strict=False) if args.mode == "quantized": raise ValueError("mode = quantized only used during inference") Q.quantize(model) optimizer = optim.SGD( get_parameters(model, cfg), lr=cfg.LEARNING_RATE, momentum=cfg.MOMENTUM, ) # Define train and valid graph @jit.trace(symbolic=True) def train_func(image, label): model.train() logits = model(image) loss = F.cross_entropy_with_softmax(logits, label, label_smooth=0.1) acc1, acc5 = F.accuracy(logits, label, (1, 5)) optimizer.backward(loss) # compute gradients if dist.is_distributed(): # all_reduce_mean loss = dist.all_reduce_sum(loss, "train_loss") / dist.get_world_size() acc1 = dist.all_reduce_sum(acc1, "train_acc1") / dist.get_world_size() acc5 = dist.all_reduce_sum(acc5, "train_acc5") / dist.get_world_size() return loss, acc1, acc5 @jit.trace(symbolic=True) def valid_func(image, label): model.eval() logits = model(image) loss = F.cross_entropy_with_softmax(logits, label, label_smooth=0.1) acc1, acc5 = F.accuracy(logits, label, (1, 5)) if dist.is_distributed(): # all_reduce_mean loss = dist.all_reduce_sum(loss, "valid_loss") / dist.get_world_size() acc1 = dist.all_reduce_sum(acc1, "valid_acc1") / dist.get_world_size() acc5 = dist.all_reduce_sum(acc5, "valid_acc5") / dist.get_world_size() return loss, acc1, acc5 # Build train and valid datasets logger.info("preparing dataset..") train_dataset = data.dataset.ImageNet(args.data, train=True) train_sampler = data.Infinite( data.RandomSampler(train_dataset, batch_size=cfg.BATCH_SIZE, drop_last=True)) train_queue = data.DataLoader( train_dataset, sampler=train_sampler, transform=T.Compose([ T.RandomResizedCrop(224), T.RandomHorizontalFlip(), cfg.COLOR_JITTOR, T.Normalize(mean=128), T.ToMode("CHW"), ]), num_workers=args.workers, ) train_queue = iter(train_queue) valid_dataset = data.dataset.ImageNet(args.data, train=False) valid_sampler = data.SequentialSampler(valid_dataset, batch_size=100, drop_last=False) valid_queue = data.DataLoader( valid_dataset, sampler=valid_sampler, transform=T.Compose([ T.Resize(256), T.CenterCrop(224), T.Normalize(mean=128), T.ToMode("CHW"), ]), num_workers=args.workers, ) def adjust_learning_rate(step, epoch): learning_rate = cfg.LEARNING_RATE if cfg.SCHEDULER == "Linear": learning_rate *= 1 - float(step) / total_steps elif cfg.SCHEDULER == "Multistep": learning_rate *= cfg.SCHEDULER_GAMMA**bisect.bisect_right( cfg.SCHEDULER_STEPS, epoch) else: raise ValueError(cfg.SCHEDULER) for param_group in optimizer.param_groups: param_group["lr"] = learning_rate return learning_rate # Start training objs = AverageMeter("Loss") top1 = AverageMeter("Acc@1") top5 = AverageMeter("Acc@5") total_time = AverageMeter("Time") t = time.time() for step in range(0, total_steps): # Linear learning rate decay epoch = step // steps_per_epoch learning_rate = adjust_learning_rate(step, epoch) image, label = next(train_queue) image = image.astype("float32") label = label.astype("int32") n = image.shape[0] optimizer.zero_grad() loss, acc1, acc5 = train_func(image, label) optimizer.step() top1.update(100 * acc1.numpy()[0], n) top5.update(100 * acc5.numpy()[0], n) objs.update(loss.numpy()[0], n) total_time.update(time.time() - t) t = time.time() if step % args.report_freq == 0 and rank == 0: logger.info("TRAIN e%d %06d %f %s %s %s %s", epoch, step, learning_rate, objs, top1, top5, total_time) objs.reset() top1.reset() top5.reset() total_time.reset() if step % 10000 == 0 and rank == 0: logger.info("SAVING %06d", step) mge.save( { "step": step, "state_dict": model.state_dict() }, os.path.join(save_dir, "checkpoint.pkl"), ) if step % 10000 == 0 and step != 0: _, valid_acc, valid_acc5 = infer(valid_func, valid_queue, args) logger.info("TEST %06d %f, %f", step, valid_acc, valid_acc5) mge.save({ "step": step, "state_dict": model.state_dict() }, os.path.join(save_dir, "checkpoint-final.pkl")) _, valid_acc, valid_acc5 = infer(valid_func, valid_queue, args) logger.info("TEST %06d %f, %f", step, valid_acc, valid_acc5)
def worker(world_size, args): # pylint: disable=too-many-statements rank = dist.get_rank() if world_size > 1: # Initialize distributed process group logger.info("init distributed process group {} / {}".format(rank, world_size)) save_dir = os.path.join(args.save, args.arch + "." + "calibration") if not os.path.exists(save_dir): os.makedirs(save_dir, exist_ok=True) mge.set_log_file(os.path.join(save_dir, "log.txt")) model = models.__dict__[args.arch]() # load calibration model assert args.checkpoint logger.info("Load pretrained weights from %s", args.checkpoint) ckpt = mge.load(args.checkpoint) ckpt = ckpt["state_dict"] if "state_dict" in ckpt else ckpt model.load_state_dict(ckpt, strict=False) # Build valid datasets valid_dataset = data.dataset.ImageNet(args.data, train=False) valid_sampler = data.SequentialSampler( valid_dataset, batch_size=100, drop_last=False ) valid_queue = data.DataLoader( valid_dataset, sampler=valid_sampler, transform=T.Compose( [T.Resize(256), T.CenterCrop(224), T.Normalize(mean=128), T.ToMode("CHW")] ), num_workers=args.workers, ) # calibration model.fc.disable_quantize() model = quantize_qat(model, qconfig=Q.calibration_qconfig) # calculate scale def calculate_scale(image, label): model.eval() enable_observer(model) logits = model(image) loss = F.loss.cross_entropy(logits, label, label_smooth=0.1) acc1, acc5 = F.topk_accuracy(logits, label, (1, 5)) if dist.is_distributed(): # all_reduce_mean loss = dist.functional.all_reduce_sum(loss) / dist.get_world_size() acc1 = dist.functional.all_reduce_sum(acc1) / dist.get_world_size() acc5 = dist.functional.all_reduce_sum(acc5) / dist.get_world_size() return loss, acc1, acc5 infer(calculate_scale, valid_queue, args) # quantized model = quantize(model) # eval quantized model def eval_func(image, label): model.eval() logits = model(image) loss = F.loss.cross_entropy(logits, label, label_smooth=0.1) acc1, acc5 = F.topk_accuracy(logits, label, (1, 5)) if dist.is_distributed(): # all_reduce_mean loss = dist.functional.all_reduce_sum(loss) / dist.get_world_size() acc1 = dist.functional.all_reduce_sum(acc1) / dist.get_world_size() acc5 = dist.functional.all_reduce_sum(acc5) / dist.get_world_size() return loss, acc1, acc5 _, valid_acc, valid_acc5 = infer(eval_func, valid_queue, args) logger.info("TEST %f, %f", valid_acc, valid_acc5) # save quantized model mge.save( {"step": -1, "state_dict": model.state_dict()}, os.path.join(save_dir, "checkpoint-calibration.pkl"), ) logger.info( "save in {}".format(os.path.join(save_dir, "checkpoint-calibration.pkl")) )
def tracedmodule_to_caffe( traced_module, prototxt="out.prototxt", caffemodel="out.caffemodel", outspec=None, use_empty_blobs=False, input_data_type: str = None, input_scales: Union[float, List[float]] = None, input_zero_points: Union[int, List[int]] = None, require_quantize=False, param_fake_quant=False, split_conv_relu=False, fuse_bn=False, quantize_file_path="quant_params.json", convert_backend: BackEnd = BackEnd.CAFFE, ): """ Convert TracedModule model to Caffe, and save caffe model to `prototxt` and `caffemodel`. :param traced_module: the file path of TracedModule model. :type traced_module: str :param prototxt: the filename used for saved model definition. :type prototxt: str :param caffemodel: the filename used for saved model weights. :type caffemodel: str :param outspec: specify the end points of the model, expect the full names of nodes. :type outspec: list """ if isinstance(traced_module, str): traced_module = mge.load(traced_module) assert isinstance( traced_module, TracedModule ), "Input should be a traced module or a path of traced module." assert not require_quantize, "Caffe do not support quantize model." _update_inputs_qparams( traced_module, input_data_type, input_scales, input_zero_points ) tm_resolver = TM_FrontEnd(traced_module, outspec=outspec) irgraph = tm_resolver.resolve() transformer_options = [ TransformerRule.REMOVE_DROPOUT, TransformerRule.REMOVE_RESHAPE_REALTED_OP, TransformerRule.REMOVE_UNRELATED_IROP, TransformerRule.ADD_FAKE_HSIGMOID_OUT, TransformerRule.EXPAND_CONVRELU, ] if fuse_bn: transformer_options += [ TransformerRule.FUSE_LINEAR_BN, TransformerRule.FUSE_CONV_BN, ] if convert_backend == BackEnd.NNIE: transformer_options.extend( [TransformerRule.REMOVE_FLATTEN_BEFORE_LINEAR,] ) if split_conv_relu: transformer_options += [TransformerRule.REMOVE_RELU] transformer = IRTransform(transformer_options) transformed_irgraph = transformer.transform(irgraph) quantizer = IRQuantizer( require_quantize=require_quantize, param_fake_quant=param_fake_quant ) if tm_resolver.has_qat: quantizer.save_quantize_params(transformed_irgraph) converter = CaffeConverter( transformed_irgraph, quantizer, use_empty_blobs, convert_backend ) converter.convert() if tm_resolver.has_qat: quantizer.dump_quant_param(path=quantize_file_path) assert isinstance(prototxt, str) and isinstance( caffemodel, str ), "'prototxt' and 'caffemodel' must be string" converter.dump(prototxt, caffemodel)
import megengine as mge from model import ReverseString from dataset import get_dataloader, make_string_from_tensor, MAXLEN import sys model = ReverseString() model.load_state_dict(mge.load(sys.argv[1])) model.eval() test_data = get_dataloader() data = mge.tensor() position = mge.tensor() total = 0 correct = 0 for idx, (batch_data, batch_label, pos) in enumerate(test_data): data.set_value(batch_data) position.set_value(pos) prob = model(data, position) prob = prob.reshape(-1, MAXLEN, 26) predicted = prob.numpy().argmax(axis=2) inp_str = make_string_from_tensor(batch_data) pred_str = make_string_from_tensor(predicted) gt_str = make_string_from_tensor(batch_label) for i in range(len(inp_str)): total += 1 correct += gt_str[i] == pred_str[i] print(correct, total)