def worker(current_network, weight_file, dataset_dir, master_ip, port, world_size, rank, result_list): if world_size > 1: dist.init_process_group( master_ip=master_ip, port=port, world_size=world_size, rank=rank, device=rank, ) mge.device.set_default_device("gpu{}".format(rank)) cfg = current_network.Cfg() cfg.backbone_pretrained = False model = current_network.Net(cfg) model.eval() state_dict = mge.load(weight_file) if "state_dict" in state_dict: state_dict = state_dict["state_dict"] model.load_state_dict(state_dict) evaluator = DetEvaluator(model) test_loader = build_dataloader(rank, world_size, dataset_dir, model.cfg) if world_size == 1: test_loader = tqdm(test_loader) for data in test_loader: image, im_info = DetEvaluator.process_inputs( data[0][0], model.cfg.test_image_short_size, model.cfg.test_image_max_size, ) pred_res = evaluator.predict(image=mge.tensor(image), im_info=mge.tensor(im_info)) result = { "det_res": pred_res, "image_id": int(data[1][2][0].split(".")[0].split("_")[-1]), } if world_size > 1: result_list.put_nowait(result) else: result_list.append(result)
def worker(rank, world_size, cfg): logger = get_root_logger() # 每个进程再创建一个logger if world_size > 1: # Initialize distributed process group logger.info("init distributed process group {} / {}".format( rank, world_size)) dist.init_process_group( master_ip="localhost", master_port=23333, world_size=world_size, rank=rank, dev=rank % 8, ) model = build_model(cfg.model, train_cfg=cfg.train_cfg, eval_cfg=cfg.eval_cfg) datasets = [build_dataset(cfg.data.train)] train(model, datasets, cfg, rank)
def worker(rank, world_size, ngpus_per_node, args): if world_size > 1: # init process group dist.init_process_group( master_ip=args.dist_addr, port=args.dist_port, world_size=world_size, rank=rank, device=rank % ngpus_per_node, backend="nccl", ) logging.info("init process group rank %d / %d", dist.get_rank(), dist.get_world_size()) # build dataset _, valid_dataloader = build_dataset(args) # build model model = resnet_model.__dict__[args.arch](pretrained=args.model is None) if args.model is not None: logging.info("load from checkpoint %s", args.model) checkpoint = megengine.load(args.model) if "state_dict" in checkpoint: state_dict = checkpoint["state_dict"] model.load_state_dict(state_dict) def valid_step(image, label): logits = model(image) loss = F.nn.cross_entropy(logits, label) acc1, acc5 = F.topk_accuracy(logits, label, topk=(1, 5)) # calculate mean values if world_size > 1: loss = F.distributed.all_reduce_sum(loss) / world_size acc1 = F.distributed.all_reduce_sum(acc1) / world_size acc5 = F.distributed.all_reduce_sum(acc5) / world_size return loss, acc1, acc5 model.eval() _, valid_acc1, valid_acc5 = valid(valid_step, valid_dataloader, args) logging.info( "Test Acc@1 %.3f, Acc@5 %.3f", valid_acc1, valid_acc5, )
def worker(rank, world_size, cfg, gpu_id="0", port=23333): if cfg.dynamic: trace.enabled = False if world_size > 1: dist.init_process_group( master_ip="localhost", port=port, world_size=world_size, rank=rank, device=int(gpu_id) % 10, ) log_file = os.path.join(cfg.work_dir, 'rank{}_root.log'.format(rank)) logger = get_root_logger(log_file=log_file, log_level=cfg.log_level) model = build_model(cfg.model, train_cfg=cfg.train_cfg, eval_cfg=cfg.eval_cfg) # 此时参数已经随机化完成 datasets = [build_dataset(cfg.data.train)] train(model, datasets, cfg, rank)
def worker( current_network, weight_file, dataset_dir, result_list, master_ip=None, port=None, world_size=None, rank=None ): if world_size > 1: dist.init_process_group( master_ip=master_ip, port=port, world_size=world_size, rank=rank, device=rank, ) cfg = current_network.Cfg() cfg.backbone_pretrained = False model = current_network.Net(cfg) model.eval() state_dict = mge.load(weight_file) if "state_dict" in state_dict: state_dict = state_dict["state_dict"] model.load_state_dict(state_dict) def pred_func(data): pred = model(data) return pred test_loader = build_dataloader(dataset_dir, model.cfg) if dist.get_world_size() == 1: test_loader = tqdm(test_loader) for data in test_loader: img = data[0].squeeze() label = data[1].squeeze() im_info = data[2] pred = evaluate(pred_func, img, model.cfg) result = {"pred": pred, "gt": label, "name": im_info[2]} if dist.get_world_size() > 1: result_list.put_nowait(result) else: result_list.append(result)
def worker(rank, world_size, cfg, gpu_id="0", port=23333): if cfg.dynamic: trace.enabled = False if world_size > 1: dist.init_process_group( master_ip="localhost", port=port, world_size=world_size, rank=rank, device=int(gpu_id) % 10, ) log_file = os.path.join(cfg.work_dir, 'rank{}_root.log'.format(rank)) logger = get_root_logger( log_file=log_file, log_level=cfg.log_level ) # 给每个进程创立自己的root logger,但只有rank0的创建文件,其余的不创建且为error级别 model = build_model( cfg.model, eval_cfg=cfg.eval_cfg ) # eval cfg can provide some useful info, e.g. the padding multi datasets = [build_dataset(cfg.data.test)] test(model, datasets, cfg, rank)
def worker(rank, world_size, cfg): logger = get_root_logger() # 每个进程再创建一个logger # set dynamic graph for debug if cfg.dynamic: trace.enabled = False if world_size > 1: # Initialize distributed process group logger.info("init distributed process group {} / {}".format( rank, world_size)) dist.init_process_group( master_ip="localhost", master_port=23333, world_size=world_size, rank=rank, dev=rank % 8, ) model = build_model( cfg.model, eval_cfg=cfg.eval_cfg ) # eval cfg can provide some useful info, e.g. the padding multi datasets = [build_dataset(cfg.data.test)] test(model, datasets, cfg, rank)
def worker(rank, world_size, args): # pylint: disable=too-many-statements if rank == 0: save_dir = os.path.join(args.save, args.arch, "b{}".format(args.batch_size * world_size)) if not os.path.exists(save_dir): os.makedirs(save_dir) log_format = '%(asctime)s %(message)s' logging.basicConfig(stream=sys.stdout, level=logging.INFO, format=log_format, datefmt='%m/%d %I:%M:%S %p') fh = logging.FileHandler(os.path.join(save_dir, 'log.txt')) fh.setFormatter(logging.Formatter(log_format)) logging.getLogger().addHandler(fh) if world_size > 1: # Initialize distributed process group logging.info("init distributed process group {} / {}".format( rank, world_size)) dist.init_process_group( master_ip="localhost", master_port=23456, world_size=world_size, rank=rank, dev=rank, ) save_dir = os.path.join(args.save, args.arch) if rank == 0: prefixs = ['train', 'valid'] writers = { prefix: SummaryWriter(os.path.join(args.output, prefix)) for prefix in prefixs } model = getattr(M, args.arch)() step_start = 0 # if args.model: # logging.info("load weights from %s", args.model) # model.load_state_dict(mge.load(args.model)) # step_start = int(args.model.split("-")[1].split(".")[0]) optimizer = optim.SGD( get_parameters(model), lr=args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay, ) # Define train and valid graph def train_func(image, label): model.train() logits = model(image) loss = F.cross_entropy_with_softmax(logits, label, label_smooth=0.1) acc1, acc5 = F.accuracy(logits, label, (1, 5)) optimizer.backward(loss) # compute gradients if dist.is_distributed(): # all_reduce_mean loss = dist.all_reduce_sum(loss) / dist.get_world_size() acc1 = dist.all_reduce_sum(acc1) / dist.get_world_size() acc5 = dist.all_reduce_sum(acc5) / dist.get_world_size() return loss, acc1, acc5 def valid_func(image, label): model.eval() logits = model(image) loss = F.cross_entropy_with_softmax(logits, label, label_smooth=0.1) acc1, acc5 = F.accuracy(logits, label, (1, 5)) if dist.is_distributed(): # all_reduce_mean loss = dist.all_reduce_sum(loss) / dist.get_world_size() acc1 = dist.all_reduce_sum(acc1) / dist.get_world_size() acc5 = dist.all_reduce_sum(acc5) / dist.get_world_size() return loss, acc1, acc5 # Build train and valid datasets logging.info("preparing dataset..") transform = transforms.Compose([ transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ColorJitter(brightness=0.4, contrast=0.4, saturation=0.4), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]) train_dataset = datasets.ImageNet(split='train', transform=transform) train_sampler = torch.utils.data.RandomSampler(train_dataset) train_queue = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, sampler=train_sampler, shuffle=False, drop_last=True, pin_memory=True, num_workers=args.workers) train_queue = iter(train_queue) transform = transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]) valid_dataset = datasets.ImageNet(split='val', transform=transform) valid_sampler = torch.utils.data.SequentialSampler(valid_dataset) valid_queue = torch.utils.data.DataLoader(valid_dataset, batch_size=100, sampler=valid_sampler, shuffle=False, drop_last=False, num_workers=args.workers) # Start training objs = AverageMeter("Loss") top1 = AverageMeter("Acc@1") top5 = AverageMeter("Acc@5") total_time = AverageMeter("Time") t = time.time() best_valid_acc = 0 for step in range(step_start, args.steps + 1): # Linear learning rate decay decay = 1.0 decay = 1 - float(step) / args.steps if step < args.steps else 0 for param_group in optimizer.param_groups: param_group["lr"] = args.learning_rate * decay image, label = next(train_queue) time_data = time.time() - t # image = image.astype("float32") # label = label.astype("int32") n = image.shape[0] optimizer.zero_grad() loss, acc1, acc5 = train_func(image, label) optimizer.step() top1.update(100 * acc1.numpy()[0], n) top5.update(100 * acc5.numpy()[0], n) objs.update(loss.numpy()[0], n) total_time.update(time.time() - t) time_iter = time.time() - t t = time.time() if step % args.report_freq == 0 and rank == 0: logging.info( "TRAIN Iter %06d: lr = %f,\tloss = %f,\twc_loss = 1,\tTop-1 err = %f,\tTop-5 err = %f,\tdata_time = %f,\ttrain_time = %f,\tremain_hours=%f", step, args.learning_rate * decay, float(objs.__str__().split()[1]), 1 - float(top1.__str__().split()[1]) / 100, 1 - float(top5.__str__().split()[1]) / 100, time_data, time_iter - time_data, time_iter * (args.steps - step) / 3600, ) writers['train'].add_scalar('loss', float(objs.__str__().split()[1]), global_step=step) writers['train'].add_scalar('top1_err', 1 - float(top1.__str__().split()[1]) / 100, global_step=step) writers['train'].add_scalar('top5_err', 1 - float(top5.__str__().split()[1]) / 100, global_step=step) objs.reset() top1.reset() top5.reset() total_time.reset() if step % 10000 == 0 and step != 0: loss, valid_acc, valid_acc5 = infer(valid_func, valid_queue, args) logging.info( "TEST Iter %06d: loss = %f,\tTop-1 err = %f,\tTop-5 err = %f", step, loss, 1 - valid_acc / 100, 1 - valid_acc5 / 100) is_best = valid_acc > best_valid_acc best_valid_acc = max(valid_acc, best_valid_acc) if rank == 0: writers['valid'].add_scalar('loss', loss, global_step=step) writers['valid'].add_scalar('top1_err', 1 - valid_acc / 100, global_step=step) writers['valid'].add_scalar('top5_err', 1 - valid_acc5 / 100, global_step=step) logging.info("SAVING %06d", step) save_checkpoint( save_dir, { 'step': step + 1, 'model': args.arch, 'state_dict': model.state_dict(), 'best_prec1': best_valid_acc, 'optimizer': optimizer.state_dict(), }, is_best)
def run_perf( batch_size=64, warm_up=True, dump_prof=None, opt_level=2, conv_fastrun=False, run_step=True, track_bn_stats=True, warm_up_iter=20, run_iter=100, num_gpu=None, device=0, server=None, port=None, scale_batch_size=False, eager=False, ): if conv_fastrun: set_conv_execution_strategy("PROFILE") if num_gpu: dist.init_process_group(args.server, args.port, num_gpu, device, device) if scale_batch_size: batch_size = batch_size // num_gpu print("Run with data parallel, batch size = {} per GPU".format( batch_size)) data = tensor(np.random.randn(batch_size, 3, 224, 224).astype("float32")) label = tensor(np.random.randint(1000, size=[ batch_size, ], dtype=np.int32)) net = Resnet50(track_bn_stats=track_bn_stats) opt = SGD(net.parameters(), lr=0.01, momentum=0.9, weight_decay=1e-4) def train_func(data, label): logits = net(data) loss = F.cross_entropy_with_softmax(logits, label) if num_gpu: loss = loss / num_gpu opt.zero_grad() opt.backward(loss) return loss train_func = trace( train_func, symbolic=(not eager), opt_level=opt_level, profiling=not (dump_prof is None), ) if warm_up: print("Warm up ...") for _ in range(warm_up_iter): opt.zero_grad() train_func(data, label) if run_step: opt.step() print_gpu_usage() print("Running train ...") start = time.time() for _ in range(run_iter): opt.zero_grad() train_func(data, label) if run_step: opt.step() time_used = time.time() - start if dump_prof: with open(dump_prof, "w") as fout: json.dump(train_func.get_profile(), fout, indent=2) return time_used / run_iter
def worker(rank, world_size, args): # pylint: disable=too-many-statements mge.set_log_file(os.path.join(args.save, args.arch, "log.txt")) if world_size > 1: # Initialize distributed process group logger.info("init distributed process group {} / {}".format( rank, world_size)) dist.init_process_group( master_ip="localhost", master_port=23456, world_size=world_size, rank=rank, dev=rank, ) save_dir = os.path.join(args.save, args.arch) model = getattr(M, args.arch)() step_start = 0 if args.model: logger.info("load weights from %s", args.model) model.load_state_dict(mge.load(args.model)) step_start = int(args.model.split("-")[1].split(".")[0]) optimizer = optim.SGD( get_parameters(model), lr=args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay, ) # Define train and valid graph @jit.trace(symbolic=True) def train_func(image, label): model.train() logits = model(image) loss = F.cross_entropy_with_softmax(logits, label, label_smooth=0.) acc1, acc5 = F.accuracy(logits, label, (1, 5)) optimizer.backward(loss) # compute gradients if dist.is_distributed(): # all_reduce_mean loss = dist.all_reduce_sum(loss, "train_loss") / dist.get_world_size() acc1 = dist.all_reduce_sum(acc1, "train_acc1") / dist.get_world_size() acc5 = dist.all_reduce_sum(acc5, "train_acc5") / dist.get_world_size() return loss, acc1, acc5 @jit.trace(symbolic=True) def valid_func(image, label): model.eval() logits = model(image) loss = F.cross_entropy_with_softmax(logits, label, label_smooth=0.) acc1, acc5 = F.accuracy(logits, label, (1, 5)) if dist.is_distributed(): # all_reduce_mean loss = dist.all_reduce_sum(loss, "valid_loss") / dist.get_world_size() acc1 = dist.all_reduce_sum(acc1, "valid_acc1") / dist.get_world_size() acc5 = dist.all_reduce_sum(acc5, "valid_acc5") / dist.get_world_size() return loss, acc1, acc5 # Build train and valid datasets logger.info("preparing dataset..") train_dataset = data.dataset.ImageNet(args.data, train=True) train_sampler = data.Infinite( data.RandomSampler(train_dataset, batch_size=args.batch_size, drop_last=True)) train_queue = data.DataLoader( train_dataset, sampler=train_sampler, transform=T.Compose([ T.RandomResizedCrop(224), T.RandomHorizontalFlip(), T.ColorJitter(brightness=0.4, contrast=0.4, saturation=0.4), T.ToMode("CHW"), ]), num_workers=args.workers, ) valid_dataset = data.dataset.ImageNet(args.data, train=False) valid_sampler = data.SequentialSampler(valid_dataset, batch_size=100, drop_last=False) valid_queue = data.DataLoader( valid_dataset, sampler=valid_sampler, transform=T.Compose([ T.Resize(256), T.CenterCrop(224), T.ToMode("CHW"), ]), num_workers=args.workers, ) # Start training objs = AverageMeter("Loss") top1 = AverageMeter("Acc@1") top5 = AverageMeter("Acc@5") total_time = AverageMeter("Time") t = time.time() for step in range(step_start, args.steps + 1): # Linear learning rate decay decay = 1.0 decay = 1 - float(step) / args.steps if step < args.steps else 0 for param_group in optimizer.param_groups: param_group["lr"] = args.learning_rate * decay image, label = next(train_queue) time_data = time.time() - t image = image.astype("float32") label = label.astype("int32") n = image.shape[0] optimizer.zero_grad() loss, acc1, acc5 = train_func(image, label) optimizer.step() top1.update(100 * acc1.numpy()[0], n) top5.update(100 * acc5.numpy()[0], n) objs.update(loss.numpy()[0], n) total_time.update(time.time() - t) time_iter = time.time() - t t = time.time() if step % args.report_freq == 0 and rank == 0: logger.info( "TRAIN Iter %06d: lr = %f,\tloss = %f,\twc_loss = 1,\tTop-1 err = %f,\tTop-5 err = %f,\tdata_time = %f,\ttrain_time = %f,\tremain_hours=%f", step, args.learning_rate * decay, float(objs.__str__().split()[1]), 1 - float(top1.__str__().split()[1]) / 100, 1 - float(top5.__str__().split()[1]) / 100, time_data, time_iter - time_data, time_iter * (args.steps - step) / 3600, ) objs.reset() top1.reset() top5.reset() total_time.reset() if step % 10000 == 0 and rank == 0 and step != 0: logger.info("SAVING %06d", step) mge.save( model.state_dict(), os.path.join(save_dir, "checkpoint-{:06d}.pkl".format(step)), ) if step % 50000 == 0 and step != 0: _, valid_acc, valid_acc5 = infer(valid_func, valid_queue, args) logger.info( "TEST Iter %06d: loss = %f,\tTop-1 err = %f,\tTop-5 err = %f", step, _, 1 - valid_acc / 100, 1 - valid_acc5 / 100) mge.save(model.state_dict(), os.path.join(save_dir, "checkpoint-{:06d}.pkl".format(step))) _, valid_acc, valid_acc5 = infer(valid_func, valid_queue, args) logger.info("TEST Iter %06d: loss=%f,\tTop-1 err = %f,\tTop-5 err = %f", step, _, 1 - valid_acc / 100, 1 - valid_acc5 / 100)
def worker(master_ip, port, world_size, rank, configs): if world_size > 1: dist.init_process_group( master_ip=master_ip, port=port, world_size=world_size, rank=rank, device=rank, ) logger.info("init process group for gpu{} done".format(rank)) # set up logger os.makedirs(configs["base_dir"], exist_ok=True) worklog_path = os.path.join(configs["base_dir"], "worklog.txt") mge.set_log_file(worklog_path) # prepare model-related components model = FaceRecognitionModel(configs) # prepare data-related components preprocess = T.Compose([T.Normalize(mean=127.5, std=128), T.ToMode("CHW")]) augment = T.Compose([T.RandomHorizontalFlip()]) train_dataset = get_train_dataset(configs["dataset"], dataset_dir=configs["dataset_dir"]) train_sampler = data.RandomSampler(train_dataset, batch_size=configs["batch_size"], drop_last=True) train_queue = data.DataLoader(train_dataset, sampler=train_sampler, transform=T.Compose([augment, preprocess])) # prepare optimize-related components configs["learning_rate"] = configs["learning_rate"] * dist.get_world_size() if dist.get_world_size() > 1: dist.bcast_list_(model.parameters()) gm = ad.GradManager().attach( model.parameters(), callbacks=[dist.make_allreduce_cb("mean")]) else: gm = ad.GradManager().attach(model.parameters()) opt = optim.SGD( model.parameters(), lr=configs["learning_rate"], momentum=configs["momentum"], weight_decay=configs["weight_decay"], ) # try to load checkpoint model, start_epoch = try_load_latest_checkpoint(model, configs["base_dir"]) # do training def train_one_epoch(): def train_func(images, labels): opt.clear_grad() with gm: loss, accuracy, _ = model(images, labels) gm.backward(loss) if dist.is_distributed(): # all_reduce_mean loss = dist.functional.all_reduce_sum( loss) / dist.get_world_size() accuracy = dist.functional.all_reduce_sum( accuracy) / dist.get_world_size() opt.step() return loss, accuracy model.train() average_loss = AverageMeter("loss") average_accuracy = AverageMeter("accuracy") data_time = AverageMeter("data_time") train_time = AverageMeter("train_time") total_step = len(train_queue) data_iter = iter(train_queue) for step in range(total_step): # get next batch of data data_tic = time.time() images, labels = next(data_iter) data_toc = time.time() # forward pass & backward pass train_tic = time.time() images = mge.tensor(images, dtype="float32") labels = mge.tensor(labels, dtype="int32") loss, accuracy = train_func(images, labels) train_toc = time.time() # do the statistics and logging n = images.shape[0] average_loss.update(loss.item(), n) average_accuracy.update(accuracy.item() * 100, n) data_time.update(data_toc - data_tic) train_time.update(train_toc - train_tic) if step % configs["log_interval"] == 0 and dist.get_rank() == 0: logger.info( "epoch: %d, step: %d, %s, %s, %s, %s", epoch, step, average_loss, average_accuracy, data_time, train_time, ) for epoch in range(start_epoch, configs["num_epoch"]): adjust_learning_rate(opt, epoch, configs) train_one_epoch() if dist.get_rank() == 0: checkpoint_path = os.path.join(configs["base_dir"], f"epoch-{epoch+1}-checkpoint.pkl") mge.save( { "epoch": epoch + 1, "state_dict": model.state_dict() }, checkpoint_path, )
def worker(master_ip, port, world_size, rank, args): if world_size > 1: dist.init_process_group( master_ip=master_ip, port=port, world_size=world_size, rank=rank, device=rank, ) logger.info("Init process group for gpu{} done".format(rank)) current_network = import_from_file(args.file) model = current_network.Net(current_network.Cfg()) model.train() if dist.get_rank() == 0: logger.info(get_config_info(model.cfg)) logger.info(repr(model)) backbone_params = [] head_params = [] for name, param in model.named_parameters(): if "backbone" in name: backbone_params.append(param) else: head_params.append(param) opt = SGD( [ { "params": backbone_params, "lr": model.cfg.learning_rate * 0.1 }, { "params": head_params }, ], lr=model.cfg.learning_rate, momentum=model.cfg.momentum, weight_decay=model.cfg.weight_decay * dist.get_world_size(), ) gm = GradManager() if dist.get_world_size() > 1: gm.attach(model.parameters(), callbacks=[dist.make_allreduce_cb("SUM", dist.WORLD)]) else: gm.attach(model.parameters()) cur_epoch = 0 if args.resume is not None: pretrained = mge.load(args.resume) cur_epoch = pretrained["epoch"] + 1 model.load_state_dict(pretrained["state_dict"]) opt.load_state_dict(pretrained["opt"]) if dist.get_rank() == 0: logger.info("load success: epoch %d", cur_epoch) if dist.get_world_size() > 1: dist.bcast_list_(model.parameters(), dist.WORLD) # sync parameters if dist.get_rank() == 0: logger.info("Prepare dataset") train_loader = iter( build_dataloader(model.cfg.batch_size, args.dataset_dir, model.cfg)) for epoch in range(cur_epoch, model.cfg.max_epoch): train_one_epoch(model, train_loader, opt, gm, epoch) if dist.get_rank() == 0: save_path = "log-of-{}/epoch_{}.pkl".format( os.path.basename(args.file).split(".")[0], epoch) mge.save( { "epoch": epoch, "state_dict": model.state_dict(), "opt": opt.state_dict() }, save_path) logger.info("dump weights to %s", save_path)
def worker(rank, world_size, args): # pylint: disable=too-many-statements if world_size > 1: # Initialize distributed process group logger.info("init distributed process group {} / {}".format( rank, world_size)) dist.init_process_group( master_ip="localhost", master_port=23456, world_size=world_size, rank=rank, dev=rank, ) model = models.__dict__[args.arch]() if args.mode != "normal": Q.quantize_qat(model, Q.ema_fakequant_qconfig) if args.checkpoint: logger.info("Load pretrained weights from %s", args.checkpoint) ckpt = mge.load(args.checkpoint) ckpt = ckpt["state_dict"] if "state_dict" in ckpt else ckpt model.load_state_dict(ckpt, strict=False) if args.mode == "quantized": Q.quantize(model) # Define valid graph @jit.trace(symbolic=True) def valid_func(image, label): model.eval() logits = model(image) loss = F.cross_entropy_with_softmax(logits, label, label_smooth=0.1) acc1, acc5 = F.accuracy(logits, label, (1, 5)) if dist.is_distributed(): # all_reduce_mean loss = dist.all_reduce_sum(loss, "valid_loss") / dist.get_world_size() acc1 = dist.all_reduce_sum(acc1, "valid_acc1") / dist.get_world_size() acc5 = dist.all_reduce_sum(acc5, "valid_acc5") / dist.get_world_size() return loss, acc1, acc5 # Build valid datasets logger.info("preparing dataset..") valid_dataset = data.dataset.ImageNet(args.data, train=False) valid_sampler = data.SequentialSampler(valid_dataset, batch_size=100, drop_last=False) valid_queue = data.DataLoader( valid_dataset, sampler=valid_sampler, transform=T.Compose([ T.Resize(256), T.CenterCrop(224), T.Normalize(mean=128), T.ToMode("CHW"), ]), num_workers=args.workers, ) _, valid_acc, valid_acc5 = infer(valid_func, valid_queue, args) logger.info("TEST %f, %f", valid_acc, valid_acc5)
def worker(rank, world_size, args): if world_size > 1: # Initialize distributed process group logger.info("init distributed process group {} / {}".format( rank, world_size)) dist.init_process_group( master_ip="localhost", master_port=23456, world_size=world_size, rank=rank, dev=rank, ) model_name = "{}_{}x{}".format(args.arch, cfg.input_shape[0], cfg.input_shape[1]) save_dir = os.path.join(args.save, model_name) model = getattr(M, args.arch)(pretrained=args.pretrained) model.train() start_epoch = 0 if args.c is not None: file = mge.load(args.c) model.load_state_dict(file["state_dict"]) start_epoch = file["epoch"] optimizer = optim.Adam( model.parameters(requires_grad=True), lr=args.lr, weight_decay=cfg.weight_decay, ) # Build train datasets logger.info("preparing dataset..") train_dataset = COCOJoints( args.data_root, args.ann_file, image_set="train", order=("image", "keypoints", "boxes", "info"), ) train_sampler = data.RandomSampler(train_dataset, batch_size=args.batch_size, drop_last=True) transforms = [T.Normalize(mean=cfg.IMG_MEAN, std=cfg.IMG_STD)] if cfg.half_body_transform: transforms.append( HalfBodyTransform(cfg.upper_body_ids, cfg.lower_body_ids, cfg.prob_half_body)) if cfg.extend_boxes: transforms.append( ExtendBoxes(cfg.x_ext, cfg.y_ext, cfg.input_shape[1] / cfg.input_shape[0])) transforms += [ RandomHorizontalFlip(0.5, keypoint_flip_order=cfg.keypoint_flip_order) ] transforms += [ RandomBoxAffine( degrees=cfg.rotate_range, scale=cfg.scale_range, output_shape=cfg.input_shape, rotate_prob=cfg.rotation_prob, scale_prob=cfg.scale_prob, ) ] transforms += [T.ToMode()] train_queue = data.DataLoader( train_dataset, sampler=train_sampler, num_workers=args.workers, transform=T.Compose( transforms=transforms, order=train_dataset.order, ), collator=HeatmapCollator( cfg.input_shape, cfg.output_shape, cfg.keypoint_num, cfg.heat_thre, cfg.heat_kernel if args.multi_scale_supervision else cfg.heat_kernel[-1:], cfg.heat_range, ), ) # Start training for epoch in range(start_epoch, args.epochs): loss = train(model, train_queue, optimizer, args, epoch=epoch) logger.info("Epoch %d Train %.6f ", epoch, loss) if rank == 0: # save checkpoint mge.save( { "epoch": epoch + 1, "state_dict": model.state_dict(), }, os.path.join(save_dir, "epoch_{}.pkl".format(epoch)), )
def worker(rank, world_size, args): # pylint: disable=too-many-statements if world_size > 1: # Initialize distributed process group logger.info("init distributed process group {} / {}".format(rank, world_size)) dist.init_process_group( master_ip="localhost", master_port=23456, world_size=world_size, rank=rank, dev=rank, ) save_dir = os.path.join(args.save, args.arch + "." + args.mode) if not os.path.exists(save_dir): os.makedirs(save_dir, exist_ok=True) mge.set_log_file(os.path.join(save_dir, "log.txt")) model = models.__dict__[args.arch]() cfg = config.get_finetune_config(args.arch) cfg.LEARNING_RATE *= world_size # scale learning rate in distributed training total_batch_size = cfg.BATCH_SIZE * world_size steps_per_epoch = 1280000 // total_batch_size total_steps = steps_per_epoch * cfg.EPOCHS # load calibration model assert args.checkpoint logger.info("Load pretrained weights from %s", args.checkpoint) ckpt = mge.load(args.checkpoint) ckpt = ckpt["state_dict"] if "state_dict" in ckpt else ckpt model.load_state_dict(ckpt, strict=False) # Build valid datasets valid_dataset = data.dataset.ImageNet(args.data, train=False) # valid_dataset = ImageNetNoriDataset(args.data) valid_sampler = data.SequentialSampler( valid_dataset, batch_size=100, drop_last=False ) valid_queue = data.DataLoader( valid_dataset, sampler=valid_sampler, transform=T.Compose( [ T.Resize(256), T.CenterCrop(224), T.Normalize(mean=128), T.ToMode("CHW"), ] ), num_workers=args.workers, ) # calibration model.fc.disable_quantize() model = quantize_qat(model, qconfig=Q.calibration_qconfig) # calculate scale @jit.trace(symbolic=True) def calculate_scale(image, label): model.eval() enable_observer(model) logits = model(image) loss = F.cross_entropy_with_softmax(logits, label, label_smooth=0.1) acc1, acc5 = F.accuracy(logits, label, (1, 5)) if dist.is_distributed(): # all_reduce_mean loss = dist.all_reduce_sum(loss, "valid_loss") / dist.get_world_size() acc1 = dist.all_reduce_sum(acc1, "valid_acc1") / dist.get_world_size() acc5 = dist.all_reduce_sum(acc5, "valid_acc5") / dist.get_world_size() return loss, acc1, acc5 # model.fc.disable_quantize() infer(calculate_scale, valid_queue, args) # quantized model = quantize(model) # eval quantized model @jit.trace(symbolic=True) def eval_func(image, label): model.eval() logits = model(image) loss = F.cross_entropy_with_softmax(logits, label, label_smooth=0.1) acc1, acc5 = F.accuracy(logits, label, (1, 5)) if dist.is_distributed(): # all_reduce_mean loss = dist.all_reduce_sum(loss, "valid_loss") / dist.get_world_size() acc1 = dist.all_reduce_sum(acc1, "valid_acc1") / dist.get_world_size() acc5 = dist.all_reduce_sum(acc5, "valid_acc5") / dist.get_world_size() return loss, acc1, acc5 _, valid_acc, valid_acc5 = infer(eval_func, valid_queue, args) logger.info("TEST %f, %f", valid_acc, valid_acc5) # save quantized model mge.save( {"step": -1, "state_dict": model.state_dict()}, os.path.join(save_dir, "checkpoint-calibration.pkl") ) logger.info("save in {}".format(os.path.join(save_dir, "checkpoint-calibration.pkl")))
def worker(rank, world_size, args): if world_size > 1: # Initialize distributed process group logger.info("init distributed process group {} / {}".format( rank, world_size)) dist.init_process_group( master_ip="localhost", master_port=23456, world_size=world_size, rank=rank, dev=rank, ) save_dir = os.path.join(args.save, args.arch) model = getattr(M, args.arch)() optimizer = optim.SGD( model.parameters(requires_grad=True), lr=args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay, ) scheduler = optim.MultiStepLR(optimizer, [30, 60, 80]) # Define train and valid graph @jit.trace(symbolic=True) def train_func(image, label): model.train() logits = model(image) loss = F.cross_entropy_with_softmax(logits, label) acc1, acc5 = F.accuracy(logits, label, (1, 5)) optimizer.backward(loss) # compute gradients if dist.is_distributed(): # all_reduce_mean loss = dist.all_reduce_sum(loss, "train_loss") / dist.get_world_size() acc1 = dist.all_reduce_sum(acc1, "train_acc1") / dist.get_world_size() acc5 = dist.all_reduce_sum(acc5, "train_acc5") / dist.get_world_size() return loss, acc1, acc5 @jit.trace(symbolic=True) def valid_func(image, label): model.eval() logits = model(image) loss = F.cross_entropy_with_softmax(logits, label) acc1, acc5 = F.accuracy(logits, label, (1, 5)) if dist.is_distributed(): # all_reduce_mean loss = dist.all_reduce_sum(loss, "valid_loss") / dist.get_world_size() acc1 = dist.all_reduce_sum(acc1, "valid_acc1") / dist.get_world_size() acc5 = dist.all_reduce_sum(acc5, "valid_acc5") / dist.get_world_size() return loss, acc1, acc5 # Build train and valid datasets logger.info("preparing dataset..") train_dataset = data.dataset.ImageNet(args.data, train=True) train_sampler = data.RandomSampler(train_dataset, batch_size=args.batch_size, drop_last=True) train_queue = data.DataLoader( train_dataset, sampler=train_sampler, transform=T.Compose([ # Baseline Augmentation for small models T.RandomResizedCrop(224), T.RandomHorizontalFlip(), T.Normalize(mean=[103.530, 116.280, 123.675], std=[57.375, 57.120, 58.395]), # BGR T.ToMode("CHW"), ]) if args.arch in ("resnet18", "resnet34") else T.Compose( [ # Facebook Augmentation for large models T.RandomResizedCrop(224), T.RandomHorizontalFlip(), T.ColorJitter(brightness=0.4, contrast=0.4, saturation=0.4), T.Lighting(0.1), T.Normalize(mean=[103.530, 116.280, 123.675], std=[57.375, 57.120, 58.395]), # BGR T.ToMode("CHW"), ]), num_workers=args.workers, ) valid_dataset = data.dataset.ImageNet(args.data, train=False) valid_sampler = data.SequentialSampler(valid_dataset, batch_size=100, drop_last=False) valid_queue = data.DataLoader( valid_dataset, sampler=valid_sampler, transform=T.Compose([ T.Resize(256), T.CenterCrop(224), T.Normalize(mean=[103.530, 116.280, 123.675], std=[57.375, 57.120, 58.395]), # BGR T.ToMode("CHW"), ]), num_workers=args.workers, ) # Start training top1_acc = 0 for epoch in range(0, args.epochs): logger.info("Epoch %d LR %.3e", epoch, scheduler.get_lr()[0]) _, train_acc, train_acc5 = train(train_func, train_queue, optimizer, args, epoch=epoch) logger.info("Epoch %d Train %.3f / %.3f", epoch, train_acc, train_acc5) _, valid_acc, valid_acc5 = infer(valid_func, valid_queue, args, epoch=epoch) logger.info("Epoch %d Valid %.3f / %.3f", epoch, valid_acc, valid_acc5) scheduler.step() if rank == 0: # save checkpoint mge.save( { "epoch": epoch + 1, "state_dict": model.state_dict(), "accuracy": valid_acc, }, os.path.join(save_dir, "checkpoint.pkl"), ) if valid_acc > top1_acc: top1_acc = valid_acc shutil.copy( os.path.join(save_dir, "checkpoint.pkl"), os.path.join(save_dir, "model_best.pkl"), )
def worker(rank, world_size, args): # pylint: disable=too-many-statements if world_size > 1: # Initialize distributed process group logger.info("init distributed process group {} / {}".format( rank, world_size)) dist.init_process_group( master_ip="localhost", master_port=23456, world_size=world_size, rank=rank, dev=rank, ) save_dir = os.path.join(args.save, args.arch + "." + args.mode) if not os.path.exists(save_dir): os.makedirs(save_dir, exist_ok=True) mge.set_log_file(os.path.join(save_dir, "log.txt")) model = models.__dict__[args.arch]() cfg = config.get_finetune_config(args.arch) cfg.LEARNING_RATE *= world_size # scale learning rate in distributed training total_batch_size = cfg.BATCH_SIZE * world_size steps_per_epoch = 1280000 // total_batch_size total_steps = steps_per_epoch * cfg.EPOCHS if args.mode != "normal": Q.quantize_qat(model, Q.ema_fakequant_qconfig) if args.checkpoint: logger.info("Load pretrained weights from %s", args.checkpoint) ckpt = mge.load(args.checkpoint) ckpt = ckpt["state_dict"] if "state_dict" in ckpt else ckpt model.load_state_dict(ckpt, strict=False) if args.mode == "quantized": raise ValueError("mode = quantized only used during inference") Q.quantize(model) optimizer = optim.SGD( get_parameters(model, cfg), lr=cfg.LEARNING_RATE, momentum=cfg.MOMENTUM, ) # Define train and valid graph @jit.trace(symbolic=True) def train_func(image, label): model.train() logits = model(image) loss = F.cross_entropy_with_softmax(logits, label, label_smooth=0.1) acc1, acc5 = F.accuracy(logits, label, (1, 5)) optimizer.backward(loss) # compute gradients if dist.is_distributed(): # all_reduce_mean loss = dist.all_reduce_sum(loss, "train_loss") / dist.get_world_size() acc1 = dist.all_reduce_sum(acc1, "train_acc1") / dist.get_world_size() acc5 = dist.all_reduce_sum(acc5, "train_acc5") / dist.get_world_size() return loss, acc1, acc5 @jit.trace(symbolic=True) def valid_func(image, label): model.eval() logits = model(image) loss = F.cross_entropy_with_softmax(logits, label, label_smooth=0.1) acc1, acc5 = F.accuracy(logits, label, (1, 5)) if dist.is_distributed(): # all_reduce_mean loss = dist.all_reduce_sum(loss, "valid_loss") / dist.get_world_size() acc1 = dist.all_reduce_sum(acc1, "valid_acc1") / dist.get_world_size() acc5 = dist.all_reduce_sum(acc5, "valid_acc5") / dist.get_world_size() return loss, acc1, acc5 # Build train and valid datasets logger.info("preparing dataset..") train_dataset = data.dataset.ImageNet(args.data, train=True) train_sampler = data.Infinite( data.RandomSampler(train_dataset, batch_size=cfg.BATCH_SIZE, drop_last=True)) train_queue = data.DataLoader( train_dataset, sampler=train_sampler, transform=T.Compose([ T.RandomResizedCrop(224), T.RandomHorizontalFlip(), cfg.COLOR_JITTOR, T.Normalize(mean=128), T.ToMode("CHW"), ]), num_workers=args.workers, ) train_queue = iter(train_queue) valid_dataset = data.dataset.ImageNet(args.data, train=False) valid_sampler = data.SequentialSampler(valid_dataset, batch_size=100, drop_last=False) valid_queue = data.DataLoader( valid_dataset, sampler=valid_sampler, transform=T.Compose([ T.Resize(256), T.CenterCrop(224), T.Normalize(mean=128), T.ToMode("CHW"), ]), num_workers=args.workers, ) def adjust_learning_rate(step, epoch): learning_rate = cfg.LEARNING_RATE if cfg.SCHEDULER == "Linear": learning_rate *= 1 - float(step) / total_steps elif cfg.SCHEDULER == "Multistep": learning_rate *= cfg.SCHEDULER_GAMMA**bisect.bisect_right( cfg.SCHEDULER_STEPS, epoch) else: raise ValueError(cfg.SCHEDULER) for param_group in optimizer.param_groups: param_group["lr"] = learning_rate return learning_rate # Start training objs = AverageMeter("Loss") top1 = AverageMeter("Acc@1") top5 = AverageMeter("Acc@5") total_time = AverageMeter("Time") t = time.time() for step in range(0, total_steps): # Linear learning rate decay epoch = step // steps_per_epoch learning_rate = adjust_learning_rate(step, epoch) image, label = next(train_queue) image = image.astype("float32") label = label.astype("int32") n = image.shape[0] optimizer.zero_grad() loss, acc1, acc5 = train_func(image, label) optimizer.step() top1.update(100 * acc1.numpy()[0], n) top5.update(100 * acc5.numpy()[0], n) objs.update(loss.numpy()[0], n) total_time.update(time.time() - t) t = time.time() if step % args.report_freq == 0 and rank == 0: logger.info("TRAIN e%d %06d %f %s %s %s %s", epoch, step, learning_rate, objs, top1, top5, total_time) objs.reset() top1.reset() top5.reset() total_time.reset() if step % 10000 == 0 and rank == 0: logger.info("SAVING %06d", step) mge.save( { "step": step, "state_dict": model.state_dict() }, os.path.join(save_dir, "checkpoint.pkl"), ) if step % 10000 == 0 and step != 0: _, valid_acc, valid_acc5 = infer(valid_func, valid_queue, args) logger.info("TEST %06d %f, %f", step, valid_acc, valid_acc5) mge.save({ "step": step, "state_dict": model.state_dict() }, os.path.join(save_dir, "checkpoint-final.pkl")) _, valid_acc, valid_acc5 = infer(valid_func, valid_queue, args) logger.info("TEST %06d %f, %f", step, valid_acc, valid_acc5)
def worker(rank, slc): dist.init_process_group("localhost", port, world_size, rank, rank) m = ob.SyncMinMaxObserver() y = mge.tensor(x[slc]) m(y) assert m.min_val == np_min and m.max_val == np_max
def worker(master_ip, port, rank, world_size, args): if world_size > 1: # Initialize distributed process group logger.info("init distributed process group {} / {}".format(rank, world_size)) dist.init_process_group( master_ip=master_ip, port=port, world_size=world_size, rank=rank, device=rank, ) model_name = "{}_{}x{}".format(args.arch, cfg.input_shape[0], cfg.input_shape[1]) save_dir = os.path.join(args.save, model_name) model = getattr(kpm, args.arch)() model.train() start_epoch = 0 if args.resume is not None: file = mge.load(args.resume) model.load_state_dict(file["state_dict"]) start_epoch = file["epoch"] optimizer = optim.Adam( model.parameters(), lr=cfg.initial_lr, weight_decay=cfg.weight_decay ) gm = GradManager() if dist.get_world_size() > 1: gm.attach( model.parameters(), callbacks=[dist.make_allreduce_cb("SUM", dist.WORLD)], ) else: gm.attach(model.parameters()) if dist.get_world_size() > 1: dist.bcast_list_(model.parameters(), dist.WORLD) # sync parameters # Build train datasets logger.info("preparing dataset..") ann_file = os.path.join( cfg.data_root, "annotations", "person_keypoints_train2017.json" ) train_dataset = COCOJoints( cfg.data_root, ann_file, image_set="train2017", order=("image", "keypoints", "boxes", "info"), ) logger.info("Num of Samples: {}".format(len(train_dataset))) train_sampler = data.RandomSampler( train_dataset, batch_size=cfg.batch_size, drop_last=True ) transforms = [ T.Normalize(mean=cfg.img_mean, std=cfg.img_std), RandomHorizontalFlip(0.5, keypoint_flip_order=cfg.keypoint_flip_order) ] if cfg.half_body_transform: transforms.append( HalfBodyTransform( cfg.upper_body_ids, cfg.lower_body_ids, cfg.prob_half_body ) ) if cfg.extend_boxes: transforms.append( ExtendBoxes(cfg.x_ext, cfg.y_ext, cfg.input_shape[1] / cfg.input_shape[0]) ) transforms += [ RandomBoxAffine( degrees=cfg.rotate_range, scale=cfg.scale_range, output_shape=cfg.input_shape, rotate_prob=cfg.rotation_prob, scale_prob=cfg.scale_prob, ) ] transforms += [T.ToMode()] train_queue = data.DataLoader( train_dataset, sampler=train_sampler, num_workers=args.workers, transform=T.Compose(transforms=transforms, order=train_dataset.order,), collator=HeatmapCollator( cfg.input_shape, cfg.output_shape, cfg.keypoint_num, cfg.heat_thr, cfg.heat_kernels if args.multi_scale_supervision else cfg.heat_kernels[-1:], cfg.heat_range, ), ) # Start training for epoch in range(start_epoch, cfg.epochs): loss = train(model, train_queue, optimizer, gm, epoch=epoch) logger.info("Epoch %d Train %.6f ", epoch, loss) if rank == 0 and epoch % cfg.save_freq == 0: # save checkpoint mge.save( {"epoch": epoch + 1, "state_dict": model.state_dict()}, os.path.join(save_dir, "epoch_{}.pkl".format(epoch)), )
def worker(master_ip, port, world_size, rank, args): if world_size > 1: dist.init_process_group( master_ip=master_ip, port=port, world_size=world_size, rank=rank, device=rank, ) logger.info("Init process group for gpu{} done".format(rank)) current_network = import_from_file(args.file) model = current_network.Net(current_network.Cfg()) model.train() if dist.get_rank() == 0: logger.info(get_config_info(model.cfg)) logger.info(repr(model)) params_with_grad = [] for name, param in model.named_parameters(): if "bottom_up.conv1" in name and model.cfg.backbone_freeze_at >= 1: continue if "bottom_up.layer1" in name and model.cfg.backbone_freeze_at >= 2: continue params_with_grad.append(param) opt = SGD( params_with_grad, lr=model.cfg.basic_lr * args.batch_size, momentum=model.cfg.momentum, weight_decay=model.cfg.weight_decay * dist.get_world_size(), ) gm = GradManager() if dist.get_world_size() > 1: gm.attach( params_with_grad, callbacks=[dist.make_allreduce_cb("SUM", dist.WORLD)] ) else: gm.attach(params_with_grad) if args.weight_file is not None: weights = mge.load(args.weight_file) model.backbone.bottom_up.load_state_dict(weights, strict=False) if dist.get_world_size() > 1: dist.bcast_list_(model.parameters(), dist.WORLD) # sync parameters if dist.get_rank() == 0: logger.info("Prepare dataset") train_loader = iter(build_dataloader(args.batch_size, args.dataset_dir, model.cfg)) for epoch in range(model.cfg.max_epoch): train_one_epoch(model, train_loader, opt, gm, epoch, args) if dist.get_rank() == 0: save_path = "log-of-{}/epoch_{}.pkl".format( os.path.basename(args.file).split(".")[0], epoch ) mge.save( {"epoch": epoch, "state_dict": model.state_dict()}, save_path, ) logger.info("dump weights to %s", save_path)
def worker(rank, world_size, ngpus_per_node, args): # pylint: disable=too-many-statements if rank == 0: os.makedirs(os.path.join(args.save, args.arch), exist_ok=True) megengine.logger.set_log_file( os.path.join(args.save, args.arch, "log.txt")) # init process group if world_size > 1: dist.init_process_group( master_ip=args.dist_addr, port=args.dist_port, world_size=world_size, rank=rank, device=rank % ngpus_per_node, backend="nccl", ) logging.info("init process group rank %d / %d", dist.get_rank(), dist.get_world_size()) # build dataset train_dataloader, valid_dataloader = build_dataset(args) train_queue = iter(train_dataloader) # infinite steps_per_epoch = 1280000 // (world_size * args.batch_size) # build model model = resnet_model.__dict__[args.arch]() # Sync parameters if world_size > 1: dist.bcast_list_(model.parameters(), dist.WORLD) # Autodiff gradient manager gm = autodiff.GradManager().attach( model.parameters(), callbacks=dist.make_allreduce_cb("SUM") if world_size > 1 else None, ) # Optimizer opt = optim.SGD( model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay * world_size, # scale weight decay in "SUM" mode ) # train and valid func def train_step(image, label): with gm: logits = model(image) loss = F.nn.cross_entropy(logits, label) acc1, acc5 = F.topk_accuracy(logits, label, topk=(1, 5)) gm.backward(loss) opt.step().clear_grad() return loss, acc1, acc5 def valid_step(image, label): logits = model(image) loss = F.nn.cross_entropy(logits, label) acc1, acc5 = F.topk_accuracy(logits, label, topk=(1, 5)) # calculate mean values if world_size > 1: loss = F.distributed.all_reduce_sum(loss) / world_size acc1 = F.distributed.all_reduce_sum(acc1) / world_size acc5 = F.distributed.all_reduce_sum(acc5) / world_size return loss, acc1, acc5 # multi-step learning rate scheduler with warmup def adjust_learning_rate(step): lr = args.lr * 0.1**bisect.bisect_right( [30 * steps_per_epoch, 60 * steps_per_epoch, 80 * steps_per_epoch], step) if step < 5 * steps_per_epoch: # warmup lr = args.lr * (step / (5 * steps_per_epoch)) for param_group in opt.param_groups: param_group["lr"] = lr return lr # start training objs = AverageMeter("Loss") top1 = AverageMeter("Acc@1") top5 = AverageMeter("Acc@5") clck = AverageMeter("Time") for step in range(0, args.epochs * steps_per_epoch): lr = adjust_learning_rate(step) t = time.time() image, label = next(train_queue) image = megengine.tensor(image, dtype="float32") label = megengine.tensor(label, dtype="int32") loss, acc1, acc5 = train_step(image, label) objs.update(loss.item()) top1.update(100 * acc1.item()) top5.update(100 * acc5.item()) clck.update(time.time() - t) if step % args.print_freq == 0 and dist.get_rank() == 0: logging.info( "Epoch %d Step %d, LR %.4f, %s %s %s %s", step // steps_per_epoch, step, lr, objs, top1, top5, clck, ) objs.reset() top1.reset() top5.reset() clck.reset() if (step + 1) % steps_per_epoch == 0: model.eval() _, valid_acc1, valid_acc5 = valid(valid_step, valid_dataloader, args) model.train() logging.info( "Epoch %d Test Acc@1 %.3f, Acc@5 %.3f", (step + 1) // steps_per_epoch, valid_acc1, valid_acc5, ) megengine.save( { "epoch": (step + 1) // steps_per_epoch, "state_dict": model.state_dict(), }, os.path.join(args.save, args.arch, "checkpoint.pkl"), )
def worker(rank, world_size, args): if world_size > 1: dist.init_process_group( master_ip="localhost", master_port=23456, world_size=world_size, rank=rank, dev=rank, ) logger.info("Init process group done") logger.info("Prepare dataset") train_loader, epoch_size = build_dataloader(args.batch_size, args.dataset_dir) batch_iter = epoch_size // (args.batch_size * world_size) net = DeepLabV3Plus(class_num=cfg.NUM_CLASSES, pretrained=args.weight_file) base_lr = args.base_lr * world_size optimizer = optim.SGD( net.parameters(requires_grad=True), lr=base_lr, momentum=0.9, weight_decay=0.00004, ) @jit.trace(symbolic=True, opt_level=2) def train_func(data, label, net=None, optimizer=None): net.train() pred = net(data) loss = softmax_cross_entropy(pred, label, ignore_index=cfg.IGNORE_INDEX) optimizer.backward(loss) return pred, loss begin_epoch = 0 end_epoch = args.train_epochs if args.resume is not None: pretrained = mge.load(args.resume) begin_epoch = pretrained["epoch"] + 1 net.load_state_dict(pretrained["state_dict"]) logger.info("load success: epoch %d", begin_epoch) itr = begin_epoch * batch_iter max_itr = end_epoch * batch_iter image = mge.tensor( np.zeros([args.batch_size, 3, cfg.IMG_SIZE, cfg.IMG_SIZE]).astype(np.float32), dtype="float32", ) label = mge.tensor( np.zeros([args.batch_size, cfg.IMG_SIZE, cfg.IMG_SIZE]).astype(np.int32), dtype="int32", ) exp_name = os.path.abspath(os.path.dirname(__file__)).split("/")[-1] for epoch in range(begin_epoch, end_epoch): for i_batch, sample_batched in enumerate(train_loader): def adjust_lr(optimizer, itr, max_itr): now_lr = base_lr * (1 - itr / (max_itr + 1))**0.9 for param_group in optimizer.param_groups: param_group["lr"] = now_lr return now_lr now_lr = adjust_lr(optimizer, itr, max_itr) inputs_batched, labels_batched = sample_batched labels_batched = np.squeeze(labels_batched, axis=1).astype(np.int32) image.set_value(inputs_batched) label.set_value(labels_batched) optimizer.zero_grad() _, loss = train_func(image, label, net=net, optimizer=optimizer) optimizer.step() running_loss = loss.numpy()[0] if rank == 0: logger.info( "%s epoch:%d/%d\tbatch:%d/%d\titr:%d\tlr:%g\tloss:%g", exp_name, epoch, end_epoch, i_batch, batch_iter, itr + 1, now_lr, running_loss, ) itr += 1 if rank == 0: save_path = os.path.join(cfg.MODEL_SAVE_DIR, "epoch%d.pkl" % (epoch)) mge.save({ "epoch": epoch, "state_dict": net.state_dict() }, save_path) logger.info("save epoch%d", epoch)
def worker(rank, world_size, args): if world_size > 1: dist.init_process_group( master_ip="localhost", master_port=23456, world_size=world_size, rank=rank, dev=rank, ) logger.info("Init process group for gpu%d done", rank) sys.path.insert(0, os.path.dirname(args.file)) current_network = importlib.import_module( os.path.basename(args.file).split(".")[0]) model = current_network.Net(current_network.Cfg(), batch_size=args.batch_size) params = model.parameters(requires_grad=True) model.train() if rank == 0: logger.info(get_config_info(model.cfg)) opt = optim.SGD( params, lr=model.cfg.basic_lr * world_size * model.batch_size, momentum=model.cfg.momentum, weight_decay=model.cfg.weight_decay, ) if args.weight_file is not None: weights = mge.load(args.weight_file) model.backbone.bottom_up.load_state_dict(weights) if rank == 0: logger.info("Prepare dataset") train_loader = iter( build_dataloader(model.batch_size, args.dataset_dir, model.cfg)) for epoch_id in range(model.cfg.max_epoch): for param_group in opt.param_groups: param_group["lr"] = (model.cfg.basic_lr * world_size * model.batch_size * (model.cfg.lr_decay_rate**bisect.bisect_right( model.cfg.lr_decay_stages, epoch_id))) tot_steps = model.cfg.nr_images_epoch // (model.batch_size * world_size) train_one_epoch( model, train_loader, opt, tot_steps, rank, epoch_id, world_size, args.enable_sublinear, ) if rank == 0: save_path = "log-of-{}/epoch_{}.pkl".format( os.path.basename(args.file).split(".")[0], epoch_id) mge.save( { "epoch": epoch_id, "state_dict": model.state_dict() }, save_path, ) logger.info("dump weights to %s", save_path)
def worker(rank, gpu_num, args): # using sublinear os.environ["MGB_COMP_GRAPH_OPT"] = "enable_sublinear_memory_opt=1;seq_opt.enable_seq_comp_node_opt=0" os.environ["MGB_SUBLINEAR_MEMORY_GENETIC_NR_ITER"] = '10' os.environ['MGB_CUDA_RESERVE_MEMORY'] = '1' # establish the server if is the master dist_port = args.port if rank == 0: dist.Server(port=dist_port) if gpu_num> 1: dist.init_process_group( master_ip="localhost", port=dist_port, world_size=gpu_num, rank=rank, device=rank, ) logger.info("Init process group for gpu%d done", rank) model = network.Network() params = model.parameters(requires_grad=True) model.train() # Autodiff gradient manager gm = autodiff.GradManager().attach( model.parameters(), callbacks=allreduce_cb, ) opt = optim.SGD( params, lr=cfg.basic_lr * gpu_num * cfg.batch_per_gpu, momentum=cfg.momentum, weight_decay=cfg.weight_decay, ) if cfg.pretrain_weight is not None: weights = mge.load(cfg.pretrain_weight) del weights['fc.weight'] del weights['fc.bias'] model.resnet50.load_state_dict(weights) start_epoch = 0 if args.resume_weights is not None: assert osp.exists(args.resume_weights) model_file = args.resume_weights print('Loading {} to initialize FPN...'.format(model_file)) model_dict = mge.load(model_file) start_epoch, weights = model_dict['epoch'] + 1, model_dict['state_dict'] model.load_state_dict(weights, strict=False) logger.info("Prepare dataset") # train_loader = dataset.train_dataset(rank) train_dataset = CrowdHuman(cfg, if_train=True) train_sampler = data.Infinite(data.RandomSampler( train_dataset, batch_size = cfg.batch_per_gpu, drop_last=True, world_size = gpu_num, rank = rank,)) train_loader = data.DataLoader( train_dataset, sampler=train_sampler, collator = train_dataset, num_workers=4, ) train_loader = iter(train_loader) logger.info("Training...") for epoch_id in range(start_epoch, cfg.max_epoch): for param_group in opt.param_groups: param_group["lr"] = ( cfg.basic_lr * gpu_num * cfg.batch_per_gpu * (cfg.lr_decay_rate ** bisect.bisect_right(cfg.lr_decay_sates, epoch_id)) ) max_steps = cfg.nr_images_epoch // (cfg.batch_per_gpu * gpu_num) train_one_epoch(model, gm, train_loader, opt, max_steps, rank, epoch_id, gpu_num) if rank == 0: save_path = osp.join(cfg.model_dir, 'epoch-{}.pkl'.format(epoch_id + 1)) state_dict = model.state_dict() names = [k for k, _ in state_dict.items()] for name in names: if name.startswith('inputs.'): del state_dict[name] mge.save( {"epoch": epoch_id, "state_dict": state_dict}, save_path, ) logger.info("dump weights to %s", save_path)
def worker(rank, world_size, args): if world_size > 1: # Initialize distributed process group logger.info("init distributed process group {} / {}".format( rank, world_size)) dist.init_process_group( master_ip="localhost", master_port=23456, world_size=world_size, rank=rank, dev=rank, ) save_dir = os.path.join(args.save, args.arch) model = getattr(M, args.arch)() optimizer = optim.SGD( get_parameters(model), lr=args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay, ) # Define train and valid graph @jit.trace(symbolic=True) def train_func(image, label): model.train() logits = model(image) loss = F.cross_entropy_with_softmax(logits, label, label_smooth=0.1) acc1, acc5 = F.accuracy(logits, label, (1, 5)) optimizer.backward(loss) # compute gradients if dist.is_distributed(): # all_reduce_mean loss = dist.all_reduce_sum(loss, "train_loss") / dist.get_world_size() acc1 = dist.all_reduce_sum(acc1, "train_acc1") / dist.get_world_size() acc5 = dist.all_reduce_sum(acc5, "train_acc5") / dist.get_world_size() return loss, acc1, acc5 @jit.trace(symbolic=True) def valid_func(image, label): model.eval() logits = model(image) loss = F.cross_entropy_with_softmax(logits, label, label_smooth=0.1) acc1, acc5 = F.accuracy(logits, label, (1, 5)) if dist.is_distributed(): # all_reduce_mean loss = dist.all_reduce_sum(loss, "valid_loss") / dist.get_world_size() acc1 = dist.all_reduce_sum(acc1, "valid_acc1") / dist.get_world_size() acc5 = dist.all_reduce_sum(acc5, "valid_acc5") / dist.get_world_size() return loss, acc1, acc5 # Build train and valid datasets logger.info("preparing dataset..") train_dataset = data.dataset.ImageNet(args.data, train=True) train_sampler = data.RandomSampler(train_dataset, batch_size=args.batch_size, drop_last=True) train_queue = data.DataLoader( train_dataset, sampler=train_sampler, transform=T.Compose([ T.RandomResizedCrop(224), T.RandomHorizontalFlip(), T.ColorJitter(brightness=0.4, contrast=0.4, saturation=0.4), T.Normalize(mean=[103.530, 116.280, 123.675], std=[57.375, 57.120, 58.395]), # BGR T.ToMode("CHW"), ]), num_workers=args.workers, ) train_queue = infinite_iter(train_queue) valid_dataset = data.dataset.ImageNet(args.data, train=False) valid_sampler = data.SequentialSampler(valid_dataset, batch_size=100, drop_last=False) valid_queue = data.DataLoader( valid_dataset, sampler=valid_sampler, transform=T.Compose([ T.Resize(256), T.CenterCrop(224), T.Normalize(mean=[103.530, 116.280, 123.675], std=[57.375, 57.120, 58.395]), # BGR T.ToMode("CHW"), ]), num_workers=args.workers, ) # Start training objs = AverageMeter("Loss") top1 = AverageMeter("Acc@1") top5 = AverageMeter("Acc@5") total_time = AverageMeter("Time") t = time.time() for step in range(0, args.steps + 1250 + 1): # Linear learning rate decay decay = 1.0 decay = 1 - float(step) / args.steps if step < args.steps else 0.0 for param_group in optimizer.param_groups: param_group["lr"] = args.learning_rate * decay image, label = next(train_queue) image = image.astype("float32") label = label.astype("int32") n = image.shape[0] optimizer.zero_grad() loss, acc1, acc5 = train_func(image, label) optimizer.step() top1.update(100 * acc1.numpy()[0], n) top5.update(100 * acc5.numpy()[0], n) objs.update(loss.numpy()[0], n) total_time.update(time.time() - t) t = time.time() if step % args.report_freq == 0 and rank == 0: logger.info( "TRAIN %06d %f %s %s %s %s", step, args.learning_rate * decay, objs, top1, top5, total_time, ) objs.reset() top1.reset() top5.reset() total_time.reset() if step % 10000 == 0 and rank == 0: logger.info("SAVING %06d", step) mge.save( model.state_dict(), os.path.join(save_dir, "checkpoint-{:06d}.pkl".format(step)), ) if step % 10000 == 0 and step != 0: _, valid_acc, valid_acc5 = infer(valid_func, valid_queue, args) logger.info("TEST %06d %f, %f", step, valid_acc, valid_acc5) mge.save(model.state_dict(), os.path.join(save_dir, "checkpoint-{:06d}.pkl".format(step))) _, valid_acc, valid_acc5 = infer(valid_func, valid_queue, args) logger.info("TEST %06d %f, %f", step, valid_acc, valid_acc5)
def worker( arch, model_file, data_root, ann_file, master_ip, port, rank, world_size, result_queue, ): """ :param net_file: network description file :param model_file: file of dump weights :param data_dir: the dataset directory :param worker_id: the index of the worker :param total_worker: number of gpu for evaluation :param result_queue: processing queue """ if world_size > 1: dist.init_process_group( master_ip=master_ip, port=port, world_size=world_size, rank=rank, device=rank, ) mge.device.set_default_device("gpu{}".format(rank)) model = getattr(kpm, arch)() model.eval() weight = mge.load(model_file) weight = weight["state_dict"] if "state_dict" in weight.keys() else weight model.load_state_dict(weight) loader = build_dataloader(rank, world_size, data_root, ann_file) for data_dict in loader: img, bbox, info = data_dict fliped_img = img[:, :, :, ::-1] - np.zeros_like(img) data = np.concatenate([img, fliped_img], 0) data = np.ascontiguousarray(data).astype(np.float32) outs = model.predict(mge.tensor(data)).numpy() preds = outs[:img.shape[0]] preds_fliped = outs[img.shape[0]:, cfg.keypoint_flip_order, :, ::-1] preds = (preds + preds_fliped) / 2 for i in range(preds.shape[0]): results = find_keypoints(preds[i], bbox[i, 0]) final_score = float(results[:, -1].mean() * info[-1][i]) image_id = int(info[-2][i]) keypoints = results.copy() keypoints[:, -1] = 1 keypoints = keypoints.reshape(-1, ).tolist() instance = { "image_id": image_id, "category_id": 1, "score": final_score, "keypoints": keypoints, } result_queue.put(instance)