Ejemplo n.º 1
0
    def worker(max_err):
        net = MnistNet(has_bn=True)
        net.load_state_dict(checkpoint["net_init"])
        lr = checkpoint["sgd_lr"]
        opt = SGD(net.parameters(), lr=lr)

        gm = ad.GradManager().attach(
            net.parameters(), callbacks=[dist.make_allreduce_cb("MEAN", dist.WORLD)]
        )

        # use same data and label for all gpu's
        # such that the result does not depend on number of gpu
        data_train = Tensor(data)
        label_train = Tensor(label)

        loss = train(data_train, label_train, net, opt, gm)

        np.testing.assert_allclose(loss.numpy(), checkpoint["loss"], atol=max_err)

        if dist.get_rank():
            return
        for param, param_ref in zip(
            net.state_dict().items(), checkpoint["net_updated"].items()
        ):
            assert param[0] == param_ref[0]
            if "bn" in param[0]:
                ref = param_ref[1].reshape(param[1].shape)
                np.testing.assert_allclose(param[1], ref, atol=max_err)
            else:
                np.testing.assert_allclose(param[1], param_ref[1], atol=max_err)
Ejemplo n.º 2
0
def train_and_evaluate(model, manager):
    rank = dist.get_rank()

    # reload weights from restore_file if specified
    if args.restore_file is not None:
        manager.load_checkpoints()

    world_size = dist.get_world_size()
    if world_size > 1:
        dist.bcast_list_(model.parameters())
        dist.bcast_list_(model.buffers())

    gm = GradManager().attach(
        model.parameters(),
        callbacks=dist.make_allreduce_cb("SUM") if world_size > 1 else None,
    )

    for epoch in range(manager.params.num_epochs):
        # compute number of batches in one epoch (one full pass over the training set)
        train(model, manager, gm)

        # Evaluate for one epoch on validation set
        evaluate(model, manager)

        # Save best model weights accroding to the params.major_metric
        if rank == 0:
            manager.check_best_save_last_checkpoints(latest_freq=5)
Ejemplo n.º 3
0
def worker(args):
    current_network = import_from_file(args.file)

    model = current_network.Net(current_network.Cfg())
    model.train()

    if dist.get_rank() == 0:
        logger.info(get_config_info(model.cfg))
        logger.info(repr(model))

    params_with_grad = []
    for name, param in model.named_parameters():
        if "bottom_up.conv1" in name and model.cfg.backbone_freeze_at >= 1:
            continue
        if "bottom_up.layer1" in name and model.cfg.backbone_freeze_at >= 2:
            continue
        params_with_grad.append(param)

    opt = SGD(
        params_with_grad,
        lr=model.cfg.basic_lr * args.batch_size,
        momentum=model.cfg.momentum,
        weight_decay=model.cfg.weight_decay * dist.get_world_size(),
    )

    gm = GradManager()
    if dist.get_world_size() > 1:
        gm.attach(params_with_grad,
                  callbacks=[dist.make_allreduce_cb("SUM", dist.WORLD)])
    else:
        gm.attach(params_with_grad)

    if args.weight_file is not None:
        # model.backbone.bottom_up.load_state_dict(weights, strict=False)
        logger.info("Loading Base-Pretrain weights...")
        weights = mge.load(args.weight_file)
        weight_new = {k: v for k, v in weights.items() if 'pred_' not in k}
        model.load_state_dict(weight_new, strict=False)

    if dist.get_world_size() > 1:
        dist.bcast_list_(model.parameters(), dist.WORLD)  # sync parameters

    if dist.get_rank() == 0:
        logger.info("Prepare dataset")
    train_loader = iter(
        build_dataloader(args.batch_size, args.dataset_dir, model.cfg))

    for epoch in range(model.cfg.max_epoch):
        train_one_epoch(model, train_loader, opt, gm, epoch, args)
        if dist.get_rank() == 0:
            save_path = "logs/{}/epoch_{}.pkl".format(
                os.path.basename(args.file).split(".")[0], epoch)
            mge.save(
                {
                    "epoch": epoch,
                    "state_dict": model.state_dict()
                },
                save_path,
            )
            logger.info("dump weights to %s", save_path)
Ejemplo n.º 4
0
def run_syncbn(trace_mode):
    x = F.ones([2, 16, 4, 4], dtype="float32")

    net = Sequential(
        Conv2d(16, 16, 1), SyncBatchNorm(16), Conv2d(16, 16, 1), SyncBatchNorm(16),
    )

    gm = ad.GradManager().attach(
        net.parameters(), callbacks=dist.make_allreduce_cb("MEAN")
    )
    opt = optimizer.SGD(net.parameters(), 1e-3)

    def train_func(x):
        with gm:
            y = net(x)
            loss = y.mean()
            gm.backward(loss)
            opt.step().clear_grad()
        return loss

    if trace_mode is not None:
        train_func = trace(train_func, symbolic=trace_mode)

    for _ in range(3):
        loss = train_func(x)
        loss.numpy()
Ejemplo n.º 5
0
    def worker():
        net = Simple()
        opt = SGD(net.parameters(), lr=0.1)

        gm = ad.GradManager().attach(
            net.parameters(),
            callbacks=[dist.make_allreduce_cb("MEAN", dist.WORLD)])

        opt.clear_grad()
        with gm:
            x = tensor(data)
            loss = net(x)
            loss = loss.sum()
            gm.backward(loss)
        for p in net.params:
            np.testing.assert_equal(p.grad.numpy(), 1)
Ejemplo n.º 6
0
def update_model(model_path):
    """
    Update the dumped model with test cases for new reference values.

    The model with pre-trained weights is trained for one iter with the test data attached.
    The loss and updated net state dict is dumped.

    .. code-block:: python

        from test_dp_correctness import update_model
        update_model('mnist_model_with_test.mge') # for gpu
        update_model('mnist_model_with_test_cpu.mge') # for cpu

    """
    net = MnistNet(has_bn=True)
    checkpoint = mge.load(model_path)
    net.load_state_dict(checkpoint["net_init"])
    lr = checkpoint["sgd_lr"]
    opt = SGD(net.parameters(), lr=lr)

    gm = ad.GradManager().attach(
        net.parameters(),
        callbacks=[dist.make_allreduce_cb("MEAN", dist.WORLD)])

    data = Tensor(checkpoint["data"], dtype=np.float32)
    label = Tensor(checkpoint["label"], dtype=np.int32)

    opt.clear_grad()
    loss = train(data, label, net=net, opt=opt)
    opt.step()

    xpu_name = get_xpu_name()

    checkpoint.update({
        "net_updated": net.state_dict(),
        "loss": loss.numpy(),
        "xpu": xpu_name
    })
    mge.serialization.save(checkpoint, model_path)
Ejemplo n.º 7
0
    def worker():
        net = Simple(param_shape)
        opt = SGD(net.parameters(), lr=0.1)

        allreduce_cb = dist.make_allreduce_cb("MEAN", dist.WORLD)
        if threshold is not None:
            allreduce_cb._param_pack_thd = threshold
        gm = ad.GradManager().attach(net.parameters(), callbacks=[allreduce_cb])

        def run():
            opt.clear_grad()
            with gm:
                x = tensor(data)
                loss = net(x)
                loss = loss.sum()
                gm.backward(loss)

        for i in range(n_iters):
            run()

        for p in net.params:
            np.testing.assert_equal(p.grad.numpy(), np.ones_like(p.grad.numpy()))
Ejemplo n.º 8
0
def worker(args):
    current_network = import_from_file(args.file)

    model = current_network.Net(current_network.Cfg())
    model.train()

    if dist.get_rank() == 0:
        logger.info(get_config_info(model.cfg))
        logger.info(repr(model))

    backbone_params = []
    head_params = []
    for name, param in model.named_parameters():
        if "backbone" in name:
            backbone_params.append(param)
        else:
            head_params.append(param)

    opt = SGD(
        [
            {
                "params": backbone_params,
                "lr": model.cfg.learning_rate * 0.1
            },
            {
                "params": head_params
            },
        ],
        lr=model.cfg.learning_rate,
        momentum=model.cfg.momentum,
        weight_decay=model.cfg.weight_decay * dist.get_world_size(),
    )

    gm = GradManager()
    if dist.get_world_size() > 1:
        gm.attach(model.parameters(),
                  callbacks=[dist.make_allreduce_cb("SUM", dist.WORLD)])
    else:
        gm.attach(model.parameters())

    cur_epoch = 0
    if args.resume is not None:
        pretrained = mge.load(args.resume)
        cur_epoch = pretrained["epoch"] + 1
        model.load_state_dict(pretrained["state_dict"])
        opt.load_state_dict(pretrained["opt"])
        if dist.get_rank() == 0:
            logger.info("load success: epoch %d", cur_epoch)

    if dist.get_world_size() > 1:
        dist.bcast_list_(model.parameters(), dist.WORLD)  # sync parameters

    if dist.get_rank() == 0:
        logger.info("Prepare dataset")
    train_loader = iter(
        build_dataloader(model.cfg.batch_size, args.dataset_dir, model.cfg))

    for epoch in range(cur_epoch, model.cfg.max_epoch):
        train_one_epoch(model, train_loader, opt, gm, epoch)
        if dist.get_rank() == 0:
            save_path = "log-of-{}/epoch_{}.pkl".format(
                os.path.basename(args.file).split(".")[0], epoch)
            mge.save(
                {
                    "epoch": epoch,
                    "state_dict": model.state_dict(),
                    "opt": opt.state_dict()
                }, save_path)
            logger.info("dump weights to %s", save_path)
Ejemplo n.º 9
0
def build_gradmanager(module):
    world_size = dist.get_world_size()
    gm = GradManager().attach(
        module.parameters(),
        callbacks=dist.make_allreduce_cb("SUM") if world_size > 1 else None)
    return gm
Ejemplo n.º 10
0
def worker(rank, world_size, ngpus_per_node, args):
    # pylint: disable=too-many-statements
    if rank == 0:
        os.makedirs(os.path.join(args.save, args.arch), exist_ok=True)
        megengine.logger.set_log_file(
            os.path.join(args.save, args.arch, "log.txt"))
    # init process group
    if world_size > 1:
        dist.init_process_group(
            master_ip=args.dist_addr,
            port=args.dist_port,
            world_size=world_size,
            rank=rank,
            device=rank % ngpus_per_node,
            backend="nccl",
        )
        logging.info("init process group rank %d / %d", dist.get_rank(),
                     dist.get_world_size())

    # build dataset
    train_dataloader, valid_dataloader = build_dataset(args)
    train_queue = iter(train_dataloader)  # infinite
    steps_per_epoch = 1280000 // (world_size * args.batch_size)

    # build model
    model = resnet_model.__dict__[args.arch]()

    # Sync parameters
    if world_size > 1:
        dist.bcast_list_(model.parameters(), dist.WORLD)

    # Autodiff gradient manager
    gm = autodiff.GradManager().attach(
        model.parameters(),
        callbacks=dist.make_allreduce_cb("SUM") if world_size > 1 else None,
    )

    # Optimizer
    opt = optim.SGD(
        model.parameters(),
        lr=args.lr,
        momentum=args.momentum,
        weight_decay=args.weight_decay *
        world_size,  # scale weight decay in "SUM" mode
    )

    # train and valid func
    def train_step(image, label):
        with gm:
            logits = model(image)
            loss = F.nn.cross_entropy(logits, label)
            acc1, acc5 = F.topk_accuracy(logits, label, topk=(1, 5))
            gm.backward(loss)
            opt.step().clear_grad()
        return loss, acc1, acc5

    def valid_step(image, label):
        logits = model(image)
        loss = F.nn.cross_entropy(logits, label)
        acc1, acc5 = F.topk_accuracy(logits, label, topk=(1, 5))
        # calculate mean values
        if world_size > 1:
            loss = F.distributed.all_reduce_sum(loss) / world_size
            acc1 = F.distributed.all_reduce_sum(acc1) / world_size
            acc5 = F.distributed.all_reduce_sum(acc5) / world_size
        return loss, acc1, acc5

    # multi-step learning rate scheduler with warmup
    def adjust_learning_rate(step):
        lr = args.lr * 0.1**bisect.bisect_right(
            [30 * steps_per_epoch, 60 * steps_per_epoch, 80 * steps_per_epoch],
            step)
        if step < 5 * steps_per_epoch:  # warmup
            lr = args.lr * (step / (5 * steps_per_epoch))
        for param_group in opt.param_groups:
            param_group["lr"] = lr
        return lr

    # start training
    objs = AverageMeter("Loss")
    top1 = AverageMeter("Acc@1")
    top5 = AverageMeter("Acc@5")
    clck = AverageMeter("Time")

    for step in range(0, args.epochs * steps_per_epoch):
        lr = adjust_learning_rate(step)

        t = time.time()

        image, label = next(train_queue)
        image = megengine.tensor(image, dtype="float32")
        label = megengine.tensor(label, dtype="int32")

        loss, acc1, acc5 = train_step(image, label)

        objs.update(loss.item())
        top1.update(100 * acc1.item())
        top5.update(100 * acc5.item())
        clck.update(time.time() - t)

        if step % args.print_freq == 0 and dist.get_rank() == 0:
            logging.info(
                "Epoch %d Step %d, LR %.4f, %s %s %s %s",
                step // steps_per_epoch,
                step,
                lr,
                objs,
                top1,
                top5,
                clck,
            )
            objs.reset()
            top1.reset()
            top5.reset()
            clck.reset()

        if (step + 1) % steps_per_epoch == 0:
            model.eval()
            _, valid_acc1, valid_acc5 = valid(valid_step, valid_dataloader,
                                              args)
            model.train()
            logging.info(
                "Epoch %d Test Acc@1 %.3f, Acc@5 %.3f",
                (step + 1) // steps_per_epoch,
                valid_acc1,
                valid_acc5,
            )
            megengine.save(
                {
                    "epoch": (step + 1) // steps_per_epoch,
                    "state_dict": model.state_dict(),
                },
                os.path.join(args.save, args.arch, "checkpoint.pkl"),
            )
Ejemplo n.º 11
0
def worker(master_ip, port, rank, world_size, args):
    if world_size > 1:
        # Initialize distributed process group
        logger.info("init distributed process group {} / {}".format(rank, world_size))
        dist.init_process_group(
            master_ip=master_ip,
            port=port,
            world_size=world_size,
            rank=rank,
            device=rank,
        )

    model_name = "{}_{}x{}".format(args.arch, cfg.input_shape[0], cfg.input_shape[1])
    save_dir = os.path.join(args.save, model_name)

    model = getattr(kpm, args.arch)()
    model.train()
    start_epoch = 0
    if args.resume is not None:
        file = mge.load(args.resume)
        model.load_state_dict(file["state_dict"])
        start_epoch = file["epoch"]

    optimizer = optim.Adam(
        model.parameters(), lr=cfg.initial_lr, weight_decay=cfg.weight_decay
    )

    gm = GradManager()
    if dist.get_world_size() > 1:
        gm.attach(
            model.parameters(), callbacks=[dist.make_allreduce_cb("SUM", dist.WORLD)],
        )
    else:
        gm.attach(model.parameters())

    if dist.get_world_size() > 1:
        dist.bcast_list_(model.parameters(), dist.WORLD)  # sync parameters

    # Build train datasets
    logger.info("preparing dataset..")
    ann_file = os.path.join(
        cfg.data_root, "annotations", "person_keypoints_train2017.json"
    )
    train_dataset = COCOJoints(
        cfg.data_root,
        ann_file,
        image_set="train2017",
        order=("image", "keypoints", "boxes", "info"),
    )
    logger.info("Num of Samples: {}".format(len(train_dataset)))
    train_sampler = data.RandomSampler(
        train_dataset, batch_size=cfg.batch_size, drop_last=True
    )

    transforms = [
        T.Normalize(mean=cfg.img_mean, std=cfg.img_std),
        RandomHorizontalFlip(0.5, keypoint_flip_order=cfg.keypoint_flip_order)
    ]

    if cfg.half_body_transform:
        transforms.append(
            HalfBodyTransform(
                cfg.upper_body_ids, cfg.lower_body_ids, cfg.prob_half_body
            )
        )
    if cfg.extend_boxes:
        transforms.append(
            ExtendBoxes(cfg.x_ext, cfg.y_ext, cfg.input_shape[1] / cfg.input_shape[0])
        )

    transforms += [
        RandomBoxAffine(
            degrees=cfg.rotate_range,
            scale=cfg.scale_range,
            output_shape=cfg.input_shape,
            rotate_prob=cfg.rotation_prob,
            scale_prob=cfg.scale_prob,
        )
    ]
    transforms += [T.ToMode()]

    train_queue = data.DataLoader(
        train_dataset,
        sampler=train_sampler,
        num_workers=args.workers,
        transform=T.Compose(transforms=transforms, order=train_dataset.order,),
        collator=HeatmapCollator(
            cfg.input_shape,
            cfg.output_shape,
            cfg.keypoint_num,
            cfg.heat_thr,
            cfg.heat_kernels if args.multi_scale_supervision else cfg.heat_kernels[-1:],
            cfg.heat_range,
        ),
    )

    # Start training
    for epoch in range(start_epoch, cfg.epochs):
        loss = train(model, train_queue, optimizer, gm, epoch=epoch)
        logger.info("Epoch %d Train %.6f ", epoch, loss)

        if rank == 0 and epoch % cfg.save_freq == 0:  # save checkpoint
            mge.save(
                {"epoch": epoch + 1, "state_dict": model.state_dict()},
                os.path.join(save_dir, "epoch_{}.pkl".format(epoch)),
            )
Ejemplo n.º 12
0
def worker(master_ip, port, world_size, rank, configs):
    if world_size > 1:
        dist.init_process_group(
            master_ip=master_ip,
            port=port,
            world_size=world_size,
            rank=rank,
            device=rank,
        )
        logger.info("init process group for gpu{} done".format(rank))

    # set up logger
    os.makedirs(configs["base_dir"], exist_ok=True)
    worklog_path = os.path.join(configs["base_dir"], "worklog.txt")
    mge.set_log_file(worklog_path)

    # prepare model-related components
    model = FaceRecognitionModel(configs)

    # prepare data-related components
    preprocess = T.Compose([T.Normalize(mean=127.5, std=128), T.ToMode("CHW")])
    augment = T.Compose([T.RandomHorizontalFlip()])

    train_dataset = get_train_dataset(configs["dataset"],
                                      dataset_dir=configs["dataset_dir"])
    train_sampler = data.RandomSampler(train_dataset,
                                       batch_size=configs["batch_size"],
                                       drop_last=True)
    train_queue = data.DataLoader(train_dataset,
                                  sampler=train_sampler,
                                  transform=T.Compose([augment, preprocess]))

    # prepare optimize-related components
    configs["learning_rate"] = configs["learning_rate"] * dist.get_world_size()
    if dist.get_world_size() > 1:
        dist.bcast_list_(model.parameters())
        gm = ad.GradManager().attach(
            model.parameters(), callbacks=[dist.make_allreduce_cb("mean")])
    else:
        gm = ad.GradManager().attach(model.parameters())
    opt = optim.SGD(
        model.parameters(),
        lr=configs["learning_rate"],
        momentum=configs["momentum"],
        weight_decay=configs["weight_decay"],
    )

    # try to load checkpoint
    model, start_epoch = try_load_latest_checkpoint(model, configs["base_dir"])

    # do training
    def train_one_epoch():
        def train_func(images, labels):
            opt.clear_grad()
            with gm:
                loss, accuracy, _ = model(images, labels)
                gm.backward(loss)
                if dist.is_distributed():
                    # all_reduce_mean
                    loss = dist.functional.all_reduce_sum(
                        loss) / dist.get_world_size()
                    accuracy = dist.functional.all_reduce_sum(
                        accuracy) / dist.get_world_size()
            opt.step()
            return loss, accuracy

        model.train()

        average_loss = AverageMeter("loss")
        average_accuracy = AverageMeter("accuracy")
        data_time = AverageMeter("data_time")
        train_time = AverageMeter("train_time")

        total_step = len(train_queue)
        data_iter = iter(train_queue)
        for step in range(total_step):
            # get next batch of data
            data_tic = time.time()
            images, labels = next(data_iter)
            data_toc = time.time()

            # forward pass & backward pass
            train_tic = time.time()
            images = mge.tensor(images, dtype="float32")
            labels = mge.tensor(labels, dtype="int32")
            loss, accuracy = train_func(images, labels)
            train_toc = time.time()

            # do the statistics and logging
            n = images.shape[0]
            average_loss.update(loss.item(), n)
            average_accuracy.update(accuracy.item() * 100, n)
            data_time.update(data_toc - data_tic)
            train_time.update(train_toc - train_tic)
            if step % configs["log_interval"] == 0 and dist.get_rank() == 0:
                logger.info(
                    "epoch: %d, step: %d, %s, %s, %s, %s",
                    epoch,
                    step,
                    average_loss,
                    average_accuracy,
                    data_time,
                    train_time,
                )

    for epoch in range(start_epoch, configs["num_epoch"]):
        adjust_learning_rate(opt, epoch, configs)
        train_one_epoch()

        if dist.get_rank() == 0:
            checkpoint_path = os.path.join(configs["base_dir"],
                                           f"epoch-{epoch+1}-checkpoint.pkl")
            mge.save(
                {
                    "epoch": epoch + 1,
                    "state_dict": model.state_dict()
                },
                checkpoint_path,
            )
Ejemplo n.º 13
0
def worker(world_size, args):
    # pylint: disable=too-many-statements

    rank = dist.get_rank()
    if world_size > 1:
        logger.info("init distributed process group {} / {}".format(
            rank, world_size))

    save_dir = os.path.join(args.save, args.arch + "." + args.mode)
    if not os.path.exists(save_dir):
        os.makedirs(save_dir, exist_ok=True)
    mge.set_log_file(os.path.join(save_dir, "log.txt"))

    model = models.__dict__[args.arch]()
    cfg = config.get_config(args.arch)

    cfg.LEARNING_RATE *= world_size  # scale learning rate in distributed training
    total_batch_size = cfg.BATCH_SIZE * world_size
    steps_per_epoch = 1280000 // total_batch_size
    total_steps = steps_per_epoch * cfg.EPOCHS

    if args.mode != "normal":
        quantize_qat(model, qconfig=Q.ema_fakequant_qconfig)

    if world_size > 1:
        # Sync parameters
        dist.bcast_list_(model.parameters(), dist.WORLD)

    # Autodiff gradient manager
    gm = autodiff.GradManager().attach(
        model.parameters(),
        callbacks=dist.make_allreduce_cb("MEAN") if world_size > 1 else None,
    )

    optimizer = optim.SGD(
        get_parameters(model, cfg),
        lr=cfg.LEARNING_RATE,
        momentum=cfg.MOMENTUM,
    )

    # Define train and valid graph
    def train_func(image, label):
        with gm:
            model.train()
            logits = model(image)
            loss = F.loss.cross_entropy(logits, label, label_smooth=0.1)
            acc1, acc5 = F.topk_accuracy(logits, label, (1, 5))
            gm.backward(loss)
            optimizer.step().clear_grad()
        return loss, acc1, acc5

    def valid_func(image, label):
        model.eval()
        logits = model(image)
        loss = F.loss.cross_entropy(logits, label, label_smooth=0.1)
        acc1, acc5 = F.topk_accuracy(logits, label, (1, 5))
        return loss, acc1, acc5

    # Build train and valid datasets
    logger.info("preparing dataset..")
    train_dataset = data.dataset.ImageNet(args.data, train=True)
    train_sampler = data.Infinite(
        data.RandomSampler(train_dataset,
                           batch_size=cfg.BATCH_SIZE,
                           drop_last=True))
    train_queue = data.DataLoader(
        train_dataset,
        sampler=train_sampler,
        transform=T.Compose([
            T.RandomResizedCrop(224),
            T.RandomHorizontalFlip(),
            cfg.COLOR_JITTOR,
            T.Normalize(mean=128),
            T.ToMode("CHW"),
        ]),
        num_workers=args.workers,
    )
    train_queue = iter(train_queue)
    valid_dataset = data.dataset.ImageNet(args.data, train=False)
    valid_sampler = data.SequentialSampler(valid_dataset,
                                           batch_size=100,
                                           drop_last=False)
    valid_queue = data.DataLoader(
        valid_dataset,
        sampler=valid_sampler,
        transform=T.Compose([
            T.Resize(256),
            T.CenterCrop(224),
            T.Normalize(mean=128),
            T.ToMode("CHW")
        ]),
        num_workers=args.workers,
    )

    def adjust_learning_rate(step, epoch):
        learning_rate = cfg.LEARNING_RATE
        if cfg.SCHEDULER == "Linear":
            learning_rate *= 1 - float(step) / total_steps
        elif cfg.SCHEDULER == "Multistep":
            learning_rate *= cfg.SCHEDULER_GAMMA**bisect.bisect_right(
                cfg.SCHEDULER_STEPS, epoch)
        else:
            raise ValueError(cfg.SCHEDULER)
        for param_group in optimizer.param_groups:
            param_group["lr"] = learning_rate
        return learning_rate

    # Start training
    objs = AverageMeter("Loss")
    top1 = AverageMeter("Acc@1")
    top5 = AverageMeter("Acc@5")
    total_time = AverageMeter("Time")

    t = time.time()
    for step in range(0, total_steps):
        # Linear learning rate decay
        epoch = step // steps_per_epoch
        learning_rate = adjust_learning_rate(step, epoch)

        image, label = next(train_queue)
        image = mge.tensor(image, dtype="float32")
        label = mge.tensor(label, dtype="int32")

        n = image.shape[0]

        loss, acc1, acc5 = train_func(image, label)

        top1.update(100 * acc1.numpy()[0], n)
        top5.update(100 * acc5.numpy()[0], n)
        objs.update(loss.numpy()[0], n)
        total_time.update(time.time() - t)
        t = time.time()
        if step % args.report_freq == 0 and rank == 0:
            logger.info(
                "TRAIN e%d %06d %f %s %s %s %s",
                epoch,
                step,
                learning_rate,
                objs,
                top1,
                top5,
                total_time,
            )
            objs.reset()
            top1.reset()
            top5.reset()
            total_time.reset()
        if step != 0 and step % 10000 == 0 and rank == 0:
            logger.info("SAVING %06d", step)
            mge.save(
                {
                    "step": step,
                    "state_dict": model.state_dict()
                },
                os.path.join(save_dir, "checkpoint.pkl"),
            )
        if step % 10000 == 0 and step != 0:
            _, valid_acc, valid_acc5 = infer(valid_func, valid_queue, args)
            logger.info("TEST %06d %f, %f", step, valid_acc, valid_acc5)

    mge.save(
        {
            "step": step,
            "state_dict": model.state_dict()
        },
        os.path.join(save_dir, "checkpoint-final.pkl"),
    )
    _, valid_acc, valid_acc5 = infer(valid_func, valid_queue, args)
    logger.info("TEST %06d %f, %f", step, valid_acc, valid_acc5)
Ejemplo n.º 14
0
def worker(args):
    # pylint: disable=too-many-statements
    rank = dist.get_rank()
    world_size = dist.get_world_size()
    if rank == 0:
        os.makedirs(os.path.join(args.save, args.arch), exist_ok=True)
        megengine.logger.set_log_file(os.path.join(args.save, args.arch, "log.txt"))
    # init process group

    # build dataset
    train_dataloader, valid_dataloader = build_dataset(args)
    train_queue = iter(train_dataloader)  # infinite
    steps_per_epoch = args.steps_per_epoch

    # build model
    model = UNetD(3)
    # Sync parameters
    if world_size > 1:
        dist.bcast_list_(model.parameters(), dist.WORLD)

    # Autodiff gradient manager
    gm = autodiff.GradManager().attach(
        model.parameters(),
        callbacks=dist.make_allreduce_cb("SUM") if world_size > 1 else None,
    )

    # Optimizer
    opt = optim.Adam(
        model.parameters(),
        lr=args.lr,
        weight_decay=args.weight_decay * world_size,  # scale weight decay in "SUM" mode
    )

    # mixup
    def preprocess(image, label):
        if args.dnd:
            image, label = MixUp_AUG(image, label)
        return image, label

    # train and valid func
    def train_step(image, label):
        with gm:
            logits = model(image)
            logits = image - logits
            loss = F.nn.l1_loss(logits, label)
            gm.backward(loss)
            opt.step().clear_grad()
        return loss

    def valid_step(image, label):
        pred = model(image)
        pred = image - pred
        mae_iter = F.nn.l1_loss(pred, label)
        psnr_it = batch_PSNR(pred, label)
        #print(psnr_it.item())
        if world_size > 1:
            mae_iter = F.distributed.all_reduce_sum(mae_iter) / world_size
            psnr_it = F.distributed.all_reduce_sum(psnr_it) / world_size

        return mae_iter, psnr_it

    # multi-step learning rate scheduler with warmup
    def adjust_learning_rate(step):
        #lr = 1e-6 + 0.5 * (args.lr - 1e-6)*(1 + np.cos(step/(args.epochs*steps_per_epoch) * np.pi))
        lr = args.lr * (np.cos(step / (steps_per_epoch * args.epochs) * np.pi) + 1) / 2
        for param_group in opt.param_groups:
            param_group["lr"] = lr
        return lr

    # start training
    for step in range(0, int(args.epochs * steps_per_epoch)):
        #print(step)
        lr = adjust_learning_rate(step)

        t_step = time.time()

        image, label = next(train_queue)
        if step > steps_per_epoch:
            image, label = preprocess(image, label)
        image = megengine.tensor(image)
        label = megengine.tensor(label)
        t_data = time.time() - t_step
        loss = train_step(image, label)
        t_train = time.time() - t_step
        speed = 1. / t_train
        if step % args.print_freq == 0 and dist.get_rank() == 0:
            logging.info(
                "Epoch {} Step {}, Speed={:.2g} mb/s, dp_cost={:.2g}, Loss={:5.2e}, lr={:.2e}".format(
                step // int(steps_per_epoch),
                step,
                speed,
                t_data/t_train,
                loss.item(),
                lr
            ))
        #print(steps_per_epoch)
        if (step + 1) % steps_per_epoch == 0:
            model.eval()
            loss, psnr_v = valid(valid_step, valid_dataloader)
            model.train()
            logging.info(
                "Epoch {} Test mae {:.3f}, psnr {:.3f}".format(
                (step + 1) // steps_per_epoch,
                loss.item(),
                psnr_v.item(),
            ))
            megengine.save(
                {
                    "epoch": (step + 1) // steps_per_epoch,
                    "state_dict": model.state_dict(),
                },
                os.path.join(args.save, args.arch, "checkpoint.pkl"),
            ) if rank == 0 else None
Ejemplo n.º 15
0
def worker(master_ip, port, world_size, rank, args):
    if world_size > 1:
        dist.init_process_group(
            master_ip=master_ip,
            port=port,
            world_size=world_size,
            rank=rank,
            device=rank,
        )
        logger.info("Init process group for gpu{} done".format(rank))

    current_network = import_from_file(args.file)

    model = current_network.Net(current_network.Cfg())
    model.train()

    if dist.get_rank() == 0:
        logger.info(get_config_info(model.cfg))
        logger.info(repr(model))

    params_with_grad = []
    for name, param in model.named_parameters():
        if "bottom_up.conv1" in name and model.cfg.backbone_freeze_at >= 1:
            continue
        if "bottom_up.layer1" in name and model.cfg.backbone_freeze_at >= 2:
            continue
        params_with_grad.append(param)

    opt = SGD(
        params_with_grad,
        lr=model.cfg.basic_lr * args.batch_size,
        momentum=model.cfg.momentum,
        weight_decay=model.cfg.weight_decay * dist.get_world_size(),
    )

    gm = GradManager()
    if dist.get_world_size() > 1:
        gm.attach(params_with_grad,
                  callbacks=[dist.make_allreduce_cb("SUM", dist.WORLD)])
    else:
        gm.attach(params_with_grad)

    if args.weight_file is not None:
        weights = mge.load(args.weight_file)
        model.backbone.bottom_up.load_state_dict(weights, strict=False)
    if dist.get_world_size() > 1:
        dist.bcast_list_(model.parameters(), dist.WORLD)  # sync parameters

    if dist.get_rank() == 0:
        logger.info("Prepare dataset")
    train_loader = iter(
        build_dataloader(args.batch_size, args.dataset_dir, model.cfg))

    for epoch in range(model.cfg.max_epoch):
        train_one_epoch(model, train_loader, opt, gm, epoch, args)
        if dist.get_rank() == 0:
            # save_path = "log-of-{}/epoch_{}.pkl".format(
            #     os.path.basename(args.file).split(".")[0], epoch
            # )
            save_path = os.path.join(args.log_dir,
                                     "epoch_{}.pkl".format(epoch))
            mge.save(
                {
                    "epoch": epoch,
                    "state_dict": model.state_dict()
                },
                save_path,
            )
            logger.info("dump weights to %s", save_path)
Ejemplo n.º 16
0
def worker(args):
    # pylint: disable=too-many-statements
    if dist.get_rank() == 0:
        os.makedirs(os.path.join(args.save, args.arch), exist_ok=True)
        megengine.logger.set_log_file(
            os.path.join(args.save, args.arch, "log.txt"))

    # build dataset
    train_dataloader, valid_dataloader = build_dataset(args)
    train_queue = iter(train_dataloader)  # infinite
    steps_per_epoch = 1280000 // (dist.get_world_size() * args.batch_size)

    # build model
    model = snet_model.__dict__[args.arch]()

    # Sync parameters and buffers
    if dist.get_world_size() > 1:
        dist.bcast_list_(model.parameters())
        dist.bcast_list_(model.buffers())

    # Autodiff gradient manager
    gm = autodiff.GradManager().attach(
        model.parameters(),
        callbacks=dist.make_allreduce_cb("mean")
        if dist.get_world_size() > 1 else None,
    )

    # Optimizer
    params_wd = []
    params_nwd = []
    for n, p in model.named_parameters():
        if n.find("weight") >= 0 and len(p.shape) > 1:
            print("include ", n, p.shape)
            params_wd.append(p)
        else:
            print("NOT include ", n, p.shape)
            params_nwd.append(p)
    opt = optim.SGD(
        [
            {
                "params": params_wd
            },
            {
                "params": params_nwd,
                "weight_decay": 0
            },
        ],
        lr=args.lr * dist.get_world_size(),
        momentum=args.momentum,
        weight_decay=args.weight_decay,
    )

    # train and valid func
    def train_step(image, label):
        with gm:
            logits = model(image)
            loss = F.nn.cross_entropy(logits, label, label_smooth=0.1)
            acc1, acc5 = F.topk_accuracy(logits, label, topk=(1, 5))
            gm.backward(loss)
            opt.step().clear_grad()
        return loss, acc1, acc5

    def valid_step(image, label):
        logits = model(image)
        loss = F.nn.cross_entropy(logits, label, label_smooth=0.1)
        acc1, acc5 = F.topk_accuracy(logits, label, topk=(1, 5))
        # calculate mean values
        if dist.get_world_size() > 1:
            loss = F.distributed.all_reduce_sum(loss) / dist.get_world_size()
            acc1 = F.distributed.all_reduce_sum(acc1) / dist.get_world_size()
            acc5 = F.distributed.all_reduce_sum(acc5) / dist.get_world_size()
        return loss, acc1, acc5

    # linear learning rate scheduler
    def adjust_learning_rate(step):
        lr = args.lr * (1 - step / (args.epochs * steps_per_epoch))
        for param_group in opt.param_groups:
            param_group["lr"] = lr
        return lr

    # start training
    objs = AverageMeter("Loss")
    top1 = AverageMeter("Acc@1")
    top5 = AverageMeter("Acc@5")
    clck = AverageMeter("Time")

    for step in range(0, args.epochs * steps_per_epoch):
        lr = adjust_learning_rate(step)

        t = time.time()

        image, label = next(train_queue)
        image = megengine.tensor(image, dtype="float32")
        label = megengine.tensor(label, dtype="int32")

        loss, acc1, acc5 = train_step(image, label)

        objs.update(loss.item())
        top1.update(100 * acc1.item())
        top5.update(100 * acc5.item())
        clck.update(time.time() - t)

        if step % args.print_freq == 0 and dist.get_rank() == 0:
            logging.info(
                "Epoch %d Step %d, LR %.4f, %s %s %s %s",
                step // steps_per_epoch,
                step,
                lr,
                objs,
                top1,
                top5,
                clck,
            )
            objs.reset()
            top1.reset()
            top5.reset()
            clck.reset()

        if (step + 1) % steps_per_epoch == 0:
            model.eval()
            _, valid_acc1, valid_acc5 = valid(valid_step, valid_dataloader,
                                              args)
            model.train()
            logging.info(
                "Epoch %d Test Acc@1 %.3f, Acc@5 %.3f",
                (step + 1) // steps_per_epoch,
                valid_acc1,
                valid_acc5,
            )
            if dist.get_rank() == 0:
                megengine.save(
                    {
                        "epoch": (step + 1) // steps_per_epoch,
                        "state_dict": model.state_dict(),
                    },
                    os.path.join(args.save, args.arch, "checkpoint.pkl"),
                )