Esempio n. 1
0
def main():
    # is_chief indicates this machine will do shared tasks for the cluster
    # such as logging and checkpointing
    # is_chief must be true only for at most 1 process in training cluster
    # $RANK is set by pytorch.distributed.launch
    # https://github.com/pytorch/pytorch/blob/db6e4576dab097abf01d032c3326e4b285eb8499/torch/distributed/launch.py#L193
    global is_chief, event_writer, global_example_count, last_recv_bytes, last_transmit_bytes, last_log_time

    is_chief = (not args.distributed) or (int(os.environ['RANK'])==0)

    global_example_count = 0
    if is_chief:
      print(f"Logging to {args.logdir}")
      event_writer = SummaryWriter(args.logdir)
      log_tb("first", time.time())
    else:
      event_writer = NoOp()

    # baseline number for network bytes
    last_recv_bytes, last_transmit_bytes = network_bytes()
    last_log_time = time.time()
    
    print(args)
    print("~~epoch\thours\ttop1Accuracy\n")

    # need to index validation directory before we start counting the time
    dataloader.sort_ar(args.data+'/validation')
    
    global reduce_function
    if args.c10d:
        print('Distributed: loading c10d process group')
        # https://github.com/pytorch/pytorch/blob/master/torch/lib/c10d/TCPStore.hpp
        torch.cuda.set_device(args.local_rank)
        rank = int(os.environ['RANK'])
        store = c10d.TCPStore(os.environ['MASTER_ADDR'], int(os.environ['MASTER_PORT']), rank==0) # (masterAddr, masterPort, isServer) 
        process_group = c10d.ProcessGroupNCCL(store, rank, args.world_size) # (store, rank, size)
        reduce_function = lambda t: process_group.allreduce(t, c10d.AllreduceOptions().reduceOp)
    elif args.distributed:
        print('Distributed: initializing process group')
        torch.cuda.set_device(args.local_rank)
        dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size)
        assert(args.world_size == dist.get_world_size())
        reduce_function = lambda t: dist.all_reduce(t, op=dist.reduce_op.SUM)
        print("Distributed: success (%d/%d)"%(args.local_rank, args.world_size))

    if args.fp16: assert torch.backends.cudnn.enabled, "fp16 mode requires cudnn backend to be enabled."

    print("Loading model")
    if args.factorized_resnet: model = resnet.resnet50factorized(pretrained=args.pretrained)
    else: model = resnet.resnet50(pretrained=args.pretrained)

    model = model.cuda()
    if args.init_bn0: resnet.init_dist_weights(model) # Sets batchnorm std to 0
    if args.fp16: model = network_to_half(model)
    best_prec5 = 93 # only save models over 92%. Otherwise it stops to save every time

    # Load model from checkpoint. This must happen distributed as model is saved without it
    if args.resume:
        checkpoint = torch.load(args.resume, map_location = lambda storage, loc: storage.cuda(args.local_rank))
        model.load_state_dict(checkpoint['state_dict'])
        args.start_epoch = checkpoint['epoch']
        best_prec5 = checkpoint['best_prec5']

    if args.c10d:
        model = distributed_c10d._DistributedDataParallelC10d(model, process_group, device_ids=[args.local_rank], output_device=args.local_rank)
        c10d_sanity_check()
    elif args.distributed: model = nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank)

    global model_params, master_params
    if args.fp16: model_params, master_params = prep_param_lists(model)
    else: model_params = master_params = model.parameters()

    optim_params = experimental_utils.bnwd_optim_params(model, model_params, master_params) if args.no_bn_wd else master_params

    # define loss function (criterion) and optimizer
    criterion = nn.CrossEntropyLoss().cuda()
    optimizer = torch.optim.SGD(optim_params, 0, momentum=args.momentum, weight_decay=args.weight_decay) # start with 0 lr. Scheduler will change this later
    if args.resume: # we must resume optimizer params separately
        checkpoint = torch.load(args.resume, map_location = lambda storage, loc: storage.cuda(args.local_rank))
        optimizer.load_state_dict(checkpoint['optimizer'])

    # Load data data manager and lr scheduler from phases
    phases = eval(args.phases)
    print("Creating data loaders (this could take 6-12 minutes)")
    dm = DataManager([p for p in phases if 'bs' in p])
    scheduler = Scheduler(optimizer, [p for p in phases if 'lr' in p], args.scale_lr)

    start_time = datetime.now() # Loading start to after everything is loaded
    if args.evaluate: return validate(dm.val_dl, model, criterion, 0, start_time)

    if args.distributed:
        print('Syncing machines before training')
        sum_tensor(torch.tensor([1.0]).float().cuda())

    print("Begin training")
    estart = time.time()
    for epoch in range(args.start_epoch, scheduler.tot_epochs):
        estart = time.time()
        dm.set_epoch(epoch)

        train(dm.trn_dl, model, criterion, optimizer, scheduler, epoch)
        if args.prof: break
        prec5 = validate(dm.val_dl, model, criterion, epoch, start_time)

        is_best = prec5 > best_prec5
        best_prec5 = max(prec5, best_prec5)
        if args.local_rank == 0:
            if is_best: save_checkpoint(epoch, model, best_prec5, optimizer, is_best=True, filename='model_best.pth.tar')
            phase = dm.get_phase(epoch)
            if phase:save_checkpoint(epoch, model, best_prec5, optimizer, filename=f'sz{phase["bs"]}_checkpoint.path.tar')

    event_writer.export_scalars_to_json(args.logdir+'/scalars.json')
    event_writer.close()
Esempio n. 2
0
def main():
    # os.system('sudo shutdown -c')  # cancel previous shutdown command
    log.console(args)
    tb.log('sizes/world', dist_utils.env_world_size())

    print(args.data)
    assert os.path.exists(args.data)

    # need to index validation directory before we start counting the time
    dataloader.sort_ar(args.data + '/val')

    if args.distributed:
        log.console('Distributed initializing process group')
        torch.cuda.set_device(args.local_rank)
        dist.init_process_group(backend=args.dist_backend,
                                init_method=args.dist_url,
                                world_size=dist_utils.env_world_size())
        assert (dist_utils.env_world_size() == dist.get_world_size())
        # todo(y): use global_rank instead of local_rank here
        log.console("Distributed: success (%d/%d)" %
                    (args.local_rank, dist.get_world_size()))

    log.console("Loading model")
    #from mobilenetv3 import MobileNetV3
    #model = MobileNetV3(mode='small', num_classes=1000).cuda()
    if args.network == 'resnet50':
        model.resnet.resnet50(bn0=args.init_bn0).cuda()
    elif args.network == 'resnet50friendlyv1':
        model = resnet.resnet50friendly(bn0=args.init_bn0, hybrid=True).cuda()
    elif args.network == 'resnet50friendlyv2':
        model = resnet.resnet50friendly2(bn0=args.init_bn0, hybrid=True).cuda()
    elif args.network == 'resnet50friendlyv3':
        model = resnet.resnet50friendly3(bn0=args.init_bn0, hybrid=True).cuda()
    elif args.network == 'resnet50friendlyv4':
        model = resnet.resnet50friendly4(bn0=args.init_bn0, hybrid=True).cuda()
    #import resnet_friendly
    #model = resnet_friendly.ResNet50Friendly().cuda()
    #model = torchvision.models.mobilenet_v2(pretrained=False).cuda()
    if args.fp16:
        model = network_to_half(model)
    if args.distributed:
        model = dist_utils.DDP(model,
                               device_ids=[args.local_rank],
                               output_device=args.local_rank)
    best_top5 = 93  # only save models over 93%. Otherwise it stops to save every time

    global model_params, master_params
    if args.fp16:
        model_params, master_params = prep_param_lists(model)
    else:
        model_params = master_params = model.parameters()

    optim_params = experimental_utils.bnwd_optim_params(
        model, model_params, master_params) if args.no_bn_wd else master_params

    # define loss function (criterion) and optimizer
    criterion = nn.CrossEntropyLoss().cuda()
    optimizer = torch.optim.SGD(
        optim_params,
        0,
        momentum=args.momentum,
        weight_decay=args.weight_decay
    )  # start with 0 lr. Scheduler will change this later

    if args.resume:
        checkpoint = torch.load(
            args.resume,
            map_location=lambda storage, loc: storage.cuda(args.local_rank))
        model.load_state_dict(checkpoint['state_dict'])
        args.start_epoch = checkpoint['epoch']
        current_phase = checkpoint['current_phase']
        best_top5 = checkpoint['best_top5']
        optimizer.load_state_dict(checkpoint['optimizer'])

    # save script so we can reproduce from logs
    shutil.copy2(os.path.realpath(__file__), f'{args.logdir}')

    log.console(
        "Creating data loaders (this could take up to 10 minutes if volume needs to be warmed up)"
    )
    # phases = util.text_unpickle(args.phases)
    lr = 0.9
    scale_224 = 224 / 512
    scale_288 = 128 / 512
    one_machine = [
        {
            'ep': 0,
            'sz': 128,
            'bs': 512,
            'trndir': ''
        },  # Will this work?  -- No idea! Should we try with mv2 baseline? ???
        {
            'ep': (0, 5),
            'lr': (lr, lr * 2)
        },  # lr warmup is better with --init-bn0
        {
            'ep': 5,
            'lr': lr
        },
        {
            'ep': 14,
            'sz': 224,
            'bs': 224,
            'lr': lr * scale_224
        },
        {
            'ep': 16,
            'lr': lr / 10 * scale_224
        },
        {
            'ep': 32,
            'lr': lr / 100 * scale_224
        },
        {
            'ep': 37,
            'lr': lr / 100 * scale_224
        },
        {
            'ep': 39,
            'sz': 288,
            'bs': 128,
            'min_scale': 0.5,
            'rect_val': True,
            'lr': lr / 100 * scale_288
        },
        {
            'ep': (40, 44),
            'lr': lr / 1000 * scale_288
        },
        #{'ep': (36, 40), 'lr': lr / 1000 * scale_288},
        {
            'ep': (45, 48),
            'lr': lr / 10000 * scale_288
        },
        {
            'ep': (49, 52),
            'sz': 288,
            'bs': 224,
            'lr': lr / 10000 * scale_224
        }
        #{'ep': (46, 50), 'sz': 320, 'bs': 64,  'lr': lr / 10000 * scale_320}
    ]
    phases = util.text_pickle(one_machine)  #Ok? Unpickle?
    phases = util.text_unpickle(phases)
    dm = DataManager([copy.deepcopy(p) for p in phases if 'bs' in p])
    scheduler = Scheduler(optimizer,
                          [copy.deepcopy(p) for p in phases if 'lr' in p])

    start_time = datetime.now()  # Loading start to after everything is loaded
    if args.evaluate:
        return validate(dm.val_dl, model, criterion, 0, start_time)

    if args.distributed:
        log.console('Syncing machines before training')
        dist_utils.sum_tensor(torch.tensor([1.0]).float().cuda())

    log.event("~~epoch\thours\ttop1\ttop5\n")
    for epoch in range(args.start_epoch, scheduler.tot_epochs):
        print(" The start epoch:", args.start_epoch)
        dm.set_epoch(epoch)

        train(dm.trn_dl, model, criterion, optimizer, scheduler, epoch)
        top1, top5 = validate(dm.val_dl, model, criterion, epoch, start_time)

        time_diff = (datetime.now() - start_time).total_seconds() / 3600.0
        log.event(f'~~{epoch}\t{time_diff:.5f}\t\t{top1:.3f}\t\t{top5:.3f}\n')

        is_best = top5 > best_top5
        best_top5 = max(top5, best_top5)
        phase_save = dm.get_phase(epoch)
        if args.local_rank == 0:
            if is_best:
                save_checkpoint(phase_save,
                                epoch,
                                model,
                                best_top5,
                                optimizer,
                                is_best=True,
                                filename='model_best_' + args.network +
                                args.name + '.pth.tar')
            else:
                save_checkpoint(phase_save,
                                epoch,
                                model,
                                top5,
                                optimizer,
                                is_best=False,
                                filename='model_epoch_latest_' + args.network +
                                args.name + '.pth.tar')
            phase = dm.get_phase(epoch)
            if phase:
                save_checkpoint(
                    phase_save,
                    epoch,
                    model,
                    best_top5,
                    optimizer,
                    filename=f'sz{phase["bs"]}_checkpoint.path.tar')
Esempio n. 3
0
def main():
    os.system('shutdown -c')  # cancel previous shutdown command
    log.console(args)
    tb.log('sizes/world', dist_utils.env_world_size())

    # need to index validation directory before we start counting the time
    dataloader.sort_ar(args.data + '/validation')

    if args.distributed:
        log.console('Distributed initializing process group')
        torch.cuda.set_device(args.local_rank)
        dist.init_process_group(backend=args.dist_backend,
                                init_method=args.dist_url,
                                world_size=dist_utils.env_world_size())
        assert (dist_utils.env_world_size() == dist.get_world_size())
        log.console("Distributed: success (%d/%d)" %
                    (args.local_rank, dist.get_world_size()))

    log.console("Loading model")
    model = resnet.resnet50(bn0=args.init_bn0).cuda()
    if args.fp16: model = network_to_half(model)
    if args.distributed:
        model = dist_utils.DDP(model,
                               device_ids=[args.local_rank],
                               output_device=args.local_rank)
    best_top5 = 93  # only save models over 93%. Otherwise it stops to save every time

    global model_params, master_params
    if args.fp16: model_params, master_params = prep_param_lists(model)
    else: model_params = master_params = model.parameters()

    optim_params = experimental_utils.bnwd_optim_params(
        model, model_params, master_params) if args.no_bn_wd else master_params

    # define loss function (criterion) and optimizer
    criterion = nn.CrossEntropyLoss().cuda()
    optimizer = torch.optim.SGD(
        optim_params,
        0,
        momentum=args.momentum,
        weight_decay=args.weight_decay
    )  # start with 0 lr. Scheduler will change this later

    if args.resume:
        checkpoint = torch.load(
            args.resume,
            map_location=lambda storage, loc: storage.cuda(args.local_rank))
        model.load_state_dict(checkpoint['state_dict'])
        args.start_epoch = checkpoint['epoch']
        best_top5 = checkpoint['best_top5']
        optimizer.load_state_dict(checkpoint['optimizer'])

    # save script so we can reproduce from logs
    shutil.copy2(os.path.realpath(__file__), f'{args.logdir}')

    log.console(
        "Creating data loaders (this could take up to 10 minutes if volume needs to be warmed up)"
    )
    phases = eval(args.phases)
    dm = DataManager([copy.deepcopy(p) for p in phases if 'bs' in p])
    scheduler = Scheduler(optimizer,
                          [copy.deepcopy(p) for p in phases if 'lr' in p])

    start_time = datetime.now()  # Loading start to after everything is loaded
    if args.evaluate:
        return validate(dm.val_dl, model, criterion, 0, start_time)

    if args.distributed:
        log.console('Syncing machines before training')
        dist_utils.sum_tensor(torch.tensor([1.0]).float().cuda())

    log.event("~~epoch\thours\ttop1\ttop5\n")
    for epoch in range(args.start_epoch, scheduler.tot_epochs):
        dm.set_epoch(epoch)

        train(dm.trn_dl, model, criterion, optimizer, scheduler, epoch)
        top1, top5 = validate(dm.val_dl, model, criterion, epoch, start_time)

        time_diff = (datetime.now() - start_time).total_seconds() / 3600.0
        log.event(f'~~{epoch}\t{time_diff:.5f}\t\t{top1:.3f}\t\t{top5:.3f}\n')

        is_best = top5 > best_top5
        best_top5 = max(top5, best_top5)
        if args.local_rank == 0:
            if is_best:
                save_checkpoint(epoch,
                                model,
                                best_top5,
                                optimizer,
                                is_best=True,
                                filename='model_best.pth.tar')
            phase = dm.get_phase(epoch)
            if phase:
                save_checkpoint(
                    epoch,
                    model,
                    best_top5,
                    optimizer,
                    filename=f'sz{phase["bs"]}_checkpoint.path.tar')
Esempio n. 4
0
def main():
    # os.system('shutdown -c')  # cancel previous shutdown command
    log.console(args)
    tb.log('sizes/world', bps.size())

    # need to index validation directory before we start counting the time
    dataloader.sort_ar(args.data + '/validation')

    # if args.distributed:
    # log.console('Distributed initializing process group')
    torch.cuda.set_device(bps.local_rank())
    print(f'cuda device set to {bps.local_rank()}')
    log.console("cuda initialized (rank=%d)" % (bps.local_rank()))
    # dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=bps.size())
    log.console("Distributed: success (%d/%d)" % (bps.rank(), bps.size()))

    log.console("Loading model (rank=%d)" % (bps.rank()))
    model = resnet.resnet50(bn0=args.init_bn0).cuda()

    # reuse the validate tensor
    global validate_tensor, dist_validate_tensor
    validate_tensor = torch.tensor([0, 0, 0, 0]).float().cuda()
    dist_validate_tensor = torch.tensor([0, 0, 0, 0, 0]).float().cuda()

    if args.fp16: model = network_to_half(model)
    best_top5 = 93  # only save models over 93%. Otherwise it stops to save every time

    global model_params, master_params
    if args.fp16: model_params, master_params = prep_param_lists(model)
    else: model_params = master_params = model.parameters()

    optim_params, name_list = experimental_utils.bnwd_optim_params(
        model, model_params, master_params) if args.no_bn_wd else master_params

    # define loss function (criterion) and optimizer
    criterion = nn.CrossEntropyLoss().cuda()
    optimizer = torch.optim.SGD(
        optim_params,
        0,
        momentum=args.momentum,
        weight_decay=args.weight_decay
    )  # start with 0 lr. Scheduler will change this later

    named_param = []
    for p in optim_params:
        tensors = p['params']
        for tensor in tensors:
            named_param.append(tensor)

    # create bps_param (tuple)
    bps_param = []
    for i, tensor in enumerate(named_param):
        name = name_list[i]
        bps_param.append((name, tensor))

    # wrap with byteps optimizer
    optimizer = DistributedOptimizer(
        optimizer,
        named_parameters=bps_param,
        backward_passes_per_step=args.batches_per_pushpull,
        half=True,
        model=model,
        fp16_params=model_params,
        fp32_params=master_params,
        loss_scale=args.loss_scale)

    if args.resume:
        checkpoint = torch.load(
            args.resume,
            map_location=lambda storage, loc: storage.cuda(args.local_rank))
        model.load_state_dict(checkpoint['state_dict'])
        args.start_epoch = checkpoint['epoch']
        best_top5 = checkpoint['best_top5']
        optimizer.load_state_dict(checkpoint['optimizer'])

    log.console(
        "Creating data loaders (this could take up to 10 minutes if volume needs to be warmed up)"
    )
    num_machines = (bps.size() - 1) // 8 + 1
    assert (num_machines in schedules)
    phases = schedules[num_machines]
    dm = DataManager([copy.deepcopy(p) for p in phases if 'bs' in p])
    scheduler = Scheduler(optimizer,
                          [copy.deepcopy(p) for p in phases if 'lr' in p])

    # BytePS: broadcast parameters & optimizer state.
    broadcast_parameters([(name, p.detach()) for name, p in bps_param],
                         root_rank=0)
    broadcast_optimizer_state(optimizer, root_rank=0)

    start_time = datetime.now()  # Loading start to after everything is loaded
    if args.evaluate:
        return validate(dm.val_dl, model, criterion, 0, start_time)

    if args.distributed:
        log.console('Global Barrier: Syncing machines before training')
        tensor = torch.tensor([1.0]).float().cuda()
        barrier_handler = push_pull_async_inplace(tensor,
                                                  average=True,
                                                  name="init.barrier")
        while True:
            if poll(barrier_handler):
                synchronize(barrier_handler)
                break
        # do broadcast for validate tensor
        log.console('Broadcasting validate tensor')
        barrier_handler = push_pull_async_inplace(validate_tensor,
                                                  average=True,
                                                  name="validation_tensor")
        while True:
            if poll(barrier_handler):
                synchronize(barrier_handler)
                break
        barrier_handler = push_pull_async_inplace(
            dist_validate_tensor,
            average=True,
            name="distributed_validation_tensor")
        while True:
            if poll(barrier_handler):
                synchronize(barrier_handler)
                break

    log.event("~~epoch\thours\ttop1\ttop5\n")
    for epoch in range(args.start_epoch, scheduler.tot_epochs):
        dm.set_epoch(epoch)

        train(dm.trn_dl, model, criterion, optimizer, scheduler, epoch)
        top1, top5 = validate(dm.val_dl, model, criterion, epoch, start_time)

        time_diff = (datetime.now() - start_time).total_seconds() / 3600.0
        log.event(f'~~{epoch}\t{time_diff:.5f}\t\t{top1:.3f}\t\t{top5:.3f}\n')

        is_best = top5 > best_top5
        best_top5 = max(top5, best_top5)
        if args.local_rank == 0:
            if is_best:
                save_checkpoint(epoch,
                                model,
                                best_top5,
                                optimizer,
                                is_best=True,
                                filename='model_best.pth.tar')
            phase = dm.get_phase(epoch)
            if phase:
                save_checkpoint(
                    epoch,
                    model,
                    best_top5,
                    optimizer,
                    filename=f'sz{phase["bs"]}_checkpoint.path.tar')