def main(): # is_chief indicates this machine will do shared tasks for the cluster # such as logging and checkpointing # is_chief must be true only for at most 1 process in training cluster # $RANK is set by pytorch.distributed.launch # https://github.com/pytorch/pytorch/blob/db6e4576dab097abf01d032c3326e4b285eb8499/torch/distributed/launch.py#L193 global is_chief, event_writer, global_example_count, last_recv_bytes, last_transmit_bytes, last_log_time is_chief = (not args.distributed) or (int(os.environ['RANK'])==0) global_example_count = 0 if is_chief: print(f"Logging to {args.logdir}") event_writer = SummaryWriter(args.logdir) log_tb("first", time.time()) else: event_writer = NoOp() # baseline number for network bytes last_recv_bytes, last_transmit_bytes = network_bytes() last_log_time = time.time() print(args) print("~~epoch\thours\ttop1Accuracy\n") # need to index validation directory before we start counting the time dataloader.sort_ar(args.data+'/validation') global reduce_function if args.c10d: print('Distributed: loading c10d process group') # https://github.com/pytorch/pytorch/blob/master/torch/lib/c10d/TCPStore.hpp torch.cuda.set_device(args.local_rank) rank = int(os.environ['RANK']) store = c10d.TCPStore(os.environ['MASTER_ADDR'], int(os.environ['MASTER_PORT']), rank==0) # (masterAddr, masterPort, isServer) process_group = c10d.ProcessGroupNCCL(store, rank, args.world_size) # (store, rank, size) reduce_function = lambda t: process_group.allreduce(t, c10d.AllreduceOptions().reduceOp) elif args.distributed: print('Distributed: initializing process group') torch.cuda.set_device(args.local_rank) dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size) assert(args.world_size == dist.get_world_size()) reduce_function = lambda t: dist.all_reduce(t, op=dist.reduce_op.SUM) print("Distributed: success (%d/%d)"%(args.local_rank, args.world_size)) if args.fp16: assert torch.backends.cudnn.enabled, "fp16 mode requires cudnn backend to be enabled." print("Loading model") if args.factorized_resnet: model = resnet.resnet50factorized(pretrained=args.pretrained) else: model = resnet.resnet50(pretrained=args.pretrained) model = model.cuda() if args.init_bn0: resnet.init_dist_weights(model) # Sets batchnorm std to 0 if args.fp16: model = network_to_half(model) best_prec5 = 93 # only save models over 92%. Otherwise it stops to save every time # Load model from checkpoint. This must happen distributed as model is saved without it if args.resume: checkpoint = torch.load(args.resume, map_location = lambda storage, loc: storage.cuda(args.local_rank)) model.load_state_dict(checkpoint['state_dict']) args.start_epoch = checkpoint['epoch'] best_prec5 = checkpoint['best_prec5'] if args.c10d: model = distributed_c10d._DistributedDataParallelC10d(model, process_group, device_ids=[args.local_rank], output_device=args.local_rank) c10d_sanity_check() elif args.distributed: model = nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank) global model_params, master_params if args.fp16: model_params, master_params = prep_param_lists(model) else: model_params = master_params = model.parameters() optim_params = experimental_utils.bnwd_optim_params(model, model_params, master_params) if args.no_bn_wd else master_params # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda() optimizer = torch.optim.SGD(optim_params, 0, momentum=args.momentum, weight_decay=args.weight_decay) # start with 0 lr. Scheduler will change this later if args.resume: # we must resume optimizer params separately checkpoint = torch.load(args.resume, map_location = lambda storage, loc: storage.cuda(args.local_rank)) optimizer.load_state_dict(checkpoint['optimizer']) # Load data data manager and lr scheduler from phases phases = eval(args.phases) print("Creating data loaders (this could take 6-12 minutes)") dm = DataManager([p for p in phases if 'bs' in p]) scheduler = Scheduler(optimizer, [p for p in phases if 'lr' in p], args.scale_lr) start_time = datetime.now() # Loading start to after everything is loaded if args.evaluate: return validate(dm.val_dl, model, criterion, 0, start_time) if args.distributed: print('Syncing machines before training') sum_tensor(torch.tensor([1.0]).float().cuda()) print("Begin training") estart = time.time() for epoch in range(args.start_epoch, scheduler.tot_epochs): estart = time.time() dm.set_epoch(epoch) train(dm.trn_dl, model, criterion, optimizer, scheduler, epoch) if args.prof: break prec5 = validate(dm.val_dl, model, criterion, epoch, start_time) is_best = prec5 > best_prec5 best_prec5 = max(prec5, best_prec5) if args.local_rank == 0: if is_best: save_checkpoint(epoch, model, best_prec5, optimizer, is_best=True, filename='model_best.pth.tar') phase = dm.get_phase(epoch) if phase:save_checkpoint(epoch, model, best_prec5, optimizer, filename=f'sz{phase["bs"]}_checkpoint.path.tar') event_writer.export_scalars_to_json(args.logdir+'/scalars.json') event_writer.close()
def main(): # os.system('sudo shutdown -c') # cancel previous shutdown command log.console(args) tb.log('sizes/world', dist_utils.env_world_size()) print(args.data) assert os.path.exists(args.data) # need to index validation directory before we start counting the time dataloader.sort_ar(args.data + '/val') if args.distributed: log.console('Distributed initializing process group') torch.cuda.set_device(args.local_rank) dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=dist_utils.env_world_size()) assert (dist_utils.env_world_size() == dist.get_world_size()) # todo(y): use global_rank instead of local_rank here log.console("Distributed: success (%d/%d)" % (args.local_rank, dist.get_world_size())) log.console("Loading model") #from mobilenetv3 import MobileNetV3 #model = MobileNetV3(mode='small', num_classes=1000).cuda() if args.network == 'resnet50': model.resnet.resnet50(bn0=args.init_bn0).cuda() elif args.network == 'resnet50friendlyv1': model = resnet.resnet50friendly(bn0=args.init_bn0, hybrid=True).cuda() elif args.network == 'resnet50friendlyv2': model = resnet.resnet50friendly2(bn0=args.init_bn0, hybrid=True).cuda() elif args.network == 'resnet50friendlyv3': model = resnet.resnet50friendly3(bn0=args.init_bn0, hybrid=True).cuda() elif args.network == 'resnet50friendlyv4': model = resnet.resnet50friendly4(bn0=args.init_bn0, hybrid=True).cuda() #import resnet_friendly #model = resnet_friendly.ResNet50Friendly().cuda() #model = torchvision.models.mobilenet_v2(pretrained=False).cuda() if args.fp16: model = network_to_half(model) if args.distributed: model = dist_utils.DDP(model, device_ids=[args.local_rank], output_device=args.local_rank) best_top5 = 93 # only save models over 93%. Otherwise it stops to save every time global model_params, master_params if args.fp16: model_params, master_params = prep_param_lists(model) else: model_params = master_params = model.parameters() optim_params = experimental_utils.bnwd_optim_params( model, model_params, master_params) if args.no_bn_wd else master_params # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda() optimizer = torch.optim.SGD( optim_params, 0, momentum=args.momentum, weight_decay=args.weight_decay ) # start with 0 lr. Scheduler will change this later if args.resume: checkpoint = torch.load( args.resume, map_location=lambda storage, loc: storage.cuda(args.local_rank)) model.load_state_dict(checkpoint['state_dict']) args.start_epoch = checkpoint['epoch'] current_phase = checkpoint['current_phase'] best_top5 = checkpoint['best_top5'] optimizer.load_state_dict(checkpoint['optimizer']) # save script so we can reproduce from logs shutil.copy2(os.path.realpath(__file__), f'{args.logdir}') log.console( "Creating data loaders (this could take up to 10 minutes if volume needs to be warmed up)" ) # phases = util.text_unpickle(args.phases) lr = 0.9 scale_224 = 224 / 512 scale_288 = 128 / 512 one_machine = [ { 'ep': 0, 'sz': 128, 'bs': 512, 'trndir': '' }, # Will this work? -- No idea! Should we try with mv2 baseline? ??? { 'ep': (0, 5), 'lr': (lr, lr * 2) }, # lr warmup is better with --init-bn0 { 'ep': 5, 'lr': lr }, { 'ep': 14, 'sz': 224, 'bs': 224, 'lr': lr * scale_224 }, { 'ep': 16, 'lr': lr / 10 * scale_224 }, { 'ep': 32, 'lr': lr / 100 * scale_224 }, { 'ep': 37, 'lr': lr / 100 * scale_224 }, { 'ep': 39, 'sz': 288, 'bs': 128, 'min_scale': 0.5, 'rect_val': True, 'lr': lr / 100 * scale_288 }, { 'ep': (40, 44), 'lr': lr / 1000 * scale_288 }, #{'ep': (36, 40), 'lr': lr / 1000 * scale_288}, { 'ep': (45, 48), 'lr': lr / 10000 * scale_288 }, { 'ep': (49, 52), 'sz': 288, 'bs': 224, 'lr': lr / 10000 * scale_224 } #{'ep': (46, 50), 'sz': 320, 'bs': 64, 'lr': lr / 10000 * scale_320} ] phases = util.text_pickle(one_machine) #Ok? Unpickle? phases = util.text_unpickle(phases) dm = DataManager([copy.deepcopy(p) for p in phases if 'bs' in p]) scheduler = Scheduler(optimizer, [copy.deepcopy(p) for p in phases if 'lr' in p]) start_time = datetime.now() # Loading start to after everything is loaded if args.evaluate: return validate(dm.val_dl, model, criterion, 0, start_time) if args.distributed: log.console('Syncing machines before training') dist_utils.sum_tensor(torch.tensor([1.0]).float().cuda()) log.event("~~epoch\thours\ttop1\ttop5\n") for epoch in range(args.start_epoch, scheduler.tot_epochs): print(" The start epoch:", args.start_epoch) dm.set_epoch(epoch) train(dm.trn_dl, model, criterion, optimizer, scheduler, epoch) top1, top5 = validate(dm.val_dl, model, criterion, epoch, start_time) time_diff = (datetime.now() - start_time).total_seconds() / 3600.0 log.event(f'~~{epoch}\t{time_diff:.5f}\t\t{top1:.3f}\t\t{top5:.3f}\n') is_best = top5 > best_top5 best_top5 = max(top5, best_top5) phase_save = dm.get_phase(epoch) if args.local_rank == 0: if is_best: save_checkpoint(phase_save, epoch, model, best_top5, optimizer, is_best=True, filename='model_best_' + args.network + args.name + '.pth.tar') else: save_checkpoint(phase_save, epoch, model, top5, optimizer, is_best=False, filename='model_epoch_latest_' + args.network + args.name + '.pth.tar') phase = dm.get_phase(epoch) if phase: save_checkpoint( phase_save, epoch, model, best_top5, optimizer, filename=f'sz{phase["bs"]}_checkpoint.path.tar')
def main(): os.system('shutdown -c') # cancel previous shutdown command log.console(args) tb.log('sizes/world', dist_utils.env_world_size()) # need to index validation directory before we start counting the time dataloader.sort_ar(args.data + '/validation') if args.distributed: log.console('Distributed initializing process group') torch.cuda.set_device(args.local_rank) dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=dist_utils.env_world_size()) assert (dist_utils.env_world_size() == dist.get_world_size()) log.console("Distributed: success (%d/%d)" % (args.local_rank, dist.get_world_size())) log.console("Loading model") model = resnet.resnet50(bn0=args.init_bn0).cuda() if args.fp16: model = network_to_half(model) if args.distributed: model = dist_utils.DDP(model, device_ids=[args.local_rank], output_device=args.local_rank) best_top5 = 93 # only save models over 93%. Otherwise it stops to save every time global model_params, master_params if args.fp16: model_params, master_params = prep_param_lists(model) else: model_params = master_params = model.parameters() optim_params = experimental_utils.bnwd_optim_params( model, model_params, master_params) if args.no_bn_wd else master_params # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda() optimizer = torch.optim.SGD( optim_params, 0, momentum=args.momentum, weight_decay=args.weight_decay ) # start with 0 lr. Scheduler will change this later if args.resume: checkpoint = torch.load( args.resume, map_location=lambda storage, loc: storage.cuda(args.local_rank)) model.load_state_dict(checkpoint['state_dict']) args.start_epoch = checkpoint['epoch'] best_top5 = checkpoint['best_top5'] optimizer.load_state_dict(checkpoint['optimizer']) # save script so we can reproduce from logs shutil.copy2(os.path.realpath(__file__), f'{args.logdir}') log.console( "Creating data loaders (this could take up to 10 minutes if volume needs to be warmed up)" ) phases = eval(args.phases) dm = DataManager([copy.deepcopy(p) for p in phases if 'bs' in p]) scheduler = Scheduler(optimizer, [copy.deepcopy(p) for p in phases if 'lr' in p]) start_time = datetime.now() # Loading start to after everything is loaded if args.evaluate: return validate(dm.val_dl, model, criterion, 0, start_time) if args.distributed: log.console('Syncing machines before training') dist_utils.sum_tensor(torch.tensor([1.0]).float().cuda()) log.event("~~epoch\thours\ttop1\ttop5\n") for epoch in range(args.start_epoch, scheduler.tot_epochs): dm.set_epoch(epoch) train(dm.trn_dl, model, criterion, optimizer, scheduler, epoch) top1, top5 = validate(dm.val_dl, model, criterion, epoch, start_time) time_diff = (datetime.now() - start_time).total_seconds() / 3600.0 log.event(f'~~{epoch}\t{time_diff:.5f}\t\t{top1:.3f}\t\t{top5:.3f}\n') is_best = top5 > best_top5 best_top5 = max(top5, best_top5) if args.local_rank == 0: if is_best: save_checkpoint(epoch, model, best_top5, optimizer, is_best=True, filename='model_best.pth.tar') phase = dm.get_phase(epoch) if phase: save_checkpoint( epoch, model, best_top5, optimizer, filename=f'sz{phase["bs"]}_checkpoint.path.tar')
def main(): # os.system('shutdown -c') # cancel previous shutdown command log.console(args) tb.log('sizes/world', bps.size()) # need to index validation directory before we start counting the time dataloader.sort_ar(args.data + '/validation') # if args.distributed: # log.console('Distributed initializing process group') torch.cuda.set_device(bps.local_rank()) print(f'cuda device set to {bps.local_rank()}') log.console("cuda initialized (rank=%d)" % (bps.local_rank())) # dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=bps.size()) log.console("Distributed: success (%d/%d)" % (bps.rank(), bps.size())) log.console("Loading model (rank=%d)" % (bps.rank())) model = resnet.resnet50(bn0=args.init_bn0).cuda() # reuse the validate tensor global validate_tensor, dist_validate_tensor validate_tensor = torch.tensor([0, 0, 0, 0]).float().cuda() dist_validate_tensor = torch.tensor([0, 0, 0, 0, 0]).float().cuda() if args.fp16: model = network_to_half(model) best_top5 = 93 # only save models over 93%. Otherwise it stops to save every time global model_params, master_params if args.fp16: model_params, master_params = prep_param_lists(model) else: model_params = master_params = model.parameters() optim_params, name_list = experimental_utils.bnwd_optim_params( model, model_params, master_params) if args.no_bn_wd else master_params # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda() optimizer = torch.optim.SGD( optim_params, 0, momentum=args.momentum, weight_decay=args.weight_decay ) # start with 0 lr. Scheduler will change this later named_param = [] for p in optim_params: tensors = p['params'] for tensor in tensors: named_param.append(tensor) # create bps_param (tuple) bps_param = [] for i, tensor in enumerate(named_param): name = name_list[i] bps_param.append((name, tensor)) # wrap with byteps optimizer optimizer = DistributedOptimizer( optimizer, named_parameters=bps_param, backward_passes_per_step=args.batches_per_pushpull, half=True, model=model, fp16_params=model_params, fp32_params=master_params, loss_scale=args.loss_scale) if args.resume: checkpoint = torch.load( args.resume, map_location=lambda storage, loc: storage.cuda(args.local_rank)) model.load_state_dict(checkpoint['state_dict']) args.start_epoch = checkpoint['epoch'] best_top5 = checkpoint['best_top5'] optimizer.load_state_dict(checkpoint['optimizer']) log.console( "Creating data loaders (this could take up to 10 minutes if volume needs to be warmed up)" ) num_machines = (bps.size() - 1) // 8 + 1 assert (num_machines in schedules) phases = schedules[num_machines] dm = DataManager([copy.deepcopy(p) for p in phases if 'bs' in p]) scheduler = Scheduler(optimizer, [copy.deepcopy(p) for p in phases if 'lr' in p]) # BytePS: broadcast parameters & optimizer state. broadcast_parameters([(name, p.detach()) for name, p in bps_param], root_rank=0) broadcast_optimizer_state(optimizer, root_rank=0) start_time = datetime.now() # Loading start to after everything is loaded if args.evaluate: return validate(dm.val_dl, model, criterion, 0, start_time) if args.distributed: log.console('Global Barrier: Syncing machines before training') tensor = torch.tensor([1.0]).float().cuda() barrier_handler = push_pull_async_inplace(tensor, average=True, name="init.barrier") while True: if poll(barrier_handler): synchronize(barrier_handler) break # do broadcast for validate tensor log.console('Broadcasting validate tensor') barrier_handler = push_pull_async_inplace(validate_tensor, average=True, name="validation_tensor") while True: if poll(barrier_handler): synchronize(barrier_handler) break barrier_handler = push_pull_async_inplace( dist_validate_tensor, average=True, name="distributed_validation_tensor") while True: if poll(barrier_handler): synchronize(barrier_handler) break log.event("~~epoch\thours\ttop1\ttop5\n") for epoch in range(args.start_epoch, scheduler.tot_epochs): dm.set_epoch(epoch) train(dm.trn_dl, model, criterion, optimizer, scheduler, epoch) top1, top5 = validate(dm.val_dl, model, criterion, epoch, start_time) time_diff = (datetime.now() - start_time).total_seconds() / 3600.0 log.event(f'~~{epoch}\t{time_diff:.5f}\t\t{top1:.3f}\t\t{top5:.3f}\n') is_best = top5 > best_top5 best_top5 = max(top5, best_top5) if args.local_rank == 0: if is_best: save_checkpoint(epoch, model, best_top5, optimizer, is_best=True, filename='model_best.pth.tar') phase = dm.get_phase(epoch) if phase: save_checkpoint( epoch, model, best_top5, optimizer, filename=f'sz{phase["bs"]}_checkpoint.path.tar')