def train_loop(run_id, dataset_dir, ckpt_run_dir, output_dir, validation_only=False, use_cuda=False, light_target=False): r"""Main logic.""" num_parallel_workers = 2 max_batch_per_epoch = None train_epochs = 164 batch_size = 128 rank = dist.get_rank() world_size = dist.get_world_size() model = ResNetCIFAR(resnet_size=20, bottleneck=False, num_classes=10, version=1) optimizer = CentralizedSGD(world_size=world_size, model=model, lr=0.1, momentum=0.9, weight_decay=1e-4, nesterov=False) # Create a learning rate scheduler for an optimizer scheduler = MultiStepLR(optimizer, milestones=[82, 109], gamma=0.1) # A loss_function for computing the loss loss_function = CrossEntropyLoss() if use_cuda: model = model.cuda() loss_function = loss_function.cuda() # Metrics like Top 1/5 Accuracy metrics = [TopKAccuracy(topk=1), TopKAccuracy(topk=5)] train_set = CIFAR10V1(dataset_dir, train=True, download=True) val_set = CIFAR10V1(dataset_dir, train=False, download=True) train_set = partition_dataset_by_rank(train_set, rank, world_size) val_set = partition_dataset_by_rank(val_set, rank, world_size) train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True, num_workers=num_parallel_workers, pin_memory=use_cuda, drop_last=False) val_loader = DataLoader(val_set, batch_size=batch_size, shuffle=False, num_workers=num_parallel_workers, pin_memory=use_cuda, drop_last=False) checkpointer = Checkpointer(ckpt_run_dir=ckpt_run_dir, rank=rank, freq=CheckpointFreq.NONE) if not validation_only: if light_target: goal = task1_time_to_accuracy_light_goal else: goal = task1_time_to_accuracy_goal tracker = Tracker(metrics, run_id, rank, goal=goal) dist.barrier() tracker.start() for epoch in range(0, train_epochs): train_round(train_loader, model, optimizer, loss_function, metrics, scheduler, 'fp32', schedule_per='epoch', transform_target_type=None, use_cuda=use_cuda, max_batch_per_epoch=max_batch_per_epoch, tracker=tracker) is_best = validation_round(val_loader, model, loss_function, metrics, run_id, rank, 'fp32', transform_target_type=None, use_cuda=use_cuda, max_batch_per_epoch=max_batch_per_epoch, tracker=tracker) checkpointer.save(tracker, model, optimizer, scheduler, tracker.current_epoch, is_best) tracker.epoch_end() if tracker.goal_reached: print("Goal Reached!") return else: cecf = CheckpointsEvaluationControlFlow(ckpt_dir=ckpt_run_dir, rank=rank, world_size=world_size, checkpointer=checkpointer, model=model, epochs=train_epochs, loss_function=loss_function, metrics=metrics, use_cuda=use_cuda, dtype='fp32', max_batch_per_epoch=None) train_stats = cecf.evaluate_by_epochs(train_loader) with open(os.path.join(output_dir, "train_stats.json"), 'w') as f: json.dump(train_stats, f) val_stats = cecf.evaluate_by_epochs(val_loader) with open(os.path.join(output_dir, "val_stats.json"), 'w') as f: json.dump(val_stats, f)
def main(run_id, dataset_dir, ckpt_run_dir, output_dir, validation_only=False): r"""Main logic.""" num_parallel_workers = 2 use_cuda = True max_batch_per_epoch = None train_epochs = 164 batch_size = 128 initialize_backends(comm_backend='mpi', logging_level='INFO', logging_file=os.path.join(output_dir, 'mlbench.log'), use_cuda=use_cuda, seed=42, cudnn_deterministic=False, ckpt_run_dir=ckpt_run_dir, delete_existing_ckpts=not validation_only) rank = dist.get_rank() world_size = dist.get_world_size() model = ResNetCIFAR(resnet_size=20, bottleneck=False, num_classes=10, version=1) optimizer = SSGDWM(model, world_size=world_size, num_coordinates=1, lr=0.1, weight_decay=0) # Create a learning rate scheduler for an optimizer scheduler = MultiStepLR(optimizer, milestones=[82, 109], gamma=0.1) # A loss_function for computing the loss loss_function = CrossEntropyLoss() if use_cuda: model = model.cuda() optimizer = optimizer.cuda() loss_function = loss_function.cuda() # Metrics like Top 1/5 Accuracy metrics = [TopKAccuracy(topk=1), TopKAccuracy(topk=5)] train_set = CIFAR10V1(dataset_dir, train=True, download=True) val_set = CIFAR10V1(dataset_dir, train=False, download=True) train_set = partition_dataset_by_rank(train_set, rank, world_size) val_set = partition_dataset_by_rank(val_set, rank, world_size) train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True, num_workers=num_parallel_workers, pin_memory=use_cuda, drop_last=False) val_loader = DataLoader(val_set, batch_size=batch_size, shuffle=False, num_workers=num_parallel_workers, pin_memory=use_cuda, drop_last=False) checkpointer = Checkpointer(ckpt_run_dir=ckpt_run_dir, rank=rank, freq=CheckpointFreq.NONE) if not validation_only: controlflow = TrainValidation(model=model, optimizer=optimizer, loss_function=loss_function, metrics=metrics, scheduler=scheduler, batch_size=batch_size, train_epochs=train_epochs, rank=rank, world_size=world_size, run_id=run_id, dtype='fp32', validate=True, schedule_per='epoch', checkpoint=checkpointer, transform_target_type=None, average_models=True, use_cuda=use_cuda, max_batch_per_epoch=max_batch_per_epoch) controlflow.run(dataloader_train=train_loader, dataloader_val=val_loader, dataloader_train_fn=None, dataloader_val_fn=None, resume=False, repartition_per_epoch=False) else: cecf = CheckpointsEvaluationControlFlow(ckpt_dir=ckpt_run_dir, rank=rank, world_size=world_size, checkpointer=checkpointer, model=model, epochs=train_epochs, loss_function=loss_function, metrics=metrics, use_cuda=use_cuda, dtype='fp32', max_batch_per_epoch=None) train_stats = cecf.evaluate_by_epochs(train_loader) with open(os.path.join(output_dir, "train_stats.json"), 'w') as f: json.dump(train_stats, f) val_stats = cecf.evaluate_by_epochs(val_loader) with open(os.path.join(output_dir, "val_stats.json"), 'w') as f: json.dump(val_stats, f)
def main(run_id): checkpoint_dir = os.path.join(config['checkpoint_root'], run_id) rank, world_size, _ = initialize_backends( comm_backend=config['comm_backend'], logging_level=config['logging_level'], logging_file=config['logging_file'], use_cuda=config['use_cuda'], seed=config['seed'], ckpt_run_dir=checkpoint_dir) os.makedirs(config['dataset_root'], exist_ok=True) train_set = CIFAR10V1(config['dataset_root'], train=True, download=True) val_set = CIFAR10V1(config['dataset_root'], train=False, download=True) train_set = partition_dataset_by_rank(train_set, rank, world_size) train_loader = DataLoader(train_set, batch_size=config['batch_size'], shuffle=True, num_workers=config['num_parallel_workers'], pin_memory=config['use_cuda'], drop_last=False) val_loader = DataLoader(val_set, batch_size=config['batch_size'], shuffle=False, num_workers=config['num_parallel_workers'], pin_memory=config['use_cuda'], drop_last=False) model = get_resnet_model('resnet20', 2, 'fp32', num_classes=config['num_classes'], use_cuda=True) if config['use_cuda']: model.cuda() lr = config['lr_per_sample'] * config['batch_size'] optimizer = optim.SGD(model.parameters(), lr=lr, momentum=config['momentum'], weight_decay=config['weight_decay'], nesterov=config['nesterov']) scheduler = multistep_learning_rates_with_warmup( optimizer, world_size, lr, config['multisteplr_gamma'], config['multisteplr_milestones'], warmup_duration=config['warmup_duration'], warmup_linear_scaling=config['warmup_linear_scaling'], warmup_lr=lr) loss_function = CrossEntropyLoss() if config['use_cuda']: loss_function.cuda() metrics = [TopKAccuracy(topk=1), TopKAccuracy(topk=5)] checkpointer = Checkpointer(checkpoint_dir, rank) controlflow = TrainValidation(model, optimizer, loss_function, metrics, scheduler, config['batch_size'], config['train_epochs'], rank, world_size, run_id, dtype=config['dtype'], checkpoint=checkpointer, use_cuda=config['use_cuda']) controlflow.run(dataloader_train=train_loader, dataloader_val=val_loader)
def main(run_id, validation_only=False): r"""Main logic.""" num_parallel_workers = 2 dataset_root = '/datasets/torch/cifar10' ckpt_run_dir = '/checkpoints/decentralized/cifar_resnet20' use_cuda = True train_epochs = 164 initialize_backends( comm_backend='mpi', logging_level='INFO', logging_file='/mlbench.log', use_cuda=use_cuda, seed=42, cudnn_deterministic=False, ckpt_run_dir=ckpt_run_dir, delete_existing_ckpts=not validation_only) rank = dist.get_rank() world_size = dist.get_world_size() batch_size = 256 // world_size model = ResNetCIFAR( resnet_size=20, bottleneck=False, num_classes=10, version=1) optimizer = optim.SGD( model.parameters(), lr=0.1, momentum=0.9, weight_decay=1e-4, nesterov=True) # Create a learning rate scheduler for an optimizer scheduler = MultiStepLR( optimizer, milestones=[82, 109], gamma=0.1) # A loss_function for computing the loss loss_function = CrossEntropyLoss() if use_cuda: model = model.cuda() loss_function = loss_function.cuda() # Metrics like Top 1/5 Accuracy metrics = [ TopKAccuracy(topk=1), TopKAccuracy(topk=5) ] train_set = CIFAR10V1(dataset_root, train=True, download=True) val_set = CIFAR10V1(dataset_root, train=False, download=True) train_set = partition_dataset_by_rank(train_set, rank, world_size) val_set = partition_dataset_by_rank(val_set, rank, world_size) train_loader = DataLoader( train_set, batch_size=batch_size, shuffle=True, num_workers=num_parallel_workers, pin_memory=use_cuda, drop_last=False) val_loader = DataLoader( val_set, batch_size=batch_size, shuffle=False, num_workers=num_parallel_workers, pin_memory=use_cuda, drop_last=False) checkpointer = Checkpointer( ckpt_run_dir=ckpt_run_dir, rank=rank, checkpoint_all=True) if not validation_only: # Aggregation ring_neighbors = [(rank + 1) % world_size, (rank - 1) % world_size] agg_fn = DecentralizedAggregation( rank=rank, neighbors=ring_neighbors).agg_model controlflow = TrainValidation( model=model, optimizer=optimizer, loss_function=loss_function, metrics=metrics, scheduler=scheduler, batch_size=batch_size, train_epochs=train_epochs, rank=rank, world_size=world_size, run_id=run_id, dtype='fp32', validate=True, schedule_per='epoch', checkpoint=checkpointer, transform_target_type=None, average_models=True, use_cuda=use_cuda, max_batch_per_epoch=None, agg_fn=agg_fn) controlflow.run( dataloader_train=train_loader, dataloader_val=val_loader, dataloader_train_fn=None, dataloader_val_fn=None, resume=False, repartition_per_epoch=False) else: cecf = CheckpointsEvaluationControlFlow( ckpt_dir=ckpt_run_dir, rank=rank, world_size=world_size, checkpointer=checkpointer, model=model, epochs=train_epochs, loss_function=loss_function, metrics=metrics, use_cuda=use_cuda, dtype='fp32', max_batch_per_epoch=None) train_stats = cecf.evaluate_by_epochs(train_loader) with open(os.path.join(ckpt_run_dir, "train_stats.json"), 'w') as f: json.dump(train_stats, f) val_stats = cecf.evaluate_by_epochs(val_loader) with open(os.path.join(ckpt_run_dir, "val_stats.json"), 'w') as f: json.dump(val_stats, f)
def train_loop( run_id, dataset_dir, ckpt_run_dir, output_dir, validation_only=False, use_cuda=False, light_target=False, ): """Train loop""" num_parallel_workers = 2 max_batch_per_epoch = None train_epochs = 164 batch_size = 128 dtype = "fp32" rank = dist.get_rank() world_size = dist.get_world_size() # LR = 0.1 / 256 / sample lr = 0.02 scaled_lr = lr * world_size by_layer = False # Create Model model = ResNetCIFAR(resnet_size=20, bottleneck=False, num_classes=10, version=1) # Create optimizer optimizer = CentralizedSGD( world_size=world_size, model=model, lr=lr, momentum=0.9, weight_decay=1e-4, nesterov=False, use_cuda=use_cuda, by_layer=by_layer, ) # A loss_function for computing the loss loss_function = CrossEntropyLoss() if use_cuda: model = model.cuda() loss_function = loss_function.cuda() # Metrics like Top 1/5 Accuracy metrics = [TopKAccuracy(topk=1), TopKAccuracy(topk=5)] train_set = CIFAR10V1(dataset_dir, train=True, download=True) val_set = CIFAR10V1(dataset_dir, train=False, download=True) # Create train/validation sets and loaders train_set = partition_dataset_by_rank(train_set, rank, world_size) val_set = partition_dataset_by_rank(val_set, rank, world_size) train_loader = DataLoader( train_set, batch_size=batch_size, shuffle=True, num_workers=num_parallel_workers, pin_memory=use_cuda, drop_last=False, ) val_loader = DataLoader( val_set, batch_size=batch_size, shuffle=False, num_workers=num_parallel_workers, pin_memory=use_cuda, drop_last=False, ) # Create a learning rate scheduler for an optimizer scheduler = ReduceLROnPlateauWithWarmup( optimizer.optimizer, warmup_init_lr=lr, scaled_lr=scaled_lr, warmup_epochs=int(math.log(world_size, 2)), # Adaptive warmup period factor=0.5, threshold_mode="abs", threshold=0.01, patience=1, verbose=True, min_lr=lr, ) checkpointer = Checkpointer(ckpt_run_dir=ckpt_run_dir, rank=rank, freq=CheckpointFreq.NONE) if not validation_only: if light_target: goal = task1_time_to_accuracy_light_goal() else: goal = task1_time_to_accuracy_goal() num_batches_per_device_train = len(train_loader) tracker = Tracker(metrics, run_id, rank, goal=goal) dist.barrier() tracker.start() for epoch in range(0, train_epochs): # Set tracker and model in training mode model.train() tracker.train() for batch_idx, (data, target) in enumerate(train_loader): tracker.batch_start() data, target = prepare_batch( data, target, dtype=dtype, transform_target_dtype=False, use_cuda=use_cuda, ) tracker.record_batch_load() # Clear gradients in the optimizer. optimizer.zero_grad() tracker.record_batch_init() # Compute the output output = model(data) tracker.record_batch_fwd_pass() # Compute the loss loss = loss_function(output, target) tracker.record_batch_comp_loss() # Backprop loss.backward() tracker.record_batch_backprop() # Aggregate gradients/parameters from all workers and apply updates to model optimizer.step(tracker=tracker) metrics_results = compute_train_batch_metrics( output, target, metrics, ) tracker.record_batch_comp_metrics() tracker.batch_end() record_train_batch_stats( batch_idx, loss.item(), output, metrics_results, tracker, num_batches_per_device_train, ) # Scheduler per epoch tracker.epoch_end() # Perform validation and gather results metrics_values, loss = validation_round( val_loader, model=model, loss_function=loss_function, metrics=metrics, dtype=dtype, tracker=tracker, transform_target_type=False, use_cuda=use_cuda, max_batches=max_batch_per_epoch, ) scheduler.step(loss) # Record validation stats is_best = record_validation_stats(metrics_values=metrics_values, loss=loss, tracker=tracker, rank=rank) checkpointer.save(tracker, model, optimizer, scheduler, tracker.current_epoch, is_best) if tracker.goal_reached: print("Goal Reached!") dist.barrier() time.sleep(10) return else: cecf = CheckpointsEvaluationControlFlow( ckpt_dir=ckpt_run_dir, rank=rank, world_size=world_size, checkpointer=checkpointer, model=model, epochs=train_epochs, loss_function=loss_function, metrics=metrics, use_cuda=use_cuda, dtype="fp32", max_batch_per_epoch=None, ) train_stats = cecf.evaluate_by_epochs(train_loader) with open(os.path.join(output_dir, "train_stats.json"), "w") as f: json.dump(train_stats, f) val_stats = cecf.evaluate_by_epochs(val_loader) with open(os.path.join(output_dir, "val_stats.json"), "w") as f: json.dump(val_stats, f)
def train_loop( run_id, dataset_dir, ckpt_run_dir, output_dir, validation_only=False, use_cuda=False, light_target=False, by_layer=False, ): r"""Main logic.""" num_parallel_workers = 2 train_epochs = 164 batch_size = 128 rank = dist.get_rank() world_size = dist.get_world_size() current_device = cuda.current_device() local_model = ResNetCIFAR(resnet_size=20, bottleneck=False, num_classes=10, version=1).to(current_device) model = DDP(local_model, device_ids=[current_device]) optimizer = SGD( model.parameters(), lr=0.1, momentum=0.9, weight_decay=1e-4, ) # Create a learning rate scheduler for an optimizer scheduler = MultiStepLR(optimizer, milestones=[82, 109], gamma=0.1) # A loss_function for computing the loss loss_function = CrossEntropyLoss() if use_cuda: model = model.cuda() loss_function = loss_function.cuda() # Metrics like Top 1/5 Accuracy metrics = [TopKAccuracy(topk=1), TopKAccuracy(topk=5)] train_set = CIFAR10V1(dataset_dir, train=True, download=True) val_set = CIFAR10V1(dataset_dir, train=False, download=True) train_set = partition_dataset_by_rank(train_set, rank, world_size) val_set = partition_dataset_by_rank(val_set, rank, world_size) train_loader = DataLoader( train_set, batch_size=batch_size, shuffle=True, num_workers=num_parallel_workers, pin_memory=use_cuda, drop_last=False, ) val_loader = DataLoader( val_set, batch_size=batch_size, shuffle=False, num_workers=num_parallel_workers, pin_memory=use_cuda, drop_last=False, ) checkpointer = Checkpointer(ckpt_run_dir=ckpt_run_dir, rank=rank, freq=CheckpointFreq.NONE) if not validation_only: if light_target: goal = task1_time_to_accuracy_light_goal() else: goal = task1_time_to_accuracy_goal() tracker = Tracker(metrics, run_id, rank, goal=goal) dist.barrier() tracker.start() for epoch in range(0, train_epochs): model.train() tracker.train() data_iter = iterate_dataloader(train_loader, dtype="fp32", use_cuda=use_cuda) num_batches_per_device_train = len(train_loader) for batch_idx, (data, target) in enumerate(data_iter): tracker.batch_start() # Clear gradients in the optimizer. optimizer.zero_grad() tracker.record_batch_init() # Compute the output output = model(data) tracker.record_batch_fwd_pass() # Compute the loss loss = loss_function(output, target) tracker.record_batch_comp_loss() # Backprop loss.backward() tracker.record_batch_backprop() # Aggregate gradients/parameters from all workers and apply updates to model optimizer.step() tracker.record_batch_opt_step() metrics_results = compute_train_batch_metrics( output, target, metrics, ) tracker.record_batch_comp_metrics() tracker.batch_end() record_train_batch_stats( batch_idx, loss.item(), output, metrics_results, tracker, num_batches_per_device_train, ) tracker.epoch_end() metrics_values, loss = validation_round( val_loader, model=model, loss_function=loss_function, metrics=metrics, dtype="fp32", tracker=tracker, use_cuda=use_cuda, ) scheduler.step() # Record validation stats is_best = record_validation_stats(metrics_values=metrics_values, loss=loss, tracker=tracker, rank=rank) checkpointer.save(tracker, model, optimizer, scheduler, tracker.current_epoch, is_best) if tracker.goal_reached: print("Goal Reached!") time.sleep(10) return else: cecf = CheckpointsEvaluationControlFlow( ckpt_dir=ckpt_run_dir, rank=rank, world_size=world_size, checkpointer=checkpointer, model=model, epochs=train_epochs, loss_function=loss_function, metrics=metrics, use_cuda=use_cuda, dtype="fp32", max_batch_per_epoch=None, ) train_stats = cecf.evaluate_by_epochs(train_loader) with open(os.path.join(output_dir, "train_stats.json"), "w") as f: json.dump(train_stats, f) val_stats = cecf.evaluate_by_epochs(val_loader) with open(os.path.join(output_dir, "val_stats.json"), "w") as f: json.dump(val_stats, f)
def main(run_id): r"""Main logic.""" num_parallel_workers = 2 dataset_root = '/datasets/torch/cifar10' use_cuda = True batch_size = 128 initialize_backends(comm_backend='mpi', logging_level='INFO', logging_file='/mlbench.log', use_cuda=use_cuda, seed=42, cudnn_deterministic=False, ckpt_run_dir='/checkpoints', delete_existing_ckpts=False) rank = dist.get_rank() world_size = dist.get_world_size() model = ResNetCIFAR(resnet_size=20, bottleneck=False, num_classes=10, version=1) optimizer = optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=1e-4, nesterov=True) # Create a learning rate scheduler for an optimizer scheduler = MultiStepLR(optimizer, milestones=[82, 109], gamma=0.1) # A loss_function for computing the loss loss_function = CrossEntropyLoss() if use_cuda: model = model.cuda() loss_function = loss_function.cuda() # Metrics like Top 1/5 Accuracy metrics = [TopKAccuracy(topk=1), TopKAccuracy(topk=5)] train_set = CIFAR10V1(dataset_root, train=True, download=True) val_set = CIFAR10V1(dataset_root, train=False, download=True) train_set = partition_dataset_by_rank(train_set, rank, world_size) train_loader = DataLoader( train_set, batch_size=batch_size, shuffle=True, num_workers=num_parallel_workers, pin_memory=use_cuda, drop_last=False) val_loader = DataLoader( val_set, batch_size=batch_size, shuffle=False, num_workers=num_parallel_workers, pin_memory=use_cuda, drop_last=False) checkpointer = Checkpointer( ckpt_run_dir='/checkpoints', rank=rank, checkpoint_all=True) controlflow = TrainValidation( model=model, optimizer=optimizer, loss_function=loss_function, metrics=metrics, scheduler=scheduler, batch_size=batch_size, train_epochs=164, rank=rank, world_size=world_size, run_id=run_id, dtype='fp32', validate=True, schedule_per='epoch', checkpoint=checkpointer, transform_target_type=None, average_models=True, use_cuda=True, max_batch_per_epoch=None) controlflow.run( dataloader_train=train_loader, dataloader_val=val_loader, dataloader_train_fn=None, dataloader_val_fn=None, resume=False, repartition_per_epoch=False)