def train_and_eval(self, initial_epoch=0, lr_tensor_name=None): """Train and evaluate one epoch. Args: initial_epoch (int, optional): Defaults to 0. Initial epoch of training. lr_tensor_name (:obj:`tf.Tensor`, optional): Defaults to None. A (scalar) float tensor representing name of learning rate """ # Tracker for stats tracker = Tracker() tracker.current_epoch = 0 tracker.best_epoch = 0 tracker.best_epoch_value = 0 tracker.records = [] tracker.cumu_time_train = [] final_epoch = min(self.max_train_steps, self.train_epochs) for i_epoch in range(initial_epoch, final_epoch): logging.debug("=> Epoch {}".format(i_epoch)) tracker.current_epoch = i_epoch if self.lr_scheduler_level == "epoch" and lr_tensor_name is not None: lr = self.sess.run(lr_tensor_name) logging.debug("Epoch {} Learning Rate : {:10.3e}".format( i_epoch, lr)) self.train_one_epoch(tracker) self.valid_one_epoch(tracker) return tracker
def test_tracker_goal_times(mocker): patched = mocker.patch("mlbench_core.utils.tracker.LogMetrics") metric = TopKAccuracy(1) tracker = Tracker([metric], 1, 0, task1_time_to_accuracy_light_goal()) tracker.start() assert tracker.start_time is not None tracker.train() with freeze_time(datetime.datetime.now()) as frozen: _do_batch(tracker, frozen) assert abs(tracker.get_total_preprocess_time() - 0.5) < 0.01 assert abs(tracker.get_total_communication_time() - 0.5) < 0.01 assert abs(tracker.get_total_compute_time() - 2.0) < 0.01 assert abs(tracker.get_total_metrics_time() - 0.5) < 0.01 _do_batch(tracker, frozen) assert abs(tracker.get_total_preprocess_time() - 1.0) < 0.01 assert abs(tracker.get_total_communication_time() - 1.0) < 0.01 assert abs(tracker.get_total_compute_time() - 4.0) < 0.01 assert abs(tracker.get_total_metrics_time() - 1.0) < 0.01 tracker.validation() tracker.record_stat("global_Prec@1", 70, log_to_api=True) assert tracker.goal_reached assert any( filter(lambda c: c[1][3] == "TaskResult", patched.method_calls))
def __init__(self, model, optimizer, loss_function, metrics, scheduler, batch_size, train_epochs, rank, world_size, run_id, dtype, validate=True, schedule_per='epoch', checkpoint=None, transform_target_type=None, average_models=False, use_cuda=False, max_batch_per_epoch=None, tracker=None): self.batch_size = batch_size self.train_epochs = train_epochs self.model = model self.optimizer = optimizer self.scheduler = scheduler self.schedule_per = schedule_per self.perform_validation = validate self.checkpoint = checkpoint self.model = model self.optimizer = optimizer self.loss_function = loss_function self.metrics = metrics self.scheduler = scheduler self.batch_size = batch_size self.rank = rank self.run_id = run_id self.dtype = dtype self.schedule_per = schedule_per self.transform_target_type = transform_target_type self.use_cuda = use_cuda self.max_batch_per_epoch = max_batch_per_epoch if tracker: self.tracker = tracker else: self.tracker = Tracker(metrics, run_id, rank)
def test_tracker_goal(mocker): patched = mocker.patch('mlbench_core.utils.tracker.LogMetrics') metric = TopKAccuracy(1) tracker = Tracker([metric], 1, 0, task1_time_to_accuracy_light_goal) tracker.start() assert tracker.start_time is not None tracker.train() tracker.record_stat('global_Prec@1', 69, log_to_api=True) tracker.batch_end() assert not tracker.goal_reached tracker.record_stat('global_Prec@1', 70, log_to_api=True) tracker.batch_end() assert not tracker.goal_reached tracker.validation() tracker.record_stat('global_Prec@1', 69, log_to_api=True) tracker.batch_end() assert not tracker.goal_reached tracker.record_stat('global_Prec@1', 70, log_to_api=True) assert tracker.goal_reached
def test_tracker_goal_times(mocker): patched = mocker.patch('mlbench_core.utils.tracker.LogMetrics') metric = TopKAccuracy(1) tracker = Tracker([metric], 1, 0, task1_time_to_accuracy_light_goal) tracker.start() assert tracker.start_time is not None tracker.train() with freeze_time(datetime.datetime.now()) as frozen: tracker.batch_start() frozen.tick(delta=datetime.timedelta(seconds=0.5)) tracker.record_batch_step('init') frozen.tick(delta=datetime.timedelta(seconds=0.5)) tracker.record_batch_step('fwd_pass') frozen.tick(delta=datetime.timedelta(seconds=0.5)) tracker.record_batch_step('comp_loss') frozen.tick(delta=datetime.timedelta(seconds=0.5)) tracker.record_batch_step('backprop') frozen.tick(delta=datetime.timedelta(seconds=0.5)) tracker.record_batch_step('opt_step') frozen.tick(delta=datetime.timedelta(seconds=0.5)) tracker.batch_end() assert abs(tracker.get_total_communication_time() - 0.5) < 0.01 assert abs(tracker.get_total_compute_time() - 1.5) < 0.01 tracker.batch_start() frozen.tick(delta=datetime.timedelta(seconds=0.5)) tracker.record_batch_step('init') frozen.tick(delta=datetime.timedelta(seconds=0.5)) tracker.record_batch_step('fwd_pass') frozen.tick(delta=datetime.timedelta(seconds=0.5)) tracker.record_batch_step('comp_loss') frozen.tick(delta=datetime.timedelta(seconds=0.5)) tracker.record_batch_step('backprop') frozen.tick(delta=datetime.timedelta(seconds=0.5)) tracker.record_batch_step('opt_step') frozen.tick(delta=datetime.timedelta(seconds=0.5)) tracker.batch_end() assert abs(tracker.get_total_communication_time() - 1.0) < 0.01 assert abs(tracker.get_total_compute_time() - 3.0) < 0.01 tracker.validation() tracker.record_stat('global_Prec@1', 70, log_to_api=True) assert tracker.goal_reached assert any(filter(lambda c: c[1][3] == 'TaskResult', patched.method_calls))
def __init__( self, train_op, sess, loss, metrics, max_train_steps, train_epochs, batch_size, num_batches_per_epoch_for_train, num_batches_per_epoch_for_validation, train_set_init_op, validation_set_init_op, run_id, rank, lr_scheduler_level="epoch", tracker=None, ): """ Args: train_op (:obj:`tf.Operation`): An operation for training models. sess (:obj:`tf.Session`): A session which the control flow will communicate. loss (:obj:`tf.Tensor`): The loss tensor. metrics (list of :obj:`tf.Tensor`): A list of metrics tensors. max_train_steps (int): Number of steps for training (independent of lr) train_epochs (int): Number of steps for training (may related to lr). batch_size (int): Size of a batch. num_batches_per_epoch_for_train (int): Number of batches in one training epoch num_batches_per_epoch_for_validation (int): Number of batches in one validation epoch train_set_init_op (:obj:`tf.Operation`): Op for initializing training dataset. validation_set_init_op (:obj:`tf.Operation`): Op for initializing validation dataset. run_id (str): the id of the run in the dashboard rank (int): the rank of the current worker lr_scheduler_level (str): Learning rate is updated based on `epoch` or `batch`. """ self.batch_size = batch_size self.num_batches_per_epoch_for_train = num_batches_per_epoch_for_train self.num_batches_per_epoch_for_validation = num_batches_per_epoch_for_validation self.sess = sess self.loss = loss self.metrics = metrics self.train_op = train_op self.lr_scheduler_level = lr_scheduler_level self.max_train_steps = max_train_steps self.train_epochs = train_epochs self.train_set_init_op = train_set_init_op self.validation_set_init_op = validation_set_init_op self.run_id = run_id self.rank = rank if tracker: self.tracker = tracker else: self.tracker = Tracker(metrics, run_id, rank)
def train_loop(run_id, use_horovod=False, gpu=False): world_size = dist.get_world_size() rank = dist.get_rank() # Define size range and number of samples to gather size_range = np.logspace(0, 8, num=80) num_samples = 100 do_fp16 = dist.get_backend() != dist.Backend.MPI or use_horovod is_nccl = dist.get_backend() == dist.Backend.NCCL logger.info("Using {}".format(dist.get_backend())) # Verify that communication works verify_communication(use_horovod, world_size) tracker = Tracker([], run_id, rank) dist.barrier() tracker.start() tracker.validation() tracker.validation_start() # Perform benchmark on both GPU and CPU (except for NCCL) if is_nccl and not gpu: raise ValueError("Cannot run NCCL without GPU") for j, size in enumerate(size_range): size = int(size) avg = get_communication_average(size, torch.float32, gpu, num_samples, use_horovod) tracker.record_stat("tensor_size", size, log_to_api=True) tracker.record_stat("dtype", 32, log_to_api=True) tracker.record_stat("cuda", 1 if gpu else 0, log_to_api=True) tracker.record_stat("avg_time", avg, log_to_api=True) logger.info("Size={}, dtype=float32, use_cuda={}, avg_time={}".format( size, gpu, avg)) if do_fp16: avg = get_communication_average(size, torch.float16, gpu, num_samples, use_horovod) tracker.record_stat("tensor_size", size, log_to_api=True) tracker.record_stat("dtype", 16, log_to_api=True) tracker.record_stat("cuda", 1 if gpu else 0, log_to_api=True) tracker.record_stat("avg_time", avg, log_to_api=True) logger.info( "Size={}, dtype=float16, use_cuda={}, avg_time={}".format( size, gpu, avg)) tracker.validation_end() time.sleep(10)
def test_update_best_runtime_metric(mocker): tracker = Tracker([TopKAccuracy(5)], 1, 0) # tracker = mocker.patch('mlbench_core.utils.pytorch.helpers.Tracker') is_best, best_metric_name = update_best_runtime_metric(tracker, 10.0, "prec") assert is_best assert best_metric_name == "best_prec" is_best, best_metric_name = update_best_runtime_metric(tracker, 11.0, "prec") assert is_best assert best_metric_name == "best_prec" is_best, best_metric_name = update_best_runtime_metric(tracker, 9.0, "prec") assert not is_best assert best_metric_name == "best_prec"
def test_update_best_runtime_metric(mocker): tracker = Tracker() tracker.records = {} tracker.current_epoch = 1 tracker.best_metric_value = 0 # tracker = mocker.patch('mlbench_core.utils.pytorch.helpers.Tracker') is_best, best_metric_name = update_best_runtime_metric( tracker, 10.0, 'prec') assert is_best assert best_metric_name == "best_prec" is_best, best_metric_name = update_best_runtime_metric( tracker, 11.0, 'prec') assert is_best assert best_metric_name == "best_prec" is_best, best_metric_name = update_best_runtime_metric( tracker, 9.0, 'prec') assert not is_best assert best_metric_name == "best_prec"
def __init__( self, train_op, sess, loss, metrics, max_train_steps, train_epochs, batch_size, num_batches_per_epoch_for_train, num_batches_per_epoch_for_validation, train_set_init_op, validation_set_init_op, run_id, rank, lr_scheduler_level="epoch", tracker=None, ): self.batch_size = batch_size self.num_batches_per_epoch_for_train = num_batches_per_epoch_for_train self.num_batches_per_epoch_for_validation = num_batches_per_epoch_for_validation self.sess = sess self.loss = loss self.metrics = metrics self.train_op = train_op self.lr_scheduler_level = lr_scheduler_level self.max_train_steps = max_train_steps self.train_epochs = train_epochs self.train_set_init_op = train_set_init_op self.validation_set_init_op = validation_set_init_op self.run_id = run_id self.rank = rank if tracker: self.tracker = tracker else: self.tracker = Tracker(metrics, run_id, rank)
def test_tracker(): tracker = Tracker([TopKAccuracy(5)], 1, 0) assert tracker is not None
def main(is_ps, run_id, rank, world_size, cluster_spec, batch_size, replicas_to_aggregate, light_target=False): logging.info("Initial.") job_name = "ps" if is_ps else "worker" cluster = tf.train.ClusterSpec(cluster_spec) gpu_options = tf.GPUOptions(allow_growth=True, per_process_gpu_memory_fraction=0.2) session_conf = tf.ConfigProto(gpu_options=gpu_options, allow_soft_placement=True, log_device_placement=False) server = tf.train.Server(cluster, job_name=job_name, task_index=rank, config=session_conf) if is_ps: server.join() else: # Pin variables to parameter server. device_fn = tf.train.replica_device_setter( ps_tasks=None, ps_device="/job:ps", worker_device="/job:{}/task:{}/device:GPU:{}".format( job_name, rank, rank), merge_devices=True, cluster=cluster, ps_ops=None, ps_strategy=None) with tf.Graph().as_default(): with tf.device(device_fn): data_loader = DatasetCifar(dataset='cifar-10', dataset_root='/datasets', batch_size=batch_size, world_size=world_size, rank=rank, seed=42, tf_dtype=tf.float32) train_op, loss, metrics, hooks = define_graph( data_loader.inputs, data_loader.labels, data_loader.training, batch_size, replicas_to_aggregate) local_init_op = tf.group(tf.local_variables_initializer(), data_loader.train_init_op, data_loader.validation_init_op) scaffold = tf.train.Scaffold(init_op=None, init_feed_dict=None, init_fn=None, ready_op=None, ready_for_local_init_op=None, local_init_op=local_init_op) lr_tensor_name = tf.get_default_graph().get_tensor_by_name( "learning_rate:0") with tf.train.MonitoredTrainingSession(config=session_conf, master=server.target, scaffold=scaffold, is_chief=(rank == 0), checkpoint_dir=None, save_checkpoint_secs=None, save_summaries_steps=None, stop_grace_period_secs=5, hooks=hooks) as sess: logging.info("Begin training.") final_epoch = 164 if light_target: goal = task1_time_to_accuracy_light_goal else: goal = task1_time_to_accuracy_goal tracker = Tracker(metrics, run_id, rank, goal=goal) tracker.start() for i_epoch in range(final_epoch): logging.debug("=> Epoch {}".format(i_epoch)) train_round(sess, data_loader.train_init_op, train_op, loss, metrics, batch_size, data_loader.num_batches_per_epoch_for_train, tracker, lr_tensor=lr_tensor_name, lr_scheduler_level='epoch') validation_round( sess, data_loader.validation_init_op, loss, metrics, batch_size, data_loader.num_batches_per_epoch_for_eval, tracker) tracker.epoch_end() if tracker.goal_reached: print("Goal Reached!") return logging.info("Finish.")
def test_tracker(): tracker = Tracker() assert tracker is not None
def run(rank, size, run_id): """ Distributed Synchronous SGD Example """ torch.manual_seed(1234) logging.info("Loading Dataset") train_set, bsz = partition_dataset_train() val_set, bsz_val = partition_dataset_val() logging.info("Setting up models and training") model = Net() optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.5) metrics = [ TopKAccuracy(topk=1), TopKAccuracy(topk=5) ] loss_func = nn.NLLLoss() goal = task1_time_to_accuracy_goal() tracker = Tracker(metrics, run_id, rank, goal=goal) num_batches = ceil(len(train_set.dataset) / float(bsz)) num_batches_val = ceil(len(val_set.dataset) / float(bsz_val)) tracker.start() logging.info("Starting train loop") for epoch in range(10): tracker.train() epoch_loss = 0.0 for i, (data, target) in enumerate(train_set): tracker.batch_start() optimizer.zero_grad() output = model(data) tracker.record_batch_step('forward') loss = loss_func(output, target) epoch_loss += loss.data.item() tracker.record_batch_step('loss') loss.backward() tracker.record_batch_step('backward') average_gradients(model) optimizer.step() tracker.batch_end() logging.info("Batch: {}, Loss: {}".format(i, loss.item())) tracker.record_loss(epoch_loss, num_batches, log_to_api=True) logging.debug('Rank %s, epoch %s: %s', dist.get_rank(), epoch, epoch_loss / num_batches) metrics, loss = validation_round(val_set, model, loss_func, metrics, "fp32", tracker=tracker, transform_target_type=False, use_cuda=False, max_batches=num_batches_val) record_validation_stats(metrics, loss, tracker=tracker, rank=rank) tracker.epoch_end() if tracker.goal_reached: logging.debug("Goal Reached!") return
def train_loop( run_id, dataset_dir, ckpt_run_dir, output_dir, validation_only=False, use_cuda=False, light_target=False, ): """Train loop""" num_parallel_workers = 2 max_batch_per_epoch = None train_epochs = 164 batch_size = 128 dtype = "fp32" rank = dist.get_rank() world_size = dist.get_world_size() # LR = 0.1 / 256 / sample lr = 0.02 scaled_lr = lr * world_size by_layer = False # Create Model model = ResNetCIFAR(resnet_size=20, bottleneck=False, num_classes=10, version=1) # Create optimizer optimizer = CentralizedSGD( world_size=world_size, model=model, lr=lr, momentum=0.9, weight_decay=1e-4, nesterov=False, use_cuda=use_cuda, by_layer=by_layer, ) # A loss_function for computing the loss loss_function = CrossEntropyLoss() if use_cuda: model = model.cuda() loss_function = loss_function.cuda() # Metrics like Top 1/5 Accuracy metrics = [TopKAccuracy(topk=1), TopKAccuracy(topk=5)] train_set = CIFAR10V1(dataset_dir, train=True, download=True) val_set = CIFAR10V1(dataset_dir, train=False, download=True) # Create train/validation sets and loaders train_set = partition_dataset_by_rank(train_set, rank, world_size) val_set = partition_dataset_by_rank(val_set, rank, world_size) train_loader = DataLoader( train_set, batch_size=batch_size, shuffle=True, num_workers=num_parallel_workers, pin_memory=use_cuda, drop_last=False, ) val_loader = DataLoader( val_set, batch_size=batch_size, shuffle=False, num_workers=num_parallel_workers, pin_memory=use_cuda, drop_last=False, ) # Create a learning rate scheduler for an optimizer scheduler = ReduceLROnPlateauWithWarmup( optimizer.optimizer, warmup_init_lr=lr, scaled_lr=scaled_lr, warmup_epochs=int(math.log(world_size, 2)), # Adaptive warmup period factor=0.5, threshold_mode="abs", threshold=0.01, patience=1, verbose=True, min_lr=lr, ) checkpointer = Checkpointer(ckpt_run_dir=ckpt_run_dir, rank=rank, freq=CheckpointFreq.NONE) if not validation_only: if light_target: goal = task1_time_to_accuracy_light_goal() else: goal = task1_time_to_accuracy_goal() num_batches_per_device_train = len(train_loader) tracker = Tracker(metrics, run_id, rank, goal=goal) dist.barrier() tracker.start() for epoch in range(0, train_epochs): # Set tracker and model in training mode model.train() tracker.train() for batch_idx, (data, target) in enumerate(train_loader): tracker.batch_start() data, target = prepare_batch( data, target, dtype=dtype, transform_target_dtype=False, use_cuda=use_cuda, ) tracker.record_batch_load() # Clear gradients in the optimizer. optimizer.zero_grad() tracker.record_batch_init() # Compute the output output = model(data) tracker.record_batch_fwd_pass() # Compute the loss loss = loss_function(output, target) tracker.record_batch_comp_loss() # Backprop loss.backward() tracker.record_batch_backprop() # Aggregate gradients/parameters from all workers and apply updates to model optimizer.step(tracker=tracker) metrics_results = compute_train_batch_metrics( output, target, metrics, ) tracker.record_batch_comp_metrics() tracker.batch_end() record_train_batch_stats( batch_idx, loss.item(), output, metrics_results, tracker, num_batches_per_device_train, ) # Scheduler per epoch tracker.epoch_end() # Perform validation and gather results metrics_values, loss = validation_round( val_loader, model=model, loss_function=loss_function, metrics=metrics, dtype=dtype, tracker=tracker, transform_target_type=False, use_cuda=use_cuda, max_batches=max_batch_per_epoch, ) scheduler.step(loss) # Record validation stats is_best = record_validation_stats(metrics_values=metrics_values, loss=loss, tracker=tracker, rank=rank) checkpointer.save(tracker, model, optimizer, scheduler, tracker.current_epoch, is_best) if tracker.goal_reached: print("Goal Reached!") dist.barrier() time.sleep(10) return else: cecf = CheckpointsEvaluationControlFlow( ckpt_dir=ckpt_run_dir, rank=rank, world_size=world_size, checkpointer=checkpointer, model=model, epochs=train_epochs, loss_function=loss_function, metrics=metrics, use_cuda=use_cuda, dtype="fp32", max_batch_per_epoch=None, ) train_stats = cecf.evaluate_by_epochs(train_loader) with open(os.path.join(output_dir, "train_stats.json"), "w") as f: json.dump(train_stats, f) val_stats = cecf.evaluate_by_epochs(val_loader) with open(os.path.join(output_dir, "val_stats.json"), "w") as f: json.dump(val_stats, f)
def train_loop( run_id, dataset_dir, ckpt_run_dir, output_dir, validation_only=False, use_cuda=False, light_target=False, seed=42, ): """Train loop""" train_epochs = 10 math_mode = "fp16" rank = dist.get_rank() world_size = dist.get_world_size() # Dataset arguments train_global_batch_size = 2**17 # Global batch size max_bs = 2**13 # Max batch size for used hardware update_freq = int(max(1, train_global_batch_size // (max_bs * world_size))) max_tokens = int(train_global_batch_size // (world_size * update_freq)) max_source_positions, max_target_positions = 80, 80 seq_len_multiple = 2 left_pad = (True, False) lang = ("en", "de") # specific arch model_args = deepcopy(DEFAULT_TRANSFORMER_ARCH) model_args["max_source_positions"] = max_source_positions model_args["max_target_positions"] = max_target_positions model_args["share_all_embeddings"] = True model_args["dropout"] = 0.1 model_args["softmax_type"] = "fast_fill" lr = 1.976e-3 optimizer_args = { "lr": lr, "eps": 1e-9, "betas": (0.9, 0.98), } scheduler_args = { "base_lr": lr, "warmup_init_lr": 0.0, "warmup_steps": 1000 } loss_scaling_fp16 = { "init_scale": 2.0**7, "scale_factor": 2, "scale_window": 2000, } criterion_args = {"smoothing": 0.1, "fast_xentropy": True} # Horovod stuff use_horovod = (math_mode == "fp16") and dist.get_backend() == dist.Backend.MPI if use_horovod: hvd.init() logger.info("Using horovod rank={}".format(hvd.rank())) tensor = torch.tensor([1]) res = hvd.allreduce(tensor, op=hvd.Sum) assert res[0] == world_size # Load train and validation datasets train_set = WMT17Dataset( dataset_dir, download=True, train=True, shuffle=True, lang=lang, left_pad=left_pad, max_positions=(max_source_positions, max_target_positions), seq_len_multiple=seq_len_multiple, ) validation_set = WMT17Dataset( dataset_dir, download=False, test=True, shuffle=True, lang=lang, left_pad=left_pad, max_positions=(max_source_positions, max_target_positions), seq_len_multiple=seq_len_multiple, ) src_dict, trg_dict = train_set.src_dict, train_set.trg_dict train_batches = get_batches(train_set, max_tokens=max_tokens, bsz_mult=8, shuffle=True, seed=seed) val_batches = get_batches(validation_set, max_tokens=max_tokens, bsz_mult=8, shuffle=False) train_batches = equalize_batches(train_batches, world_size, seed=seed) # Partition by rank train_batches = partition_dataset_by_rank(train_batches, rank, world_size) val_batches = partition_dataset_by_rank(val_batches, rank, world_size) total_train_points = sum(len(b) for b in train_batches) validate_every = update_freq * round( len(train_batches) * 0.30 / update_freq) # Validate every 30% assert (validate_every % update_freq) == 0 logger.info("Using {} total train points, {} batches".format( total_train_points, len(train_batches))) train_loader = DataLoader( train_set, num_workers=1, pin_memory=False, collate_fn=train_set.collater, batch_sampler=train_batches, ) val_loader = DataLoader( validation_set, num_workers=1, pin_memory=False, collate_fn=validation_set.collater, batch_sampler=val_batches, ) model = TransformerModel(Arguments(model_args), src_dict, trg_dict) criterion = LabelSmoothing(padding_idx=src_dict.pad(), **criterion_args) if use_cuda: model = model.cuda() criterion = criterion.cuda() fp_optimizer, optimizer, model = build_optimizer( model, optimizer_args, math_mode=math_mode, scaling_args=loss_scaling_fp16, use_horovod=use_horovod, use_cuda=use_cuda, ) scheduler = SQRTTimeDecayLRWithWarmup(optimizer, **scheduler_args) metrics = [BLEUScore(use_raw=True)] checkpointer = Checkpointer(ckpt_run_dir=ckpt_run_dir, rank=rank, freq=CheckpointFreq.BEST) translator = SequenceGenerator( model, src_dict=deepcopy(src_dict), trg_dict=deepcopy(trg_dict), beam_size=4, stop_early=True, normalize_scores=True, len_penalty=0.6, sampling=False, sampling_topk=-1, minlen=1, ) if not validation_only: if light_target: goal = task4_time_to_bleu_goal(20) else: goal = task4_time_to_bleu_goal(25) num_batches_per_device_train = len(train_loader) tracker = Tracker(metrics, run_id, rank, goal=goal) dist.barrier() tracker.start() for epoch in range(0, train_epochs): if torch.cuda.is_available(): torch.cuda.empty_cache() model.train() tracker.train() iter_sample_size = 0 for batch_idx, sample in enumerate(train_loader): tracker.batch_start() sample = prepare_batch(sample, use_cuda=use_cuda) tracker.record_batch_load() is_last = batch_idx == len(train_loader) update = (batch_idx % update_freq) == update_freq - 1 init = (batch_idx % update_freq) == 0 # Clear gradients in the optimizer. if init: fp_optimizer.zero_grad() iter_sample_size = 0 tracker.record_batch_init() # Compute the output output = model(**sample["net_input"]) tracker.record_batch_fwd_pass() loss, sample_size = compute_loss(sample, output, criterion) loss_per_sample = loss.item() / sample_size iter_sample_size += sample_size tracker.record_batch_comp_loss() # Backprop fp_optimizer.backward_loss(loss) tracker.record_batch_backprop() if update or is_last: # Get batch size over all workers full_bs = get_full_batch_size(iter_sample_size, world_size=world_size, use_cuda=use_cuda) updated = opt_step( fp_optimizer, tracker, full_bs, update_freq, math_mode, world_size, ) if updated: scheduler.step() tracker.batch_end() record_train_batch_stats( batch_idx=batch_idx, loss=loss_per_sample, output=torch.Tensor([0]), metric_results={}, tracker=tracker, num_batches_per_device_train=num_batches_per_device_train, ) if (batch_idx + 1) % validate_every == 0: if torch.cuda.is_available(): torch.cuda.empty_cache() metric_values, loss = validation_round( val_loader, metrics, criterion, translator, tracker=tracker, use_cuda=use_cuda, ) record_validation_stats(metric_values, loss, tracker, rank) if tracker.goal_reached: break model.train() tracker.train() if torch.cuda.is_available(): torch.cuda.empty_cache() metric_values, loss = validation_round( val_loader, metrics, criterion, translator, tracker=tracker, use_cuda=use_cuda, ) is_best = record_validation_stats(metric_values, loss, tracker, rank) checkpointer.save( tracker, model, optimizer, scheduler, tracker.current_epoch, is_best, ) tracker.epoch_end() if tracker.goal_reached: print("Goal Reached!") time.sleep(10) return else: cecf = CheckpointsEvaluationControlFlow( ckpt_dir=ckpt_run_dir, rank=rank, world_size=world_size, checkpointer=checkpointer, model=model, epochs=train_epochs, loss_function=criterion, metrics=metrics, use_cuda=use_cuda, dtype="fp32", max_batch_per_epoch=None, ) train_stats = cecf.evaluate_by_epochs(train_loader) with open(os.path.join(output_dir, "train_stats.json"), "w") as f: json.dump(train_stats, f)
def train_loop( run_id, dataset_dir, ckpt_run_dir, output_dir, validation_only=False, use_cuda=False, light_target=False, seed=42, ): """Train loop""" train_epochs = 750 rank = dist.get_rank() world_size = dist.get_world_size() # Using batch scaling train_batch_size = 80 train_global_batch_size = train_batch_size * world_size val_batch_size = 10 # Define the batch sizes here # Dataset arguments bptt = 70 min_seq_len = 5 # Model Arguments model_args = { "ninp": 400, "nhid": 1150, "nlayers": 3, "dropout": 0.4, "dropouth": 0.2, "dropouti": 0.65, "dropoute": 0.1, "wdrop": 0.5, "tie_weights": True, } # Optimizer args lr = 30 scaled_lr = lr * math.sqrt(world_size) warmup_epochs = 5 * world_size weight_decay = 1.2e-6 grad_clip = 0.25 alpha = 2 beta = 1 nonmono = 5 # Load train/valid train_set = Wikitext2Dataset(dataset_dir, bptt=bptt, train=True, min_seq_len=min_seq_len) val_set = Wikitext2Dataset(dataset_dir, bptt=bptt, valid=True, min_seq_len=min_seq_len) ntokens = len(train_set.dictionary) # Generate batches train_set.generate_batches(global_bsz=train_global_batch_size, worker_bsz=train_batch_size, rank=rank) val_set.generate_batches(val_batch_size) val_set.generate_sequence_lengths() logger.info("Built dictionary of {} tokens".format(ntokens)) model = LSTMLanguageModel(ntokens, **model_args) criterion = CrossEntropyLoss(reduction="mean") if use_cuda: model = model.cuda() criterion = criterion.cuda() optimizer = SGD(model.parameters(), lr=scaled_lr, weight_decay=weight_decay) c_optimizer = CustomCentralizedOptimizer( model=model, optimizer=optimizer, use_cuda=use_cuda, agg_grad=True, grad_clip=grad_clip, world_size=world_size, ) scheduler = LRLinearWarmUp( optimizer, init_lr=lr / world_size, scaled_lr=scaled_lr, warmup_duration=warmup_epochs, ) metrics = [Perplexity()] checkpointer = Checkpointer(ckpt_run_dir=ckpt_run_dir, rank=rank, freq=CheckpointFreq.NONE) if light_target: goal = task3_time_to_perplexity_goal(90) else: goal = task3_time_to_perplexity_goal(70) tracker = Tracker(metrics, run_id, rank, goal=goal, minimize=True) dist.barrier() tracker.start() val_losses = [] for epoch in range(0, train_epochs): model.train() tracker.train() # Init hidden state hidden = model.init_hidden(train_batch_size) # Set random sequence lengths for epoch set_sequence_lengths(train_set, random=True) num_batches_per_device_train = train_set.num_batches() for batch_idx in range(num_batches_per_device_train): tracker.batch_start() data, targets = train_set.get_batch(batch_idx, cuda=use_cuda) seq_len = data.size(0) lr_original = optimizer.param_groups[0]["lr"] batch_lr = lr_original * seq_len / bptt optimizer.param_groups[0]["lr"] = batch_lr hidden = repackage_hidden(hidden) c_optimizer.zero_grad() tracker.record_batch_init() output, hidden, raw_outputs, outputs = model(data, hidden, return_h=True) tracker.record_batch_fwd_pass() loss = criterion(output, targets) # Activation regularization loss = loss + sum(alpha * dropped_rnn_h.pow(2).mean() for dropped_rnn_h in outputs[-1:]) # Temporal Activation Regularization (slowness) loss = loss + sum(beta * (rnn_h[1:] - rnn_h[:-1]).pow(2).mean() for rnn_h in raw_outputs[-1:]) tracker.record_batch_comp_loss() loss.backward() tracker.record_batch_backprop() c_optimizer.step(tracker=tracker) optimizer.param_groups[0]["lr"] = lr_original metrics_results = compute_train_batch_metrics( output, targets, metrics, ) tracker.record_batch_comp_metrics() tracker.batch_end() record_train_batch_stats( batch_idx, loss.item(), output, metrics_results, tracker, num_batches_per_device_train, ) tracker.epoch_end() # Still in regular SGD if type(c_optimizer.optimizer) == SGD: metrics_values, loss = validation_round( val_set, model=model, batch_size=val_batch_size, metrics=metrics, loss_function=criterion, tracker=tracker, use_cuda=use_cuda, ) scheduler.step() logger.info("Using LR={}".format(scheduler.get_last_lr())) if len(val_losses) > nonmono and loss > min(val_losses[:-nonmono]): logger.info("Switching optimizer to ASGD") optimizer = ASGD( params=model.parameters(), lr=scheduler.get_last_lr()[0], lambd=0.0, t0=0, weight_decay=weight_decay, ) c_optimizer.optimizer = optimizer # Switched to ASGD, no scheduling else: tmp = {} for prm in model.parameters(): tmp[prm] = prm.data.clone() prm.data = optimizer.state[prm]["ax"].clone() metrics_values, loss = validation_round( val_set, model=model, batch_size=val_batch_size, metrics=metrics, loss_function=criterion, tracker=tracker, use_cuda=use_cuda, ) for prm in model.parameters(): prm.data = tmp[prm].clone() val_losses.append(loss) # Record validation stats is_best = record_validation_stats(metrics_values=metrics_values, loss=loss, tracker=tracker, rank=rank) checkpointer.save(tracker, model, optimizer, scheduler, tracker.current_epoch, is_best) if tracker.goal_reached: print("Goal Reached!") dist.barrier() time.sleep(10) return
def train_loop( run_id, dataset_dir, ckpt_run_dir, output_dir, validation_only=False, use_cuda=False, light_target=False, ): """Main logic.""" num_parallel_workers = 2 max_batch_per_epoch = None train_epochs = 20 batch_size = 100 n_features = 2000 l1_coef = 0.0 l2_coef = 0.0000025 # Regularization 1 / train_size ( 1 / 400,000) dtype = "fp32" rank = dist.get_rank() world_size = dist.get_world_size() lr = 4 scaled_lr = lr * min(16, world_size) by_layer = False agg_grad = False # According to paper, we aggregate weights after update model = LogisticRegression(n_features) # A loss_function for computing the loss loss_function = BCELossRegularized(l1=l1_coef, l2=l2_coef, model=model) if use_cuda: model = model.cuda() loss_function = loss_function.cuda() optimizer = CentralizedSGD( world_size=world_size, model=model, lr=scaled_lr, use_cuda=use_cuda, by_layer=by_layer, agg_grad=agg_grad, ) metrics = [ TopKAccuracy(), # Binary accuracy with threshold 0.5 F1Score(), DiceCoefficient(), ] train_set = LMDBDataset(name="epsilon", data_type="train", root=dataset_dir) val_set = LMDBDataset(name="epsilon", data_type="test", root=dataset_dir) train_set = partition_dataset_by_rank(train_set, rank, world_size) val_set = partition_dataset_by_rank(val_set, rank, world_size) train_loader = DataLoader( train_set, batch_size=batch_size, shuffle=True, num_workers=num_parallel_workers, pin_memory=use_cuda, drop_last=False, ) val_loader = DataLoader( val_set, batch_size=batch_size, shuffle=False, num_workers=num_parallel_workers, pin_memory=use_cuda, drop_last=False, ) num_batches_per_device_train = len(train_loader) scheduler = ReduceLROnPlateau( optimizer.optimizer, factor=0.75, patience=0, verbose=True, threshold_mode="abs", threshold=0.01, min_lr=lr, ) checkpointer = Checkpointer(ckpt_run_dir=ckpt_run_dir, rank=rank, freq=CheckpointFreq.NONE) if not validation_only: if light_target: goal = task2_time_to_accuracy_light_goal() else: goal = task2_time_to_accuracy_goal() tracker = Tracker(metrics, run_id, rank, goal=goal) dist.barrier() tracker.start() for epoch in range(0, train_epochs): # Set tracker and model in training mode model.train() tracker.train() for batch_idx, (data, target) in enumerate(train_loader): tracker.batch_start() data, target = prepare_batch( data, target, dtype=dtype, transform_target_dtype=False, use_cuda=use_cuda, ) tracker.record_batch_load() # Clear gradients in the optimizer. optimizer.zero_grad() tracker.record_batch_init() # Compute the output output = model(data) tracker.record_batch_fwd_pass() # Compute the loss loss = loss_function(output, target) tracker.record_batch_comp_loss() # Backprop loss.backward() tracker.record_batch_backprop() # Aggregate gradients/parameters from all workers and apply updates to model optimizer.step(tracker=tracker) metrics_results = compute_train_batch_metrics( output, target, metrics, ) tracker.record_batch_comp_metrics() # scheduler.batch_step() tracker.batch_end() record_train_batch_stats( batch_idx, loss.item(), output, metrics_results, tracker, num_batches_per_device_train, ) tracker.epoch_end() # Perform validation and gather results metrics_values, loss = validation_round( val_loader, model=model, loss_function=loss_function, metrics=metrics, dtype=dtype, tracker=tracker, transform_target_type=False, use_cuda=use_cuda, max_batches=max_batch_per_epoch, ) # Scheduler per epoch scheduler.step(loss) # Record validation stats is_best = record_validation_stats(metrics_values=metrics_values, loss=loss, tracker=tracker, rank=rank) checkpointer.save(tracker, model, optimizer, scheduler, tracker.current_epoch, is_best) if tracker.goal_reached: print("Goal Reached!") dist.barrier() time.sleep(10) return else: cecf = CheckpointsEvaluationControlFlow( ckpt_dir=ckpt_run_dir, rank=rank, world_size=world_size, checkpointer=checkpointer, model=model, epochs=train_epochs, loss_function=loss_function, metrics=metrics, use_cuda=use_cuda, dtype="fp32", max_batch_per_epoch=None, ) train_stats = cecf.evaluate_by_epochs(train_loader) with open(os.path.join(output_dir, "train_stats.json"), "w") as f: json.dump(train_stats, f) val_stats = cecf.evaluate_by_epochs(val_loader) with open(os.path.join(output_dir, "val_stats.json"), "w") as f: json.dump(val_stats, f)
def train_loop(run_id, dataset_dir, ckpt_run_dir, output_dir, validation_only=False, use_cuda=False, light_target=False): r"""Main logic.""" num_parallel_workers = 2 max_batch_per_epoch = None train_epochs = 164 batch_size = 128 rank = dist.get_rank() world_size = dist.get_world_size() model = ResNetCIFAR(resnet_size=20, bottleneck=False, num_classes=10, version=1) optimizer = CentralizedSGD(world_size=world_size, model=model, lr=0.1, momentum=0.9, weight_decay=1e-4, nesterov=False) # Create a learning rate scheduler for an optimizer scheduler = MultiStepLR(optimizer, milestones=[82, 109], gamma=0.1) # A loss_function for computing the loss loss_function = CrossEntropyLoss() if use_cuda: model = model.cuda() loss_function = loss_function.cuda() # Metrics like Top 1/5 Accuracy metrics = [TopKAccuracy(topk=1), TopKAccuracy(topk=5)] train_set = CIFAR10V1(dataset_dir, train=True, download=True) val_set = CIFAR10V1(dataset_dir, train=False, download=True) train_set = partition_dataset_by_rank(train_set, rank, world_size) val_set = partition_dataset_by_rank(val_set, rank, world_size) train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True, num_workers=num_parallel_workers, pin_memory=use_cuda, drop_last=False) val_loader = DataLoader(val_set, batch_size=batch_size, shuffle=False, num_workers=num_parallel_workers, pin_memory=use_cuda, drop_last=False) checkpointer = Checkpointer(ckpt_run_dir=ckpt_run_dir, rank=rank, freq=CheckpointFreq.NONE) if not validation_only: if light_target: goal = task1_time_to_accuracy_light_goal else: goal = task1_time_to_accuracy_goal tracker = Tracker(metrics, run_id, rank, goal=goal) dist.barrier() tracker.start() for epoch in range(0, train_epochs): train_round(train_loader, model, optimizer, loss_function, metrics, scheduler, 'fp32', schedule_per='epoch', transform_target_type=None, use_cuda=use_cuda, max_batch_per_epoch=max_batch_per_epoch, tracker=tracker) is_best = validation_round(val_loader, model, loss_function, metrics, run_id, rank, 'fp32', transform_target_type=None, use_cuda=use_cuda, max_batch_per_epoch=max_batch_per_epoch, tracker=tracker) checkpointer.save(tracker, model, optimizer, scheduler, tracker.current_epoch, is_best) tracker.epoch_end() if tracker.goal_reached: print("Goal Reached!") return else: cecf = CheckpointsEvaluationControlFlow(ckpt_dir=ckpt_run_dir, rank=rank, world_size=world_size, checkpointer=checkpointer, model=model, epochs=train_epochs, loss_function=loss_function, metrics=metrics, use_cuda=use_cuda, dtype='fp32', max_batch_per_epoch=None) train_stats = cecf.evaluate_by_epochs(train_loader) with open(os.path.join(output_dir, "train_stats.json"), 'w') as f: json.dump(train_stats, f) val_stats = cecf.evaluate_by_epochs(val_loader) with open(os.path.join(output_dir, "val_stats.json"), 'w') as f: json.dump(val_stats, f)
def train_loop( run_id, dataset_dir, ckpt_run_dir, output_dir, validation_only=False, use_cuda=False, light_target=False, ): """Train loop""" if torch.cuda.is_available(): torch.cuda.empty_cache() rank = dist.get_rank() world_size = dist.get_world_size() train_epochs = 8 train_min_len, train_max_len = 0, 75 val_min_len, val_max_len = 0, 150 math_mode = "fp16" # One of `fp16`, `fp32` lang = ("en", "de") # Training train_global_batch_size = 2048 # Global batch size max_bs = 128 # Max batch size for used hardware update_freq = int(max(1, train_global_batch_size // (max_bs * world_size))) train_batch_size = int(train_global_batch_size // (world_size * update_freq)) val_batch_size = 64 # Model attributes model_args = { "hidden_size": 1024, "num_layers": 4, "dropout": 0.2, "share_embedding": True, "fusion": True, } # Criterion criterion_args = {"smoothing": 0.1, "fast_xentropy": True} # Loss scaling loss_scaling = {"init_scale": 1024, "upscale_interval": 128} # Optimizer optimizer_args = { "lr": 2e-3, "grad_clip": 5.0, } # Scheduler scheduler_args = { "warmup_steps": 200, "remain_steps": 0.4, "decay_interval": 0.05, "decay_steps": 4, "decay_factor": 0.5, } # Translator translator_args = { "beam_size": 5, "len_norm_factor": 0.6, "cov_penalty_factor": 0.1, "len_norm_const": 5.0, "max_seq_len": 150, } # Build train/val datsets train_set = WMT16Dataset( dataset_dir, math_precision=math_mode, lang=lang, train=True, download=True, preprocessed=True, min_len=train_min_len, max_len=train_max_len, ) train_set.prepare() val_set = WMT16Dataset( dataset_dir, math_precision=math_mode, lang=lang, validation=True, download=False, min_len=val_min_len, max_len=val_max_len, sort=True, ) tokenizer = train_set.tokenizer # Build model model = GNMT(vocab_size=train_set.vocab_size, **model_args) # Build loss function criterion = LabelSmoothing(padding_idx=wmt16_config.PAD, **criterion_args) # Bilingual Evaluation Understudy Score metrics = [BLEUScore()] # Partition data train_set = partition_dataset_by_rank(train_set, rank, world_size) val_set = partition_dataset_by_rank(val_set, rank, world_size) collate_fn = build_collate_fn(sort=True) train_loader = DataLoader( train_set, batch_size=train_batch_size, collate_fn=collate_fn, num_workers=2, pin_memory=True, drop_last=False, shuffle=True, ) val_loader = DataLoader( val_set, batch_size=val_batch_size, collate_fn=collate_fn, num_workers=2, pin_memory=True, drop_last=False, ) validate_every = update_freq * round( len(train_loader) * 0.30 / update_freq ) # Validate every 30% # Build optimizer & scheduler total_train_iters = (len(train_loader) // update_freq) * train_epochs print("Number of batches per epoch {}".format(len(train_loader))) print("Train iterations per epoch {}".format(total_train_iters / train_epochs)) if use_cuda: model = model.cuda() criterion = criterion.cuda() use_horovod = math_mode == "fp16" and dist.get_backend() == dist.Backend.MPI if use_horovod: hvd.init() logger.info("Using horovod rank={}".format(hvd.rank())) tensor = torch.tensor([1]) res = hvd.allreduce(tensor, op=hvd.Sum) assert res[0] == world_size fp_optimizer, optimizer, model = build_optimizer( model=model, math=math_mode, loss_scaling=loss_scaling, use_cuda=use_cuda, use_horovod=use_horovod, **optimizer_args ) # Create a learning rate scheduler for an optimizer scheduler = ExponentialWarmupMultiStepLR( optimizer, total_train_iters, **scheduler_args ) # Translator translator = Translator(model=model, trg_tokenizer=tokenizer, **translator_args) checkpointer = Checkpointer( ckpt_run_dir=ckpt_run_dir, rank=rank, freq=CheckpointFreq.BEST ) if not validation_only: if light_target: goal = task4_time_to_bleu_goal(20) else: goal = task4_time_to_bleu_goal(24) num_batches_per_device_train = len(train_loader) tracker = Tracker(metrics, run_id, rank, goal=goal) dist.barrier() tracker.start() for epoch in range(0, train_epochs): if torch.cuda.is_available(): torch.cuda.empty_cache() model.train() tracker.train() for batch_idx, (data, target) in enumerate(train_loader): tracker.batch_start() data, target = prepare_batch(data, target, use_cuda=use_cuda) tracker.record_batch_load() is_last = batch_idx == len(train_loader) update = (batch_idx % update_freq) == update_freq - 1 init = (batch_idx % update_freq) == 0 # Clear gradients in the optimizer. if init: fp_optimizer.zero_grad() tracker.record_batch_init() # Compute the output output = compute_model_output(model, data, target) tracker.record_batch_fwd_pass() # Compute the loss loss, loss_per_token = compute_loss( data, target, output, criterion, update_freq ) tracker.record_batch_comp_loss() # Backprop fp_optimizer.backward_loss(loss) tracker.record_batch_backprop() # Opt step if update or is_last: # For this task, simply sum all gradients updated = fp_optimizer.step(tracker=tracker, denom=1) # Learning rate scheduler if updated: scheduler.step() tracker.batch_end() record_train_batch_stats( batch_idx=batch_idx, loss=loss_per_token, output=target[0], # Use target just for the size metric_results={}, tracker=tracker, num_batches_per_device_train=num_batches_per_device_train, ) # Validation during training if (batch_idx + 1) % validate_every == 0: if torch.cuda.is_available(): torch.cuda.empty_cache() metrics_values, loss = validation_round( val_loader, metrics, model, criterion, update_freq, translator, tracker=tracker, use_cuda=use_cuda, ) record_validation_stats(metrics_values, loss, tracker, rank) if tracker.goal_reached: break model.train() tracker.train() if torch.cuda.is_available(): torch.cuda.empty_cache() metrics_values, loss = validation_round( val_loader, metrics, model, criterion, update_freq, translator, use_cuda=use_cuda, ) is_best = record_validation_stats(metrics_values, loss, tracker, rank) checkpointer.save( tracker, model, fp_optimizer.optimizer, scheduler, tracker.current_epoch, is_best, ) tracker.epoch_end() if tracker.goal_reached: print("Goal Reached!") dist.barrier() time.sleep(10) return else: cecf = CheckpointsEvaluationControlFlow( ckpt_dir=ckpt_run_dir, rank=rank, world_size=world_size, checkpointer=checkpointer, model=model, epochs=train_epochs, loss_function=criterion, metrics=metrics, use_cuda=use_cuda, dtype="fp32", max_batch_per_epoch=None, ) train_stats = cecf.evaluate_by_epochs(train_loader) with open(os.path.join(output_dir, "train_stats.json"), "w") as f: json.dump(train_stats, f) val_stats = cecf.evaluate_by_epochs(val_loader) with open(os.path.join(output_dir, "val_stats.json"), "w") as f: json.dump(val_stats, f)
def train_loop( run_id, dataset_dir, ckpt_run_dir, output_dir, validation_only=False, use_cuda=False, light_target=False, by_layer=False, ): r"""Main logic.""" num_parallel_workers = 2 train_epochs = 164 batch_size = 128 rank = dist.get_rank() world_size = dist.get_world_size() current_device = cuda.current_device() local_model = ResNetCIFAR(resnet_size=20, bottleneck=False, num_classes=10, version=1).to(current_device) model = DDP(local_model, device_ids=[current_device]) optimizer = SGD( model.parameters(), lr=0.1, momentum=0.9, weight_decay=1e-4, ) # Create a learning rate scheduler for an optimizer scheduler = MultiStepLR(optimizer, milestones=[82, 109], gamma=0.1) # A loss_function for computing the loss loss_function = CrossEntropyLoss() if use_cuda: model = model.cuda() loss_function = loss_function.cuda() # Metrics like Top 1/5 Accuracy metrics = [TopKAccuracy(topk=1), TopKAccuracy(topk=5)] train_set = CIFAR10V1(dataset_dir, train=True, download=True) val_set = CIFAR10V1(dataset_dir, train=False, download=True) train_set = partition_dataset_by_rank(train_set, rank, world_size) val_set = partition_dataset_by_rank(val_set, rank, world_size) train_loader = DataLoader( train_set, batch_size=batch_size, shuffle=True, num_workers=num_parallel_workers, pin_memory=use_cuda, drop_last=False, ) val_loader = DataLoader( val_set, batch_size=batch_size, shuffle=False, num_workers=num_parallel_workers, pin_memory=use_cuda, drop_last=False, ) checkpointer = Checkpointer(ckpt_run_dir=ckpt_run_dir, rank=rank, freq=CheckpointFreq.NONE) if not validation_only: if light_target: goal = task1_time_to_accuracy_light_goal() else: goal = task1_time_to_accuracy_goal() tracker = Tracker(metrics, run_id, rank, goal=goal) dist.barrier() tracker.start() for epoch in range(0, train_epochs): model.train() tracker.train() data_iter = iterate_dataloader(train_loader, dtype="fp32", use_cuda=use_cuda) num_batches_per_device_train = len(train_loader) for batch_idx, (data, target) in enumerate(data_iter): tracker.batch_start() # Clear gradients in the optimizer. optimizer.zero_grad() tracker.record_batch_init() # Compute the output output = model(data) tracker.record_batch_fwd_pass() # Compute the loss loss = loss_function(output, target) tracker.record_batch_comp_loss() # Backprop loss.backward() tracker.record_batch_backprop() # Aggregate gradients/parameters from all workers and apply updates to model optimizer.step() tracker.record_batch_opt_step() metrics_results = compute_train_batch_metrics( output, target, metrics, ) tracker.record_batch_comp_metrics() tracker.batch_end() record_train_batch_stats( batch_idx, loss.item(), output, metrics_results, tracker, num_batches_per_device_train, ) tracker.epoch_end() metrics_values, loss = validation_round( val_loader, model=model, loss_function=loss_function, metrics=metrics, dtype="fp32", tracker=tracker, use_cuda=use_cuda, ) scheduler.step() # Record validation stats is_best = record_validation_stats(metrics_values=metrics_values, loss=loss, tracker=tracker, rank=rank) checkpointer.save(tracker, model, optimizer, scheduler, tracker.current_epoch, is_best) if tracker.goal_reached: print("Goal Reached!") time.sleep(10) return else: cecf = CheckpointsEvaluationControlFlow( ckpt_dir=ckpt_run_dir, rank=rank, world_size=world_size, checkpointer=checkpointer, model=model, epochs=train_epochs, loss_function=loss_function, metrics=metrics, use_cuda=use_cuda, dtype="fp32", max_batch_per_epoch=None, ) train_stats = cecf.evaluate_by_epochs(train_loader) with open(os.path.join(output_dir, "train_stats.json"), "w") as f: json.dump(train_stats, f) val_stats = cecf.evaluate_by_epochs(val_loader) with open(os.path.join(output_dir, "val_stats.json"), "w") as f: json.dump(val_stats, f)