def main(_A: argparse.Namespace): if _A.num_gpus_per_machine == 0: # Set device as CPU if num_gpus_per_machine = 0. device = torch.device("cpu") else: # Get the current device (this will be zero here by default). device = torch.cuda.current_device() # Create a downstream config object (this will be immutable) and perform # common setup such as logging and setting up serialization directory. _DOWNC = Config(_A.down_config, _A.down_config_override) common_setup(_DOWNC, _A, job_type="downstream") # Create a (pretraining) config object and backup in serialization directory. _C = Config(_A.config, _A.config_override) _C.dump(os.path.join(_A.serialization_dir, "pretrain_config.yaml")) # ------------------------------------------------------------------------- # INSTANTIATE DATALOADER, MODEL, AND FEATURE EXTRACTOR # ------------------------------------------------------------------------- train_dataset = DownstreamDatasetFactory.from_config(_DOWNC, split="trainval") train_dataloader = DataLoader( train_dataset, batch_size=_DOWNC.OPTIM.BATCH_SIZE, num_workers=_A.cpu_workers, pin_memory=True, ) test_dataset = DownstreamDatasetFactory.from_config(_DOWNC, split="test") test_dataloader = DataLoader( test_dataset, batch_size=_DOWNC.OPTIM.BATCH_SIZE, num_workers=_A.cpu_workers, pin_memory=True, ) NUM_CLASSES = len(train_dataset.class_names) # Initialize from a checkpoint, but only keep the visual module. model = PretrainingModelFactory.from_config(_C) # Load weights according to the init method, do nothing for `random`, and # `imagenet` is already taken care of. if _A.weight_init == "virtex": ITERATION = CheckpointManager(model=model).load(_A.checkpoint_path) elif _A.weight_init == "torchvision": # Keep strict=False because this state dict may have weights for # last fc layer. model.visual.cnn.load_state_dict( torch.load(_A.checkpoint_path, map_location="cpu")["state_dict"], strict=False, ) model = FeatureExtractor(model, layer_name=_A.layer, flatten_and_normalize=True) model = model.to(device).eval() # ------------------------------------------------------------------------- # EXTRACT FEATURES FOR TRAINING SVMs # ------------------------------------------------------------------------- features_train: List[torch.Tensor] = [] targets_train: List[torch.Tensor] = [] features_test: List[torch.Tensor] = [] targets_test: List[torch.Tensor] = [] # VOC07 is small, extract all features and keep them in memory. with torch.no_grad(): for batch in tqdm(train_dataloader, desc="Extracting train features:"): features = model(batch["image"].to(device)) features_train.append(features.cpu()) targets_train.append(batch["label"]) # Similarly extract test features. for batch in tqdm(test_dataloader, desc="Extracting test features:"): features = model(batch["image"].to(device)) features_test.append(features.cpu()) targets_test.append(batch["label"]) # Convert batches of features/targets to one large numpy array features_train = torch.cat(features_train, dim=0).numpy() targets_train = torch.cat(targets_train, dim=0).numpy().astype(np.int32) features_test = torch.cat(features_test, dim=0).numpy() targets_test = torch.cat(targets_test, dim=0).numpy().astype(np.int32) # ------------------------------------------------------------------------- # TRAIN AND TEST SVMs WITH EXTRACTED FEATURES # ------------------------------------------------------------------------- input_args: List[Any] = [] # Iterate over all VOC07 classes and train one-vs-all linear SVMs. for cls_idx in range(NUM_CLASSES): # fmt: off input_args.append(( features_train, targets_train[:, cls_idx], features_test, targets_test[:, cls_idx], train_dataset.class_names[cls_idx], )) # fmt: on pool = mp.Pool(processes=_A.cpu_workers) pool_output = pool.map(train_test_single_svm, input_args) # ------------------------------------------------------------------------- # TENSORBOARD LOGGING (RELEVANT MAINLY FOR weight_init=checkpoint) # ------------------------------------------------------------------------- # Tensorboard writer for logging mAP scores. This is useful especially # when weight_init=checkpoint (which maybe be coming from a training job). tensorboard_writer = SummaryWriter(log_dir=_A.serialization_dir) # Test set mAP for each class, for features from every layer. test_map = torch.tensor(pool_output).mean() logger.info(f"mAP: {test_map}") # Tensorboard logging only when _A.weight_init == "virtex" if _A.weight_init == "virtex": tensorboard_writer.add_scalars("metrics/voc07_clf", {"mAP": test_map}, ITERATION)
def main(_A: argparse.Namespace): if _A.num_gpus_per_machine == 0: # Set device as CPU if num_gpus_per_machine = 0. device: Any = torch.device("cpu") else: # Get the current device as set for current distributed process. # Check `launch` function in `virtex.utils.distributed` module. device = torch.cuda.current_device() # Create a config object (this will be immutable) and perform common setup # such as logging and setting up serialization directory. _C = Config(_A.config, _A.config_override) common_setup(_C, _A) # ------------------------------------------------------------------------- # INSTANTIATE DATALOADER, MODEL, OPTIMIZER, SCHEDULER # ------------------------------------------------------------------------- train_dataset = PretrainingDatasetFactory.from_config(_C, split="train") val_dataset = PretrainingDatasetFactory.from_config(_C, split="val", all_captions=True) train_dataset_no_image = PretrainingDatasetFactory.from_config( _C, split="train", all_captions=True, include_image=False) val_dataset_no_image = PretrainingDatasetFactory.from_config( _C, split="val", all_captions=True, include_image=False) # Make `DistributedSampler`s to shard datasets across GPU processes. # Skip this if training on CPUs. train_sampler = ( DistributedSampler(train_dataset, shuffle=True) # type: ignore if _A.num_gpus_per_machine > 0 else None) val_sampler = ( DistributedSampler(val_dataset, shuffle=False) # type: ignore if _A.num_gpus_per_machine > 0 else None) train_dataloader = DataLoader( train_dataset, batch_size=_C.OPTIM.BATCH_SIZE // dist.get_world_size(), sampler=train_sampler, shuffle=train_sampler is None, num_workers=_A.cpu_workers, pin_memory=True, drop_last=True, collate_fn=train_dataset.collate_fn, ) val_dataloader = DataLoader( val_dataset, batch_size=_C.OPTIM.BATCH_SIZE // dist.get_world_size(), sampler=val_sampler, shuffle=False, num_workers=_A.cpu_workers, pin_memory=True, drop_last=False, collate_fn=val_dataset.collate_fn, ) train_dataloader_no_image = DataLoader( train_dataset_no_image, batch_size=_C.OPTIM.BATCH_SIZE // dist.get_world_size(), shuffle=False, drop_last=False, collate_fn=val_dataset.collate_fn, ) evaluator = CiderEvaluator(train_dataloader_no_image, prefix='train') val_dataloader_no_image = DataLoader( val_dataset_no_image, batch_size=_C.OPTIM.BATCH_SIZE // dist.get_world_size(), shuffle=False, drop_last=False, collate_fn=val_dataset.collate_fn, ) evaluator_val = CiderEvaluator(val_dataloader_no_image, prefix='val') # Load supervised trained model model = PretrainingModelFactory.from_config(_C).to(device) CheckpointManager(model=model).load(_A.start_checkpoint) optimizer = OptimizerFactory.from_config(_C, model.named_parameters()) scheduler = LRSchedulerFactory.from_config(_C, optimizer) if dist.is_master_process(): print( 'total parameters:', sum([ np.prod(p.shape) for p in model.parameters() if p.requires_grad ])) print( f'train data: {len(train_dataloader)}, val data: {len(val_dataloader)}' ) tokenizer = train_dataset.tokenizer # ------------------------------------------------------------------------- # BEFORE TRAINING STARTS # ------------------------------------------------------------------------- # Create a gradient scaler for automatic mixed precision. scaler = amp.GradScaler(enabled=_C.AMP) # Load checkpoint to resume training if specified. if _A.resume_from is not None: start_iteration = CheckpointManager( model=model, optimizer=optimizer, scheduler=scheduler, scaler=scaler, ).load(_A.resume_from) else: start_iteration = 0 # Create an iterator from dataloader to sample batches perpetually. train_dataloader_iter = cycle(train_dataloader, device, start_iteration) # Wrap model in DDP if using more than one processes. if dist.get_world_size() > 1: dist.synchronize() model = dist.DistributedDataParallel(model, device_ids=[device], find_unused_parameters=False) # Keep track of time per iteration and ETA. timer = Timer(start_from=start_iteration + 1, total_iterations=_C.OPTIM.NUM_ITERATIONS) # Create tensorboard writer and checkpoint manager (only in master process). if dist.is_master_process(): tensorboard_writer = SummaryWriter(log_dir=_A.serialization_dir) tensorboard_writer.add_text("config", f"```\n{_C}\n```") checkpoint_manager = CheckpointManager( _A.serialization_dir, model=model, optimizer=optimizer, scheduler=scheduler, scaler=scaler, ) # ------------------------------------------------------------------------- # TRAINING LOOP # ------------------------------------------------------------------------- for iteration in range(start_iteration + 1, _C.OPTIM.NUM_ITERATIONS + 1): timer.tic() optimizer.zero_grad() batch = next(train_dataloader_iter) with amp.autocast(enabled=_C.AMP): model.sample_on() model.eval() with torch.no_grad(): greedy_dec = model({"image": batch["image"]}, sample_mode="greedy")['predictions'] out = model({"image": batch["image"]}, sample_mode="sample", n_samples_per_image=5) sample_dec, caption_lengths = out['predictions'], out[ 'caption_lengths'] model.train() model.sample_off() sample_log_probs = -model( { "image": batch["image"], "caption_tokens": sample_dec, "caption_lengths": caption_lengths }, loss_reduction='none')['loss'] image_ids = batch['image_id'].tolist() reward = compute_scts_reward(image_ids, greedy_dec[:, 1:], sample_dec[:, 1:], tokenizer, evaluator) reward = torch.from_numpy(reward).to(device) mask = sample_dec[:, 1:] != tokenizer.pad_id loss = -sample_log_probs * reward * mask loss = loss.sum() / mask.sum() scaler.scale(loss).backward() # First clip norm of gradients, and then perform optimizer step. scaler.unscale_(optimizer) torch.nn.utils.clip_grad_norm_(model.parameters(), _C.OPTIM.CLIP_GRAD_NORM) scaler.step(optimizer) scaler.update() scheduler.step() timer.toc() # --------------------------------------------------------------------- # LOGGING # --------------------------------------------------------------------- if iteration % _A.log_every == 0: logger.info( f"{timer.stats} [Reward {-loss:.3f}] [GPU {dist.gpu_mem_usage()} MB]" ) if dist.is_master_process(): tensorboard_writer.add_scalars( "learning_rate", { "visual": optimizer.param_groups[0]["lr"], "common": optimizer.param_groups[-1]["lr"], }, iteration, ) # --------------------------------------------------------------------- # VALIDATION # --------------------------------------------------------------------- if iteration % _A.checkpoint_every == 0: if dist.is_master_process(): checkpoint_manager.step(iteration) # All processes will wait till master process is done serializing. dist.synchronize() torch.set_grad_enabled(False) model.eval() predictions: List[Dict[str, Any]] = [] if dist.is_master_process(): pbar = tqdm(total=len(val_dataloader)) for val_iteration, val_batch in enumerate(val_dataloader, start=1): val_batch = { 'image_id': val_batch['image_id'].to(device), 'image': val_batch['image'].to(device) } output_dict = model(val_batch) for image_id, caption in zip(val_batch['image_id'], output_dict['predictions'][:, 1:]): predictions.append({ 'image_id': image_id.item(), 'caption': tokenizer.decode(caption.tolist()) }) if dist.is_master_process(): pbar.update(1) if dist.is_master_process(): pbar.close() metrics = evaluator_val.evaluate(predictions) metrics = { k: torch.tensor(v, dtype=torch.float, device=device) for k, v in metrics.items() } dist.average_across_processes(metrics) metrics = {k: v.item() for k, v in metrics.items()} torch.set_grad_enabled(True) model.train() if dist.is_master_process(): logger.info(f"Iteration: {iteration} | Metrics: {metrics}") tensorboard_writer.add_scalars("val", metrics, iteration) if iteration % _A.checkpoint_every == 0: torch.set_grad_enabled(False) model.eval() batch = next(iter(val_dataloader)) batch = {"image": batch["image"][:8].to(device)} predictions = model(batch)["predictions"].cpu() captions = [] for i in range(predictions.shape[0]): caption = tokenizer.decode(predictions[i].tolist()) captions.append(caption) mean = torch.tensor(IMAGENET_COLOR_MEAN, dtype=torch.float).view(1, 3, 1, 1) std = torch.tensor(IMAGENET_COLOR_STD, dtype=torch.float).view(1, 3, 1, 1) image = batch["image"].cpu() * std + mean if dist.is_master_process(): logger.info(f"Sample Generated Captions:") log_text = "" for i, caption in enumerate(captions): logger.info(f"\t{caption}") log_text += f"{caption}\n\n" tensorboard_writer.add_text(f"samples_itr{iteration}", log_text, iteration) tensorboard_writer.add_images(f"samples_itr{iteration}", image, iteration) torch.set_grad_enabled(True) model.train()
def main(_A: argparse.Namespace): if _A.num_gpus_per_machine == 0: # Set device as CPU if num_gpus_per_machine = 0. device = torch.device("cpu") else: # Get the current device as set for current distributed process. # Check `launch` function in `virtex.utils.distributed` module. device = torch.cuda.current_device() # Create a downstream config object (this will be immutable) and perform # common setup such as logging and setting up serialization directory. _DOWNC = Config(_A.down_config, _A.down_config_override) common_setup(_DOWNC, _A, job_type="downstream") # Create a (pretraining) config object and backup in serializaion directory. _C = Config(_A.config, _A.config_override) _C.dump(os.path.join(_A.serialization_dir, "pretrain_config.yaml")) # Get dataset name for tensorboard logging. DATASET = _DOWNC.DATA.ROOT.split("/")[-1] # Set number of output classes according to dataset: NUM_CLASSES_MAPPING = {"imagenet": 1000, "inaturalist": 8142} NUM_CLASSES = NUM_CLASSES_MAPPING[DATASET] # ------------------------------------------------------------------------- # INSTANTIATE DATALOADER, MODEL, OPTIMIZER, SCHEDULER # ------------------------------------------------------------------------- train_dataset = DownstreamDatasetFactory.from_config(_DOWNC, split="train") train_dataloader = DataLoader( train_dataset, batch_size=_DOWNC.OPTIM.BATCH_SIZE // dist.get_world_size(), num_workers=_A.cpu_workers, sampler=DistributedSampler( train_dataset, num_replicas=dist.get_world_size(), rank=dist.get_rank(), shuffle=True, ), drop_last=False, pin_memory=True, collate_fn=train_dataset.collate_fn, ) val_dataset = DownstreamDatasetFactory.from_config(_DOWNC, split="val") val_dataloader = DataLoader( val_dataset, batch_size=_DOWNC.OPTIM.BATCH_SIZE // dist.get_world_size(), num_workers=_A.cpu_workers, sampler=DistributedSampler( val_dataset, num_replicas=dist.get_world_size(), rank=dist.get_rank(), shuffle=False, ), pin_memory=True, drop_last=False, collate_fn=val_dataset.collate_fn, ) # Initialize model using pretraining config. pretrained_model = PretrainingModelFactory.from_config(_C) # Load weights according to the init method, do nothing for `random`, and # `imagenet` is already taken care of. if _A.weight_init == "virtex": CheckpointManager(model=pretrained_model).load(_A.checkpoint_path) elif _A.weight_init == "torchvision": # Keep strict=False because this state dict may have weights for # last fc layer. pretrained_model.visual.cnn.load_state_dict( torch.load(_A.checkpoint_path, map_location="cpu")["state_dict"], strict=False, ) # Pull out the CNN (torchvision-like) from our pretrained model and add # back the FC layer - this is exists in torchvision models, and is set to # `nn.Identity()` during pretraining. model = pretrained_model.visual.cnn # type: ignore model.fc = nn.Linear(_DOWNC.MODEL.VISUAL.FEATURE_SIZE, NUM_CLASSES).to(device) model = model.to(device) # Re-initialize the FC layer. torch.nn.init.normal_(model.fc.weight.data, mean=0.0, std=0.01) torch.nn.init.constant_(model.fc.bias.data, 0.0) # Freeze all layers except FC as per config param. if _DOWNC.MODEL.VISUAL.FROZEN: for name, param in model.named_parameters(): if "fc" not in name: param.requires_grad = False # Cross entropy loss and accuracy meter. criterion = nn.CrossEntropyLoss() top1 = TopkAccuracy(top_k=1) optimizer = OptimizerFactory.from_config(_DOWNC, model.named_parameters()) scheduler = LRSchedulerFactory.from_config(_DOWNC, optimizer) del pretrained_model # ------------------------------------------------------------------------- # BEFORE TRAINING STARTS # ------------------------------------------------------------------------- # Create an iterator from dataloader to sample batches perpetually. train_dataloader_iter = cycle(train_dataloader, device) # Wrap model and optimizer using NVIDIA Apex for mixed precision training. # NOTE: Always do this before wrapping model with DistributedDataParallel. if _DOWNC.FP16_OPT > 0: from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level=f"O{_DOWNC.FP16_OPT}") if dist.get_world_size() > 1: dist.synchronize() model = nn.parallel.DistributedDataParallel( model, device_ids=[device], find_unused_parameters=True) if dist.is_master_process(): checkpoint_manager = CheckpointManager( _A.serialization_dir, model=model, optimizer=optimizer, scheduler=scheduler, ) tensorboard_writer = SummaryWriter(log_dir=_A.serialization_dir) # Keep track of time per iteration and ETA. timer = Timer(start_from=1, total_iterations=_DOWNC.OPTIM.NUM_ITERATIONS) # ------------------------------------------------------------------------- # TRAINING LOOP # ------------------------------------------------------------------------- for iteration in range(1, _DOWNC.OPTIM.NUM_ITERATIONS + 1): timer.tic() optimizer.zero_grad() batch = next(train_dataloader_iter) logits = model(batch["image"]) loss = criterion(logits, batch["label"]) # Perform dynamic scaling of loss to adjust for mixed precision. if _DOWNC.FP16_OPT > 0: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() optimizer.step() scheduler.step(iteration) timer.toc() if iteration % _A.log_every == 0 and dist.is_master_process(): logger.info( f"{timer.stats} | Loss: {loss:.3f} | GPU: {dist.gpu_mem_usage()} MB" ) tensorboard_writer.add_scalar(f"{DATASET}/train_loss", loss, iteration) tensorboard_writer.add_scalar( f"{DATASET}/learning_rate", optimizer.param_groups[0]["lr"], iteration, ) # --------------------------------------------------------------------- # VALIDATION # --------------------------------------------------------------------- if iteration % _A.checkpoint_every == 0: torch.set_grad_enabled(False) model.eval() total_val_loss = torch.tensor(0.0).to(device) for val_iteration, batch in enumerate(val_dataloader, start=1): for key in batch: batch[key] = batch[key].to(device) logits = model(batch["image"]) loss = criterion(logits, batch["label"]) top1(logits, batch["label"]) total_val_loss += loss # Divide each loss component by number of val batches per GPU. total_val_loss = total_val_loss / val_iteration dist.average_across_processes(total_val_loss) # Get accumulated Top-1 accuracy for logging across GPUs. acc = top1.get_metric(reset=True) dist.average_across_processes(acc) torch.set_grad_enabled(True) model.train() # Save recent checkpoint and best checkpoint based on accuracy. if dist.is_master_process(): checkpoint_manager.step(iteration) if iteration % _A.checkpoint_every == 0 and dist.is_master_process(): logger.info(f"Iter: {iteration} | Top-1 accuracy: {acc})") tensorboard_writer.add_scalar(f"{DATASET}/val_loss", total_val_loss, iteration) # This name scoping will result in Tensorboard displaying all metrics # (VOC07, caption, etc.) together. tensorboard_writer.add_scalars(f"metrics/{DATASET}", {"top1": acc}, iteration) # All processes will wait till master process is done logging. dist.synchronize()
def main(_A: argparse.Namespace): if _A.num_gpus_per_machine == 0: # Set device as CPU if num_gpus_per_machine = 0. device = torch.device("cpu") else: # Get the current device (this will be zero here by default). device = torch.cuda.current_device() # Create a downstream config object (this will be immutable) and perform # common setup such as logging and setting up serialization directory. _DOWNC = Config(_A.down_config, _A.down_config_override) common_setup(_DOWNC, _A, job_type="downstream") # Create a (pretraining) config object and backup in serialization directory. _C = Config(_A.config, _A.config_override) _C.dump(os.path.join(_A.serialization_dir, "pretrain_config.yaml")) # ------------------------------------------------------------------------- # INSTANTIATE DATALOADER, MODEL, AND FEATURE EXTRACTOR # ------------------------------------------------------------------------- train_dataset = DownstreamDatasetFactory.from_config(_DOWNC, split="trainval", csv=_A.csv) train_dataloader = DataLoader( train_dataset, batch_size=_DOWNC.OPTIM.BATCH_SIZE, num_workers=_A.cpu_workers, pin_memory=True, shuffle=False, ) print(f"train dataset length {len(train_dataset)}") # Initialize from a checkpoint, but only keep the visual module. model = PretrainingModelFactory.from_config(_C) # Load weights according to the init method, do nothing for `random`, and # `imagenet` is already taken care of. if _A.weight_init == "virtex": ITERATION = CheckpointManager(model=model).load(_A.checkpoint_path) elif _A.weight_init == "torchvision": # Keep strict=False because this state dict may have weights for # last fc layer. model.visual.cnn.load_state_dict( torch.load(_A.checkpoint_path, map_location="cpu")["state_dict"], strict=False, ) model = FeatureExtractor(model, layer_name=_A.layer, flatten_and_normalize=True) model = model.to(device).eval() # ------------------------------------------------------------------------- # EXTRACT FEATURES FOR TRAINING SVMs # ------------------------------------------------------------------------- features_train: List[torch.Tensor] = [] targets_train: List[torch.Tensor] = [] print("input csv is {}".format(_A.csv)) # VOC07 is small, extract all features and keep them in memory. count = 0 with torch.no_grad(): for batch in tqdm(train_dataloader, desc="Extracting train features:"): # if count<=4000: # count+=1 # continue features = model(batch["image"].to(device)) count += 1 print("train features has shape {}, video_id {}".format(features.shape, batch['image_id'])) if count % 1000 == 0: torch.save(features_train, "./features_phoenix/features_{}_{}.pt".format(_A.mode, count)) features_train = [] features_train.append(features.cpu()) torch.save(features_train, "./features_phoenix/features_end_{}.pt".format(_A.mode)) print('finished saving features')
def main(_A: argparse.Namespace): if _A.num_gpus_per_machine == 0: # Set device as CPU if num_gpus_per_machine = 0. device: Any = torch.device("cpu") else: # Get the current device as set for current distributed process. # Check `launch` function in `virtex.utils.distributed` module. device = torch.cuda.current_device() # Create a config object (this will be immutable) and perform common setup # such as logging and setting up serialization directory. _C = Config(_A.config, _A.config_override) common_setup(_C, _A) # ------------------------------------------------------------------------- # INSTANTIATE DATALOADER, MODEL, OPTIMIZER, SCHEDULER # ------------------------------------------------------------------------- train_dataset = PretrainingDatasetFactory.from_config(_C, split="train") val_dataset = PretrainingDatasetFactory.from_config(_C, split="val") # Make `DistributedSampler`s to shard datasets across GPU processes. # Skip this if training on CPUs. train_sampler = ( DistributedSampler(train_dataset, shuffle=True) # type: ignore if _A.num_gpus_per_machine > 0 else None) val_sampler = ( DistributedSampler(val_dataset, shuffle=False) # type: ignore if _A.num_gpus_per_machine > 0 else None) train_dataloader = DataLoader( train_dataset, batch_size=_C.OPTIM.BATCH_SIZE // dist.get_world_size(), sampler=train_sampler, shuffle=train_sampler is None, num_workers=_A.cpu_workers, pin_memory=True, drop_last=True, collate_fn=train_dataset.collate_fn, ) val_dataloader = DataLoader( val_dataset, batch_size=_C.OPTIM.BATCH_SIZE // dist.get_world_size(), sampler=val_sampler, shuffle=False, num_workers=_A.cpu_workers, pin_memory=True, drop_last=False, collate_fn=val_dataset.collate_fn, ) model = PretrainingModelFactory.from_config(_C).to(device) optimizer = OptimizerFactory.from_config(_C, model.named_parameters()) scheduler = LRSchedulerFactory.from_config(_C, optimizer) # ------------------------------------------------------------------------- # BEFORE TRAINING STARTS # ------------------------------------------------------------------------- # Create a gradient scaler for automatic mixed precision. scaler = amp.GradScaler(enabled=_C.AMP) # Load checkpoint to resume training if specified. if _A.resume_from is not None: start_iteration = CheckpointManager( model=model, optimizer=optimizer, scheduler=scheduler, scaler=scaler, ).load(_A.resume_from) else: start_iteration = 0 # Create an iterator from dataloader to sample batches perpetually. train_dataloader_iter = cycle(train_dataloader, device, start_iteration) # Wrap model in DDP if using more than one processes. if dist.get_world_size() > 1: dist.synchronize() model = nn.parallel.DistributedDataParallel( model, device_ids=[device], find_unused_parameters=True) # Keep track of time per iteration and ETA. timer = Timer(start_from=start_iteration + 1, total_iterations=_C.OPTIM.NUM_ITERATIONS) # Create tensorboard writer and checkpoint manager (only in master process). if dist.is_master_process(): tensorboard_writer = SummaryWriter(log_dir=_A.serialization_dir) tensorboard_writer.add_text("config", f"```\n{_C}\n```") checkpoint_manager = CheckpointManager( _A.serialization_dir, model=model, optimizer=optimizer, scheduler=scheduler, scaler=scaler, ) # ------------------------------------------------------------------------- # TRAINING LOOP # ------------------------------------------------------------------------- for iteration in range(start_iteration + 1, _C.OPTIM.NUM_ITERATIONS + 1): timer.tic() optimizer.zero_grad() batch = next(train_dataloader_iter) with amp.autocast(enabled=_C.AMP): output_dict = model(batch) loss = output_dict["loss"] scaler.scale(loss).backward() # First clip norm of gradients, and then perform optimizer step. scaler.unscale_(optimizer) torch.nn.utils.clip_grad_norm_(model.parameters(), _C.OPTIM.CLIP_GRAD_NORM) scaler.step(optimizer) scaler.update() scheduler.step() timer.toc() # --------------------------------------------------------------------- # LOGGING # --------------------------------------------------------------------- if iteration % _A.log_every == 0: logger.info( f"{timer.stats} [Loss {loss:.3f}] [GPU {dist.gpu_mem_usage()} MB]" ) if dist.is_master_process(): tensorboard_writer.add_scalars( "learning_rate", { "visual": optimizer.param_groups[0]["lr"], "common": optimizer.param_groups[-1]["lr"], }, iteration, ) tensorboard_writer.add_scalars("train", output_dict["loss_components"], iteration) # --------------------------------------------------------------------- # VALIDATION # --------------------------------------------------------------------- if iteration % _A.checkpoint_every == 0: if dist.is_master_process(): checkpoint_manager.step(iteration) # All processes will wait till master process is done serializing. dist.synchronize() torch.set_grad_enabled(False) model.eval() # Accumulate different val loss components according to the type of # pretraining model. val_loss_counter: Counter = Counter() for val_iteration, val_batch in enumerate(val_dataloader, start=1): for key in val_batch: val_batch[key] = val_batch[key].to(device) output_dict = model(val_batch) val_loss_counter.update(output_dict["loss_components"]) # Divide each loss component by number of val batches per GPU. val_loss_dict = { k: v / val_iteration for k, v in dict(val_loss_counter).items() } dist.average_across_processes(val_loss_dict) torch.set_grad_enabled(True) model.train() logger.info(f"Iteration: {iteration} [Val loss: {val_loss_dict}]") if dist.is_master_process(): tensorboard_writer.add_scalars("val", val_loss_dict, iteration)
def main(_A: argparse.Namespace): apex = False is_cpu = False if _A.num_gpus_per_machine == 0: # Set device as CPU if num_gpus_per_machine = 0. device = torch.device("cpu") is_cpu = True else: # Get the current device as set for current distributed process. # Check `launch` function in `virtex.utils.distributed` module. device = torch.cuda.current_device() # Create a config object (this will be immutable) and perform common setup # such as logging and setting up serialization directory. _C = Config(_A.config, _A.config_override) common_setup(_C, _A) # ------------------------------------------------------------------------- # INSTANTIATE DATALOADER, MODEL, OPTIMIZER # ------------------------------------------------------------------------- tokenizer = TokenizerFactory.from_config(_C) train_dataset = PretrainingDatasetFactory.from_config(_C, split="train", csv=_A.train_csv) val_dataset = PretrainingDatasetFactory.from_config(_C, split="val", csv=_A.val_csv) train_dataloader = DataLoader( train_dataset, batch_size=_C.OPTIM.BATCH_SIZE // dist.get_world_size(), #sampler= Sampler(train_dataset), sampler=DistributedSampler(train_dataset, shuffle=True), num_workers=_A.cpu_workers, pin_memory=True, drop_last=True, collate_fn=train_dataset.collate_fn, ) val_dataloader = DataLoader( val_dataset, batch_size=_C.OPTIM.BATCH_SIZE // dist.get_world_size(), # sampler = Sampler(val_dataset), sampler=DistributedSampler(val_dataset, shuffle=False), num_workers=_A.cpu_workers, pin_memory=True, drop_last=False, collate_fn=val_dataset.collate_fn, ) model = PretrainingModelFactory.from_config(_C).to(device) optimizer = OptimizerFactory.from_config(_C, model.named_parameters()) scheduler = LRSchedulerFactory.from_config(_C, optimizer) # ------------------------------------------------------------------------- # BEFORE TRAINING STARTS # ------------------------------------------------------------------------- # Load checkpoint to resume training if specified. if _A.resume_from is not None: start_iteration = CheckpointManager(model=model, optimizer=optimizer, scheduler=scheduler).load( _A.resume_from) else: start_iteration = 0 # Keep track of time per iteration and ETA. timer = Timer( start_from=start_iteration + 1, total_iterations=_C.OPTIM.NUM_ITERATIONS, ) # Create an iterator from dataloader to sample batches perpetually. train_dataloader_iter = cycle(train_dataloader, device, start_iteration) if (not is_cpu): # Wrap model and optimizer using NVIDIA Apex for mixed precision training. # NOTE: Always do this before wrapping model with DistributedDataParallel. if apex: if _C.FP16_OPT > 0: from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level=f"O{_C.FP16_OPT}") # Wrap model in DDP if using more than one processes. if dist.get_world_size() > 1: dist.synchronize() model = nn.parallel.DistributedDataParallel( model, device_ids=[device], find_unused_parameters=True) # Create checkpoint manager and tensorboard writer (only in master process). if dist.is_master_process(): checkpoint_manager = CheckpointManager( _A.serialization_dir, model=model, optimizer=optimizer, scheduler=scheduler, ) tensorboard_writer = SummaryWriter(log_dir=_A.serialization_dir) tensorboard_writer.add_text("config", f"```\n{_C}\n```") # ------------------------------------------------------------------------- # TRAINING LOOP # ------------------------------------------------------------------------- for iteration in range(start_iteration + 1, _C.OPTIM.NUM_ITERATIONS + 1): timer.tic() optimizer.zero_grad() batch_loss = torch.tensor(0.0, device=device) batch = next(train_dataloader_iter) output_dict = model(batch) loss = output_dict["loss"] batch_loss += loss.item() # Perform dynamic scaling of loss to adjust for mixed precision. if apex and _C.FP16_OPT > 0: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() # Clip norm of gradients before optimizer step. torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer) if apex and _C.FP16_OPT > 0 else model.parameters(), _C.OPTIM.CLIP_GRAD_NORM, ) optimizer.step() scheduler.step(iteration) timer.toc() # --------------------------------------------------------------------- # TENSORBOARD LOGGING # --------------------------------------------------------------------- if iteration % _A.log_every == 0 and dist.is_master_process(): logger.info(f"{timer.stats} | Loss: {batch_loss:.3f} | " f"GPU mem: {dist.gpu_mem_usage()} MB") tensorboard_writer.add_scalars( "learning_rate", { "visual": optimizer.param_groups[0]["lr"], "common": optimizer.param_groups[-1]["lr"], }, iteration, ) tensorboard_writer.add_scalars("train", output_dict["loss_components"], iteration) # --------------------------------------------------------------------- # VALIDATION # --------------------------------------------------------------------- if iteration % _A.checkpoint_every == 0: if dist.is_master_process(): checkpoint_manager.step(iteration) torch.set_grad_enabled(False) model.eval() # Accumulate different val loss components according to the type of # pretraining model. val_loss_counter: Counter = Counter() for val_iteration, val_batch in enumerate(val_dataloader, start=1): for key in val_batch: val_batch[key] = val_batch[key].to(device) output_dict = model(val_batch) val_loss_counter.update(output_dict["loss_components"]) # Divide each loss component by number of val batches per GPU. val_loss_dict = { k: v / val_iteration for k, v in dict(val_loss_counter).items() } dist.average_across_processes(val_loss_dict) torch.set_grad_enabled(True) model.train() if iteration % _A.checkpoint_every == 0 and dist.is_master_process(): logger.info(f"Iter: {iteration} | Val loss: {val_loss_dict}") tensorboard_writer.add_scalars("val", val_loss_dict, iteration) # All processes will wait till master process is done logging. dist.synchronize()