def _get_captions(self, dataloader): if dist.is_master_process(): pbar = tqdm(total=len(dataloader)) annotations = [] for batch in dataloader: annotations.extend([{ 'image_id': image_id.item(), 'caption': c } for image_id, caption in zip(batch['image_id'], batch['caption']) for c in caption]) if dist.is_master_process(): pbar.update(1) if dist.is_master_process(): pbar.close() return annotations
def __init__(self, val_dataloader, prefix): cache_filename = f'{prefix}_{val_dataloader.dataset.name}_cache.json' if not os.path.exists(cache_filename): annotations = self._get_captions(val_dataloader) self.ground_truth: Dict[int, List[str]] = defaultdict(list) for ann in annotations: self.ground_truth[ann["image_id"]].append(ann["caption"]) self.ground_truth = tokenize(self.ground_truth) if dist.is_master_process(): f = open(cache_filename, 'w') json.dump(self.ground_truth, f) else: f = open(cache_filename, 'r') self.ground_truth = json.load(f) self.ground_truth = { int(k): v for k, v in self.ground_truth.items() }
def common_setup(_C: Config, _A: argparse.Namespace, job_type: str = "pretrain"): r""" Setup common stuff at the start of every pretraining or downstream evaluation job, all listed here to avoid code duplication. Basic steps: 1. Fix random seeds and other PyTorch flags. 2. Set up a serialization directory and loggers. 3. Log important stuff such as config, process info (useful during distributed training). 4. Save a copy of config to serialization directory. .. note:: It is assumed that multiple processes for distributed training have already been launched from outside. Functions from :mod:`virtex.utils.distributed` module ae used to get process info. Args: _C: Config object with all the parameters. _A: Argparse command line arguments. job_type: Type of job for which setup is to be done; one of ``{"pretrain", "downstream"}``. """ # Get process rank and world size (assuming distributed is initialized). RANK = dist.get_rank() WORLD_SIZE = dist.get_world_size() # For reproducibility - refer https://pytorch.org/docs/stable/notes/randomness.html torch.manual_seed(_C.RANDOM_SEED) torch.backends.cudnn.deterministic = _C.CUDNN_DETERMINISTIC torch.backends.cudnn.benchmark = _C.CUDNN_BENCHMARK random.seed(_C.RANDOM_SEED) np.random.seed(_C.RANDOM_SEED) # Create serialization directory and save config in it. os.makedirs(_A.serialization_dir, exist_ok=True) _C.dump(os.path.join(_A.serialization_dir, f"{job_type}_config.yaml")) # Remove default logger, create a logger for each process which writes to a # separate log-file. This makes changes in global scope. logger.remove(0) if dist.get_world_size() > 1: logger.add( os.path.join(_A.serialization_dir, f"log-rank{RANK}.txt"), format="{time} {level} {message}", ) # Add a logger for stdout only for the master process. if dist.is_master_process(): logger.add(sys.stdout, format="<g>{time}</g>: <lvl>{message}</lvl>", colorize=True) # Print process info, config and args. logger.info(f"Rank of current process: {RANK}. World size: {WORLD_SIZE}") logger.info(str(_C)) logger.info("Command line args:") for arg in vars(_A): logger.info("{:<20}: {}".format(arg, getattr(_A, arg)))
def main(_A: argparse.Namespace): if _A.num_gpus_per_machine == 0: # Set device as CPU if num_gpus_per_machine = 0. device: Any = torch.device("cpu") else: # Get the current device as set for current distributed process. # Check `launch` function in `virtex.utils.distributed` module. device = torch.cuda.current_device() # Create a config object (this will be immutable) and perform common setup # such as logging and setting up serialization directory. _C = Config(_A.config, _A.config_override) common_setup(_C, _A) # ------------------------------------------------------------------------- # INSTANTIATE DATALOADER, MODEL, OPTIMIZER, SCHEDULER # ------------------------------------------------------------------------- train_dataset = PretrainingDatasetFactory.from_config(_C, split="train") val_dataset = PretrainingDatasetFactory.from_config(_C, split="val", all_captions=True) train_dataset_no_image = PretrainingDatasetFactory.from_config( _C, split="train", all_captions=True, include_image=False) val_dataset_no_image = PretrainingDatasetFactory.from_config( _C, split="val", all_captions=True, include_image=False) # Make `DistributedSampler`s to shard datasets across GPU processes. # Skip this if training on CPUs. train_sampler = ( DistributedSampler(train_dataset, shuffle=True) # type: ignore if _A.num_gpus_per_machine > 0 else None) val_sampler = ( DistributedSampler(val_dataset, shuffle=False) # type: ignore if _A.num_gpus_per_machine > 0 else None) train_dataloader = DataLoader( train_dataset, batch_size=_C.OPTIM.BATCH_SIZE // dist.get_world_size(), sampler=train_sampler, shuffle=train_sampler is None, num_workers=_A.cpu_workers, pin_memory=True, drop_last=True, collate_fn=train_dataset.collate_fn, ) val_dataloader = DataLoader( val_dataset, batch_size=_C.OPTIM.BATCH_SIZE // dist.get_world_size(), sampler=val_sampler, shuffle=False, num_workers=_A.cpu_workers, pin_memory=True, drop_last=False, collate_fn=val_dataset.collate_fn, ) train_dataloader_no_image = DataLoader( train_dataset_no_image, batch_size=_C.OPTIM.BATCH_SIZE // dist.get_world_size(), shuffle=False, drop_last=False, collate_fn=val_dataset.collate_fn, ) evaluator = CiderEvaluator(train_dataloader_no_image, prefix='train') val_dataloader_no_image = DataLoader( val_dataset_no_image, batch_size=_C.OPTIM.BATCH_SIZE // dist.get_world_size(), shuffle=False, drop_last=False, collate_fn=val_dataset.collate_fn, ) evaluator_val = CiderEvaluator(val_dataloader_no_image, prefix='val') # Load supervised trained model model = PretrainingModelFactory.from_config(_C).to(device) CheckpointManager(model=model).load(_A.start_checkpoint) optimizer = OptimizerFactory.from_config(_C, model.named_parameters()) scheduler = LRSchedulerFactory.from_config(_C, optimizer) if dist.is_master_process(): print( 'total parameters:', sum([ np.prod(p.shape) for p in model.parameters() if p.requires_grad ])) print( f'train data: {len(train_dataloader)}, val data: {len(val_dataloader)}' ) tokenizer = train_dataset.tokenizer # ------------------------------------------------------------------------- # BEFORE TRAINING STARTS # ------------------------------------------------------------------------- # Create a gradient scaler for automatic mixed precision. scaler = amp.GradScaler(enabled=_C.AMP) # Load checkpoint to resume training if specified. if _A.resume_from is not None: start_iteration = CheckpointManager( model=model, optimizer=optimizer, scheduler=scheduler, scaler=scaler, ).load(_A.resume_from) else: start_iteration = 0 # Create an iterator from dataloader to sample batches perpetually. train_dataloader_iter = cycle(train_dataloader, device, start_iteration) # Wrap model in DDP if using more than one processes. if dist.get_world_size() > 1: dist.synchronize() model = dist.DistributedDataParallel(model, device_ids=[device], find_unused_parameters=False) # Keep track of time per iteration and ETA. timer = Timer(start_from=start_iteration + 1, total_iterations=_C.OPTIM.NUM_ITERATIONS) # Create tensorboard writer and checkpoint manager (only in master process). if dist.is_master_process(): tensorboard_writer = SummaryWriter(log_dir=_A.serialization_dir) tensorboard_writer.add_text("config", f"```\n{_C}\n```") checkpoint_manager = CheckpointManager( _A.serialization_dir, model=model, optimizer=optimizer, scheduler=scheduler, scaler=scaler, ) # ------------------------------------------------------------------------- # TRAINING LOOP # ------------------------------------------------------------------------- for iteration in range(start_iteration + 1, _C.OPTIM.NUM_ITERATIONS + 1): timer.tic() optimizer.zero_grad() batch = next(train_dataloader_iter) with amp.autocast(enabled=_C.AMP): model.sample_on() model.eval() with torch.no_grad(): greedy_dec = model({"image": batch["image"]}, sample_mode="greedy")['predictions'] out = model({"image": batch["image"]}, sample_mode="sample", n_samples_per_image=5) sample_dec, caption_lengths = out['predictions'], out[ 'caption_lengths'] model.train() model.sample_off() sample_log_probs = -model( { "image": batch["image"], "caption_tokens": sample_dec, "caption_lengths": caption_lengths }, loss_reduction='none')['loss'] image_ids = batch['image_id'].tolist() reward = compute_scts_reward(image_ids, greedy_dec[:, 1:], sample_dec[:, 1:], tokenizer, evaluator) reward = torch.from_numpy(reward).to(device) mask = sample_dec[:, 1:] != tokenizer.pad_id loss = -sample_log_probs * reward * mask loss = loss.sum() / mask.sum() scaler.scale(loss).backward() # First clip norm of gradients, and then perform optimizer step. scaler.unscale_(optimizer) torch.nn.utils.clip_grad_norm_(model.parameters(), _C.OPTIM.CLIP_GRAD_NORM) scaler.step(optimizer) scaler.update() scheduler.step() timer.toc() # --------------------------------------------------------------------- # LOGGING # --------------------------------------------------------------------- if iteration % _A.log_every == 0: logger.info( f"{timer.stats} [Reward {-loss:.3f}] [GPU {dist.gpu_mem_usage()} MB]" ) if dist.is_master_process(): tensorboard_writer.add_scalars( "learning_rate", { "visual": optimizer.param_groups[0]["lr"], "common": optimizer.param_groups[-1]["lr"], }, iteration, ) # --------------------------------------------------------------------- # VALIDATION # --------------------------------------------------------------------- if iteration % _A.checkpoint_every == 0: if dist.is_master_process(): checkpoint_manager.step(iteration) # All processes will wait till master process is done serializing. dist.synchronize() torch.set_grad_enabled(False) model.eval() predictions: List[Dict[str, Any]] = [] if dist.is_master_process(): pbar = tqdm(total=len(val_dataloader)) for val_iteration, val_batch in enumerate(val_dataloader, start=1): val_batch = { 'image_id': val_batch['image_id'].to(device), 'image': val_batch['image'].to(device) } output_dict = model(val_batch) for image_id, caption in zip(val_batch['image_id'], output_dict['predictions'][:, 1:]): predictions.append({ 'image_id': image_id.item(), 'caption': tokenizer.decode(caption.tolist()) }) if dist.is_master_process(): pbar.update(1) if dist.is_master_process(): pbar.close() metrics = evaluator_val.evaluate(predictions) metrics = { k: torch.tensor(v, dtype=torch.float, device=device) for k, v in metrics.items() } dist.average_across_processes(metrics) metrics = {k: v.item() for k, v in metrics.items()} torch.set_grad_enabled(True) model.train() if dist.is_master_process(): logger.info(f"Iteration: {iteration} | Metrics: {metrics}") tensorboard_writer.add_scalars("val", metrics, iteration) if iteration % _A.checkpoint_every == 0: torch.set_grad_enabled(False) model.eval() batch = next(iter(val_dataloader)) batch = {"image": batch["image"][:8].to(device)} predictions = model(batch)["predictions"].cpu() captions = [] for i in range(predictions.shape[0]): caption = tokenizer.decode(predictions[i].tolist()) captions.append(caption) mean = torch.tensor(IMAGENET_COLOR_MEAN, dtype=torch.float).view(1, 3, 1, 1) std = torch.tensor(IMAGENET_COLOR_STD, dtype=torch.float).view(1, 3, 1, 1) image = batch["image"].cpu() * std + mean if dist.is_master_process(): logger.info(f"Sample Generated Captions:") log_text = "" for i, caption in enumerate(captions): logger.info(f"\t{caption}") log_text += f"{caption}\n\n" tensorboard_writer.add_text(f"samples_itr{iteration}", log_text, iteration) tensorboard_writer.add_images(f"samples_itr{iteration}", image, iteration) torch.set_grad_enabled(True) model.train()
def main(_A: argparse.Namespace): if _A.num_gpus_per_machine == 0: # Set device as CPU if num_gpus_per_machine = 0. device = torch.device("cpu") else: # Get the current device as set for current distributed process. # Check `launch` function in `virtex.utils.distributed` module. device = torch.cuda.current_device() # Create a downstream config object (this will be immutable) and perform # common setup such as logging and setting up serialization directory. _DOWNC = Config(_A.down_config, _A.down_config_override) common_setup(_DOWNC, _A, job_type="downstream") # Create a (pretraining) config object and backup in serializaion directory. _C = Config(_A.config, _A.config_override) _C.dump(os.path.join(_A.serialization_dir, "pretrain_config.yaml")) # Get dataset name for tensorboard logging. DATASET = _DOWNC.DATA.ROOT.split("/")[-1] # Set number of output classes according to dataset: NUM_CLASSES_MAPPING = {"imagenet": 1000, "inaturalist": 8142} NUM_CLASSES = NUM_CLASSES_MAPPING[DATASET] # ------------------------------------------------------------------------- # INSTANTIATE DATALOADER, MODEL, OPTIMIZER, SCHEDULER # ------------------------------------------------------------------------- train_dataset = DownstreamDatasetFactory.from_config(_DOWNC, split="train") train_dataloader = DataLoader( train_dataset, batch_size=_DOWNC.OPTIM.BATCH_SIZE // dist.get_world_size(), num_workers=_A.cpu_workers, sampler=DistributedSampler( train_dataset, num_replicas=dist.get_world_size(), rank=dist.get_rank(), shuffle=True, ), drop_last=False, pin_memory=True, collate_fn=train_dataset.collate_fn, ) val_dataset = DownstreamDatasetFactory.from_config(_DOWNC, split="val") val_dataloader = DataLoader( val_dataset, batch_size=_DOWNC.OPTIM.BATCH_SIZE // dist.get_world_size(), num_workers=_A.cpu_workers, sampler=DistributedSampler( val_dataset, num_replicas=dist.get_world_size(), rank=dist.get_rank(), shuffle=False, ), pin_memory=True, drop_last=False, collate_fn=val_dataset.collate_fn, ) # Initialize model using pretraining config. pretrained_model = PretrainingModelFactory.from_config(_C) # Load weights according to the init method, do nothing for `random`, and # `imagenet` is already taken care of. if _A.weight_init == "virtex": CheckpointManager(model=pretrained_model).load(_A.checkpoint_path) elif _A.weight_init == "torchvision": # Keep strict=False because this state dict may have weights for # last fc layer. pretrained_model.visual.cnn.load_state_dict( torch.load(_A.checkpoint_path, map_location="cpu")["state_dict"], strict=False, ) # Pull out the CNN (torchvision-like) from our pretrained model and add # back the FC layer - this is exists in torchvision models, and is set to # `nn.Identity()` during pretraining. model = pretrained_model.visual.cnn # type: ignore model.fc = nn.Linear(_DOWNC.MODEL.VISUAL.FEATURE_SIZE, NUM_CLASSES).to(device) model = model.to(device) # Re-initialize the FC layer. torch.nn.init.normal_(model.fc.weight.data, mean=0.0, std=0.01) torch.nn.init.constant_(model.fc.bias.data, 0.0) # Freeze all layers except FC as per config param. if _DOWNC.MODEL.VISUAL.FROZEN: for name, param in model.named_parameters(): if "fc" not in name: param.requires_grad = False # Cross entropy loss and accuracy meter. criterion = nn.CrossEntropyLoss() top1 = TopkAccuracy(top_k=1) optimizer = OptimizerFactory.from_config(_DOWNC, model.named_parameters()) scheduler = LRSchedulerFactory.from_config(_DOWNC, optimizer) del pretrained_model # ------------------------------------------------------------------------- # BEFORE TRAINING STARTS # ------------------------------------------------------------------------- # Create an iterator from dataloader to sample batches perpetually. train_dataloader_iter = cycle(train_dataloader, device) # Wrap model and optimizer using NVIDIA Apex for mixed precision training. # NOTE: Always do this before wrapping model with DistributedDataParallel. if _DOWNC.FP16_OPT > 0: from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level=f"O{_DOWNC.FP16_OPT}") if dist.get_world_size() > 1: dist.synchronize() model = nn.parallel.DistributedDataParallel( model, device_ids=[device], find_unused_parameters=True) if dist.is_master_process(): checkpoint_manager = CheckpointManager( _A.serialization_dir, model=model, optimizer=optimizer, scheduler=scheduler, ) tensorboard_writer = SummaryWriter(log_dir=_A.serialization_dir) # Keep track of time per iteration and ETA. timer = Timer(start_from=1, total_iterations=_DOWNC.OPTIM.NUM_ITERATIONS) # ------------------------------------------------------------------------- # TRAINING LOOP # ------------------------------------------------------------------------- for iteration in range(1, _DOWNC.OPTIM.NUM_ITERATIONS + 1): timer.tic() optimizer.zero_grad() batch = next(train_dataloader_iter) logits = model(batch["image"]) loss = criterion(logits, batch["label"]) # Perform dynamic scaling of loss to adjust for mixed precision. if _DOWNC.FP16_OPT > 0: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() optimizer.step() scheduler.step(iteration) timer.toc() if iteration % _A.log_every == 0 and dist.is_master_process(): logger.info( f"{timer.stats} | Loss: {loss:.3f} | GPU: {dist.gpu_mem_usage()} MB" ) tensorboard_writer.add_scalar(f"{DATASET}/train_loss", loss, iteration) tensorboard_writer.add_scalar( f"{DATASET}/learning_rate", optimizer.param_groups[0]["lr"], iteration, ) # --------------------------------------------------------------------- # VALIDATION # --------------------------------------------------------------------- if iteration % _A.checkpoint_every == 0: torch.set_grad_enabled(False) model.eval() total_val_loss = torch.tensor(0.0).to(device) for val_iteration, batch in enumerate(val_dataloader, start=1): for key in batch: batch[key] = batch[key].to(device) logits = model(batch["image"]) loss = criterion(logits, batch["label"]) top1(logits, batch["label"]) total_val_loss += loss # Divide each loss component by number of val batches per GPU. total_val_loss = total_val_loss / val_iteration dist.average_across_processes(total_val_loss) # Get accumulated Top-1 accuracy for logging across GPUs. acc = top1.get_metric(reset=True) dist.average_across_processes(acc) torch.set_grad_enabled(True) model.train() # Save recent checkpoint and best checkpoint based on accuracy. if dist.is_master_process(): checkpoint_manager.step(iteration) if iteration % _A.checkpoint_every == 0 and dist.is_master_process(): logger.info(f"Iter: {iteration} | Top-1 accuracy: {acc})") tensorboard_writer.add_scalar(f"{DATASET}/val_loss", total_val_loss, iteration) # This name scoping will result in Tensorboard displaying all metrics # (VOC07, caption, etc.) together. tensorboard_writer.add_scalars(f"metrics/{DATASET}", {"top1": acc}, iteration) # All processes will wait till master process is done logging. dist.synchronize()
def main(_A: argparse.Namespace): if _A.num_gpus_per_machine == 0: # Set device as CPU if num_gpus_per_machine = 0. device: Any = torch.device("cpu") else: # Get the current device as set for current distributed process. # Check `launch` function in `virtex.utils.distributed` module. device = torch.cuda.current_device() # Create a config object (this will be immutable) and perform common setup # such as logging and setting up serialization directory. _C = Config(_A.config, _A.config_override) common_setup(_C, _A) # ------------------------------------------------------------------------- # INSTANTIATE DATALOADER, MODEL, OPTIMIZER, SCHEDULER # ------------------------------------------------------------------------- train_dataset = PretrainingDatasetFactory.from_config(_C, split="train") val_dataset = PretrainingDatasetFactory.from_config(_C, split="val") # Make `DistributedSampler`s to shard datasets across GPU processes. # Skip this if training on CPUs. train_sampler = ( DistributedSampler(train_dataset, shuffle=True) # type: ignore if _A.num_gpus_per_machine > 0 else None) val_sampler = ( DistributedSampler(val_dataset, shuffle=False) # type: ignore if _A.num_gpus_per_machine > 0 else None) train_dataloader = DataLoader( train_dataset, batch_size=_C.OPTIM.BATCH_SIZE // dist.get_world_size(), sampler=train_sampler, shuffle=train_sampler is None, num_workers=_A.cpu_workers, pin_memory=True, drop_last=True, collate_fn=train_dataset.collate_fn, ) val_dataloader = DataLoader( val_dataset, batch_size=_C.OPTIM.BATCH_SIZE // dist.get_world_size(), sampler=val_sampler, shuffle=False, num_workers=_A.cpu_workers, pin_memory=True, drop_last=False, collate_fn=val_dataset.collate_fn, ) model = PretrainingModelFactory.from_config(_C).to(device) optimizer = OptimizerFactory.from_config(_C, model.named_parameters()) scheduler = LRSchedulerFactory.from_config(_C, optimizer) # ------------------------------------------------------------------------- # BEFORE TRAINING STARTS # ------------------------------------------------------------------------- # Create a gradient scaler for automatic mixed precision. scaler = amp.GradScaler(enabled=_C.AMP) # Load checkpoint to resume training if specified. if _A.resume_from is not None: start_iteration = CheckpointManager( model=model, optimizer=optimizer, scheduler=scheduler, scaler=scaler, ).load(_A.resume_from) else: start_iteration = 0 # Create an iterator from dataloader to sample batches perpetually. train_dataloader_iter = cycle(train_dataloader, device, start_iteration) # Wrap model in DDP if using more than one processes. if dist.get_world_size() > 1: dist.synchronize() model = nn.parallel.DistributedDataParallel( model, device_ids=[device], find_unused_parameters=True) # Keep track of time per iteration and ETA. timer = Timer(start_from=start_iteration + 1, total_iterations=_C.OPTIM.NUM_ITERATIONS) # Create tensorboard writer and checkpoint manager (only in master process). if dist.is_master_process(): tensorboard_writer = SummaryWriter(log_dir=_A.serialization_dir) tensorboard_writer.add_text("config", f"```\n{_C}\n```") checkpoint_manager = CheckpointManager( _A.serialization_dir, model=model, optimizer=optimizer, scheduler=scheduler, scaler=scaler, ) # ------------------------------------------------------------------------- # TRAINING LOOP # ------------------------------------------------------------------------- for iteration in range(start_iteration + 1, _C.OPTIM.NUM_ITERATIONS + 1): timer.tic() optimizer.zero_grad() batch = next(train_dataloader_iter) with amp.autocast(enabled=_C.AMP): output_dict = model(batch) loss = output_dict["loss"] scaler.scale(loss).backward() # First clip norm of gradients, and then perform optimizer step. scaler.unscale_(optimizer) torch.nn.utils.clip_grad_norm_(model.parameters(), _C.OPTIM.CLIP_GRAD_NORM) scaler.step(optimizer) scaler.update() scheduler.step() timer.toc() # --------------------------------------------------------------------- # LOGGING # --------------------------------------------------------------------- if iteration % _A.log_every == 0: logger.info( f"{timer.stats} [Loss {loss:.3f}] [GPU {dist.gpu_mem_usage()} MB]" ) if dist.is_master_process(): tensorboard_writer.add_scalars( "learning_rate", { "visual": optimizer.param_groups[0]["lr"], "common": optimizer.param_groups[-1]["lr"], }, iteration, ) tensorboard_writer.add_scalars("train", output_dict["loss_components"], iteration) # --------------------------------------------------------------------- # VALIDATION # --------------------------------------------------------------------- if iteration % _A.checkpoint_every == 0: if dist.is_master_process(): checkpoint_manager.step(iteration) # All processes will wait till master process is done serializing. dist.synchronize() torch.set_grad_enabled(False) model.eval() # Accumulate different val loss components according to the type of # pretraining model. val_loss_counter: Counter = Counter() for val_iteration, val_batch in enumerate(val_dataloader, start=1): for key in val_batch: val_batch[key] = val_batch[key].to(device) output_dict = model(val_batch) val_loss_counter.update(output_dict["loss_components"]) # Divide each loss component by number of val batches per GPU. val_loss_dict = { k: v / val_iteration for k, v in dict(val_loss_counter).items() } dist.average_across_processes(val_loss_dict) torch.set_grad_enabled(True) model.train() logger.info(f"Iteration: {iteration} [Val loss: {val_loss_dict}]") if dist.is_master_process(): tensorboard_writer.add_scalars("val", val_loss_dict, iteration)
def main(_A: argparse.Namespace): apex = False is_cpu = False if _A.num_gpus_per_machine == 0: # Set device as CPU if num_gpus_per_machine = 0. device = torch.device("cpu") is_cpu = True else: # Get the current device as set for current distributed process. # Check `launch` function in `virtex.utils.distributed` module. device = torch.cuda.current_device() # Create a config object (this will be immutable) and perform common setup # such as logging and setting up serialization directory. _C = Config(_A.config, _A.config_override) common_setup(_C, _A) # ------------------------------------------------------------------------- # INSTANTIATE DATALOADER, MODEL, OPTIMIZER # ------------------------------------------------------------------------- tokenizer = TokenizerFactory.from_config(_C) train_dataset = PretrainingDatasetFactory.from_config(_C, split="train", csv=_A.train_csv) val_dataset = PretrainingDatasetFactory.from_config(_C, split="val", csv=_A.val_csv) train_dataloader = DataLoader( train_dataset, batch_size=_C.OPTIM.BATCH_SIZE // dist.get_world_size(), #sampler= Sampler(train_dataset), sampler=DistributedSampler(train_dataset, shuffle=True), num_workers=_A.cpu_workers, pin_memory=True, drop_last=True, collate_fn=train_dataset.collate_fn, ) val_dataloader = DataLoader( val_dataset, batch_size=_C.OPTIM.BATCH_SIZE // dist.get_world_size(), # sampler = Sampler(val_dataset), sampler=DistributedSampler(val_dataset, shuffle=False), num_workers=_A.cpu_workers, pin_memory=True, drop_last=False, collate_fn=val_dataset.collate_fn, ) model = PretrainingModelFactory.from_config(_C).to(device) optimizer = OptimizerFactory.from_config(_C, model.named_parameters()) scheduler = LRSchedulerFactory.from_config(_C, optimizer) # ------------------------------------------------------------------------- # BEFORE TRAINING STARTS # ------------------------------------------------------------------------- # Load checkpoint to resume training if specified. if _A.resume_from is not None: start_iteration = CheckpointManager(model=model, optimizer=optimizer, scheduler=scheduler).load( _A.resume_from) else: start_iteration = 0 # Keep track of time per iteration and ETA. timer = Timer( start_from=start_iteration + 1, total_iterations=_C.OPTIM.NUM_ITERATIONS, ) # Create an iterator from dataloader to sample batches perpetually. train_dataloader_iter = cycle(train_dataloader, device, start_iteration) if (not is_cpu): # Wrap model and optimizer using NVIDIA Apex for mixed precision training. # NOTE: Always do this before wrapping model with DistributedDataParallel. if apex: if _C.FP16_OPT > 0: from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level=f"O{_C.FP16_OPT}") # Wrap model in DDP if using more than one processes. if dist.get_world_size() > 1: dist.synchronize() model = nn.parallel.DistributedDataParallel( model, device_ids=[device], find_unused_parameters=True) # Create checkpoint manager and tensorboard writer (only in master process). if dist.is_master_process(): checkpoint_manager = CheckpointManager( _A.serialization_dir, model=model, optimizer=optimizer, scheduler=scheduler, ) tensorboard_writer = SummaryWriter(log_dir=_A.serialization_dir) tensorboard_writer.add_text("config", f"```\n{_C}\n```") # ------------------------------------------------------------------------- # TRAINING LOOP # ------------------------------------------------------------------------- for iteration in range(start_iteration + 1, _C.OPTIM.NUM_ITERATIONS + 1): timer.tic() optimizer.zero_grad() batch_loss = torch.tensor(0.0, device=device) batch = next(train_dataloader_iter) output_dict = model(batch) loss = output_dict["loss"] batch_loss += loss.item() # Perform dynamic scaling of loss to adjust for mixed precision. if apex and _C.FP16_OPT > 0: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() # Clip norm of gradients before optimizer step. torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer) if apex and _C.FP16_OPT > 0 else model.parameters(), _C.OPTIM.CLIP_GRAD_NORM, ) optimizer.step() scheduler.step(iteration) timer.toc() # --------------------------------------------------------------------- # TENSORBOARD LOGGING # --------------------------------------------------------------------- if iteration % _A.log_every == 0 and dist.is_master_process(): logger.info(f"{timer.stats} | Loss: {batch_loss:.3f} | " f"GPU mem: {dist.gpu_mem_usage()} MB") tensorboard_writer.add_scalars( "learning_rate", { "visual": optimizer.param_groups[0]["lr"], "common": optimizer.param_groups[-1]["lr"], }, iteration, ) tensorboard_writer.add_scalars("train", output_dict["loss_components"], iteration) # --------------------------------------------------------------------- # VALIDATION # --------------------------------------------------------------------- if iteration % _A.checkpoint_every == 0: if dist.is_master_process(): checkpoint_manager.step(iteration) torch.set_grad_enabled(False) model.eval() # Accumulate different val loss components according to the type of # pretraining model. val_loss_counter: Counter = Counter() for val_iteration, val_batch in enumerate(val_dataloader, start=1): for key in val_batch: val_batch[key] = val_batch[key].to(device) output_dict = model(val_batch) val_loss_counter.update(output_dict["loss_components"]) # Divide each loss component by number of val batches per GPU. val_loss_dict = { k: v / val_iteration for k, v in dict(val_loss_counter).items() } dist.average_across_processes(val_loss_dict) torch.set_grad_enabled(True) model.train() if iteration % _A.checkpoint_every == 0 and dist.is_master_process(): logger.info(f"Iter: {iteration} | Val loss: {val_loss_dict}") tensorboard_writer.add_scalars("val", val_loss_dict, iteration) # All processes will wait till master process is done logging. dist.synchronize()
def main(_A: argparse.Namespace): if _A.num_gpus_per_machine == 0: # Set device as CPU if num_gpus_per_machine = 0. device = torch.device("cpu") else: # Get the current device (this will be zero here by default). device = torch.cuda.current_device() _C = Config(_A.config, _A.config_override) tokenizer = TokenizerFactory.from_config(_C) if _A.data_root is None: _A.data_root = os.path.join(_C.DATA.ROOT, "val2017") val_dataset = PretrainingDatasetFactory.from_config(_C, split='val', all_captions=True) val_dataset_no_image = PretrainingDatasetFactory.from_config(_C, split='val', all_captions=True, include_image=False) val_sampler = ( DistributedSampler(val_dataset, shuffle=False) if _A.num_gpus_per_machine > 0 else None ) val_dataloader = DataLoader( val_dataset, batch_size=_C.OPTIM.BATCH_SIZE // dist.get_world_size(), sampler=val_sampler, shuffle=False, num_workers=_A.cpu_workers, pin_memory=True, drop_last=False, collate_fn=val_dataset.collate_fn ) val_dataloader_no_image = DataLoader( val_dataset_no_image, batch_size=_C.OPTIM.BATCH_SIZE // dist.get_world_size(), shuffle=False, drop_last=False, collate_fn=val_dataset.collate_fn ) evaluator = CiderEvaluator(val_dataloader_no_image, prefix='val') # Initialize model from a checkpoint. model = PretrainingModelFactory.from_config(_C).to(device) ITERATION = CheckpointManager(model=model).load(_A.checkpoint_path) model.eval() # Make a list of predictions to evaluate. predictions: List[Dict[str, Any]] = [] if dist.is_master_process(): pbar = tqdm(total=len(val_dataloader)) for val_iteration, val_batch in enumerate(val_dataloader, start=1): val_batch = {'image_id': val_batch['image_id'].to(device), 'image': val_batch['image'].to(device)} with torch.no_grad(): output_dict = model(val_batch) # Make a dictionary of predictions in COCO format. for image_id, caption in zip( val_batch["image_id"], output_dict["predictions"][:, 1:] ): predictions.append( { # Convert image id to int if possible (mainly for COCO eval). "image_id": image_id.item(), "caption": tokenizer.decode(caption.tolist()), } ) if dist.is_master_process(): pbar.update(1) if dist.is_master_process(): pbar.close() # Save predictions as a JSON file if specified. if _A.output is not None: os.makedirs(os.path.dirname(_A.output), exist_ok=True) json.dump(predictions, open(_A.output, "w")) logger.info(f"Saved predictions to {_A.output}") # Calculate CIDEr and SPICE metrics using ground truth COCO Captions. This # should be skipped when running inference on arbitrary images. if _A.calc_metrics: metrics = evaluator.evaluate(predictions) metrics = {k: torch.tensor(v, dtype=torch.float, device=device) for k, v in metrics.items()} dist.average_across_processes(metrics) metrics = {k: v.item() for k, v in metrics.items()} if dist.is_master_process(): logger.info(f"Iter: {ITERATION} | Metrics: {metrics}")
def main(_A: argparse.Namespace): if _A.num_gpus_per_machine == 0: # Set device as CPU if num_gpus_per_machine = 0. device = torch.device("cpu") else: # Get the current device (this will be zero here by default). device = torch.cuda.current_device() _C = Config(_A.config, _A.config_override) tokenizer = TokenizerFactory.from_config(_C) dataset = PretrainingDatasetFactory.from_config(_C, split='train') sampler = ( DistributedSampler(dataset, shuffle=False) if _A.num_gpus_per_machine > 0 else None ) val_dataloader = DataLoader( dataset, batch_size=_C.OPTIM.BATCH_SIZE // dist.get_world_size(), sampler=sampler, shuffle=False, num_workers=_A.cpu_workers, pin_memory=True, drop_last=False, collate_fn=dataset.collate_fn ) # Initialize model from a checkpoint. model = PretrainingModelFactory.from_config(_C).to(device) ITERATION = CheckpointManager(model=model).load(_A.checkpoint_path) model.eval() torch.set_grad_enabled(False) model.sample_on() captions = dict() if dist.is_master_process(): pbar = tqdm(total=len(val_dataloader)) for val_iteration, val_batch in enumerate(val_dataloader, start=1): val_batch = {'image_id': val_batch['image_id'].to(device), 'image': val_batch['image'].to(device)} predictions = [] for k in [1]: predictions.append(model(val_batch, sample_mode='beam', n_samples_per_image=k)['predictions'][:, 1:]) max_length = max([p.shape[1] for p in predictions]) predictions = [torch.cat((p, torch.zeros(p.shape[0], max_length - p.shape[1], device=device)), dim=1) for p in predictions] predictions = torch.stack(predictions, dim=1) # Make a dictionary of predictions in COCO format. for image_id, caption in zip( val_batch["image_id"], predictions ): captions[image_id.item()] = [tokenizer.decode(c.tolist()).strip() for c in caption] if dist.is_master_process(): pbar.update(1) if dist.is_master_process(): pbar.close() # Save predictions as a JSON file if specified. folder = os.path.dirname(_A.output) os.makedirs(folder, exist_ok=True) filename = os.path.basename(_A.output) filepath = os.path.join(folder, f'{dist.get_rank()}_{filename}') json.dump(captions, open(filepath, "w")) logger.info(f"Saved predictions to {filepath}")
def main_worker(gpu, ngpus_per_node, _A): global best_acc1 _A.gpu = gpu logger.info(f"Use GPU: {_A.gpu} for training") # For multiprocessing distributed training, rank needs to be the # global rank among all the processes _A.rank = _A.gpu dist.init_process_group( backend=_A.dist_backend, init_method=_A.dist_url, world_size=_A.world_size, rank=_A.rank, ) # Create model (pretrained or random init). model = models.resnet50(pretrained=True) if _A.pretrained else models.resnet50() # For multiprocessing distributed, DistributedDataParallel constructor # should always set the single device scope, otherwise, # DistributedDataParallel will use all available devices. torch.cuda.set_device(_A.gpu) model.cuda(_A.gpu) # When using a single GPU per process and per # DistributedDataParallel, we need to divide the batch size # ourselves based on the total number of GPUs we have _A.batch_size = int(_A.batch_size / ngpus_per_node) _A.workers = int((_A.workers + ngpus_per_node - 1) / ngpus_per_node) model = nn.parallel.DistributedDataParallel(model, device_ids=[_A.gpu]) # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda(_A.gpu) optimizer = optim.SGD( model.parameters(), _A.lr, momentum=_A.momentum, weight_decay=_A.weight_decay ) # optionally resume from a checkpoint if _A.resume: if os.path.isfile(_A.resume): logger.info(f"=> loading checkpoint '{_A.resume}'") # Map model to be loaded to specified single gpu. checkpoint = torch.load(_A.resume, map_location=f"cuda:{_A.gpu}") _A.start_epoch = checkpoint["epoch"] best_acc1 = checkpoint["best_acc1"] # best_acc1 may be from a checkpoint from a different GPU best_acc1 = best_acc1.to(_A.gpu) model.load_state_dict(checkpoint["state_dict"]) optimizer.load_state_dict(checkpoint["optimizer"]) logger.info( f"=> loaded checkpoint '{_A.resume}' (epoch {checkpoint['epoch']})" ) else: logger.info(f"=> no checkpoint found at '{_A.resume}'") cudnn.benchmark = True # ------------------------------------------------------------------------- # We modify the data loading code to use our ImageNet dataset class and # transforms from albumentations (however, transformation steps are same). # ------------------------------------------------------------------------- train_dataset = ImageNetDataset( root=_A.data, split="train", percentage=_A.data_percentage ) logger.info(f"Size of dataset: {len(train_dataset)}") val_dataset = ImageNetDataset(root=_A.data, split="val") # Val dataset is used sparsely, don't keep it around in memory by caching. normalize = alb.Normalize( mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225), max_pixel_value=1.0, always_apply=True, ) # Override image transform (class definition has transform according to # downstream linear classification protocol). # fmt: off train_dataset.image_transform = alb.Compose([ alb.RandomResizedCrop(224, 224, always_apply=True), alb.HorizontalFlip(p=0.5), alb.ToFloat(max_value=255.0, always_apply=True), normalize, ]) val_dataset.image_transform = alb.Compose([ alb.Resize(256, 256, always_apply=True), alb.CenterCrop(224, 224, always_apply=True), alb.ToFloat(max_value=255.0, always_apply=True), normalize, ]) train_sampler = DistributedSampler(train_dataset, shuffle=True) val_sampler = DistributedSampler(val_dataset) train_loader = DataLoader( train_dataset, batch_size=_A.batch_size, num_workers=_A.workers, pin_memory=True, sampler=train_sampler, ) val_loader = DataLoader( val_dataset, batch_size=_A.batch_size, num_workers=_A.workers, pin_memory=True, sampler=val_sampler, ) # fmt: on # ------------------------------------------------------------------------- # Keep track of time per iteration and ETA. timer = Timer(start_from=0, total_iterations=_A.epochs * len(train_loader)) writer = SummaryWriter(log_dir=_A.serialization_dir) for epoch in range(_A.start_epoch, _A.epochs): train_sampler.set_epoch(epoch) adjust_learning_rate(optimizer, epoch, _A) train(train_loader, model, criterion, optimizer, epoch, timer, writer, _A) acc1 = validate(val_loader, model, criterion, writer, _A) # Remember best top-1 accuracy and save checkpoint. is_best = acc1 > best_acc1 best_acc1 = max(acc1, best_acc1) if vdist.is_master_process(): save_checkpoint( { "epoch": epoch + 1, "state_dict": model.state_dict(), "best_acc1": best_acc1, "optimizer": optimizer.state_dict(), }, is_best, _A.serialization_dir, )