def main(_A: argparse.Namespace): if _A.num_gpus_per_machine == 0: # Set device as CPU if num_gpus_per_machine = 0. device = torch.device("cpu") else: # Get the current device (this will be zero here by default). device = torch.cuda.current_device() _C = Config(_A.config, _A.config_override) tokenizer = TokenizerFactory.from_config(_C) if _A.data_root is None: _A.data_root = os.path.join(_C.DATA.ROOT, "val2017") val_dataloader = DataLoader( ImageDirectoryDataset(_A.data_root), batch_size=_C.OPTIM.BATCH_SIZE, num_workers=_A.cpu_workers, pin_memory=True, ) # Initialize model from a checkpoint. model = PretrainingModelFactory.from_config(_C).to(device) ITERATION = CheckpointManager(model=model).load(_A.checkpoint_path) model.eval() # Make a list of predictions to evaluate. predictions: List[Dict[str, Any]] = [] for val_iteration, val_batch in enumerate(val_dataloader, start=1): val_batch["image"] = val_batch["image"].to(device) with torch.no_grad(): output_dict = model(val_batch) # Make a dictionary of predictions in COCO format. for image_id, caption in zip(val_batch["image_id"], output_dict["predictions"]): predictions.append({ "image_id": image_id.item(), "caption": tokenizer.decode(caption.tolist()), }) # Save predictions as a JSON file if specified. if _A.output is not None: os.makedirs(os.path.dirname(_A.output), exist_ok=True) json.dump(predictions, open(_A.output, "w")) logger.info(f"Saved predictions to {_A.output}") # Calculate CIDEr and SPICE metrics using ground truth COCO Captions. This # should be skipped when running inference on arbitrary images. if _A.calc_metrics: # Assume ground truth (COCO val2017 annotations) exist. gt = os.path.join(_C.DATA.ROOT, "annotations", "captions_val2017.json") metrics = CocoCaptionsEvaluator(gt).evaluate(predictions) logger.info(f"Iter: {ITERATION} | Metrics: {metrics}")
def main(_A: argparse.Namespace): if _A.num_gpus_per_machine == 0: # Set device as CPU if num_gpus_per_machine = 0. device = torch.device("cpu") else: # Get the current device (this will be zero here by default). device = torch.cuda.current_device() _C = Config(_A.config, _A.config_override) tokenizer = TokenizerFactory.from_config(_C) val_dataloader = DataLoader( CocoCaptionsEvalDataset(_C.DATA.ROOT), batch_size=_C.OPTIM.BATCH_SIZE, num_workers=_A.cpu_workers, pin_memory=True, ) # Initialize model from a checkpoint. model = PretrainingModelFactory.from_config(_C).to(device) ITERATION = CheckpointManager(model=model).load(_A.checkpoint_path) model.eval() # Make a list of predictions to evaluate. predictions: List[Dict[str, Any]] = [] for val_iteration, val_batch in enumerate(val_dataloader, start=1): for key in val_batch: val_batch[key] = val_batch[key].to(device) # Make a dictionary of predictions in COCO format. with torch.no_grad(): output_dict = model(val_batch) for image_id, caption in zip(val_batch["image_id"], output_dict["predictions"]): predictions.append({ "image_id": image_id.item(), "caption": tokenizer.decode(caption.tolist), }) # Assume ground truth (COCO val2017 annotations) exist. gt = os.path.join(_C.DATA.ROOT, "annotations", "captions_val2017.json") metrics = CocoCaptionsEvaluator(gt).evaluate(predictions) logger.info(f"Iter: {ITERATION} | Metrics: {metrics}")
def main(_A: argparse.Namespace): if _A.num_gpus_per_machine == 0: # Set device as CPU if num_gpus_per_machine = 0. device = torch.device("cpu") else: # Get the current device (this will be zero here by default). device = torch.cuda.current_device() _C = Config(_A.config, _A.config_override) tokenizer = TokenizerFactory.from_config(_C) if _A.data_root is None: _A.data_root = os.path.join(_C.DATA.ROOT, "val2017") val_dataloader = DataLoader( ImageDirectoryDataset(_A.data_root), batch_size=16, num_workers=_A.cpu_workers, pin_memory=True, shuffle=True, ) # Initialize model from a checkpoint. model = PretrainingModelFactory.from_config(_C).to(device) ITERATION = CheckpointManager(model=model).load(_A.checkpoint_path) model.eval() model.sample_on() val_batch = next(iter(val_dataloader)) val_batch['image'] = val_batch['image'].to(device) with torch.no_grad(): output_dict = model(val_batch, sample_mode='greedy') for caption in output_dict["predictions"][:, 1:]: print(tokenizer.decode(caption.tolist())) mean = torch.tensor(IMAGENET_COLOR_MEAN, dtype=torch.float).view(1, 3, 1, 1) std = torch.tensor(IMAGENET_COLOR_STD, dtype=torch.float).view(1, 3, 1, 1) images = val_batch['image'].cpu() * std + mean save_image(images, 'images.png', nrow=4)
def main(_A: argparse.Namespace): if _A.num_gpus_per_machine == 0: # Set device as CPU if num_gpus_per_machine = 0. device = torch.device("cpu") else: # Get the current device (this will be zero here by default). device = torch.cuda.current_device() _C = Config(_A.config, _A.config_override) tokenizer = TokenizerFactory.from_config(_C) dataset = PretrainingDatasetFactory.from_config(_C, split='train') dataloader = DataLoader(dataset, batch_size=_C.OPTIM.BATCH_SIZE, shuffle=False, num_workers=_A.cpu_workers, drop_last=False, collate_fn=dataset.collate_fn) print('dataloader size:', len(dataloader)) photoids = dict() pbar = tqdm(total=len(dataloader)) for batch in dataloader: for image_id, photo_id in zip(batch["image_id"], batch["photo_id"]): photoids[image_id.item()] = photo_id.item() pbar.update(1) pbar.close() # Save predictions as a JSON file if specified. os.makedirs(os.path.dirname(_A.output), exist_ok=True) json.dump(photoids, open(_A.output, "w")) logger.info(f"Saved photoids to {_A.output}")
def main(_A: argparse.Namespace): apex = False is_cpu = False if _A.num_gpus_per_machine == 0: # Set device as CPU if num_gpus_per_machine = 0. device = torch.device("cpu") is_cpu = True else: # Get the current device as set for current distributed process. # Check `launch` function in `virtex.utils.distributed` module. device = torch.cuda.current_device() # Create a config object (this will be immutable) and perform common setup # such as logging and setting up serialization directory. _C = Config(_A.config, _A.config_override) common_setup(_C, _A) # ------------------------------------------------------------------------- # INSTANTIATE DATALOADER, MODEL, OPTIMIZER # ------------------------------------------------------------------------- tokenizer = TokenizerFactory.from_config(_C) train_dataset = PretrainingDatasetFactory.from_config(_C, split="train", csv=_A.train_csv) val_dataset = PretrainingDatasetFactory.from_config(_C, split="val", csv=_A.val_csv) train_dataloader = DataLoader( train_dataset, batch_size=_C.OPTIM.BATCH_SIZE // dist.get_world_size(), #sampler= Sampler(train_dataset), sampler=DistributedSampler(train_dataset, shuffle=True), num_workers=_A.cpu_workers, pin_memory=True, drop_last=True, collate_fn=train_dataset.collate_fn, ) val_dataloader = DataLoader( val_dataset, batch_size=_C.OPTIM.BATCH_SIZE // dist.get_world_size(), # sampler = Sampler(val_dataset), sampler=DistributedSampler(val_dataset, shuffle=False), num_workers=_A.cpu_workers, pin_memory=True, drop_last=False, collate_fn=val_dataset.collate_fn, ) model = PretrainingModelFactory.from_config(_C).to(device) optimizer = OptimizerFactory.from_config(_C, model.named_parameters()) scheduler = LRSchedulerFactory.from_config(_C, optimizer) # ------------------------------------------------------------------------- # BEFORE TRAINING STARTS # ------------------------------------------------------------------------- # Load checkpoint to resume training if specified. if _A.resume_from is not None: start_iteration = CheckpointManager(model=model, optimizer=optimizer, scheduler=scheduler).load( _A.resume_from) else: start_iteration = 0 # Keep track of time per iteration and ETA. timer = Timer( start_from=start_iteration + 1, total_iterations=_C.OPTIM.NUM_ITERATIONS, ) # Create an iterator from dataloader to sample batches perpetually. train_dataloader_iter = cycle(train_dataloader, device, start_iteration) if (not is_cpu): # Wrap model and optimizer using NVIDIA Apex for mixed precision training. # NOTE: Always do this before wrapping model with DistributedDataParallel. if apex: if _C.FP16_OPT > 0: from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level=f"O{_C.FP16_OPT}") # Wrap model in DDP if using more than one processes. if dist.get_world_size() > 1: dist.synchronize() model = nn.parallel.DistributedDataParallel( model, device_ids=[device], find_unused_parameters=True) # Create checkpoint manager and tensorboard writer (only in master process). if dist.is_master_process(): checkpoint_manager = CheckpointManager( _A.serialization_dir, model=model, optimizer=optimizer, scheduler=scheduler, ) tensorboard_writer = SummaryWriter(log_dir=_A.serialization_dir) tensorboard_writer.add_text("config", f"```\n{_C}\n```") # ------------------------------------------------------------------------- # TRAINING LOOP # ------------------------------------------------------------------------- for iteration in range(start_iteration + 1, _C.OPTIM.NUM_ITERATIONS + 1): timer.tic() optimizer.zero_grad() batch_loss = torch.tensor(0.0, device=device) batch = next(train_dataloader_iter) output_dict = model(batch) loss = output_dict["loss"] batch_loss += loss.item() # Perform dynamic scaling of loss to adjust for mixed precision. if apex and _C.FP16_OPT > 0: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() # Clip norm of gradients before optimizer step. torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer) if apex and _C.FP16_OPT > 0 else model.parameters(), _C.OPTIM.CLIP_GRAD_NORM, ) optimizer.step() scheduler.step(iteration) timer.toc() # --------------------------------------------------------------------- # TENSORBOARD LOGGING # --------------------------------------------------------------------- if iteration % _A.log_every == 0 and dist.is_master_process(): logger.info(f"{timer.stats} | Loss: {batch_loss:.3f} | " f"GPU mem: {dist.gpu_mem_usage()} MB") tensorboard_writer.add_scalars( "learning_rate", { "visual": optimizer.param_groups[0]["lr"], "common": optimizer.param_groups[-1]["lr"], }, iteration, ) tensorboard_writer.add_scalars("train", output_dict["loss_components"], iteration) # --------------------------------------------------------------------- # VALIDATION # --------------------------------------------------------------------- if iteration % _A.checkpoint_every == 0: if dist.is_master_process(): checkpoint_manager.step(iteration) torch.set_grad_enabled(False) model.eval() # Accumulate different val loss components according to the type of # pretraining model. val_loss_counter: Counter = Counter() for val_iteration, val_batch in enumerate(val_dataloader, start=1): for key in val_batch: val_batch[key] = val_batch[key].to(device) output_dict = model(val_batch) val_loss_counter.update(output_dict["loss_components"]) # Divide each loss component by number of val batches per GPU. val_loss_dict = { k: v / val_iteration for k, v in dict(val_loss_counter).items() } dist.average_across_processes(val_loss_dict) torch.set_grad_enabled(True) model.train() if iteration % _A.checkpoint_every == 0 and dist.is_master_process(): logger.info(f"Iter: {iteration} | Val loss: {val_loss_dict}") tensorboard_writer.add_scalars("val", val_loss_dict, iteration) # All processes will wait till master process is done logging. dist.synchronize()
def main(_A: argparse.Namespace): if _A.num_gpus_per_machine == 0: # Set device as CPU if num_gpus_per_machine = 0. device = torch.device("cpu") else: # Get the current device (this will be zero here by default). device = torch.cuda.current_device() _C = Config(_A.config, _A.config_override) tokenizer = TokenizerFactory.from_config(_C) if _A.data_root is None: _A.data_root = os.path.join(_C.DATA.ROOT, "val2017") val_dataset = PretrainingDatasetFactory.from_config(_C, split='val', all_captions=True) val_dataset_no_image = PretrainingDatasetFactory.from_config(_C, split='val', all_captions=True, include_image=False) val_sampler = ( DistributedSampler(val_dataset, shuffle=False) if _A.num_gpus_per_machine > 0 else None ) val_dataloader = DataLoader( val_dataset, batch_size=_C.OPTIM.BATCH_SIZE // dist.get_world_size(), sampler=val_sampler, shuffle=False, num_workers=_A.cpu_workers, pin_memory=True, drop_last=False, collate_fn=val_dataset.collate_fn ) val_dataloader_no_image = DataLoader( val_dataset_no_image, batch_size=_C.OPTIM.BATCH_SIZE // dist.get_world_size(), shuffle=False, drop_last=False, collate_fn=val_dataset.collate_fn ) evaluator = CiderEvaluator(val_dataloader_no_image, prefix='val') # Initialize model from a checkpoint. model = PretrainingModelFactory.from_config(_C).to(device) ITERATION = CheckpointManager(model=model).load(_A.checkpoint_path) model.eval() # Make a list of predictions to evaluate. predictions: List[Dict[str, Any]] = [] if dist.is_master_process(): pbar = tqdm(total=len(val_dataloader)) for val_iteration, val_batch in enumerate(val_dataloader, start=1): val_batch = {'image_id': val_batch['image_id'].to(device), 'image': val_batch['image'].to(device)} with torch.no_grad(): output_dict = model(val_batch) # Make a dictionary of predictions in COCO format. for image_id, caption in zip( val_batch["image_id"], output_dict["predictions"][:, 1:] ): predictions.append( { # Convert image id to int if possible (mainly for COCO eval). "image_id": image_id.item(), "caption": tokenizer.decode(caption.tolist()), } ) if dist.is_master_process(): pbar.update(1) if dist.is_master_process(): pbar.close() # Save predictions as a JSON file if specified. if _A.output is not None: os.makedirs(os.path.dirname(_A.output), exist_ok=True) json.dump(predictions, open(_A.output, "w")) logger.info(f"Saved predictions to {_A.output}") # Calculate CIDEr and SPICE metrics using ground truth COCO Captions. This # should be skipped when running inference on arbitrary images. if _A.calc_metrics: metrics = evaluator.evaluate(predictions) metrics = {k: torch.tensor(v, dtype=torch.float, device=device) for k, v in metrics.items()} dist.average_across_processes(metrics) metrics = {k: v.item() for k, v in metrics.items()} if dist.is_master_process(): logger.info(f"Iter: {ITERATION} | Metrics: {metrics}")
def main(_A: argparse.Namespace): if _A.num_gpus_per_machine == 0: # Set device as CPU if num_gpus_per_machine = 0. device = torch.device("cpu") else: # Get the current device (this will be zero here by default). device = torch.cuda.current_device() _C = Config(_A.config, _A.config_override) tokenizer = TokenizerFactory.from_config(_C) dataset = PretrainingDatasetFactory.from_config(_C, split='train') sampler = ( DistributedSampler(dataset, shuffle=False) if _A.num_gpus_per_machine > 0 else None ) val_dataloader = DataLoader( dataset, batch_size=_C.OPTIM.BATCH_SIZE // dist.get_world_size(), sampler=sampler, shuffle=False, num_workers=_A.cpu_workers, pin_memory=True, drop_last=False, collate_fn=dataset.collate_fn ) # Initialize model from a checkpoint. model = PretrainingModelFactory.from_config(_C).to(device) ITERATION = CheckpointManager(model=model).load(_A.checkpoint_path) model.eval() torch.set_grad_enabled(False) model.sample_on() captions = dict() if dist.is_master_process(): pbar = tqdm(total=len(val_dataloader)) for val_iteration, val_batch in enumerate(val_dataloader, start=1): val_batch = {'image_id': val_batch['image_id'].to(device), 'image': val_batch['image'].to(device)} predictions = [] for k in [1]: predictions.append(model(val_batch, sample_mode='beam', n_samples_per_image=k)['predictions'][:, 1:]) max_length = max([p.shape[1] for p in predictions]) predictions = [torch.cat((p, torch.zeros(p.shape[0], max_length - p.shape[1], device=device)), dim=1) for p in predictions] predictions = torch.stack(predictions, dim=1) # Make a dictionary of predictions in COCO format. for image_id, caption in zip( val_batch["image_id"], predictions ): captions[image_id.item()] = [tokenizer.decode(c.tolist()).strip() for c in caption] if dist.is_master_process(): pbar.update(1) if dist.is_master_process(): pbar.close() # Save predictions as a JSON file if specified. folder = os.path.dirname(_A.output) os.makedirs(folder, exist_ok=True) filename = os.path.basename(_A.output) filepath = os.path.join(folder, f'{dist.get_rank()}_{filename}') json.dump(captions, open(filepath, "w")) logger.info(f"Saved predictions to {filepath}")