def save_config(self): if not is_main(): return cfg_file = os.path.join(self.ckpt_foldername, "config.yaml") with PathManager.open(cfg_file, "w") as f: f.write(OmegaConf.to_yaml(self.config, resolve=True))
def flush_report(self): if not is_main(): # Empty report in all processes to avoid any leaks self.report = [] return name = self.current_datamodule.dataset_name time_format = "%Y-%m-%dT%H:%M:%S" time = self.timer.get_time_hhmmss(None, format=time_format) filename = name + "_" if len(self.experiment_name) > 0: filename += self.experiment_name + "_" filename += self.dataset_type + "_" filename += time use_csv_writer = (self.config.evaluation.predict_file_format == "csv" or self.test_reporter_config.predict_file_format == "csv") if use_csv_writer: filepath = os.path.join(self.report_folder, filename + ".csv") self.csv_dump(filepath) else: filepath = os.path.join(self.report_folder, filename + ".json") self.json_dump(filepath) logger.info( f"Wrote predictions for {name} to {os.path.abspath(filepath)}") self.report = []
def __init__( self, loaders: Dict[str, DataLoader], iteration_strategy: iteration_strategies.IterationStrategy = None, ): if loaders is None or len(loaders) == 0: warnings.warn( "Empty loaders passed into MultiDataLoader. This can have " "unintended consequences.") if iteration_strategy is None: iteration_strategy = iteration_strategies.RoundRobinIterationStrategy( OmegaConf.create(), loaders) self._iteration_strategy = iteration_strategy self._loaders = loaders self._is_main = is_main() self._num_datasets = len(self.loaders) self.dataset_list = list(loaders.keys()) self._iterators = {} self._finished_iterators = {} self.current_index = 0 self.set_lengths() self.set_samplers()
def __init__(self, log_folder="./logs", iteration=0): self._summary_writer = None self._is_main = is_main() self.timer = Timer() self.log_folder = log_folder self.time_format = "%Y-%m-%dT%H:%M:%S" current_time = self.timer.get_time_hhmmss(None, format=self.time_format) self.tensorboard_folder = os.path.join(self.log_folder, f"tensorboard_{current_time}")
def _threaded_read(self): elements = [idx for idx in range(1, len(self.annotation_db))] pool = ThreadPool(processes=4) with tqdm.tqdm(total=len(elements), disable=not is_main()) as pbar: for i, _ in enumerate( pool.imap_unordered(self._fill_cache, elements)): if i % 100 == 0: pbar.update(100) pool.close()
def __init__(self, embedding_name, *args, **kwargs): """Use this if you want to use pretrained embedding. See description of IntersectedVocab to get a list of the embedding available from torchtext Parameters ---------- embedding_name : str Name of the pretrained alias for the embedding to used """ self.type = "pretrained" if embedding_name not in vocab.pretrained_aliases: raise RuntimeError(f"Unknown embedding type: {embedding_name}") vector_cache = get_mmf_cache_dir() # First test loading the vectors in master so that everybody doesn't # download it in case it doesn't exist if is_main(): vocab.pretrained_aliases[embedding_name](cache=vector_cache) synchronize() embedding = vocab.pretrained_aliases[embedding_name]( cache=vector_cache) self.UNK_INDEX = 3 self.stoi = defaultdict(lambda: self.UNK_INDEX) self.itos = {} self.itos[self.PAD_INDEX] = self.PAD_TOKEN self.itos[self.SOS_INDEX] = self.SOS_TOKEN self.itos[self.EOS_INDEX] = self.EOS_TOKEN self.itos[self.UNK_INDEX] = self.UNK_TOKEN self.stoi[self.SOS_TOKEN] = self.SOS_INDEX self.stoi[self.EOS_TOKEN] = self.EOS_INDEX self.stoi[self.PAD_TOKEN] = self.PAD_INDEX self.stoi[self.UNK_TOKEN] = self.UNK_INDEX self.vectors = torch.FloatTensor( len(self.itos.keys()) + len(embedding.itos), len(embedding.vectors[0])) for i in range(4): self.vectors[i] = torch.ones_like(self.vectors[i]) * 0.1 * i index = 4 for word in embedding.stoi: self.itos[index] = word self.stoi[word] = index actual_index = embedding.stoi[word] self.vectors[index] = embedding.vectors[actual_index] index += 1
def setup(self): """ Setup `Weights and Biases` for logging. """ if is_main(): if self._wandb.run is None: self._wandb.init(**self._wandb_init) # define default x-axis (for latest wandb versions) if getattr(self._wandb, "define_metric", None): self._wandb.define_metric("trainer/global_step") self._wandb.define_metric("*", step_metric="trainer/global_step", step_sync=True)
def try_fast_read(self): # Don't fast read in case of test set. if self._dataset_type == "test": return if hasattr(self, "_should_fast_read") and self._should_fast_read is True: logger.info( f"Starting to fast read {self.dataset_name} {self.dataset_type} " + "dataset") self.cache = {} for idx in tqdm.tqdm(range(len(self.annotation_db)), miniters=100, disable=not is_main()): self.cache[idx] = self.load_item(idx)
def __call__(self, update, iteration, meter): """ Method to be called everytime you need to check whether to early stop or not Arguments: update {number}: Current update number iteration {number}: Current iteration number Returns: bool -- Tells whether early stopping occurred or not """ # There are operations involving synchronization downstream # For XLA those calls must be executed from all cores # Therefore we do return here in case of XLA if not is_main() and not is_xla(): return False value = meter.meters.get(self.early_stop_criteria, None) if value is None: raise ValueError("Criteria used for early stopping ({}) is not " "present in meter.".format( self.early_stop_criteria)) value = value.global_avg if isinstance(value, torch.Tensor): value = value.item() if (self.minimize and value < self.best_monitored_value) or ( not self.minimize and value > self.best_monitored_value): self.best_monitored_value = value self.best_monitored_iteration = iteration self.best_monitored_update = update self.checkpoint.save(update, iteration, update_best=True) elif self.best_monitored_update + self.patience < update: self.activated = True if self.should_stop is True: self.checkpoint.restore() self.checkpoint.finalize() return True else: return False else: self.checkpoint.save(update, iteration, update_best=False) return False
def summarize_report( current_iteration, num_updates, max_updates, meter, should_print=True, extra=None, tb_writer=None, wandb_logger=None, ): if extra is None: extra = {} if not is_main() and not is_xla(): return # Log the learning rate if available if wandb_logger and "lr" in extra: wandb_logger.log_metrics({"train/learning_rate": float(extra["lr"])}, commit=False) if tb_writer: scalar_dict = meter.get_scalar_dict() tb_writer.add_scalars(scalar_dict, current_iteration) if wandb_logger: metrics = meter.get_scalar_dict() wandb_logger.log_metrics({ **metrics, "trainer/global_step": current_iteration }) if not should_print: return log_dict = {} if num_updates is not None and max_updates is not None: log_dict.update({"progress": f"{num_updates}/{max_updates}"}) log_dict.update(meter.get_log_dict()) log_dict.update(extra) log_progress(log_dict)
def build_lightning_model( config: Union[DictConfig, "mmf.models.base_model.BaseModel.Config"], checkpoint_path: str = None, ) -> "mmf.models.base_model.BaseModel": from mmf.models.base_model import BaseModel if not checkpoint_path: model = build_model(config) model.is_pl_enabled = True return model # If it is not an OmegaConf object, create the object if not isinstance(config, DictConfig) and isinstance(config, BaseModel.Config): config = OmegaConf.structured(config) model_name = config.model model_class = registry.get_model_class(model_name) if model_class is None: raise RuntimeError(f"No model registered for name: {model_name}") """ model.build is called inside on_load_checkpoint as suggested here: https://github.com/PyTorchLightning/pytorch-lightning/issues/5410 """ if is_main(): model_class.load_requirements(model_class, config=config) model = model_class.load_from_checkpoint( checkpoint_path, config=config, strict=False ) synchronize() else: synchronize() model = model_class.load_from_checkpoint( checkpoint_path, config=config, strict=False ) model.init_losses() model.is_pl_enabled = True return model
def save_xla_ckpt(ckpt, file_or_path): """ Similar to xm.save, but only try to convert "model" and "optimizer" in an MMF checkpoint to CPU, since they hold PyTorch tensors. Other items like lr_scheduler often cannot be saved with xm.save due to its errors in handling mappingproxy. Only save on the global main process (which is different from the default behavior of xm.save that saves a checkpoint on each node). """ should_write_data = is_main() is_full_ckpt = isinstance(ckpt, dict) and "model" in ckpt and "optimizer" in ckpt if is_full_ckpt: ckpt["model"] = xm._maybe_convert_to_cpu(ckpt["model"], convert=should_write_data) ckpt["optimizer"] = xm._maybe_convert_to_cpu(ckpt["optimizer"], convert=should_write_data) else: ckpt = xm._maybe_convert_to_cpu(ckpt, convert=should_write_data) if should_write_data: torch.save(ckpt, file_or_path) xm.rendezvous("mmf.utils.checkpoint.save_xla_ckpt")
def build_multiple_datamodules( dataset_list: List[str], all_dataset_config: DictConfig ) -> Dict[str, pl.LightningDataModule]: datamodules: Dict[str, pl.LightningDataModule] = {} for dataset in dataset_list: datamodule_instance = build_datamodule(dataset) if dataset in all_dataset_config: dataset_config = all_dataset_config[dataset] else: warnings.warn( f"Dataset {dataset} is missing from dataset_config" + " in config. Proceeding with empty config." ) dataset_config = OmegaConf.create() if is_main(): datamodule_instance.prepare_data(dataset_config) synchronize() datamodule_instance.setup(config=dataset_config) if hasattr(datamodule_instance, "update_registry_for_model"): datamodule_instance.update_registry_for_model(dataset_config) datamodules[dataset] = datamodule_instance return datamodules
def build_model( config: Union[DictConfig, "mmf.models.base_model.BaseModel.Config"], ) -> "mmf.models.base_model.BaseModel": from mmf.models.base_model import BaseModel # If it is not an OmegaConf object, create the object if not isinstance(config, DictConfig) and isinstance(config, BaseModel.Config): config = OmegaConf.structured(config) model_name = config.model model_class = registry.get_model_class(model_name) if model_class is None: raise RuntimeError(f"No model registered for name: {model_name}") model = model_class(config) if hasattr(model, "build"): """Model build involves checkpoint loading If the checkpoint is not available the underlying methods try to download it. Let master build the model (download the checkpoints) while other ranks wait for the sync message Once the master has downloaded the checkpoint and built the model it sends the sync message, completing the synchronization now other cores can proceed to build the model using already downloaded checkpoint. """ if is_main(): model_class.load_requirements(model_class, config=config) model.build() synchronize() else: synchronize() model.build() model.init_losses() return model
def _should_log_wandb(self): if self._wandb is None or not is_main(): return False else: return True
def evaluation_loop( self, dataset_type: str, use_tqdm: bool = False, single_batch: bool = False) -> Tuple[Dict[str, Any], Type[Meter]]: meter = Meter() reporter = self.dataset_loader.get_test_reporter(dataset_type) use_cpu = self.config.evaluation.get("use_cpu", False) loaded_batches = 0 skipped_batches = 0 with torch.no_grad(): self.model.eval() disable_tqdm = not use_tqdm or not is_main() while reporter.next_dataset(flush_report=False): dataloader = reporter.get_dataloader() combined_report = None if self._can_use_tqdm(dataloader): dataloader = tqdm.tqdm(dataloader, disable=disable_tqdm) for batch in dataloader: # Do not timeout quickly on first batch, as workers might start at # very different times. with CompleteInTimeOrDie(600 if loaded_batches else 3600 * 24): loaded_batches += 1 prepared_batch = reporter.prepare_batch(batch) prepared_batch = to_device(prepared_batch, self.device) if not validate_batch_sizes( prepared_batch.get_batch_size()): logger.info( "Skip batch due to uneven batch sizes.") skipped_batches += 1 continue model_output = self.model(prepared_batch) report = Report(prepared_batch, model_output) report = report.detach() meter.update_from_report(report) moved_report = report # Move to CPU for metrics calculation later if needed # Explicitly use `non_blocking=False` as this can cause # race conditions in next accumulate if use_cpu: moved_report = report.copy().to("cpu", non_blocking=False) # accumulate necessary params for metric calculation if combined_report is None: # make a copy of report since `reporter.add_to_report` will # change some of the report keys later combined_report = moved_report.copy() else: combined_report.accumulate_tensor_fields_and_loss( moved_report, self.metrics.required_params) combined_report.batch_size += moved_report.batch_size # Each node generates a separate copy of predict JSON from the # report, which will be used to evaluate dataset-level metrics # (such as mAP in object detection or CIDEr in image captioning) # Since `reporter.add_to_report` changes report keys, # (e.g scores) do this after # `combined_report.accumulate_tensor_fields_and_loss` if "__prediction_report__" in self.metrics.required_params: # Still need to use original report here on GPU/TPU since # it will be gathered reporter.add_to_report(report, self.model) if single_batch is True: break logger.info(f"Finished training. Loaded {loaded_batches}") logger.info(f" -- skipped {skipped_batches} batches.") reporter.postprocess_dataset_report() assert (combined_report is not None ), "Please check if your validation set is empty!" # add prediction_report is used for set-level metrics combined_report.prediction_report = reporter.report combined_report.metrics = self.metrics(combined_report, combined_report) # Since update_meter will reduce the metrics over GPUs, we need to # move them back to GPU but we will only move metrics and losses # which are needed by update_meter to avoid OOM # Furthermore, do it in a non_blocking way to avoid any issues # in device to host or host to device transfer if use_cpu: combined_report = combined_report.to( self.device, fields=["metrics", "losses"], non_blocking=False) meter.update_from_report(combined_report, should_update_loss=False) # enable train mode again self.model.train() return combined_report, meter
def save(self, update, iteration=None, update_best=False): # Only save in main process # For xla we use xm.save method # Which ensures that actual checkpoint saving happens # only for the master node. # The method also takes care of all the necessary synchronization if not is_main() and not is_xla(): return logger.info("Checkpoint save operation started!") if not iteration: iteration = update ckpt_filepath = os.path.join(self.models_foldername, "model_%d.ckpt" % update) best_ckpt_filepath = os.path.join( self.ckpt_foldername, self.ckpt_prefix + "best.ckpt" ) current_ckpt_filepath = os.path.join( self.ckpt_foldername, self.ckpt_prefix + "current.ckpt" ) best_iteration = ( self.trainer.early_stop_callback.early_stopping.best_monitored_iteration ) best_update = ( self.trainer.early_stop_callback.early_stopping.best_monitored_update ) best_metric = ( self.trainer.early_stop_callback.early_stopping.best_monitored_value ) model = self.trainer.model data_parallel = registry.get("data_parallel") or registry.get("distributed") fp16_scaler = getattr(self.trainer, "scaler", None) fp16_scaler_dict = None if fp16_scaler is not None: fp16_scaler_dict = fp16_scaler.state_dict() if data_parallel is True: model = model.module ckpt = { "model": model.state_dict(), "optimizer": self.trainer.optimizer.state_dict(), "best_iteration": best_iteration, "current_iteration": iteration, "current_epoch": self.trainer.current_epoch, "num_updates": update, "best_update": best_update, "best_metric_value": best_metric, "fp16_scaler": fp16_scaler_dict, # Convert to container to avoid any dependencies "config": OmegaConf.to_container(self.config, resolve=True), } lr_scheduler = self.trainer.lr_scheduler_callback if ( lr_scheduler is not None and getattr(lr_scheduler, "_scheduler", None) is not None ): lr_scheduler = lr_scheduler._scheduler ckpt["lr_scheduler"] = lr_scheduler.state_dict() if self.git_repo: git_metadata_dict = self._get_vcs_fields() ckpt.update(git_metadata_dict) with open_if_main(ckpt_filepath, "wb") as f: self.save_func(ckpt, f) if update_best: logger.info("Saving best checkpoint") with open_if_main(best_ckpt_filepath, "wb") as f: self.save_func(ckpt, f) # Save current always logger.info("Saving current checkpoint") with open_if_main(current_ckpt_filepath, "wb") as f: self.save_func(ckpt, f) # Save the current checkpoint as W&B artifacts for model versioning. if self.config.training.wandb.log_checkpoint: logger.info( "Saving current checkpoint as W&B Artifacts for model versioning" ) self.trainer.logistics_callback.wandb_logger.log_model_checkpoint( current_ckpt_filepath ) # Remove old checkpoints if max_to_keep is set # In XLA, only delete checkpoint files in main process if self.max_to_keep > 0 and is_main(): if len(self.saved_iterations) == self.max_to_keep: self.remove(self.saved_iterations.pop(0)) self.saved_iterations.append(update) logger.info("Checkpoint save operation finished!")
def __init__(self, vocab_file, embedding_name, *args, **kwargs): """Use this vocab class when you have a custom vocabulary class but you want to use pretrained embedding vectos for it. This will only load the vectors which intersect with your vocabulary. Use the embedding_name specified in torchtext's pretrained aliases: ['charngram.100d', 'fasttext.en.300d', 'fasttext.simple.300d', 'glove.42B.300d', 'glove.840B.300d', 'glove.twitter.27B.25d', 'glove.twitter.27B.50d', 'glove.twitter.27B.100d', 'glove.twitter.27B.200d', 'glove.6B.50d', 'glove.6B.100d', 'glove.6B.200d', 'glove.6B.300d'] Parameters ---------- vocab_file : str Vocabulary file containing list of words with one word per line which will be used to collect vectors embedding_name : str Embedding name picked up from the list of the pretrained aliases mentioned above """ super().__init__(vocab_file, *args, **kwargs) self.type = "intersected" name = embedding_name.split(".")[0] dim = embedding_name.split(".")[2][:-1] middle = embedding_name.split(".")[1] class_name = EMBEDDING_NAME_CLASS_MAPPING[name] if not hasattr(vocab, class_name): raise RuntimeError(f"Unknown embedding type: {name}") params = [middle] if name == "glove": params.append(int(dim)) vector_cache = get_mmf_cache_dir() # First test loading the vectors in master so that everybody doesn't # download it in case it doesn't exist if is_main(): vocab.pretrained_aliases[embedding_name](cache=vector_cache) synchronize() embedding = getattr(vocab, class_name)(*params, cache=vector_cache) self.vectors = torch.empty( (self.get_size(), len(embedding.vectors[0])), dtype=torch.float) self.embedding_dim = len(embedding.vectors[0]) for i in range(0, 4): self.vectors[i] = torch.ones_like(self.vectors[i]) * 0.1 * i for i in range(4, self.get_size()): word = self.itos[i] embedding_index = embedding.stoi.get(word, None) if embedding_index is None: self.vectors[i] = self.vectors[self.UNK_INDEX] else: self.vectors[i] = embedding.vectors[embedding_index]
def finalize(self): if is_main() or is_xla(): with open_if_main(self.pth_filepath, "wb") as f: self.save_func(self.trainer.model.state_dict(), f)