コード例 #1
0
    def save_config(self):
        if not is_main():
            return

        cfg_file = os.path.join(self.ckpt_foldername, "config.yaml")
        with PathManager.open(cfg_file, "w") as f:
            f.write(OmegaConf.to_yaml(self.config, resolve=True))
コード例 #2
0
ファイル: test_reporter.py プロジェクト: facebookresearch/mmf
    def flush_report(self):
        if not is_main():
            # Empty report in all processes to avoid any leaks
            self.report = []
            return

        name = self.current_datamodule.dataset_name
        time_format = "%Y-%m-%dT%H:%M:%S"
        time = self.timer.get_time_hhmmss(None, format=time_format)

        filename = name + "_"

        if len(self.experiment_name) > 0:
            filename += self.experiment_name + "_"

        filename += self.dataset_type + "_"
        filename += time

        use_csv_writer = (self.config.evaluation.predict_file_format == "csv"
                          or self.test_reporter_config.predict_file_format
                          == "csv")

        if use_csv_writer:
            filepath = os.path.join(self.report_folder, filename + ".csv")
            self.csv_dump(filepath)
        else:
            filepath = os.path.join(self.report_folder, filename + ".json")
            self.json_dump(filepath)

        logger.info(
            f"Wrote predictions for {name} to {os.path.abspath(filepath)}")
        self.report = []
コード例 #3
0
    def __init__(
        self,
        loaders: Dict[str, DataLoader],
        iteration_strategy: iteration_strategies.IterationStrategy = None,
    ):
        if loaders is None or len(loaders) == 0:
            warnings.warn(
                "Empty loaders passed into MultiDataLoader. This can have "
                "unintended consequences.")

        if iteration_strategy is None:
            iteration_strategy = iteration_strategies.RoundRobinIterationStrategy(
                OmegaConf.create(), loaders)

        self._iteration_strategy = iteration_strategy
        self._loaders = loaders
        self._is_main = is_main()
        self._num_datasets = len(self.loaders)
        self.dataset_list = list(loaders.keys())
        self._iterators = {}
        self._finished_iterators = {}

        self.current_index = 0
        self.set_lengths()
        self.set_samplers()
コード例 #4
0
ファイル: logger.py プロジェクト: facebookresearch/mmf
 def __init__(self, log_folder="./logs", iteration=0):
     self._summary_writer = None
     self._is_main = is_main()
     self.timer = Timer()
     self.log_folder = log_folder
     self.time_format = "%Y-%m-%dT%H:%M:%S"
     current_time = self.timer.get_time_hhmmss(None,
                                               format=self.time_format)
     self.tensorboard_folder = os.path.join(self.log_folder,
                                            f"tensorboard_{current_time}")
コード例 #5
0
    def _threaded_read(self):
        elements = [idx for idx in range(1, len(self.annotation_db))]
        pool = ThreadPool(processes=4)

        with tqdm.tqdm(total=len(elements), disable=not is_main()) as pbar:
            for i, _ in enumerate(
                    pool.imap_unordered(self._fill_cache, elements)):
                if i % 100 == 0:
                    pbar.update(100)
        pool.close()
コード例 #6
0
ファイル: vocab.py プロジェクト: facebookresearch/mmf
    def __init__(self, embedding_name, *args, **kwargs):
        """Use this if you want to use pretrained embedding. See description
        of IntersectedVocab to get a list of the embedding available from
        torchtext

        Parameters
        ----------
        embedding_name : str
            Name of the pretrained alias for the embedding to used
        """
        self.type = "pretrained"

        if embedding_name not in vocab.pretrained_aliases:
            raise RuntimeError(f"Unknown embedding type: {embedding_name}")

        vector_cache = get_mmf_cache_dir()

        # First test loading the vectors in master so that everybody doesn't
        # download it in case it doesn't exist
        if is_main():
            vocab.pretrained_aliases[embedding_name](cache=vector_cache)
        synchronize()

        embedding = vocab.pretrained_aliases[embedding_name](
            cache=vector_cache)

        self.UNK_INDEX = 3
        self.stoi = defaultdict(lambda: self.UNK_INDEX)
        self.itos = {}

        self.itos[self.PAD_INDEX] = self.PAD_TOKEN
        self.itos[self.SOS_INDEX] = self.SOS_TOKEN
        self.itos[self.EOS_INDEX] = self.EOS_TOKEN
        self.itos[self.UNK_INDEX] = self.UNK_TOKEN

        self.stoi[self.SOS_TOKEN] = self.SOS_INDEX
        self.stoi[self.EOS_TOKEN] = self.EOS_INDEX
        self.stoi[self.PAD_TOKEN] = self.PAD_INDEX
        self.stoi[self.UNK_TOKEN] = self.UNK_INDEX

        self.vectors = torch.FloatTensor(
            len(self.itos.keys()) + len(embedding.itos),
            len(embedding.vectors[0]))

        for i in range(4):
            self.vectors[i] = torch.ones_like(self.vectors[i]) * 0.1 * i

        index = 4
        for word in embedding.stoi:
            self.itos[index] = word
            self.stoi[word] = index
            actual_index = embedding.stoi[word]
            self.vectors[index] = embedding.vectors[actual_index]
            index += 1
コード例 #7
0
ファイル: logger.py プロジェクト: facebookresearch/mmf
    def setup(self):
        """
        Setup `Weights and Biases` for logging.
        """
        if is_main():

            if self._wandb.run is None:
                self._wandb.init(**self._wandb_init)

            # define default x-axis (for latest wandb versions)
            if getattr(self._wandb, "define_metric", None):
                self._wandb.define_metric("trainer/global_step")
                self._wandb.define_metric("*",
                                          step_metric="trainer/global_step",
                                          step_sync=True)
コード例 #8
0
ファイル: dataset.py プロジェクト: facebookresearch/mmf
    def try_fast_read(self):
        # Don't fast read in case of test set.
        if self._dataset_type == "test":
            return

        if hasattr(self,
                   "_should_fast_read") and self._should_fast_read is True:
            logger.info(
                f"Starting to fast read {self.dataset_name} {self.dataset_type} "
                + "dataset")
            self.cache = {}
            for idx in tqdm.tqdm(range(len(self.annotation_db)),
                                 miniters=100,
                                 disable=not is_main()):
                self.cache[idx] = self.load_item(idx)
コード例 #9
0
    def __call__(self, update, iteration, meter):
        """
        Method to be called everytime you need to check whether to
        early stop or not
        Arguments:
            update {number}: Current update number
            iteration {number}: Current iteration number
        Returns:
            bool -- Tells whether early stopping occurred or not
        """
        # There are operations involving synchronization downstream
        # For XLA those calls must be executed from all cores
        # Therefore we do return here in case of XLA
        if not is_main() and not is_xla():
            return False

        value = meter.meters.get(self.early_stop_criteria, None)
        if value is None:
            raise ValueError("Criteria used for early stopping ({}) is not "
                             "present in meter.".format(
                                 self.early_stop_criteria))

        value = value.global_avg

        if isinstance(value, torch.Tensor):
            value = value.item()

        if (self.minimize and value < self.best_monitored_value) or (
                not self.minimize and value > self.best_monitored_value):
            self.best_monitored_value = value
            self.best_monitored_iteration = iteration
            self.best_monitored_update = update
            self.checkpoint.save(update, iteration, update_best=True)

        elif self.best_monitored_update + self.patience < update:
            self.activated = True
            if self.should_stop is True:
                self.checkpoint.restore()
                self.checkpoint.finalize()
                return True
            else:
                return False
        else:
            self.checkpoint.save(update, iteration, update_best=False)

        return False
コード例 #10
0
ファイル: logger.py プロジェクト: facebookresearch/mmf
def summarize_report(
    current_iteration,
    num_updates,
    max_updates,
    meter,
    should_print=True,
    extra=None,
    tb_writer=None,
    wandb_logger=None,
):
    if extra is None:
        extra = {}
    if not is_main() and not is_xla():
        return

    # Log the learning rate if available
    if wandb_logger and "lr" in extra:
        wandb_logger.log_metrics({"train/learning_rate": float(extra["lr"])},
                                 commit=False)

    if tb_writer:
        scalar_dict = meter.get_scalar_dict()
        tb_writer.add_scalars(scalar_dict, current_iteration)

    if wandb_logger:
        metrics = meter.get_scalar_dict()
        wandb_logger.log_metrics({
            **metrics, "trainer/global_step":
            current_iteration
        })

    if not should_print:
        return
    log_dict = {}
    if num_updates is not None and max_updates is not None:
        log_dict.update({"progress": f"{num_updates}/{max_updates}"})

    log_dict.update(meter.get_log_dict())
    log_dict.update(extra)

    log_progress(log_dict)
コード例 #11
0
ファイル: build.py プロジェクト: facebookresearch/mmf
def build_lightning_model(
    config: Union[DictConfig, "mmf.models.base_model.BaseModel.Config"],
    checkpoint_path: str = None,
) -> "mmf.models.base_model.BaseModel":
    from mmf.models.base_model import BaseModel

    if not checkpoint_path:
        model = build_model(config)
        model.is_pl_enabled = True
        return model

    # If it is not an OmegaConf object, create the object
    if not isinstance(config, DictConfig) and isinstance(config, BaseModel.Config):
        config = OmegaConf.structured(config)

    model_name = config.model
    model_class = registry.get_model_class(model_name)

    if model_class is None:
        raise RuntimeError(f"No model registered for name: {model_name}")

    """ model.build is called inside on_load_checkpoint as suggested here:
    https://github.com/PyTorchLightning/pytorch-lightning/issues/5410
    """

    if is_main():
        model_class.load_requirements(model_class, config=config)
        model = model_class.load_from_checkpoint(
            checkpoint_path, config=config, strict=False
        )
        synchronize()
    else:
        synchronize()
        model = model_class.load_from_checkpoint(
            checkpoint_path, config=config, strict=False
        )

    model.init_losses()
    model.is_pl_enabled = True
    return model
コード例 #12
0
def save_xla_ckpt(ckpt, file_or_path):
    """
    Similar to xm.save, but only try to convert "model" and "optimizer" in an MMF
    checkpoint to CPU, since they hold PyTorch tensors. Other items like lr_scheduler
    often cannot be saved with xm.save due to its errors in handling mappingproxy.

    Only save on the global main process (which is different from the default behavior
    of xm.save that saves a checkpoint on each node).
    """
    should_write_data = is_main()

    is_full_ckpt = isinstance(ckpt,
                              dict) and "model" in ckpt and "optimizer" in ckpt
    if is_full_ckpt:
        ckpt["model"] = xm._maybe_convert_to_cpu(ckpt["model"],
                                                 convert=should_write_data)
        ckpt["optimizer"] = xm._maybe_convert_to_cpu(ckpt["optimizer"],
                                                     convert=should_write_data)
    else:
        ckpt = xm._maybe_convert_to_cpu(ckpt, convert=should_write_data)

    if should_write_data:
        torch.save(ckpt, file_or_path)
    xm.rendezvous("mmf.utils.checkpoint.save_xla_ckpt")
コード例 #13
0
ファイル: build.py プロジェクト: facebookresearch/mmf
def build_multiple_datamodules(
    dataset_list: List[str], all_dataset_config: DictConfig
) -> Dict[str, pl.LightningDataModule]:
    datamodules: Dict[str, pl.LightningDataModule] = {}
    for dataset in dataset_list:
        datamodule_instance = build_datamodule(dataset)
        if dataset in all_dataset_config:
            dataset_config = all_dataset_config[dataset]
        else:
            warnings.warn(
                f"Dataset {dataset} is missing from dataset_config"
                + " in config. Proceeding with empty config."
            )
            dataset_config = OmegaConf.create()

        if is_main():
            datamodule_instance.prepare_data(dataset_config)

        synchronize()
        datamodule_instance.setup(config=dataset_config)
        if hasattr(datamodule_instance, "update_registry_for_model"):
            datamodule_instance.update_registry_for_model(dataset_config)
        datamodules[dataset] = datamodule_instance
    return datamodules
コード例 #14
0
ファイル: build.py プロジェクト: facebookresearch/mmf
def build_model(
    config: Union[DictConfig, "mmf.models.base_model.BaseModel.Config"],
) -> "mmf.models.base_model.BaseModel":
    from mmf.models.base_model import BaseModel

    # If it is not an OmegaConf object, create the object
    if not isinstance(config, DictConfig) and isinstance(config, BaseModel.Config):
        config = OmegaConf.structured(config)

    model_name = config.model
    model_class = registry.get_model_class(model_name)

    if model_class is None:
        raise RuntimeError(f"No model registered for name: {model_name}")
    model = model_class(config)

    if hasattr(model, "build"):
        """Model build involves checkpoint loading
        If the checkpoint is not available the underlying
        methods try to download it.
        Let master build the model (download the checkpoints) while
        other ranks wait for the sync message
        Once the master has downloaded the checkpoint and built the
        model it sends the sync message, completing the synchronization
        now other cores can proceed to build the model
        using already downloaded checkpoint.
        """
        if is_main():
            model_class.load_requirements(model_class, config=config)
            model.build()
            synchronize()
        else:
            synchronize()
            model.build()
        model.init_losses()
    return model
コード例 #15
0
ファイル: logger.py プロジェクト: facebookresearch/mmf
 def _should_log_wandb(self):
     if self._wandb is None or not is_main():
         return False
     else:
         return True
コード例 #16
0
    def evaluation_loop(
            self,
            dataset_type: str,
            use_tqdm: bool = False,
            single_batch: bool = False) -> Tuple[Dict[str, Any], Type[Meter]]:
        meter = Meter()
        reporter = self.dataset_loader.get_test_reporter(dataset_type)
        use_cpu = self.config.evaluation.get("use_cpu", False)
        loaded_batches = 0
        skipped_batches = 0

        with torch.no_grad():
            self.model.eval()
            disable_tqdm = not use_tqdm or not is_main()
            while reporter.next_dataset(flush_report=False):
                dataloader = reporter.get_dataloader()
                combined_report = None

                if self._can_use_tqdm(dataloader):
                    dataloader = tqdm.tqdm(dataloader, disable=disable_tqdm)
                for batch in dataloader:
                    # Do not timeout quickly on first batch, as workers might start at
                    # very different times.
                    with CompleteInTimeOrDie(600 if loaded_batches else 3600 *
                                             24):
                        loaded_batches += 1
                        prepared_batch = reporter.prepare_batch(batch)
                        prepared_batch = to_device(prepared_batch, self.device)
                        if not validate_batch_sizes(
                                prepared_batch.get_batch_size()):
                            logger.info(
                                "Skip batch due to uneven batch sizes.")
                            skipped_batches += 1
                            continue
                        model_output = self.model(prepared_batch)
                        report = Report(prepared_batch, model_output)
                        report = report.detach()

                        meter.update_from_report(report)

                        moved_report = report
                        # Move to CPU for metrics calculation later if needed
                        # Explicitly use `non_blocking=False` as this can cause
                        # race conditions in next accumulate
                        if use_cpu:
                            moved_report = report.copy().to("cpu",
                                                            non_blocking=False)

                        # accumulate necessary params for metric calculation
                        if combined_report is None:
                            # make a copy of report since `reporter.add_to_report` will
                            # change some of the report keys later
                            combined_report = moved_report.copy()
                        else:
                            combined_report.accumulate_tensor_fields_and_loss(
                                moved_report, self.metrics.required_params)
                            combined_report.batch_size += moved_report.batch_size

                        # Each node generates a separate copy of predict JSON from the
                        # report, which will be used to evaluate dataset-level metrics
                        # (such as mAP in object detection or CIDEr in image captioning)
                        # Since `reporter.add_to_report` changes report keys,
                        # (e.g scores) do this after
                        # `combined_report.accumulate_tensor_fields_and_loss`
                        if "__prediction_report__" in self.metrics.required_params:
                            # Still need to use original report here on GPU/TPU since
                            # it will be gathered
                            reporter.add_to_report(report, self.model)

                        if single_batch is True:
                            break

                logger.info(f"Finished training. Loaded {loaded_batches}")
                logger.info(f" -- skipped {skipped_batches} batches.")

                reporter.postprocess_dataset_report()
                assert (combined_report is not None
                        ), "Please check if your validation set is empty!"
                # add prediction_report is used for set-level metrics
                combined_report.prediction_report = reporter.report

                combined_report.metrics = self.metrics(combined_report,
                                                       combined_report)

                # Since update_meter will reduce the metrics over GPUs, we need to
                # move them back to GPU but we will only move metrics and losses
                # which are needed by update_meter to avoid OOM
                # Furthermore, do it in a non_blocking way to avoid any issues
                # in device to host or host to device transfer
                if use_cpu:
                    combined_report = combined_report.to(
                        self.device,
                        fields=["metrics", "losses"],
                        non_blocking=False)

                meter.update_from_report(combined_report,
                                         should_update_loss=False)

            # enable train mode again
            self.model.train()

        return combined_report, meter
コード例 #17
0
    def save(self, update, iteration=None, update_best=False):
        # Only save in main process
        # For xla we use xm.save method
        # Which ensures that actual checkpoint saving happens
        # only for the master node.
        # The method also takes care of all the necessary synchronization
        if not is_main() and not is_xla():
            return

        logger.info("Checkpoint save operation started!")
        if not iteration:
            iteration = update

        ckpt_filepath = os.path.join(self.models_foldername, "model_%d.ckpt" % update)
        best_ckpt_filepath = os.path.join(
            self.ckpt_foldername, self.ckpt_prefix + "best.ckpt"
        )
        current_ckpt_filepath = os.path.join(
            self.ckpt_foldername, self.ckpt_prefix + "current.ckpt"
        )

        best_iteration = (
            self.trainer.early_stop_callback.early_stopping.best_monitored_iteration
        )
        best_update = (
            self.trainer.early_stop_callback.early_stopping.best_monitored_update
        )
        best_metric = (
            self.trainer.early_stop_callback.early_stopping.best_monitored_value
        )

        model = self.trainer.model
        data_parallel = registry.get("data_parallel") or registry.get("distributed")
        fp16_scaler = getattr(self.trainer, "scaler", None)
        fp16_scaler_dict = None

        if fp16_scaler is not None:
            fp16_scaler_dict = fp16_scaler.state_dict()

        if data_parallel is True:
            model = model.module

        ckpt = {
            "model": model.state_dict(),
            "optimizer": self.trainer.optimizer.state_dict(),
            "best_iteration": best_iteration,
            "current_iteration": iteration,
            "current_epoch": self.trainer.current_epoch,
            "num_updates": update,
            "best_update": best_update,
            "best_metric_value": best_metric,
            "fp16_scaler": fp16_scaler_dict,
            # Convert to container to avoid any dependencies
            "config": OmegaConf.to_container(self.config, resolve=True),
        }

        lr_scheduler = self.trainer.lr_scheduler_callback

        if (
            lr_scheduler is not None
            and getattr(lr_scheduler, "_scheduler", None) is not None
        ):
            lr_scheduler = lr_scheduler._scheduler
            ckpt["lr_scheduler"] = lr_scheduler.state_dict()

        if self.git_repo:
            git_metadata_dict = self._get_vcs_fields()
            ckpt.update(git_metadata_dict)

        with open_if_main(ckpt_filepath, "wb") as f:
            self.save_func(ckpt, f)

        if update_best:
            logger.info("Saving best checkpoint")
            with open_if_main(best_ckpt_filepath, "wb") as f:
                self.save_func(ckpt, f)

        # Save current always

        logger.info("Saving current checkpoint")
        with open_if_main(current_ckpt_filepath, "wb") as f:
            self.save_func(ckpt, f)

        # Save the current checkpoint as W&B artifacts for model versioning.
        if self.config.training.wandb.log_checkpoint:
            logger.info(
                "Saving current checkpoint as W&B Artifacts for model versioning"
            )
            self.trainer.logistics_callback.wandb_logger.log_model_checkpoint(
                current_ckpt_filepath
            )

        # Remove old checkpoints if max_to_keep is set
        # In XLA, only delete checkpoint files in main process
        if self.max_to_keep > 0 and is_main():
            if len(self.saved_iterations) == self.max_to_keep:
                self.remove(self.saved_iterations.pop(0))
            self.saved_iterations.append(update)

        logger.info("Checkpoint save operation finished!")
コード例 #18
0
ファイル: vocab.py プロジェクト: facebookresearch/mmf
    def __init__(self, vocab_file, embedding_name, *args, **kwargs):
        """Use this vocab class when you have a custom vocabulary class but you
        want to use pretrained embedding vectos for it. This will only load
        the vectors which intersect with your vocabulary. Use the
        embedding_name specified in torchtext's pretrained aliases:
        ['charngram.100d', 'fasttext.en.300d', 'fasttext.simple.300d',
         'glove.42B.300d', 'glove.840B.300d', 'glove.twitter.27B.25d',
         'glove.twitter.27B.50d', 'glove.twitter.27B.100d',
         'glove.twitter.27B.200d', 'glove.6B.50d', 'glove.6B.100d',
         'glove.6B.200d', 'glove.6B.300d']

        Parameters
        ----------
        vocab_file : str
            Vocabulary file containing list of words with one word per line
            which will be used to collect vectors
        embedding_name : str
            Embedding name picked up from the list of the pretrained aliases
            mentioned above
        """
        super().__init__(vocab_file, *args, **kwargs)

        self.type = "intersected"

        name = embedding_name.split(".")[0]
        dim = embedding_name.split(".")[2][:-1]
        middle = embedding_name.split(".")[1]

        class_name = EMBEDDING_NAME_CLASS_MAPPING[name]

        if not hasattr(vocab, class_name):
            raise RuntimeError(f"Unknown embedding type: {name}")

        params = [middle]

        if name == "glove":
            params.append(int(dim))

        vector_cache = get_mmf_cache_dir()

        # First test loading the vectors in master so that everybody doesn't
        # download it in case it doesn't exist
        if is_main():
            vocab.pretrained_aliases[embedding_name](cache=vector_cache)
        synchronize()

        embedding = getattr(vocab, class_name)(*params, cache=vector_cache)

        self.vectors = torch.empty(
            (self.get_size(), len(embedding.vectors[0])), dtype=torch.float)

        self.embedding_dim = len(embedding.vectors[0])

        for i in range(0, 4):
            self.vectors[i] = torch.ones_like(self.vectors[i]) * 0.1 * i

        for i in range(4, self.get_size()):
            word = self.itos[i]
            embedding_index = embedding.stoi.get(word, None)

            if embedding_index is None:
                self.vectors[i] = self.vectors[self.UNK_INDEX]
            else:
                self.vectors[i] = embedding.vectors[embedding_index]
コード例 #19
0
 def finalize(self):
     if is_main() or is_xla():
         with open_if_main(self.pth_filepath, "wb") as f:
             self.save_func(self.trainer.model.state_dict(), f)