Ejemplo n.º 1
0
    def __init__(
        self,
        serialization_dir: str,
        cuda_device: Union[int, torch.device] = -1,
        distributed: bool = False,
        local_rank: int = 0,
        world_size: int = 1,
    ) -> None:

        check_for_gpu(cuda_device)
        self._serialization_dir = serialization_dir

        if isinstance(cuda_device, list):
            raise ConfigurationError(
                "In allennlp 1.0, the Trainer can only be assigned a single `cuda_device`. "
                "Instead, we use torch's DistributedDataParallel at the command level, meaning "
                "our Trainer always uses a single GPU per process.")

        if distributed and world_size <= 1:
            raise ConfigurationError(
                "Distributed training can be performed only with more than 1 device. Check "
                "`cuda_device` key in the experiment configuration.")

        self.cuda_device = int_to_device(cuda_device)

        self._distributed = distributed
        self._rank = local_rank
        self._master = self._rank == 0
        self._world_size = world_size
    def __init__(
        self,
        name: str,
        model: Model,
        optimizer: Optimizer,
        cuda_device: int,
        grad_norm: Optional[float] = None,
        scaler: Optional[amp.GradScaler] = None,
        grad_clipping: Optional[float] = None,
        learning_rate_scheduler: Optional[LearningRateScheduler] = None,
        momentum_scheduler: Optional[MomentumScheduler] = None
    ) -> "ComponentOptimizer":

        self.name = name
        self.model = model
        self._optimizer = optimizer

        if cuda_device is None:
            from torch import cuda

            if cuda.device_count() > 0:
                cuda_device = 0
            else:
                cuda_device = -1

        check_for_gpu(cuda_device)
        self._cuda_device = int_to_device(cuda_device)
        self._grad_norm = grad_norm
        self._scaler = scaler
        self._grad_clipping = grad_clipping

        self._learning_rate_scheduler = learning_rate_scheduler
        self._momentum_scheduler = momentum_scheduler
        self._loss = {'train': ComponentLoss(), 'validation': ComponentLoss()}
Ejemplo n.º 3
0
 def __init__(
     self,
     local_rank: Optional[int] = None,
     world_size: Optional[int] = None,
     cuda_device: Union[torch.device, int] = -1,
 ) -> None:
     self.local_rank: int = local_rank if local_rank is not None else dist.get_rank(
     )
     self.world_size: int = world_size if world_size is not None else dist.get_world_size(
     )
     self.is_primary: bool = local_rank == 0
     self.cuda_device = int_to_device(cuda_device)
Ejemplo n.º 4
0
    def __init__(
        self,
        serialization_dir: str = None,
        cuda_device: Optional[Union[int, torch.device]] = None,
        distributed: bool = False,
        local_rank: int = 0,
        world_size: int = 1,
    ) -> None:
        if cuda_device is None:
            from torch import cuda

            if cuda.device_count() > 0:
                cuda_device = 0
            else:
                cuda_device = -1

        check_for_gpu(cuda_device)

        if serialization_dir is None:
            import tempfile

            self._serialization_dir = tempfile.mkdtemp()
        else:
            self._serialization_dir = serialization_dir
        # Ensure serialization directory exists.
        os.makedirs(self._serialization_dir, exist_ok=True)

        if isinstance(cuda_device, list):
            raise ConfigurationError(
                "In allennlp 1.0, the Trainer can only be assigned a single `cuda_device`. "
                "Instead, we use torch's DistributedDataParallel at the command level, meaning "
                "our Trainer always uses a single GPU per process."
            )

        if distributed and world_size <= 1:
            raise ConfigurationError(
                "Distributed training can be performed only with more than 1 device. Check "
                "`cuda_device` key in the experiment configuration."
            )

        self.cuda_device = int_to_device(cuda_device)

        self._distributed = distributed
        self._rank = local_rank
        self._primary = self._rank == 0
        self._world_size = world_size
Ejemplo n.º 5
0
def move_to_device(obj, cuda_device: Union[torch.device, int]):
    from allennlp.common.util import int_to_device

    cuda_device = int_to_device(cuda_device)

    if cuda_device == torch.device("cpu") or not has_tensor(obj):
        return obj
    elif isinstance(obj, torch.Tensor):
        return obj.cuda(cuda_device)
    elif isinstance(obj, dict):
        return {key: move_to_device(value, cuda_device) for key, value in obj.items()}
    elif isinstance(obj, list):
        return [move_to_device(item, cuda_device) for item in obj]
    elif isinstance(obj, tuple) and hasattr(obj, "_fields"):
        return obj.__class__(*(move_to_device(item, cuda_device) for item in obj))
    elif isinstance(obj, tuple):
        return tuple(move_to_device(item, cuda_device) for item in obj)
    else:
        return obj
Ejemplo n.º 6
0
    def __init__(
        self,
        model: Model,
        train_data_path: DatasetReaderInput,
        train_dataset_reader: DatasetReader,
        *,
        test_dataset_reader: Optional[DatasetReader] = None,
        train_data_loader: Lazy[DataLoader] = Lazy(
            SimpleDataLoader.from_dataset_reader),
        test_data_loader: Lazy[DataLoader] = Lazy(
            SimpleDataLoader.from_dataset_reader),
        params_to_freeze: Optional[List[str]] = None,
        cuda_device: int = -1,
    ) -> None:
        self.model = model
        self.vocab = model.vocab
        self.device = int_to_device(cuda_device)

        self._train_data_path = train_data_path
        self._train_loader = train_data_loader.construct(
            reader=train_dataset_reader,
            data_path=train_data_path,
            batch_size=1,
        )
        self._train_loader.set_target_device(self.device)
        self._train_loader.index_with(self.vocab)

        self._test_dataset_reader = test_dataset_reader or train_dataset_reader
        self._lazy_test_data_loader = test_data_loader

        self.model.to(self.device)
        if params_to_freeze is not None:
            for name, param in self.model.named_parameters():
                if any(
                    [re.match(pattern, name) for pattern in params_to_freeze]):
                    param.requires_grad = False

        # These variables are set when the corresponding public properties are accessed.
        # This is not set until we actually run the calculation since some parameters might not be used.
        self._used_params: Optional[List[torch.nn.Parameter]] = None
        self._used_param_names: Optional[List[str]] = None
        self._train_instances: Optional[List[InstanceWithGrads]] = None
Ejemplo n.º 7
0
def check_for_gpu(device: Union[int, torch.device, List[Union[int,
                                                              torch.device]]]):
    if isinstance(device, list):
        for did in device:
            check_for_gpu(did)
    elif device is None:
        return
    else:
        from allennlp.common.util import int_to_device

        device = int_to_device(device)
        if device != torch.device("cpu"):
            num_devices_available = cuda.device_count()
            if num_devices_available == 0:
                # Torch will give a more informative exception than ours, so we want to include
                # that context as well if it's available.  For example, if you try to run torch 1.5
                # on a machine with CUDA10.1 you'll get the following:
                #
                #     The NVIDIA driver on your system is too old (found version 10010).
                #
                torch_gpu_error = ""
                try:
                    cuda._check_driver()
                except Exception as e:
                    torch_gpu_error = "\n{0}".format(e)

                raise ConfigurationError(
                    "Experiment specified a GPU but none is available;"
                    " if you want to run on CPU use the override"
                    " 'trainer.cuda_device=-1' in the json config file." +
                    torch_gpu_error)
            elif device.index >= num_devices_available:
                raise ConfigurationError(
                    f"Experiment specified GPU device {device.index}"
                    f" but there are only {num_devices_available} devices "
                    f" available.")
Ejemplo n.º 8
0
def evaluate(
    model: Model,
    data_loader: DataLoader,
    cuda_device: int = -1,
    batch_weight_key: str = None,
    output_file: str = None,
    predictions_output_file: str = None,
) -> Dict[str, Any]:
    """
    # Parameters

    model : `Model`
        The model to evaluate
    data_loader : `DataLoader`
        The `DataLoader` that will iterate over the evaluation data (data loaders already contain
        their data).
    cuda_device : `int`, optional (default=`-1`)
        The cuda device to use for this evaluation.  The model is assumed to already be using this
        device; this parameter is only used for moving the input data to the correct device.
    batch_weight_key : `str`, optional (default=`None`)
        If given, this is a key in the output dictionary for each batch that specifies how to weight
        the loss for that batch.  If this is not given, we use a weight of 1 for every batch.
    metrics_output_file : `str`, optional (default=`None`)
        Optional path to write the final metrics to.
    predictions_output_file : `str`, optional (default=`None`)
        Optional path to write the predictions to.

    # Returns

    `Dict[str, Any]`
        The final metrics.
    """
    check_for_gpu(cuda_device)
    data_loader.set_target_device(int_to_device(cuda_device))
    predictions_file = (None if predictions_output_file is None else open(
        predictions_output_file, "w"))

    with torch.no_grad():
        model.eval()

        iterator = iter(data_loader)
        logger.info("Iterating over dataset")
        generator_tqdm = Tqdm.tqdm(iterator)

        # Number of batches in instances.
        batch_count = 0
        # Number of batches where the model produces a loss.
        loss_count = 0
        # Cumulative weighted loss
        total_loss = 0.0
        # Cumulative weight across all batches.
        total_weight = 0.0

        for batch in generator_tqdm:
            batch_count += 1
            batch = nn_util.move_to_device(batch, cuda_device)
            output_dict = model(**batch)
            loss = output_dict.get("loss")

            metrics = model.get_metrics()

            if loss is not None:
                loss_count += 1
                if batch_weight_key:
                    weight = output_dict[batch_weight_key].item()
                else:
                    weight = 1.0

                total_weight += weight
                total_loss += loss.item() * weight
                # Report the average loss so far.
                metrics["loss"] = total_loss / total_weight

            if not HasBeenWarned.tqdm_ignores_underscores and any(
                    metric_name.startswith("_") for metric_name in metrics):
                logger.warning('Metrics with names beginning with "_" will '
                               "not be logged to the tqdm progress bar.")
                HasBeenWarned.tqdm_ignores_underscores = True
            description = (", ".join([
                "%s: %.2f" % (name, value)
                for name, value in metrics.items() if not name.startswith("_")
            ]) + " ||")
            generator_tqdm.set_description(description, refresh=False)

            if predictions_file is not None:
                predictions = json.dumps(
                    sanitize(model.make_output_human_readable(output_dict)))
                predictions_file.write(predictions + "\n")

        if predictions_file is not None:
            predictions_file.close()

        final_metrics = model.get_metrics(reset=True)
        if loss_count > 0:
            # Sanity check
            if loss_count != batch_count:
                raise RuntimeError(
                    "The model you are trying to evaluate only sometimes produced a loss!"
                )
            final_metrics["loss"] = total_loss / total_weight

        if output_file is not None:
            dump_metrics(output_file, final_metrics, log=True)

        return final_metrics
Ejemplo n.º 9
0
    def __init__(
        self,
        image_dir: Optional[Union[str, PathLike]],
        *,
        image_loader: Optional[ImageLoader] = None,
        image_featurizer: Optional[Lazy[GridEmbedder]] = None,
        region_detector: Optional[Lazy[RegionDetector]] = None,
        feature_cache_dir: Optional[Union[str, PathLike]] = None,
        tokenizer: Optional[Tokenizer] = None,
        token_indexers: Optional[Dict[str, TokenIndexer]] = None,
        cuda_device: Optional[Union[int, torch.device]] = None,
        max_instances: Optional[int] = None,
        image_processing_batch_size: int = 8,
        write_to_cache: bool = True,
        manual_distributed_sharding: bool = True,
        manual_multiprocess_sharding: bool = True,
    ) -> None:
        super().__init__(
            max_instances=max_instances,
            manual_distributed_sharding=manual_distributed_sharding,
            manual_multiprocess_sharding=manual_multiprocess_sharding,
        )

        # tokenizers and indexers
        if tokenizer is None:
            tokenizer = PretrainedTransformerTokenizer("bert-base-uncased")
        self._tokenizer = tokenizer
        if token_indexers is None:
            token_indexers = {"tokens": PretrainedTransformerIndexer("bert-base-uncased")}
        self._token_indexers = token_indexers

        if not ((image_loader is None) == (image_featurizer is None) == (region_detector is None)):
            raise ConfigurationError(
                "Please either specify all of image_loader, image_featurizer, and region_detector, "
                "or specify none of them if you don't want to featurize images."
            )

        # feature cache
        self.feature_cache_dir = feature_cache_dir
        self.coordinates_cache_dir = feature_cache_dir
        self.class_probs_cache_dir = feature_cache_dir
        self.class_labels_cache_dir = feature_cache_dir

        if feature_cache_dir:
            self.write_to_cache = write_to_cache
        else:
            # If we don't have a cache dir, we use a dict in memory as a cache, so we
            # always write.
            self.write_to_cache = True
        self._feature_cache_instance: Optional[MutableMapping[str, Tensor]] = None
        self._coordinates_cache_instance: Optional[MutableMapping[str, Tensor]] = None
        self._class_probs_cache_instance: Optional[MutableMapping[str, Tensor]] = None
        self._class_labels_cache_instance: Optional[MutableMapping[str, Tensor]] = None

        # image processors
        self.image_loader = None
        if image_loader and image_featurizer and region_detector:
            if cuda_device is None:
                if torch.cuda.device_count() > 0:
                    if util.is_distributed():
                        cuda_device = dist.get_rank() % torch.cuda.device_count()
                    else:
                        cuda_device = 0
                else:
                    cuda_device = -1
            check_for_gpu(cuda_device)
            self.cuda_device = int_to_device(cuda_device)
            logger.info(f"Processing images on device {cuda_device}")

            # image loading and featurizing
            self.image_loader = image_loader
            self.image_loader.device = self.cuda_device
            self._lazy_image_featurizer = image_featurizer
            self._image_featurizer = None
            self._lazy_region_detector = region_detector
            self._region_detector = None
            self.image_processing_batch_size = image_processing_batch_size

        self.produce_featurized_images = False
        if self.feature_cache_dir and self.coordinates_cache_dir:
            logger.info(f"Featurizing images with a cache at {self.feature_cache_dir}")
            self.produce_featurized_images = True
        if image_loader and image_featurizer and region_detector:
            if self.produce_featurized_images:
                logger.info("Falling back to a full image featurization pipeline")
            else:
                logger.info("Featurizing images with a full image featurization pipeline")
                self.produce_featurized_images = True

        if self.produce_featurized_images:
            if image_dir is None:
                if image_loader and image_featurizer and region_detector:
                    raise ConfigurationError("We need an image_dir to featurize images.")
                else:
                    raise ConfigurationError(
                        "We need an image_dir to use a cache of featurized images. Images won't be "
                        "read if they are cached, but we need the image_dir to determine the right "
                        "cache keys from the file names."
                    )

            logger.info("Discovering images ...")
            self.images = {
                os.path.basename(filename): filename
                for extension in {"png", "jpg"}
                for filename in tqdm(
                    glob.iglob(os.path.join(image_dir, "**", f"*.{extension}"), recursive=True),
                    desc=f"Discovering {extension} images",
                )
            }
            logger.info("Done discovering images")
Ejemplo n.º 10
0
    def __init__(
        self,
        image_dir: Union[str, PathLike],
        image_loader: ImageLoader,
        image_featurizer: GridEmbedder,
        region_detector: RegionDetector,
        *,
        feature_cache_dir: Optional[Union[str, PathLike]] = None,
        data_dir: Optional[Union[str, PathLike]] = None,
        tokenizer: Optional[Tokenizer] = None,
        token_indexers: Optional[Dict[str, TokenIndexer]] = None,
        cuda_device: Optional[Union[int, torch.device]] = None,
        max_instances: Optional[int] = None,
    ) -> None:
        super().__init__(
            max_instances=max_instances,
            manual_distributed_sharding=True,
            manual_multi_process_sharding=True,
        )

        if cuda_device is None:
            from torch import cuda

            if cuda.device_count() > 0:
                cuda_device = 0
            else:
                cuda_device = -1
        from allennlp.common.checks import check_for_gpu

        check_for_gpu(cuda_device)
        from allennlp.common.util import int_to_device

        self.cuda_device = int_to_device(cuda_device)

        # Paths to data
        if not data_dir:
            github_url = "https://raw.githubusercontent.com/lil-lab/nlvr/"
            nlvr_commit = "68a11a766624a5b665ec7594982b8ecbedc728c7"
            data_dir = f"{github_url}{nlvr_commit}/nlvr2/data"
        self.splits = {
            "dev": f"{data_dir}/dev.json",
            "test": f"{data_dir}/test1.json",
            "train": f"{data_dir}/train.json",
            "balanced_dev": f"{data_dir}/balanced/balanced_dev.json",
            "balanced_test": f"{data_dir}/balanced/balanced_test1.json",
            "unbalanced_dev": f"{data_dir}/balanced/unbalanced_dev.json",
            "unbalanced_test": f"{data_dir}/balanced/unbalanced_test1.json",
        }
        from tqdm import tqdm

        self.images = {
            os.path.basename(filename): filename
            for filename in tqdm(
                glob.iglob(os.path.join(image_dir, "**", "*.png"),
                           recursive=True),
                desc="Discovering images",
            )
        }

        # tokenizers and indexers
        if not tokenizer:
            tokenizer = PretrainedTransformerTokenizer("bert-base-uncased")
        self._tokenizer = tokenizer
        if token_indexers is None:
            token_indexers = {
                "tokens": PretrainedTransformerIndexer("bert-base-uncased")
            }
        self._token_indexers = token_indexers

        # image loading
        self.image_loader = image_loader
        self.image_featurizer = image_featurizer.to(self.cuda_device)
        self.region_detector = region_detector.to(self.cuda_device)

        # feature cache
        if feature_cache_dir is None:
            self._features_cache: MutableMapping[str, Tensor] = {}
            self._coordinates_cache: MutableMapping[str, Tensor] = {}
        else:
            os.makedirs(feature_cache_dir, exist_ok=True)
            self._features_cache = TensorCache(
                os.path.join(feature_cache_dir, "features"))
            self._coordinates_cache = TensorCache(
                os.path.join(feature_cache_dir, "coordinates"))