Ejemplo n.º 1
0
    def get_callbacks(self, stage: str) -> "OrderedDict[str, Callback]":
        """Returns the callbacks for a given stage."""
        callbacks_params = get_by_keys(self._stage_config,
                                       stage,
                                       "callbacks",
                                       default={})
        callbacks = OrderedDict(REGISTRY.get_from_params(**callbacks_params))

        is_callback_exists = lambda callback_fn: any(
            callback_isinstance(x, callback_fn) for x in callbacks.values())
        if self._verbose and not is_callback_exists(TqdmCallback):
            callbacks["_verbose"] = TqdmCallback()
        if self._timeit and not is_callback_exists(TimerCallback):
            callbacks["_timer"] = TimerCallback()
        if self._check and not is_callback_exists(CheckRunCallback):
            callbacks["_check"] = CheckRunCallback()
        if self._overfit and not is_callback_exists(BatchOverfitCallback):
            callbacks["_overfit"] = BatchOverfitCallback()

        if self._logdir is not None and not is_callback_exists(
                ICheckpointCallback):
            callbacks["_checkpoint"] = CheckpointCallback(logdir=os.path.join(
                self._logdir, "checkpoints"), )

        return callbacks
Ejemplo n.º 2
0
 def _get_optimizer_from_params(self, model: RunnerModel, stage: str,
                                **params) -> RunnerOptimizer:
     # @TODO 1: refactor; this method is too long
     params = deepcopy(params)
     # learning rate linear scaling
     lr_scaling_params = params.pop("lr_linear_scaling", None)
     if lr_scaling_params:
         loaders_params = dict(self._stage_config[stage]["loaders"])
         lr, lr_scaling = do_lr_linear_scaling(
             lr_scaling_params=lr_scaling_params,
             batch_size=loaders_params.get("batch_size", 1),
             per_gpu_scaling=loaders_params.get("per_gpu_scaling", False),
         )
         params["lr"] = lr
     else:
         lr_scaling = 1.0
     # getting layer-wise parameters
     layerwise_params = params.pop("layerwise_params", OrderedDict())
     no_bias_weight_decay = params.pop("no_bias_weight_decay", True)
     # getting model parameters
     model_key = params.pop("_model", None)
     model_params = get_model_parameters(
         models=model,
         models_keys=model_key,
         layerwise_params=layerwise_params,
         no_bias_weight_decay=no_bias_weight_decay,
         lr_scaling=lr_scaling,
     )
     # instantiate optimizer
     optimizer = REGISTRY.get_from_params(**params, params=model_params)
     return optimizer
Ejemplo n.º 3
0
 def get_criterion(self, stage: str) -> RunnerCriterion:
     """Returns the criterion for a given stage."""
     criterion_params = get_by_keys(self._stage_config,
                                    stage,
                                    "criterion",
                                    default={})
     criterion = REGISTRY.get_from_params(**criterion_params)
     return criterion or None
Ejemplo n.º 4
0
 def _get_callback_from_params(**params):
     params = deepcopy(params)
     wrapper_params = params.pop("_wrapper", None)
     callback = REGISTRY.get_from_params(**params)
     if wrapper_params is not None:
         wrapper_params["base_callback"] = callback
         callback = ConfigRunner._get_callback_from_params(
             **wrapper_params)  # noqa: WPS437
     return callback
Ejemplo n.º 5
0
    def __init__(self,
                 vocab,
                 cleaners=[],
                 g2p=None,
                 words_separator="\t",
                 batch_size=1):
        """Processor initialization

        Parameters
        ----------
        vocab : List[str]
            List of all tokens, thats will be used after text processing.
            Use phonemes list if you want use g2p, or graphemes (alphabet characters) othervise
        cleaners : Union[List[Callable], List[dict]], optional
            List of cleaners callable objects, or their config dicts.
        g2p : Union[Callable, dict], optional
            g2p callable object or their config config dict.
        words_separator : str, optional
            Token thats will be separate words, by default "\t"
        batch_size : int, optional
            Batch size for data processing, by default 1
        """
        self.vocab = vocab
        self.words_separator = words_separator
        self.batch_size = batch_size

        self.token2id = {}

        # zero token id for padding
        for i, token in enumerate(self.vocab, 1):
            self.token2id[token] = i

        self.cleaners = []

        for cleaner in cleaners:
            if isinstance(cleaner, dict):
                cleaner = REGISTRY.get_from_params(**cleaner)

            self.cleaners.append(cleaner)

        if isinstance(g2p, dict):
            g2p = REGISTRY.get_from_params(**g2p)

        self.g2p = g2p
Ejemplo n.º 6
0
 def get_engine(self) -> IEngine:
     """Returns the engine for the run."""
     engine_params = self._config.get("engine", None)
     if engine_params is not None:
         engine = REGISTRY.get_from_params(**engine_params)
     else:
         engine = get_available_engine(fp16=self._fp16,
                                       ddp=self._ddp,
                                       amp=self._amp,
                                       apex=self._apex)
     return engine
Ejemplo n.º 7
0
    def get_from_params(cls,
                        params: Dict,
                        base_optimizer_params: Dict = None,
                        **kwargs) -> "Lookahead":
        """@TODO: Docs. Contribution is welcome."""
        from catalyst.registry import REGISTRY

        base_optimizer = REGISTRY.get_from_params(params=params,
                                                  **base_optimizer_params)
        optimizer = cls(optimizer=base_optimizer, **kwargs)
        return optimizer
Ejemplo n.º 8
0
    def __init__(self,
                 batch_size=1,
                 mel_extractor=None,
                 speaker_embedding_extractor=None,
                 prossody_extractor=None,
                 wav_max_value=32768):
        self.batch_size = batch_size
        self.wav_max_value = wav_max_value

        if isinstance(mel_extractor, dict):
            mel_extractor = REGISTRY.get_from_params(**mel_extractor)
        self.mel_extractor = mel_extractor

        if isinstance(speaker_embedding_extractor, dict):
            speaker_embedding_extractor = REGISTRY.get_from_params(
                **speaker_embedding_extractor)
        self.speaker_embedding_extractor = speaker_embedding_extractor

        if isinstance(prossody_extractor, dict):
            prossody_extractor = REGISTRY.get_from_params(**prossody_extractor)
        self.prossody_extractor = prossody_extractor
Ejemplo n.º 9
0
    def get_datasets(self, stage: str) -> "OrderedDict[str, Dataset]":
        """
        Returns datasets for a given stage.

        Args:
            stage: stage name

        Returns:
            Dict: datasets objects
        """
        datasets_params = self._stage_config[stage]["loaders"]["datasets"]
        datasets = REGISTRY.get_from_params(**datasets_params)
        return OrderedDict(datasets)
Ejemplo n.º 10
0
    def _get_model_from_params(**params) -> RunnerModel:
        params = deepcopy(params)
        is_key_value = params.pop("_key_value", False)

        if is_key_value:
            model = {
                model_key: ConfigRunner._get_model_from_params(**model_params)
                for model_key, model_params in params.items()
            }
            model = nn.ModuleDict(model)
        else:
            model = REGISTRY.get_from_params(**params)
        return model
Ejemplo n.º 11
0
    def _get_criterion_from_params(**params) -> RunnerCriterion:
        params = deepcopy(params)
        key_value_flag = params.pop("_key_value", False)

        if key_value_flag:
            criterion = {
                key: ConfigRunner._get_criterion_from_params(
                    **key_params)  # noqa: WPS437
                for key, key_params in params.items()
            }
        else:
            criterion = REGISTRY.get_from_params(**params)
        return criterion
Ejemplo n.º 12
0
    def _get_scheduler_from_params(*, optimizer: RunnerOptimizer,
                                   **params) -> RunnerScheduler:
        params = deepcopy(params)

        is_key_value = params.pop("_key_value", False)
        optimizer_key = params.pop("_optimizer", None)
        optimizer = optimizer[optimizer_key] if optimizer_key else optimizer

        if is_key_value:
            scheduler: Dict[str, Scheduler] = {}
            for key, scheduler_params in params.items():
                scheduler[key] = ConfigRunner._get_scheduler_from_params(
                    **scheduler_params, optimizer=optimizer)  # noqa: WPS437
        else:
            scheduler = REGISTRY.get_from_params(**params, optimizer=optimizer)
        return scheduler
Ejemplo n.º 13
0
    def get_samplers(self, stage: str) -> "OrderedDict[str, Sampler]":
        """
        Returns samplers for a given stage.

        Args:
            stage: stage name

        Returns:
            Dict of samplers
        """
        samplers_params = get_by_keys(self._stage_config,
                                      stage,
                                      "loaders",
                                      "samplers",
                                      default={})
        samplers = REGISTRY.get_from_params(**samplers_params)
        return OrderedDict(samplers)
Ejemplo n.º 14
0
def main(args):
    with open(args.config) as f:
        config = yaml.load(f, Loader=yaml.Loader)

    verbose = config["dataset_preprocessing_params"]["verbose"]
    ignore_processors = set(args.ignore_processors.split(
        ",")) if args.ignore_processors is not None else set()

    for processor_name, processing_params in config[
            "dataset_preprocessing_params"]["processors"].items():
        if processor_name in ignore_processors:
            print(f"Ignore {processor_name}")
            continue

        processor = REGISTRY.get_from_params(**config[processor_name])
        processor.process_files(processing_params["inputs"],
                                processing_params["outputs"],
                                verbose=verbose)
Ejemplo n.º 15
0
    def get_loggers(self) -> Dict[str, ILogger]:
        """Returns the loggers for the run."""
        loggers_params = self._config.get("loggers", {})
        loggers = REGISTRY.get_from_params(**loggers_params)

        is_logger_exists = lambda logger_fn: any(
            isinstance(x, logger_fn) for x in loggers.values())
        if not is_logger_exists(ConsoleLogger):
            loggers["_console"] = ConsoleLogger()
        if self._logdir is not None and not is_logger_exists(CSVLogger):
            loggers["_csv"] = CSVLogger(logdir=self._logdir,
                                        use_logdir_postfix=True)
        if self._logdir is not None and not is_logger_exists(
                TensorboardLogger):
            loggers["_tensorboard"] = TensorboardLogger(
                logdir=self._logdir, use_logdir_postfix=True)

        return loggers
Ejemplo n.º 16
0
    def __init__(self, transforms, batch_size=1):
        """AudioProcessor handles audios processing.
        This processor applies transforms in their order to the batch of audios.

        Parameters
        ----------
        transforms : Union[List[Callable], List[Dict]]
            List of callable transforms objects, or their config dicts.
        batch_size : int, optional
            Batch size for data processing, by default 1
        """

        self.transforms = []
        self.batch_size = batch_size

        for transform in transforms:
            if isinstance(transform, dict):
                transform = REGISTRY.get_from_params(**transform)

            self.transforms.append(transform)
Ejemplo n.º 17
0
    def get_loggers(self) -> Dict[str, ILogger]:
        """Returns the loggers for the run."""
        loggers_params = self._config.get("loggers", {})
        loggers = {
            key: REGISTRY.get_from_params(**params)
            for key, params in loggers_params.items()
        }

        is_logger_exists = lambda logger_fn: any(
            isinstance(x, logger_fn) for x in loggers.values())
        if not is_logger_exists(ConsoleLogger):
            loggers["_console"] = ConsoleLogger()
        if self._logdir is not None and not is_logger_exists(CSVLogger):
            loggers["_csv"] = CSVLogger(logdir=self._logdir)
        if self._logdir is not None and not is_logger_exists(
                TensorboardLogger):
            loggers["_tensorboard"] = TensorboardLogger(
                logdir=os.path.join(self._logdir, "tensorboard"))

        return loggers
Ejemplo n.º 18
0
    def __init__(
        self,
        transform: Sequence[Union[dict, nn.Module]],
        input_key: Union[str, int] = "image",
        output_key: Optional[Union[str, int]] = None,
    ) -> None:
        """Init."""
        super().__init__(order=CallbackOrder.Internal, node=CallbackNode.all)

        self.input_key = input_key
        self.output_key = output_key or self.input_key

        transforms: Sequence[nn.Module] = [
            item if isinstance(item, nn.Module) else REGISTRY.get_from_params(
                **item) for item in transform
        ]
        assert all(
            isinstance(t, nn.Module) for t in
            transforms), "`nn.Module` should be a base class for transforms"

        self.transform = nn.Sequential(*transforms)
Ejemplo n.º 19
0
def get_loaders_from_params(
    batch_size: int = 1,
    num_workers: int = 0,
    drop_last: bool = False,
    per_gpu_scaling: bool = False,
    loaders_params: Dict[str, Any] = None,
    samplers_params: Dict[str, Any] = None,
    initial_seed: int = 42,
    datasets_fn: Callable = None,
    **data_params,
) -> "OrderedDict[str, DataLoader]":
    """
    Creates pytorch dataloaders from datasets and additional parameters.

    Args:
        batch_size: ``batch_size`` parameter
            from ``torch.utils.data.DataLoader``
        num_workers: ``num_workers`` parameter
            from ``torch.utils.data.DataLoader``
        drop_last: ``drop_last`` parameter
            from ``torch.utils.data.DataLoader``
        per_gpu_scaling: boolean flag,
            if ``True``, scales batch_size in proportion to the number of GPUs
        loaders_params (Dict[str, Any]): additional loaders parameters
        samplers_params (Dict[str, Any]): additional sampler parameters
        initial_seed: initial seed for ``torch.utils.data.DataLoader``
            workers
        datasets_fn(Callable): callable function to get dictionary with
            ``torch.utils.data.Datasets``
        **data_params: additional data parameters
            or dictionary with ``torch.utils.data.Datasets`` to use for
            pytorch dataloaders creation

    Returns:
        OrderedDict[str, DataLoader]: dictionary with
            ``torch.utils.data.DataLoader``

    Raises:
        NotImplementedError: if datasource is out of `Dataset` or dict
        ValueError: if batch_sampler option is mutually
            exclusive with distributed
    """
    from catalyst.data.sampler import DistributedSamplerWrapper

    default_batch_size = batch_size
    default_num_workers = num_workers
    loaders_params = loaders_params or {}
    assert isinstance(loaders_params,
                      dict), (f"`loaders_params` should be a Dict. "
                              f"Got: {loaders_params}")
    samplers_params = samplers_params or {}
    assert isinstance(
        samplers_params,
        dict), f"`samplers_params` should be a Dict. Got: {samplers_params}"

    distributed_rank = get_rank()
    distributed = distributed_rank > -1

    if datasets_fn is not None:
        datasets = datasets_fn(**data_params)
    else:
        datasets = dict(**data_params)

    loaders = OrderedDict()
    for name, datasource in datasets.items():  # noqa: WPS426
        assert isinstance(
            datasource,
            (Dataset, dict
             )), f"{datasource} should be Dataset or Dict. Got: {datasource}"

        loader_params = loaders_params.pop(name, {})
        assert isinstance(loader_params,
                          dict), f"{loader_params} should be Dict"

        sampler_params = samplers_params.pop(name, None)
        if sampler_params is None:
            if isinstance(datasource, dict) and "sampler" in datasource:
                sampler = datasource.pop("sampler", None)
            else:
                sampler = None
        else:
            sampler = REGISTRY.get_from_params(**sampler_params)
            if isinstance(datasource, dict) and "sampler" in datasource:
                datasource.pop("sampler", None)

        batch_size = loader_params.pop("batch_size", default_batch_size)
        num_workers = loader_params.pop("num_workers", default_num_workers)

        if per_gpu_scaling and not distributed:
            num_gpus = max(1, torch.cuda.device_count())
            batch_size *= num_gpus
            num_workers *= num_gpus
        elif not per_gpu_scaling and distributed:
            world_size = get_distributed_params().pop("world_size", 1)
            if batch_size % world_size == 0:
                batch_size = int(batch_size / world_size)
            else:
                raise ValueError(
                    "For this distributed mode with per_gpu_scaling = False "
                    "you need to have batch_size divisible by number of GPUs")

        loader_params = {
            "batch_size": batch_size,
            "num_workers": num_workers,
            "pin_memory": torch.cuda.is_available(),
            "drop_last": drop_last,
            **loader_params,
        }

        if isinstance(datasource, Dataset):
            loader_params["dataset"] = datasource
        elif isinstance(datasource, dict):
            assert "dataset" in datasource, "You need to specify dataset for dataloader"
            loader_params = merge_dicts(datasource, loader_params)
        else:
            raise NotImplementedError

        if distributed:
            if sampler is not None:
                if not isinstance(sampler, DistributedSampler):
                    sampler = DistributedSamplerWrapper(sampler=sampler)
            else:
                sampler = DistributedSampler(dataset=loader_params["dataset"])

        loader_params["shuffle"] = name.startswith("train") and sampler is None

        loader_params["sampler"] = sampler

        if "batch_sampler" in loader_params:
            if distributed:
                raise ValueError("batch_sampler option is mutually "
                                 "exclusive with distributed")

            for k in ("batch_size", "shuffle", "sampler", "drop_last"):
                loader_params.pop(k, None)

        if "worker_init_fn" not in loader_params:
            loader_params["worker_init_fn"] = partial(
                _worker_init_fn, initial_seed=initial_seed)

        loaders[name] = DataLoader(**loader_params)

    return loaders
Ejemplo n.º 20
0
 def get_engine(self) -> IEngine:
     """Returns the engine for the run."""
     engine_params = self._config.get("engine")
     engine = REGISTRY.get_from_params(**engine_params)
     return engine
Ejemplo n.º 21
0
 def _get_loaders_from_params(
         self, **params) -> "Optional[OrderedDict[str, DataLoader]]":
     """Creates dataloaders from ``**params`` parameters."""
     loaders = dict(REGISTRY.get_from_params(**params))
     return loaders if all(
         isinstance(dl, DataLoader) for dl in loaders.values()) else None