def build_dataset( dataset_key: str, config=None, dataset_type="train" ) -> torch.utils.data.Dataset: """Builder function for creating a dataset. If dataset_key is passed the dataset is created from default config of the dataset and thus is disable config even if it is passed. Otherwise, we use MultiDatasetLoader to build and return an instance of dataset based on the config Args: dataset_key (str): Key of dataset to build. config (DictConfig, optional): Configuration that will be used to create the dataset. If not passed, dataset's default config will be used. Defaults to {}. dataset_type (str, optional): Type of the dataset to build, train|val|test. Defaults to "train". Returns: (torch.utils.data.Dataset): A dataset instance of type torch Dataset """ from mmf.datasets.base_dataset_builder import BaseDatasetBuilder from mmf.utils.configuration import load_yaml_with_defaults datamodule_instance = build_datamodule(dataset_key) # If config is not provided, we take it from default one if not config: config_path = datamodule_instance.config_path() if config_path is None: # If config path wasn't defined, send an empty config path # but don't force dataset to define a config warnings.warn( f"Config path not defined for {dataset_key}, " + "continuing with empty config" ) config = OmegaConf.create() else: config = load_yaml_with_defaults(config_path) config = OmegaConf.select(config, f"dataset_config.{dataset_key}") if config is None: config = OmegaConf.create() OmegaConf.set_struct(config, True) elif dataset_key in config: # Handle Global config config = config[dataset_key] datamodule_instance.build_dataset(config) dataset = datamodule_instance.load_dataset(config, dataset_type) if hasattr(datamodule_instance, "update_registry_for_model"): datamodule_instance.update_registry_for_model(config) return dataset
def build_dataset(dataset_key: str, config=None, dataset_type="train") -> mmf_typings.DatasetType: """Builder function for creating a dataset. If dataset_key is passed the dataset is created from default config of the dataset and thus is disable config even if it is passed. Otherwise, we use MultiDatasetLoader to build and return an instance of dataset based on the config Args: dataset_key (str): Key of dataset to build. config (DictConfig, optional): Configuration that will be used to create the dataset. If not passed, dataset's default config will be used. Defaults to {}. dataset_type (str, optional): Type of the dataset to build, train|val|test. Defaults to "train". Returns: (DatasetType): A dataset instance of type BaseDataset """ from mmf.utils.configuration import load_yaml_with_defaults dataset_builder = registry.get_builder_class(dataset_key) assert dataset_builder, (f"Key {dataset_key} doesn't have a registered " + "dataset builder") # If config is not provided, we take it from default one if not config: config_path = dataset_builder.config_path() if config_path is None: # If config path wasn't defined, send an empty config path # but don't force dataset to define a config warnings.warn(f"Config path not defined for {dataset_key}, " + "continuing with empty config") config = OmegaConf.create() else: config = load_yaml_with_defaults(config_path) config = OmegaConf.select(config, f"dataset_config.{dataset_key}") if config is None: config = OmegaConf.create() OmegaConf.set_struct(config, True) builder_instance: mmf_typings.DatasetBuilderType = dataset_builder() builder_instance.build_dataset(config, dataset_type) dataset = builder_instance.load_dataset(config, dataset_type) if hasattr(builder_instance, "update_registry_for_model"): builder_instance.update_registry_for_model(config) return dataset