def make_multisource_episode_pipeline(dataset_spec_list, use_dag_ontology_list, use_bilevel_ontology_list, split, episode_descr_config, pool=None, shuffle_buffer_size=None, read_buffer_size_bytes=None, num_prefetch=0, image_size=None, num_to_take=None): """Returns a pipeline emitting data from multiple sources as Episodes. Each episode only contains data from one single source. For each episode, its source is sampled uniformly across all sources. Args: dataset_spec_list: A list of DatasetSpecification, one for each source. use_dag_ontology_list: A list of Booleans, one for each source: whether to use that source's DAG-structured ontology to sample episode classes. use_bilevel_ontology_list: A list of Booleans, one for each source: whether to use that source's bi-level ontology to sample episode classes. split: A learning_spec.Split object identifying the sources split. It is the same for all datasets. episode_descr_config: An instance of EpisodeDescriptionConfig containing parameters relating to sampling shots and ways for episodes. pool: String (optional), for example-split datasets, which example split to use ('train', 'valid', or 'test'), used at meta-test time only. shuffle_buffer_size: int or None, shuffle buffer size for each Dataset. read_buffer_size_bytes: int or None, buffer size for each TFRecordDataset. num_prefetch: int, the number of examples to prefetch for each class of each dataset. Prefetching occurs just after the class-specific Dataset object is constructed. If < 1, no prefetching occurs. image_size: int, desired image size used during decoding. num_to_take: Optional, a list specifying for each dataset the number of examples per class to restrict to (for this given split). If provided, its length must be the same as len(dataset_spec). If None, no restrictions are applied to any dataset and all data per class is used. Returns: A Dataset instance that outputs fully-assembled and decoded episodes. """ if pool is not None: if not data.POOL_SUPPORTED: raise NotImplementedError('Example-level splits or pools not supported.') if num_to_take is not None and len(num_to_take) != len(dataset_spec_list): raise ValueError('num_to_take does not have the same length as ' 'dataset_spec_list.') if num_to_take is None: num_to_take = [-1] * len(dataset_spec_list) sources = [] for (dataset_spec, use_dag_ontology, use_bilevel_ontology, num_to_take_for_dataset) in zip(dataset_spec_list, use_dag_ontology_list, use_bilevel_ontology_list, num_to_take): episode_reader = reader.EpisodeReader(dataset_spec, split, shuffle_buffer_size, read_buffer_size_bytes, num_prefetch, num_to_take_for_dataset) sampler = sampling.EpisodeDescriptionSampler( episode_reader.dataset_spec, split, episode_descr_config, pool=pool, use_dag_hierarchy=use_dag_ontology, use_bilevel_hierarchy=use_bilevel_ontology) dataset = episode_reader.create_dataset_input_pipeline(sampler, pool=pool) sources.append(dataset) # Sample uniformly among sources dataset = tf.data.experimental.sample_from_datasets(sources) # Episodes coming out of `dataset` contain flushed examples and are internally # padded with dummy examples. `process_episode` discards flushed examples, # splits the episode into support and query sets, removes the dummy examples # and decodes the example strings. chunk_sizes = sampler.compute_chunk_sizes() map_fn = functools.partial( process_episode, chunk_sizes=chunk_sizes, image_size=image_size) dataset = dataset.map(map_fn) # Overlap episode processing and training. dataset = dataset.prefetch(1) return dataset
def make_one_source_episode_pipeline(dataset_spec, use_dag_ontology, use_bilevel_ontology, split, episode_descr_config, pool=None, shuffle_buffer_size=None, read_buffer_size_bytes=None, num_prefetch=0, image_size=None, num_to_take=None): """Returns a pipeline emitting data from one single source as Episodes. Args: dataset_spec: A DatasetSpecification object defining what to read from. use_dag_ontology: Whether to use source's ontology in the form of a DAG to sample episodes classes. use_bilevel_ontology: Whether to use source's bilevel ontology (consisting of superclasses and subclasses) to sample episode classes. split: A learning_spec.Split object identifying the source (meta-)split. episode_descr_config: An instance of EpisodeDescriptionConfig containing parameters relating to sampling shots and ways for episodes. pool: String (optional), for example-split datasets, which example split to use ('train', 'valid', or 'test'), used at meta-test time only. shuffle_buffer_size: int or None, shuffle buffer size for each Dataset. read_buffer_size_bytes: int or None, buffer size for each TFRecordDataset. num_prefetch: int, the number of examples to prefetch for each class of each dataset. Prefetching occurs just after the class-specific Dataset object is constructed. If < 1, no prefetching occurs. image_size: int, desired image size used during decoding. num_to_take: Optional, an int specifying a number of elements to pick from each class' tfrecord. If specified, the available images of each class will be restricted to that int. By default no restriction is applied and all data is used. Returns: A Dataset instance that outputs fully-assembled and decoded episodes. """ use_all_classes = False if pool is not None: if not data.POOL_SUPPORTED: raise NotImplementedError('Example-level splits or pools not supported.') if num_to_take is None: num_to_take = -1 episode_reader = reader.EpisodeReader(dataset_spec, split, shuffle_buffer_size, read_buffer_size_bytes, num_prefetch, num_to_take) sampler = sampling.EpisodeDescriptionSampler( episode_reader.dataset_spec, split, episode_descr_config, pool=pool, use_dag_hierarchy=use_dag_ontology, use_bilevel_hierarchy=use_bilevel_ontology, use_all_classes=use_all_classes) dataset = episode_reader.create_dataset_input_pipeline(sampler, pool=pool) # Episodes coming out of `dataset` contain flushed examples and are internally # padded with dummy examples. `process_episode` discards flushed examples, # splits the episode into support and query sets, removes the dummy examples # and decodes the example strings. chunk_sizes = sampler.compute_chunk_sizes() map_fn = functools.partial( process_episode, chunk_sizes=chunk_sizes, image_size=image_size) dataset = dataset.map(map_fn) # Overlap episode processing and training. dataset = dataset.prefetch(1) return dataset
def make_multisource_episode_pipeline(dataset_spec_list, use_dag_ontology_list, use_bilevel_ontology_list, split, pool=None, num_ways=None, num_support=None, num_query=None, shuffle_buffer_size=None, read_buffer_size_bytes=None, image_size=None): """Returns a pipeline emitting data from multiple sources as Episodes. Each episode only contains data from one single source. For each episode, its source is sampled uniformly across all sources. Args: dataset_spec_list: A list of DatasetSpecification, one for each source. use_dag_ontology_list: A list of Booleans, one for each source: whether to use that source's DAG-structured ontology to sample episode classes. use_bilevel_ontology_list: A list of Booleans, one for each source: whether to use that source's bi-level ontology to sample episode classes. split: A learning_spec.Split object identifying the sources split. It is the same for all datasets. pool: String (optional), for example-split datasets, which example split to use ('train', 'valid', or 'test'), used at meta-test time only. num_ways: Integer (optional), fixes the number of classes ("ways") to be used in each episode if provided. num_support: Integer (optional), fixes the number of examples for each class in the support set if provided. num_query: Integer (optional), fixes the number of examples for each class in the query set if provided. shuffle_buffer_size: int or None, shuffle buffer size for each Dataset. read_buffer_size_bytes: int or None, buffer size for each TFRecordDataset. image_size: int, desired image size used during decoding. Returns: A Dataset instance that outputs fully-assembled and decoded episodes. """ if pool is not None: if not data.POOL_SUPPORTED: raise NotImplementedError( 'Example-level splits or pools not supported.') sources = [] for (dataset_spec, use_dag_ontology, use_bilevel_ontology) in zip(dataset_spec_list, use_dag_ontology_list, use_bilevel_ontology_list): episode_reader = reader.EpisodeReader(dataset_spec, split, shuffle_buffer_size, read_buffer_size_bytes) sampler = sampling.EpisodeDescriptionSampler( episode_reader.dataset_spec, split, pool=pool, use_dag_hierarchy=use_dag_ontology, use_bilevel_hierarchy=use_bilevel_ontology, num_ways=num_ways, num_support=num_support, num_query=num_query) dataset = episode_reader.create_dataset_input_pipeline(sampler, pool=pool) sources.append(dataset) # Sample uniformly among sources dataset = tf.data.experimental.sample_from_datasets(sources) # Episodes coming out of `dataset` contain flushed examples and are internally # padded with dummy examples. `process_episode` discards flushed examples, # splits the episode into support and query sets, removes the dummy examples # and decodes the example strings. chunk_sizes = sampler.compute_chunk_sizes() map_fn = functools.partial(process_episode, chunk_sizes=chunk_sizes, image_size=image_size) dataset = dataset.map(map_fn) # Overlap episode processing and training. dataset = dataset.prefetch(1) return dataset
def make_one_source_episode_pipeline(dataset_spec, use_dag_ontology, use_bilevel_ontology, split, episode_descr_config, pool=None, shuffle_buffer_size=None, read_buffer_size_bytes=None, num_prefetch=0, image_size=None, num_to_take=None, ignore_hierarchy_probability=0.0, simclr_episode_fraction=0.0): """Returns a pipeline emitting data from one single source as Episodes. Args: dataset_spec: A DatasetSpecification object defining what to read from. use_dag_ontology: Whether to use source's ontology in the form of a DAG to sample episodes classes. use_bilevel_ontology: Whether to use source's bilevel ontology (consisting of superclasses and subclasses) to sample episode classes. split: A learning_spec.Split object identifying the source (meta-)split. episode_descr_config: An instance of EpisodeDescriptionConfig containing parameters relating to sampling shots and ways for episodes. pool: String (optional), for example-split datasets, which example split to use ('train', 'valid', or 'test'), used at meta-test time only. shuffle_buffer_size: int or None, shuffle buffer size for each Dataset. read_buffer_size_bytes: int or None, buffer size for each TFRecordDataset. num_prefetch: int, the number of examples to prefetch for each class of each dataset. Prefetching occurs just after the class-specific Dataset object is constructed. If < 1, no prefetching occurs. image_size: int, desired image size used during decoding. num_to_take: Optional, an int specifying a number of elements to pick from each class' tfrecord. If specified, the available images of each class will be restricted to that int. By default no restriction is applied and all data is used. ignore_hierarchy_probability: Float, if using a hierarchy, this flag makes the sampler ignore the hierarchy for this proportion of episodes and instead sample categories uniformly. simclr_episode_fraction: Float, fraction of episodes that will be converted to SimCLR Episodes as described in the CrossTransformers paper. Returns: A Dataset instance that outputs tuples of fully-assembled and decoded episodes zipped with the ID of their data source of origin. """ use_all_classes = False if pool is not None: if not data.POOL_SUPPORTED: raise NotImplementedError( 'Example-level splits or pools not supported.') if num_to_take is None: num_to_take = -1 num_unique_episodes = episode_descr_config.num_unique_episodes episode_reader = reader.EpisodeReader(dataset_spec, split, shuffle_buffer_size, read_buffer_size_bytes, num_prefetch, num_to_take, num_unique_episodes) sampler = sampling.EpisodeDescriptionSampler( episode_reader.dataset_spec, split, episode_descr_config, pool=pool, use_dag_hierarchy=use_dag_ontology, use_bilevel_hierarchy=use_bilevel_ontology, use_all_classes=use_all_classes, ignore_hierarchy_probability=ignore_hierarchy_probability) dataset = episode_reader.create_dataset_input_pipeline(sampler, pool=pool) # Episodes coming out of `dataset` contain flushed examples and are internally # padded with dummy examples. `process_episode` discards flushed examples, # splits the episode into support and query sets, removes the dummy examples # and decodes the example strings. chunk_sizes = sampler.compute_chunk_sizes() map_fn = functools.partial(process_episode, chunk_sizes=chunk_sizes, image_size=image_size, simclr_episode_fraction=simclr_episode_fraction) dataset = dataset.map(map_fn) # There is only one data source, so we know that all episodes belong to it, # but for interface consistency, zip with a dataset identifying the source. source_id_dataset = tf.data.Dataset.from_tensors(0).repeat() dataset = tf.data.Dataset.zip((dataset, source_id_dataset)) # Overlap episode processing and training. dataset = dataset.prefetch(1) return dataset
def make_one_source_episode_pipeline(dataset_spec, use_dag_ontology, use_bilevel_ontology, split, pool=None, num_ways=None, num_support=None, num_query=None, shuffle_buffer_size=None, read_buffer_size_bytes=None, image_size=None): """Returns a pipeline emitting data from one single source as Episodes. Args: dataset_spec: A DatasetSpecification object defining what to read from. use_dag_ontology: Whether to use source's ontology in the form of a DAG to sample episodes classes. use_bilevel_ontology: Whether to use source's bilevel ontology (consisting of superclasses and subclasses) to sample episode classes. split: A learning_spec.Split object identifying the source (meta-)split. pool: String (optional), for example-split datasets, which example split to use ('train', 'valid', or 'test'), used at meta-test time only. num_ways: Integer (optional), fixes the number of classes ("ways") to be used in each episode if provided. num_support: Integer (optional), fixes the number of examples for each class in the support set if provided. num_query: Integer (optional), fixes the number of examples for each class in the query set if provided. shuffle_buffer_size: int or None, shuffle buffer size for each Dataset. read_buffer_size_bytes: int or None, buffer size for each TFRecordDataset. image_size: int, desired image size used during decoding. Returns: A Dataset instance that outputs fully-assembled and decoded episodes. """ if pool is not None: if not data.POOL_SUPPORTED: raise NotImplementedError( 'Example-level splits or pools not supported.') else: use_all_classes = False episode_reader = reader.EpisodeReader(dataset_spec, split, shuffle_buffer_size, read_buffer_size_bytes) sampler = sampling.EpisodeDescriptionSampler( episode_reader.dataset_spec, split, pool=pool, use_dag_hierarchy=use_dag_ontology, use_bilevel_hierarchy=use_bilevel_ontology, use_all_classes=use_all_classes, num_ways=num_ways, num_support=num_support, num_query=num_query) dataset = episode_reader.create_dataset_input_pipeline(sampler, pool=pool) # Episodes coming out of `dataset` contain flushed examples and are internally # padded with dummy examples. `process_episode` discards flushed examples, # splits the episode into support and query sets, removes the dummy examples # and decodes the example strings. chunk_sizes = sampler.compute_chunk_sizes() map_fn = functools.partial(process_episode, chunk_sizes=chunk_sizes, image_size=image_size) dataset = dataset.map(map_fn) # Overlap episode processing and training. dataset = dataset.prefetch(1) return dataset