Example #1
0
 def setUp(self):
   super(ChunkSizesTest, self).setUp()
   # Set up a DatasetSpecification with lots of classes and samples.
   self.dataset_spec = DatasetSpecification(
       name=None,
       classes_per_split=dict(zip(Split, [1000, 0, 0])),
       images_per_class={i: 1000 for i in range(1000)},
       class_names=None,
       path=None,
       file_pattern='{}.tfrecords')
Example #2
0
def get_dataset_spec(path):
    DATASET_SPEC = DatasetSpecification(name=None,
                                        classes_per_split={
                                            Split.TRAIN: 15,
                                            Split.VALID: 5,
                                            Split.TEST: 10
                                        },
                                        images_per_class=dict(
                                            enumerate([10, 20, 30] * 10)),
                                        class_names=None,
                                        path=path,
                                        file_pattern='{}.h5')
    return DATASET_SPEC
Example #3
0
    def test_flush_logic(self):
        """Tests the "flush" logic avoiding example duplication in an episode."""
        # Generate two episodes from un-shuffled data sources. For classes where
        # there are enough examples for both, new examples should be used for the
        # second episodes. Otherwise, the first examples should be re-used.
        # A data_spec with classes between 10 and 29 examples.
        num_classes = 30
        dataset_spec = DatasetSpecification(
            name=None,
            classes_per_split={
                Split.TRAIN: num_classes,
                Split.VALID: 0,
                Split.TEST: 0
            },
            images_per_class={i: 10 + i
                              for i in range(num_classes)},
            class_names=None,
            path=None,
            file_pattern='{}.tfrecords')
        # Sample from all train classes, 5 + 5 examples from each episode
        sampler = sampling.EpisodeDescriptionSampler(
            dataset_spec,
            Split.TRAIN,
            episode_descr_config=config.EpisodeDescriptionConfig(
                num_ways=num_classes, num_support=5, num_query=5))
        episodes = self.generate_episodes(sampler,
                                          num_episodes=2,
                                          shuffle=False)

        # The "flush" part of the second episode should contain 0 from class_id 0, 1
        # for 1, ..., 9 for 9, and then 0 for 10 and the following.
        chunk_sizes = sampler.compute_chunk_sizes()
        _, episode2 = episodes
        examples2, targets2 = episode2
        flush_target2, _, _ = split_into_chunks(targets2, chunk_sizes)
        for class_id in range(10):
            self.assertEqual(
                sum(target == class_id for target in flush_target2), class_id)
        for class_id in range(10, num_classes):
            self.assertEqual(
                sum(target == class_id for target in flush_target2), 0)

        # The "support" part of the second episode should start at example 0 for
        # class_ids from 0 to 9 (included), and at example 10 for class_id 10 and
        # higher.
        _, support_examples2, query_examples2 = split_into_chunks(
            examples2, chunk_sizes)

        def _build_class_id_to_example_ids(examples):
            # Build a mapping: class_id -> list of example ids
            mapping = collections.defaultdict(list)
            for example in examples:
                if not example:
                    # Padding is at the end
                    break
                class_id, example_id = example.decode().split('.')
                mapping[int(class_id)].append(int(example_id))
            return mapping

        support2_example_ids = _build_class_id_to_example_ids(
            support_examples2)
        query2_example_ids = _build_class_id_to_example_ids(query_examples2)

        for class_id in range(10):
            self.assertCountEqual(support2_example_ids[class_id],
                                  list(range(5)))
            self.assertCountEqual(query2_example_ids[class_id],
                                  list(range(5, 10)))

        for class_id in range(10, num_classes):
            self.assertCountEqual(support2_example_ids[class_id],
                                  list(range(10, 15)))
            self.assertCountEqual(query2_example_ids[class_id],
                                  list(range(15, 20)))
Example #4
0
from meta_dataset.data import reader
from meta_dataset.data import sampling
from meta_dataset.data.dataset_spec import DatasetSpecification
from meta_dataset.data.learning_spec import Split
import numpy as np
from six.moves import range
from six.moves import zip
import tensorflow.compat.v1 as tf

# DatasetSpecification to use in tests
DATASET_SPEC = DatasetSpecification(name=None,
                                    classes_per_split={
                                        Split.TRAIN: 15,
                                        Split.VALID: 5,
                                        Split.TEST: 10
                                    },
                                    images_per_class=dict(
                                        enumerate([10, 20, 30] * 10)),
                                    class_names=None,
                                    path=None,
                                    file_pattern='{}.tfrecords')

# Define defaults and set Gin configuration for EpisodeDescriptionConfig
MIN_WAYS = 5
MAX_WAYS_UPPER_BOUND = 50
MAX_NUM_QUERY = 10
MAX_SUPPORT_SET_SIZE = 500
MAX_SUPPORT_SIZE_CONTRIB_PER_CLASS = 100
MIN_LOG_WEIGHT = np.log(0.5)
MAX_LOG_WEIGHT = np.log(2)
Example #5
0
    def test_make_multisource_episode_pipeline_feature(self, decoder_type,
                                                       config_file_path):

        # Create some feature records and write them to a temp directory.
        feat_size = 64
        num_examples = 100
        num_classes = 10
        output_path = self.get_temp_dir()
        gin.parse_config_file(config_file_path)

        # 1-Write feature records to temp directory.
        self.rng = np.random.RandomState(0)
        class_features = []
        class_examples = []
        for class_id in range(num_classes):
            features = self.rng.randn(num_examples,
                                      feat_size).astype(np.float32)
            label = np.array(class_id).astype(np.int64)
            output_file = os.path.join(output_path,
                                       str(class_id) + '.tfrecords')
            examples = test_utils.write_feature_records(
                features, label, output_file)
            class_examples.append(examples)
            class_features.append(features)
        class_examples = np.stack(class_examples)
        class_features = np.stack(class_features)

        # 2-Read records back using multi-source pipeline.
        # DatasetSpecification to use in tests
        dataset_spec = DatasetSpecification(
            name=None,
            classes_per_split={
                learning_spec.Split.TRAIN: 5,
                learning_spec.Split.VALID: 2,
                learning_spec.Split.TEST: 3
            },
            images_per_class={i: num_examples
                              for i in range(num_classes)},
            class_names=None,
            path=output_path,
            file_pattern='{}.tfrecords')

        # Duplicate the dataset to simulate reading from multiple datasets.
        use_bilevel_ontology_list = [False] * 2
        use_dag_ontology_list = [False] * 2
        all_dataset_specs = [dataset_spec] * 2

        fixed_ways_shots = config.EpisodeDescriptionConfig(num_query=5,
                                                           num_support=5,
                                                           num_ways=5)

        dataset_episodic = pipeline.make_multisource_episode_pipeline(
            dataset_spec_list=all_dataset_specs,
            use_dag_ontology_list=use_dag_ontology_list,
            use_bilevel_ontology_list=use_bilevel_ontology_list,
            episode_descr_config=fixed_ways_shots,
            split=learning_spec.Split.TRAIN,
            image_size=None)

        episode, _ = self.evaluate(
            dataset_episodic.make_one_shot_iterator().get_next())

        if decoder_type == 'feature':
            # 3-Check that support and query features are in class_features and have
            # the correct corresponding label.
            support_features, support_class_ids = episode[0], episode[2]
            query_features, query_class_ids = episode[3], episode[5]

            for feat, class_id in zip(list(support_features),
                                      list(support_class_ids)):
                abs_err = np.abs(
                    np.sum(class_features - feat[None][None], axis=-1))
                # Make sure the feature is present in the original data.
                self.assertEqual(abs_err.min(), 0.0)
                found_class_id = np.where(abs_err == 0.0)[0][0]
                self.assertEqual(found_class_id, class_id)

            for feat, class_id in zip(list(query_features),
                                      list(query_class_ids)):
                abs_err = np.abs(
                    np.sum(class_features - feat[None][None], axis=-1))
                # Make sure the feature is present in the original data.
                self.assertEqual(abs_err.min(), 0.0)
                found_class_id = np.where(abs_err == 0.0)[0][0]
                self.assertEqual(found_class_id, class_id)

        elif decoder_type == 'none':
            # 3-Check that support and query examples are in class_examples and have
            # the correct corresponding label.

            support_examples, support_class_ids = episode[0], episode[2]
            query_examples, query_class_ids = episode[3], episode[5]

            for example, class_id in zip(list(support_examples),
                                         list(support_class_ids)):
                found_class_id = np.where(class_examples == example)[0][0]
                self.assertEqual(found_class_id, class_id)

            for example, class_id in zip(list(query_examples),
                                         list(query_class_ids)):
                found_class_id = np.where(class_examples == example)[0][0]
                self.assertEqual(found_class_id, class_id)
Example #6
0
    def test_make_multisource_episode_pipeline_feature(self):
        def iterate_dataset(dataset, n):
            """Iterate over dataset."""
            if not tf.executing_eagerly():
                iterator = dataset.make_one_shot_iterator()
                next_element = iterator.get_next()
                with tf.Session() as sess:
                    for idx in range(n):
                        yield idx, sess.run(next_element)
            else:
                for idx, episode in enumerate(dataset):
                    if idx == n:
                        break
                    yield idx, episode

        def write_feature_records(features, label, output_path):
            """Create a record file from features and labels.

      Args:
        features: An [n, m] numpy array of features.
        label: A numpy array containing the label.
        output_path: A string specifying the location of the record.
      """
            writer = tf.python_io.TFRecordWriter(output_path)
            with self.session(use_gpu=False) as sess:
                for feat in list(features):
                    feat_serial = sess.run(tf.io.serialize_tensor(feat))
                    # Write the example.
                    dataset_to_records.write_example(
                        feat_serial,
                        label,
                        writer,
                        input_key='image/embedding',
                        label_key='image/class/label')
            writer.close()

        # Create some feature records and write them to a temp directory.
        feat_size = 64
        num_examples = 100
        num_classes = 10
        output_path = self.get_temp_dir()
        gin.parse_config("""
        import meta_dataset.data.decoder
        EpisodeDescriptionConfig.min_ways = 5
        EpisodeDescriptionConfig.max_ways_upper_bound = 50
        EpisodeDescriptionConfig.max_num_query = 10
        EpisodeDescriptionConfig.max_support_set_size = 500
        EpisodeDescriptionConfig.max_support_size_contrib_per_class = 100
        EpisodeDescriptionConfig.min_log_weight = -0.69314718055994529  # np.log(0.5)
        EpisodeDescriptionConfig.max_log_weight = 0.69314718055994529  # np.log(2)
        EpisodeDescriptionConfig.ignore_dag_ontology = False
        EpisodeDescriptionConfig.ignore_bilevel_ontology = False
        process_episode.support_decoder = @FeatureDecoder()
        process_episode.query_decoder = @FeatureDecoder()
        """)

        # 1-Write feature records to temp directory.
        self.rng = np.random.RandomState(0)
        class_features = []
        for class_id in range(num_classes):
            features = self.rng.randn(num_examples,
                                      feat_size).astype(np.float32)
            label = np.array(class_id).astype(np.int64)
            output_file = os.path.join(output_path,
                                       str(class_id) + '.tfrecords')
            write_feature_records(features, label, output_file)
            class_features.append(features)
        class_features = np.stack(class_features)

        # 2-Read records back using multi-source pipeline.
        # DatasetSpecification to use in tests
        dataset_spec = DatasetSpecification(
            name=None,
            classes_per_split={
                learning_spec.Split.TRAIN: 5,
                learning_spec.Split.VALID: 2,
                learning_spec.Split.TEST: 3
            },
            images_per_class={i: num_examples
                              for i in range(num_classes)},
            class_names=None,
            path=output_path,
            file_pattern='{}.tfrecords')

        # Duplicate the dataset to simulate reading from multiple datasets.
        use_bilevel_ontology_list = [False] * 2
        use_dag_ontology_list = [False] * 2
        all_dataset_specs = [dataset_spec] * 2

        fixed_ways_shots = config.EpisodeDescriptionConfig(num_query=5,
                                                           num_support=5,
                                                           num_ways=5)

        dataset_episodic = pipeline.make_multisource_episode_pipeline(
            dataset_spec_list=all_dataset_specs,
            use_dag_ontology_list=use_dag_ontology_list,
            use_bilevel_ontology_list=use_bilevel_ontology_list,
            episode_descr_config=fixed_ways_shots,
            split=learning_spec.Split.TRAIN,
            image_size=None)

        _, episode = next(iterate_dataset(dataset_episodic, 1))
        # 3-Check that support and query features are in class_features and have
        # the correct corresponding label.
        support_features, support_class_ids = episode[0], episode[2]
        query_features, query_class_ids = episode[3], episode[5]

        for feat, class_id in zip(list(support_features),
                                  list(support_class_ids)):
            abs_err = np.abs(np.sum(class_features - feat[None][None],
                                    axis=-1))
            # Make sure the feature is present in the original data.
            self.assertEqual(abs_err.min(), 0.0)
            found_class_id = np.where(abs_err == 0.0)[0][0]
            self.assertEqual(found_class_id, class_id)

        for feat, class_id in zip(list(query_features), list(query_class_ids)):
            abs_err = np.abs(np.sum(class_features - feat[None][None],
                                    axis=-1))
            # Make sure the feature is present in the original data.
            self.assertEqual(abs_err.min(), 0.0)
            found_class_id = np.where(abs_err == 0.0)[0][0]
            self.assertEqual(found_class_id, class_id)