Exemple #1
0
    def filter_sources(self, source_files, source_image_folders,
                       source_categories):
        """
        Loads the dataset from one or more files.

        :param source_files: List of source files.

        :param source_image_folders: List of folders containing image files.

        :param source_categories: List of categories associated with each of those files. (<UNK> unknown)

        :return: Tuple consisting of: filtered source_files and filtered source_categories
        """
        # Check categories that user want to use.
        use_files = [False] * 4
        categs = {'C1': 0, 'C2': 1, 'C3': 2, 'C4': 3}
        # Parse categories from configuration list.
        loaded_categs = get_value_list_from_dictionary(
            "categories", self.config, ['C1', 'C2', 'C3', 'C4', 'all'])
        for cat in loaded_categs:
            # "Special" case.
            if cat == "all":
                use_files = [True] * 4
                # Make no sense to continue.
                break
            else:
                use_files[categs[cat]] = True
        # Filter.
        _, source_files, source_image_folders, source_categories = zip(
            *(filter(
                lambda x: x[0],
                zip(use_files, source_files, source_image_folders,
                    source_categories))))
        return source_files, source_image_folders, source_categories
    def __init__(self, name, config):
        """
        Initializes the component.

        :param name: Component name (read from configuration file).
        :type name: str

        :param config: Dictionary of parameters (read from the configuration ``.yaml`` file).
        :type config: :py:class:`ptp.configuration.ConfigInterface`

        """
        # Call constructors of parent classes.
        Component.__init__(self, name, SentenceTokenizer, config)

        # Read the actual configuration.
        self.mode_detokenize = config['detokenize']

        # Get preprocessing.
        self.preprocessing = get_value_list_from_dictionary(
            "preprocessing", self.config,
            'none | lowercase | remove_punctuation | all'.split(" | ")
            )
        if 'none' in self.preprocessing:
            self.preprocessing = []
        if 'all' in self.preprocessing:
            self.preprocessing = 'lowercase | remove_punctuation'.split(" | ")
        self.logger.info("Applied preprocessing: {}".format(self.preprocessing))

        self.remove_characters = get_value_list_from_dictionary("remove_characters", self.config)
        self.logger.info("Additional characters that will be removed during preprocessing: {}".format(self.remove_characters))

        if 'remove_punctuation' in self.preprocessing:
            self.translator = str.maketrans('', '', string.punctuation)

        # Tokenizer.
        self.tokenizer = nltk.tokenize.WhitespaceTokenizer()

        # Set key mappings.
        self.key_inputs = self.stream_keys["inputs"]
        self.key_outputs = self.stream_keys["outputs"]

        if self.mode_detokenize:
            # list of strings -> sentence.
            self.processor = self.detokenize_sample
        else:
            # sentence -> list of strings.
            self.processor = self.tokenize_sample
    def __init__(self, name, config):
        """
        Initializes loss object.

        :param name: Loss name.
        :type name: str

        :param config: Dictionary of parameters (read from the configuration ``.yaml`` file).
        :type config: :py:class:`ptp.configuration.ConfigInterface`

        """
        # Call constructors of parent classes.
        Component.__init__(self, name, StreamViewer, config)

        # Get key mappings for indices.
        self.key_indices = self.stream_keys["indices"]

        # Load list of streams names (keys).
        self.input_stream_keys = get_value_list_from_dictionary("input_streams", self.config)
        
        # Get sample number.
        self.sample_number = self.config["sample_number"]
    def __init__(self, name, config):
        """
        Initializes the object, retrieves names of input streams and creates the output file in experiment directory.

        :param name: Name of the component.
        :type name: str

        :param config: Dictionary of parameters (read from the configuration ``.yaml`` file).
        :type config: :py:class:`ptp.configuration.ConfigInterface`

        """
        # Call constructors of parent classes.
        Component.__init__(self, name, StreamFileExporter, config)

        # Get key mappings for indices.
        self.key_indices = self.stream_keys["indices"]

        # Load list of streams names (keys).
        self.input_stream_keys = get_value_list_from_dictionary(
            "input_streams", self.config)

        # Get separator.
        self.separator = self.config["separator"]

        # Create file where we will write the results.
        filename = self.config["filename"]
        abs_filename = path.join(self.app_state.log_dir, filename)
        self.file = open(abs_filename, 'w')

        # Export additional line with separator.
        if self.config["export_separator_line_to_csv"]:
            self.file.write("sep={}\n".format(self.separator))

        # Export header - once, when we will process the first batch.
        self.export_header = self.config["export_header_to_csv"]

        self.logger.info("Writing values from {} streams to {}".format(
            self.input_stream_keys, abs_filename))
Exemple #5
0
    def __init__(self, name, config):
        """
        Initializes task object. Calls base constructor. Downloads the dataset if not present and loads the adequate files depending on the mode.

        :param name: Name of the component.

        :param class_type: Class type of the component.

        :param config: Dictionary of parameters (read from configuration ``.yaml`` file).
        """
        # Call constructors of parent classes.
        Task.__init__(self, name, VQAMED2019, config)

        # (Eventually) download required packages.
        nltk.download('punkt')
        nltk.download('stopwords')

        # Get key mappings of all output streams.
        self.key_images = self.stream_keys["images"]
        self.key_image_ids = self.stream_keys["image_ids"]
        self.key_questions = self.stream_keys["questions"]
        self.key_answers = self.stream_keys["answers"]
        self.key_category_ids = self.stream_keys["category_ids"]
        self.key_category_names = self.stream_keys["category_names"]
        self.key_image_sizes = self.stream_keys["image_sizes"]

        # Get flag informing whether we want to stream images or not.
        self.stream_images = self.config['stream_images']

        # Get flag indicating whether we want to (pre)aload all images at the start.
        self.preload_images = self.config['preload_images']

        # Check the desired image size.
        if len(self.config['resize_image']) != 2:
            self.logger.error(
                "'resize_image' field must contain 2 values: the desired height and width"
            )
            exit(-1)

        # Output image dimensions.
        self.height = self.config['resize_image'][0]
        self.width = self.config['resize_image'][1]
        self.depth = 3

        # Set global variables - all dimensions ASIDE OF BATCH.
        self.globals["image_height"] = self.height
        self.globals["image_width"] = self.width
        self.globals["image_depth"] = self.depth

        # Those values will be used to rescale the image_sizes to range (0, 1).
        self.scale_image_height = self.config['scale_image_size'][0]
        self.scale_image_width = self.config['scale_image_size'][1]

        # Set parameters and globals related to categories.
        self.globals["num_categories"] = 6
        self.globals["category_word_mappings"] = {
            'C1': 0,
            'C2': 1,
            'C3': 2,
            'C4': 3,
            'BINARY': 4,
            '<UNK>': 5
        }
        self.category_idx_to_word = {
            0: 'C1',
            1: 'C2',
            2: 'C3',
            3: 'C4',
            4: 'BINARY',
            5: '<UNK>'
        }

        # Get image preprocessing.
        self.image_preprocessing = get_value_list_from_dictionary(
            "image_preprocessing", self.config,
            'none | random_affine | random_horizontal_flip | normalize | all'.
            split(" | "))
        if 'none' in self.image_preprocessing:
            self.image_preprocessing = []
        if 'all' in self.image_preprocessing:
            self.image_preprocessing = 'random_affine | random_horizontal_flip | normalize'.split(
                " | ")
        self.logger.info("Applied image preprocessing: {}".format(
            self.image_preprocessing))

        # Get question preprocessing.
        self.question_preprocessing = get_value_list_from_dictionary(
            "question_preprocessing", self.config,
            'none | lowercase | remove_punctuation | tokenize | random_remove_stop_words | random_shuffle_words | all'
            .split(" | "))
        if 'none' in self.question_preprocessing:
            self.question_preprocessing = []
        if 'all' in self.question_preprocessing:
            self.question_preprocessing = 'lowercase | remove_punctuation | tokenize | remove_stop_words | shuffle_words'.split(
                " | ")
        self.logger.info("Applied question preprocessing: {}".format(
            self.question_preprocessing))

        # Get answer preprocessing.
        self.answer_preprocessing = get_value_list_from_dictionary(
            "answer_preprocessing", self.config,
            'none | lowercase | remove_punctuation | tokenize | all'.split(
                " | "))
        if 'none' in self.answer_preprocessing:
            self.answer_preprocessing = []
        if 'all' in self.answer_preprocessing:
            self.answer_preprocessing = 'lowercase | remove_punctuation | tokenize '.split(
                " | ")
        self.logger.info("Applied answer preprocessing: {}".format(
            self.answer_preprocessing))

        # Get the absolute path.
        self.data_folder = os.path.expanduser(self.config['data_folder'])

        # Get split.
        split = get_value_from_dictionary(
            'split', self.config,
            "training,validation,training_validation,test_answers,test".split(
                ","))

        # Set split-dependent data.
        if split == 'training':
            # Training split folder.
            split_folder = os.path.join(self.data_folder,
                                        "ImageClef-2019-VQA-Med-Training")
            # Set source files.
            source_files = [
                os.path.join(split_folder,
                             "QAPairsByCategory/C1_Modality_train.txt"),
                os.path.join(split_folder,
                             "QAPairsByCategory/C2_Plane_train.txt"),
                os.path.join(split_folder,
                             "QAPairsByCategory/C3_Organ_train.txt"),
                os.path.join(split_folder,
                             "QAPairsByCategory/C4_Abnormality_train.txt")
            ]
            # Set image folders.
            source_image_folders = [
                os.path.join(split_folder, 'Train_images')
            ] * 4

            # Set the categories associated with each of those files.
            source_categories = [0, 1, 2, 3]

            # Filter lists taking into account configuration.
            source_files, source_image_folders, source_categories = self.filter_sources(
                source_files, source_image_folders, source_categories)
            # Load dataset.
            self.dataset = self.load_dataset(source_files,
                                             source_image_folders,
                                             source_categories)

        elif split == 'validation':
            # Validation split folder.
            split_folder = os.path.join(self.data_folder,
                                        "ImageClef-2019-VQA-Med-Validation")

            # Set source files.
            source_files = [
                os.path.join(split_folder,
                             "QAPairsByCategory/C1_Modality_val.txt"),
                os.path.join(split_folder,
                             "QAPairsByCategory/C2_Plane_val.txt"),
                os.path.join(split_folder,
                             "QAPairsByCategory/C3_Organ_val.txt"),
                os.path.join(split_folder,
                             "QAPairsByCategory/C4_Abnormality_val.txt")
            ]

            # Set image folders.
            source_image_folders = [os.path.join(split_folder, 'Val_images')
                                    ] * 4

            # Set the categories associated with each of those files.
            source_categories = [0, 1, 2, 3]

            # Filter lists taking into account configuration.
            source_files, source_image_folders, source_categories = self.filter_sources(
                source_files, source_image_folders, source_categories)
            # Load dataset.
            self.dataset = self.load_dataset(source_files,
                                             source_image_folders,
                                             source_categories)

        elif split == 'training_validation':
            # This split takes both training and validation and assumes utilization of kFoldWeightedRandomSampler.

            # 1. Training split folder.
            split_folder = os.path.join(self.data_folder,
                                        "ImageClef-2019-VQA-Med-Training")
            # Set source files.
            source_files = [
                os.path.join(split_folder,
                             "QAPairsByCategory/C1_Modality_train.txt"),
                os.path.join(split_folder,
                             "QAPairsByCategory/C2_Plane_train.txt"),
                os.path.join(split_folder,
                             "QAPairsByCategory/C3_Organ_train.txt"),
                os.path.join(split_folder,
                             "QAPairsByCategory/C4_Abnormality_train.txt")
            ]
            # Set image folders.
            source_image_folders = [
                os.path.join(split_folder, 'Train_images')
            ] * 4

            # Set the categories associated with each of those files.
            source_categories = [0, 1, 2, 3]

            # Filter lists taking into account configuration.
            training_source_files, training_source_image_folders, training_source_categories = self.filter_sources(
                source_files, source_image_folders, source_categories)

            #2.  Validation split folder.
            split_folder = os.path.join(self.data_folder,
                                        "ImageClef-2019-VQA-Med-Validation")

            # Set source files.
            source_files = [
                os.path.join(split_folder,
                             "QAPairsByCategory/C1_Modality_val.txt"),
                os.path.join(split_folder,
                             "QAPairsByCategory/C2_Plane_val.txt"),
                os.path.join(split_folder,
                             "QAPairsByCategory/C3_Organ_val.txt"),
                os.path.join(split_folder,
                             "QAPairsByCategory/C4_Abnormality_val.txt")
            ]

            # Set image folders.
            source_image_folders = [os.path.join(split_folder, 'Val_images')
                                    ] * 4

            # Set the categories associated with each of those files.
            source_categories = [0, 1, 2, 3]

            # Filter lists taking into account configuration.
            valid_source_files, valid_source_image_folders, valid_source_categories = self.filter_sources(
                source_files, source_image_folders, source_categories)

            # 3. Merge lists.
            source_files = [*training_source_files, *valid_source_files]
            source_image_folders = [
                *training_source_image_folders, *valid_source_image_folders
            ]
            source_categories = [
                *training_source_categories, *valid_source_categories
            ]
            # Load dataset.
            self.dataset = self.load_dataset(source_files,
                                             source_image_folders,
                                             source_categories)

        elif split == 'test_answers':
            # Test set WITH ANSWERS.
            split_folder = os.path.join(self.data_folder,
                                        "ImageClef-2019-VQA-Med-Test")
            # Set source file.
            source_file = os.path.join(
                split_folder, "VQAMed2019_Test_Questions_w_Ref_Answers.txt")
            # Set image folder.
            source_image_folder = os.path.join(split_folder,
                                               'VQAMed2019_Test_Images')
            self.dataset = self.load_testset_with_answers(
                source_file, source_image_folder)

        else:  # "test"
            # Test set WITHOUT ANSWERS.
            split_folder = os.path.join(self.data_folder,
                                        "ImageClef-2019-VQA-Med-Test")
            # Set source file.
            source_file = os.path.join(split_folder,
                                       "VQAMed2019_Test_Questions.txt")
            # Set image folder.
            source_image_folder = os.path.join(split_folder,
                                               'VQAMed2019_Test_Images')
            self.dataset = self.load_testset_without_answers(
                source_file, source_image_folder)

        # Ok, now we got the whole dataset (for given "split").
        self.ix = np.arange(len(self.dataset))
        if self.config["import_indices"] != '':
            # Try to load indices from the file.
            self.ix = np.load(
                os.path.join(self.app_state.log_dir,
                             self.config["import_indices"]))
            self.logger.info("Imported indices from '{}'".format(
                os.path.join(self.app_state.log_dir,
                             self.config["export_indices"])))
        else:
            # Ok, check whether we want to shuffle.
            if self.config["shuffle_indices"]:
                np.random.shuffle(self.ix)
            # Export if required.
            if self.config["export_indices"] != '':
                # export indices to file.
                np.save(
                    os.path.join(self.app_state.log_dir,
                                 self.config["export_indices"]), self.ix)
                self.logger.info("Exported indices to '{}'".format(
                    os.path.join(self.app_state.log_dir,
                                 self.config["export_indices"])))

        # Display exemplary sample.
        self.logger.info(
            "Exemplary sample 0 ({}):\n  category: {}\n  image_ids: {}\n  question: {}\n  answer: {}"
            .format(
                self.ix[0], self.category_idx_to_word[self.dataset[self.ix[0]][
                    self.key_category_ids]],
                self.dataset[self.ix[0]][self.key_image_ids],
                self.dataset[self.ix[0]][self.key_questions],
                self.dataset[self.ix[0]][self.key_answers]))

        # Check if we want the task to calculate and export the weights.
        self.export_sample_weights = self.config["export_sample_weights"]
        if self.export_sample_weights != '':
            self.calculate_and_export_sample_weights(
                self.export_sample_weights)
Exemple #6
0
    def __init__(self, name, config):
        """
        Initializes task object. Calls base constructor. Downloads the dataset if not present and loads the adequate files depending on the mode.

        :param name: Name of the component.

        :param class_type: Class type of the component.

        :param config: Dictionary of parameters (read from configuration ``.yaml`` file).
        """
        # Call constructors of parent classes.
        Task.__init__(self, name, GQA, config)

        # Get key mappings of all output streams.
        self.key_sample_ids = self.stream_keys["sample_ids"]
        self.key_images = self.stream_keys["images"]
        self.key_image_ids = self.stream_keys["image_ids"]
        self.key_questions = self.stream_keys["questions"]
        self.key_answers = self.stream_keys["answers"]
        self.key_full_answers = self.stream_keys["full_answers"]

        # Get flag informing whether we want to stream images or not.
        self.stream_images = self.config['stream_images']

        # Check the resize image option.
        if len(self.config['resize_image']) != 2:
            self.logger.error(
                "'resize_image' field must contain 2 values: the desired height and width"
            )
            exit(-1)

        # Output image dimensions.
        self.height = self.config['resize_image'][0]
        self.width = self.config['resize_image'][1]
        self.depth = 3
        self.logger.info(
            "Setting image size to [D  x H x W]: {} x {} x {}".format(
                self.depth, self.height, self.width))

        # Set global variables - all dimensions ASIDE OF BATCH.
        self.globals["image_height"] = self.height
        self.globals["image_width"] = self.width
        self.globals["image_depth"] = self.depth

        # Get image preprocessing.
        self.image_preprocessing = get_value_list_from_dictionary(
            "image_preprocessing", self.config,
            'none | normalize | all'.split(" | "))
        if 'none' in self.image_preprocessing:
            self.image_preprocessing = []
        if 'all' in self.image_preprocessing:
            self.image_preprocessing = ['normalize']
        # Add resize as transformation.
        self.image_preprocessing = ["resize"] + self.image_preprocessing

        self.logger.info("Applied image preprocessing: {}".format(
            self.image_preprocessing))

        # Get the absolute path.
        self.data_folder = os.path.expanduser(self.config['data_folder'])

        # Get split.
        split = get_value_from_dictionary(
            'split', self.config,
            "training_0 | training | validation | test_dev | test".split(
                " | "))
        self.split_image_folder = os.path.join(self.data_folder, "images")

        # Set split-dependent data.
        if split == 'training':
            # Training split folder and file with data question.
            data_files = []
            for i in range(10):
                data_files.append(
                    os.path.join(self.data_folder, "questions1.2",
                                 "train_all_questions",
                                 "train_all_questions_{}.json".format(i)))

        elif split == 'training_0':
            # Validation split folder and file with data question.
            data_files = [
                os.path.join(self.data_folder, "questions1.2",
                             "train_all_questions",
                             "train_all_questions_0.json")
            ]
            self.logger.warning(
                "Please remember that this split constitutes only 10 percent of the whole training set!"
            )

        elif split == 'validation':
            # Validation split folder and file with data question.
            data_files = [
                os.path.join(self.data_folder, "questions1.2",
                             "val_all_questions.json")
            ]
            self.logger.warning("Please use 'test_dev' split for validation!")

        elif split == 'test_dev':
            # Validation split folder and file with data question.
            data_files = [
                os.path.join(self.data_folder, "questions1.2",
                             "testdev_all_questions.json")
            ]

        elif split == 'test':
            # Test split folder and file with data question.
            data_files = [
                os.path.join(self.data_folder, "questions1.2",
                             "test_all_questions.json")
            ]

        else:
            raise ConfigurationError(
                "Split {} not supported yet".format(split))

        # Load dataset.
        self.dataset = self.load_dataset(data_files)

        # Display exemplary sample.
        i = 0
        sample = self.dataset[i]
        # Check if this is a test set.
        self.logger.info(
            "Exemplary sample {} ({}):\n  image_ids: {}\n  question: {}\n  answer: {} ({})"
            .format(i, sample[self.key_sample_ids], sample[self.key_image_ids],
                    sample[self.key_questions], sample[self.key_answers],
                    sample[self.key_full_answers]))
Exemple #7
0
    def __init__(self, name, config):
        """
        Initializes task object. Calls base constructor. Downloads the dataset if not present and loads the adequate files depending on the mode.

        :param name: Name of the component.

        :param class_type: Class type of the component.

        :param config: Dictionary of parameters (read from configuration ``.yaml`` file).
        """
        # Call constructors of parent classes.
        Task.__init__(self, name, CLEVR, config)

        # Get key mappings of all output streams.
        self.key_images = self.stream_keys["images"]
        self.key_image_ids = self.stream_keys["image_ids"]
        self.key_questions = self.stream_keys["questions"]
        self.key_answers = self.stream_keys["answers"]
        self.key_question_type_ids = self.stream_keys["question_type_ids"]
        self.key_question_type_names = self.stream_keys["question_type_names"]

        # Get flag informing whether we want to stream images or not.
        self.stream_images = self.config['stream_images']

        # Check the resize image option.
        if "resize_image" in self.config:
            if len(self.config['resize_image']) != 2:
                self.logger.error(
                    "'resize_image' field must contain 2 values: the desired height and width"
                )
                exit(-1)

            # Output image dimensions.
            self.height = self.config['resize_image'][0]
            self.width = self.config['resize_image'][1]
            self.depth = 3
            resize = True
        else:
            # Use original image dimensions.
            self.height = 480
            self.width = 320
            self.depth = 3
            resize = False
        self.logger.info(
            "Setting image size to [D  x H x W]: {} x {} x {}".format(
                self.depth, self.height, self.width))

        # Set global variables - all dimensions ASIDE OF BATCH.
        self.globals["image_height"] = self.height
        self.globals["image_width"] = self.width
        self.globals["image_depth"] = self.depth

        # Get image preprocessing.
        self.image_preprocessing = get_value_list_from_dictionary(
            "image_preprocessing", self.config,
            'none | normalize | all'.split(" | "))
        if 'none' in self.image_preprocessing:
            self.image_preprocessing = []
        if 'all' in self.image_preprocessing:
            self.image_preprocessing = ['normalize']

        if resize:
            # Add resize as transformation.
            self.image_preprocessing = ["resize"] + self.image_preprocessing
        self.logger.info("Applied image preprocessing: {}".format(
            self.image_preprocessing))

        # Mapping of question subtypes to types (not used, but keeping it just in case).
        #self.question_subtype_to_type_mapping = {
        #    'query_size': 'query_attribute',
        #    'equal_size': 'compare_attribute',
        #    'query_shape': 'query_attribute',
        #    'query_color': 'query_attribute',
        #    'greater_than': 'compare_integer',
        #    'equal_material': 'compare_attribute',
        #    'equal_color': 'compare_attribute',
        #    'equal_shape': 'compare_attribute',
        #    'less_than': 'compare_integer',
        #    'count': 'count',
        #    'exist': 'exist',
        #    'equal_integer': 'compare_integer',
        #    'query_material': 'query_attribute'}

        # Mapping of question subtypes to types.
        self.question_subtype_to_id_mapping = {
            'query_size': 0,
            'equal_size': 1,
            'query_shape': 2,
            'query_color': 3,
            'greater_than': 4,
            'equal_material': 5,
            'equal_color': 6,
            'equal_shape': 7,
            'less_than': 8,
            'count': 9,
            'exist': 10,
            'equal_integer': 11,
            'query_material': 12
        }

        # Mapping of question families to subtypes.
        self.question_family_id_to_subtype_mapping = {
            0: "equal_integer",
            1: "less_than",
            2: "greater_than",
            3: "equal_integer",
            4: "less_than",
            5: "greater_than",
            6: "equal_integer",
            7: "less_than",
            8: "greater_than",
            9: "equal_size",
            10: "equal_color",
            11: "equal_material",
            12: "equal_shape",
            13: "equal_size",
            14: "equal_size",
            15: "equal_size",
            16: "equal_color",
            17: "equal_color",
            18: "equal_color",
            19: "equal_material",
            20: "equal_material",
            21: "equal_material",
            22: "equal_shape",
            23: "equal_shape",
            24: "equal_shape",
            25: "count",
            26: "exist",
            27: "query_size",
            28: "query_shape",
            29: "query_color",
            30: "query_material",
            31: "count",
            32: "query_size",
            33: "query_color",
            34: "query_material",
            35: "query_shape",
            36: "exist",
            37: "exist",
            38: "exist",
            39: "exist",
            40: "count",
            41: "count",
            42: "count",
            43: "count",
            44: "exist",
            45: "exist",
            46: "exist",
            47: "exist",
            48: "count",
            49: "count",
            50: "count",
            51: "count",
            52: "query_color",
            53: "query_material",
            54: "query_shape",
            55: "query_size",
            56: "query_material",
            57: "query_shape",
            58: "query_size",
            59: "query_color",
            60: "query_shape",
            61: "query_size",
            62: "query_color",
            63: "query_material",
            64: "count",
            65: "count",
            66: "count",
            67: "count",
            68: "count",
            69: "count",
            70: "count",
            71: "count",
            72: "count",
            73: "exist",
            74: "query_size",
            75: "query_color",
            76: "query_material",
            77: "query_shape",
            78: "count",
            79: "exist",
            80: "query_size",
            81: "query_color",
            82: "query_material",
            83: "query_shape",
            84: "count",
            85: "exist",
            86: "query_shape",
            87: "query_material",
            88: "query_color",
            89: "query_size"
        }

        # Finally, "merge" those two.
        self.question_family_id_to_subtype_id_mapping = {
            key: self.question_subtype_to_id_mapping[value]
            for key, value in
            self.question_family_id_to_subtype_mapping.items()
        }

        # Get the absolute path.
        self.data_folder = os.path.expanduser(self.config['data_folder'])

        # Get split.
        split = get_value_from_dictionary(
            'split', self.config,
            "training | validation | test | cogent_a_training | cogent_a_validation | cogent_b_validation"
            .split(" | "))

        # Set split-dependent data.
        if split == 'training':
            # Training split folder and file with data question.
            data_file = os.path.join(self.data_folder, "questions",
                                     'CLEVR_train_questions.json')
            self.split_image_folder = os.path.join(self.data_folder, "images",
                                                   "train")

        elif split == 'validation':
            # Validation split folder and file with data question.
            data_file = os.path.join(self.data_folder, "questions",
                                     'CLEVR_val_questions.json')
            self.split_image_folder = os.path.join(self.data_folder, "images",
                                                   "val")

        elif split == 'test':
            # Test split folder and file with data question.
            data_file = os.path.join(self.data_folder, "questions",
                                     'CLEVR_test_questions.json')
            self.split_image_folder = os.path.join(self.data_folder, "images",
                                                   "test")

        else:  # cogent
            raise ConfigurationError(
                "Split {} not supported yet".format(split))

        # Load dataset.
        self.dataset = self.load_dataset(data_file)

        # Display exemplary sample.
        i = 0
        sample = self.dataset[i]
        # Check if this is a test set.
        if "answer" not in sample.keys():
            sample["answer"] = "<UNK>"
            sample[self.key_question_type_ids] = -1
            sample[self.key_question_type_names] = "<UNK>"
        else:
            sample[
                self.
                key_question_type_ids] = self.question_family_id_to_subtype_id_mapping[
                    sample["question_family_index"]]
            sample[
                self.
                key_question_type_names] = self.question_family_id_to_subtype_mapping[
                    sample["question_family_index"]]

        self.logger.info(
            "Exemplary sample {} ({}):\n  question_type: {} ({})\n  image_ids: {}\n  question: {}\n  answer: {}"
            .format(i, sample["question_index"],
                    sample[self.key_question_type_ids],
                    sample[self.key_question_type_names],
                    sample["image_filename"], sample["question"],
                    sample["answer"]))