def _get_data(self, file_url, dl_folder):
        """Download input datafile, unzip and store in output_path.

        Parameters
        ----------
        file_url : str
            URL of the file to download.
        dl_folder : str
            Path to the folder where to store the downloaded file.

        """
        f_dl = fm.join(dl_folder, 'iCubWorld28_128x128.zip?dl=1')
        if not fm.file_exist(f_dl) or md5(f_dl) != ICUBWORLD28_MD5:
            # Generate the full path to the downloaded file
            f_dl = dl_file(file_url, dl_folder, md5_digest=ICUBWORLD28_MD5)

        self.logger.info("Extracting files...")

        # Extract the content of downloaded file
        zipfile.ZipFile(f_dl, 'r').extractall(dl_folder)
        # Remove downloaded file
        fm.remove_file(f_dl)

        # iCubWorld28 zip file contains a macosx private folder, clean it up
        if fm.folder_exist(fm.join(ICUBWORLD28_PATH, '__MACOSX')):
            fm.remove_folder(fm.join(ICUBWORLD28_PATH, '__MACOSX'), force=True)

        # iCubWorld28 zip file contains a macosx private files, clean it up
        for dirpath, dirnames, filenames in os.walk(ICUBWORLD28_PATH):
            for file in filenames:
                if fnmatch(file, '.DS_Store'):
                    fm.remove_file(fm.join(dirpath, file))

        # Now move all data to an upper folder if needed
        if not fm.folder_exist(self._train_path) \
                or not fm.folder_exist(self._test_path):
            sub_d = fm.join(dl_folder, fm.listdir(dl_folder)[0])
            for e in fm.listdir(sub_d):
                e_full = fm.join(sub_d, e)  # Full path to current element
                try:  # Call copy_file or copy_folder when applicable
                    if fm.file_exist(e_full) is True:
                        fm.copy_file(e_full, dl_folder)
                    elif fm.folder_exist(e_full) is True:
                        fm.copy_folder(e_full, fm.join(dl_folder, e))
                except:
                    pass

            # Check that the main dataset file is now in the correct folder
            if not fm.folder_exist(self._train_path) \
                    or not fm.folder_exist(self._test_path):
                raise RuntimeError("dataset main file not available!")

            # The subdirectory can now be removed
            fm.remove_folder(sub_d, force=True)
Beispiel #2
0
    def _load_files(self,
                    dir_path,
                    img_w,
                    img_h,
                    img_c,
                    img_ext,
                    label_re=None,
                    load_data=True):
        """Loads any file with given extension inside input folder."""
        # Folders/files will be loaded in alphabetical order
        files_list = sorted(fm.listdir(dir_path))

        # Placeholder for patterns/labels CArray
        patterns = None
        labels = None
        for file_name in files_list:

            # Full path to image file
            file_path = fm.join(dir_path, file_name)

            # Load only files of the specified format
            if fm.splitext(file_name)[1].lower() == img_ext:
                # Opening image in lazy mode (to verify dimensions etc.)
                img = Image.open(file_path)

                # Storing image dimensions...
                img_w = img_w.append(img.width)
                img_h = img_h.append(img.height)
                img_c = img_c.append(len(img.getbands()))

                # If load_data is True, store features, else store path
                if load_data is True:
                    # Storing image as a 2D CArray
                    array_img = CArray(img.getdata()).ravel().atleast_2d()
                else:
                    array_img = CArray([[file_path]])

                # Creating the 2D array patterns x features
                patterns = patterns.append(
                    array_img, axis=0) if patterns is not None else array_img

                # Consider only the directory name to set the label
                dir_name = fm.split(dir_path)[1]
                # label is the image's containing folder name or the re result
                c_id = dir_name if label_re is None \
                    else re.search(label_re, dir_name).group(0)
                labels = labels.append(c_id) if labels is not None \
                    else CArray(c_id)

                self.logger.debug("{:} has been loaded..."
                                  "".format(fm.join(dir_path, file_name)))

        return patterns, labels, img_w, img_h, img_c
Beispiel #3
0
    def _explore_dir(self,
                     dir_path,
                     img_w,
                     img_h,
                     img_c,
                     img_ext,
                     label_re=None,
                     load_data=True):
        """Explore input directory and load files if leaf."""
        # Folders/files will be loaded in alphabetical order
        items_list = sorted(fm.listdir(dir_path))

        # A leaf folder is a folder with only files in it
        leaf = not any(
            fm.folder_exist(fm.join(dir_path, item)) for item in items_list)

        if leaf is True:  # Leaf directory, time to load files!
            return self._load_files(dir_path,
                                    img_w,
                                    img_h,
                                    img_c,
                                    img_ext,
                                    label_re=label_re,
                                    load_data=load_data)

        # Placeholder for patterns/labels CArray
        patterns = None
        labels = None
        for subdir in items_list:

            subdir_path = fm.join(dir_path, subdir)

            # Only consider folders (there could be also files)
            if not fm.folder_exist(subdir_path):
                continue

            # Explore next subfolder
            patterns_new, labels_new, img_w, img_h, img_c = self._explore_dir(
                subdir_path,
                img_w,
                img_h,
                img_c,
                img_ext,
                label_re=label_re,
                load_data=load_data)

            patterns = patterns.append(patterns_new, axis=0) \
                if patterns is not None else patterns_new
            labels = labels.append(labels_new) \
                if labels is not None else labels_new

        return patterns, labels, img_w, img_h, img_c
    def _load_files(self,
                    ds_path,
                    img_w,
                    img_h,
                    img_c,
                    img_ext,
                    load_data=True):
        """Loads any file with given extension inside input folder."""
        # Files will be loaded in alphabetical order
        files_list = sorted(fm.listdir(ds_path))

        # Placeholder for patterns CArray
        patterns = None
        for file_name in files_list:

            # Full path to image file
            file_path = fm.join(ds_path, file_name)

            # Load only files of the specified format
            if fm.splitext(file_name)[1].lower() == img_ext:
                # Opening image in lazy mode (to verify dimensions etc.)
                img = Image.open(file_path)

                # Storing image dimensions...
                img_w = img_w.append(img.width)
                img_h = img_h.append(img.height)
                img_c = img_c.append(len(img.getbands()))

                # If load_data is True, store features, else store path
                if load_data is True:
                    # Storing image as a 2D CArray
                    array_img = CArray(img.getdata()).ravel().atleast_2d()
                else:
                    array_img = CArray([[file_path]])

                # Creating the 2D array patterns x features
                patterns = patterns.append(
                    array_img, axis=0) if patterns is not None else array_img

                self.logger.debug("{:} has been loaded..."
                                  "".format(fm.join(ds_path, file_name)))

        return patterns, img_w, img_h, img_c
    def load(self,
             ds_type,
             day='day4',
             icub7=False,
             resize_shape=(128, 128),
             crop_shape=None,
             normalize=True):
        """Load the dataset.

        The pre-cropped version of the images is loaded, with size 128 x 128.
        An additional resize/crop shape could be passed as input if needed.

        Extra dataset attributes:
          - 'img_w', 'img_h': size of the images in pixels.
          - 'y_orig': CArray with the original labels of the objects.

        Parameters
        ----------
        ds_type : str
            Identifier of the dataset to download, either 'train' or 'test'.
        day : str, optional
            Acquisition day from which to load the images. Default 'day4'.
            The available options are: 'day1', 'day2', 'day3', 'day4'.
        icub7 : bool or int, optional
            If True, load a reduced dataset with 7 objects by
            taking the 3rd object for each category. Default False.
            If int, the Nth object for each category will be loaded.
        resize_shape : tuple, optional
           Images will be resized to (height, width) shape. Default (128, 128).
        crop_shape : tuple or None, optional
            If a tuple, a crop of (height, width) shape will be extracted
            from the center of each image. Default None.
        normalize : bool, optional
            If True, images are normalized between 0-1. Default True.

        Returns
        -------
        CDataset
            Output dataset.

        """
        if ds_type == 'train':
            data_path = self._train_path
        elif ds_type == 'test':
            data_path = self._test_path
        else:
            raise ValueError("use ds_type = {'train', 'test'}.")

        day_path = fm.join(data_path, day)
        if not fm.folder_exist(day_path):
            raise ValueError("{:} not available.".format(day))

        self.logger.info(
            "Loading iCubWorld{:} {:} {:} dataset from {:}".format(
                '7' if icub7 else '28', day, ds_type, day_path))

        icub7 = 3 if icub7 is True else icub7  # Use the 3rd sub-obj by default

        x = None
        y_orig = []
        for obj in sorted(fm.listdir(day_path)):  # Objects (cup, sponge, ..)

            obj_path = fm.join(day_path, obj)

            # Sub-objects (cup1, cup2, ...)
            for sub_obj in sorted(fm.listdir(obj_path)):

                if icub7 and sub_obj[-1] != str(icub7):
                    continue  # Load only the `icub7`th object

                self.logger.debug("Loading images for {:}".format(sub_obj))

                sub_obj_path = fm.join(obj_path, sub_obj)

                for f in sorted(fm.listdir(sub_obj_path)):

                    img = Image.open(fm.join(sub_obj_path, f))

                    if resize_shape is not None:
                        img = resize_img(img, resize_shape)
                    if crop_shape is not None:
                        img = crop_img(img, crop_shape)

                    img = CArray(img.getdata(), dtype='uint8').ravel()
                    x = x.append(img, axis=0) if x is not None else img

                    y_orig.append(sub_obj)  # Label is given by sub-obj name

        # Create the int-based array of labels. Keep original labels in y_orig
        y_orig = CArray(y_orig)
        y = CArray(y_orig).unique(return_inverse=True)[1]

        if normalize is True:
            x /= 255.0

        # Size of images is the crop shape (if any) otherwise, the resize shape
        img_h, img_w = crop_shape if crop_shape is not None else resize_shape

        header = CDatasetHeader(img_w=img_w, img_h=img_h, y_orig=y_orig)

        return CDataset(x, y, header=header)