Esempio n. 1
0
def save_hdf5(data_fp, data):
    numpy_dataset = to_numpy_dataset(data)
    with upload_h5(data_fp) as h5_file:
        h5_file.create_dataset(HDF5_COLUMNS_KEY,
                               data=np.array(data.columns.values, dtype="S"))
        for column in data.columns:
            h5_file.create_dataset(column, data=numpy_dataset[column])
Esempio n. 2
0
    def add_feature_data(
            feature,
            input_df,
            proc_df,
            metadata,
            preprocessing_parameters,
            backend
    ):
        in_memory = preprocessing_parameters['in_memory']
        if PREPROCESSING in feature and 'in_memory' in feature[PREPROCESSING]:
            in_memory = feature[PREPROCESSING]['in_memory']

        num_processes = preprocessing_parameters['num_processes']
        if PREPROCESSING in feature and 'num_processes' in feature[
            PREPROCESSING]:
            num_processes = feature[PREPROCESSING]['num_processes']

        src_path = None
        if hasattr(input_df, 'src'):
            src_path = os.path.dirname(os.path.abspath(input_df.src))

        num_images = len(input_df)
        if num_images == 0:
            raise ValueError('There are no images in the dataset provided.')

        first_path = next(iter(input_df[feature[COLUMN]]))

        if src_path is None and not os.path.isabs(first_path):
            raise ValueError('Image file paths must be absolute')

        first_path = get_abs_path(src_path, first_path)

        (
            should_resize,
            width,
            height,
            num_channels,
            user_specified_num_channels,
            first_image
        ) = ImageFeatureMixin._finalize_preprocessing_parameters(
            preprocessing_parameters, first_path
        )

        metadata[feature[NAME]][PREPROCESSING]['height'] = height
        metadata[feature[NAME]][PREPROCESSING]['width'] = width
        metadata[feature[NAME]][PREPROCESSING][
            'num_channels'] = num_channels

        read_image_and_resize = partial(
            ImageFeatureMixin._read_image_and_resize,
            img_width=width,
            img_height=height,
            should_resize=should_resize,
            num_channels=num_channels,
            resize_method=preprocessing_parameters['resize_method'],
            user_specified_num_channels=user_specified_num_channels
        )

        if in_memory:
            # Number of processes to run in parallel for preprocessing
            metadata[feature[NAME]][PREPROCESSING][
                'num_processes'] = num_processes
            metadata[feature[NAME]]['reshape'] = (height, width, num_channels)

            # Split the dataset into pools only if we have an explicit request to use
            # multiple processes. In case we have multiple input images use the
            # standard code anyway.
            if backend.supports_multiprocessing and (
                    num_processes > 1 or num_images > 1):
                all_file_paths = [get_abs_path(src_path, file_path)
                                  for file_path in input_df[feature[NAME]]]

                with Pool(num_processes) as pool:
                    logger.debug(
                        'Using {} processes for preprocessing images'.format(
                            num_processes
                        )
                    )
                    proc_df[feature[PROC_COLUMN]] = pool.map(read_image_and_resize, all_file_paths)
            else:
                # If we're not running multiple processes and we are only processing one
                # image just use this faster shortcut, bypassing multiprocessing.Pool.map
                logger.debug(
                    'No process pool initialized. Using internal process for preprocessing images'
                )

                proc_df[feature[PROC_COLUMN]] = backend.df_engine.map_objects(
                    input_df[feature[COLUMN]],
                    lambda file_path: read_image_and_resize(get_abs_path(src_path, file_path))
                )
        else:
            backend.check_lazy_load_supported(feature)

            all_file_paths = [get_abs_path(src_path, file_path)
                              for file_path in input_df[feature[NAME]]]

            data_fp = backend.cache.get_cache_path(
                input_df.src, metadata.get(CHECKSUM), TRAINING
            )
            with upload_h5(data_fp) as h5_file:
                # todo future add multiprocessing/multithreading
                image_dataset = h5_file.create_dataset(
                    feature[PROC_COLUMN] + '_data',
                    (num_images, height, width, num_channels),
                    dtype=np.uint8
                )
                for i, filepath in enumerate(all_file_paths):
                    image_dataset[i, :height, :width, :] = (
                        read_image_and_resize(filepath)
                    )
                h5_file.flush()

            proc_df[feature[PROC_COLUMN]] = np.arange(num_images)
        return proc_df
Esempio n. 3
0
    def add_feature_data(feature, input_df, proc_df, metadata,
                         preprocessing_parameters, backend,
                         skip_save_processed_input):
        in_memory = preprocessing_parameters['in_memory']
        if PREPROCESSING in feature and 'in_memory' in feature[PREPROCESSING]:
            in_memory = feature[PREPROCESSING]['in_memory']

        num_processes = preprocessing_parameters['num_processes']
        if PREPROCESSING in feature and 'num_processes' in feature[
                PREPROCESSING]:
            num_processes = feature[PREPROCESSING]['num_processes']

        src_path = None
        if SRC in metadata:
            src_path = os.path.dirname(os.path.abspath(metadata.get(SRC)))

        num_images = len(input_df[feature[COLUMN]])
        if num_images == 0:
            raise ValueError('There are no images in the dataset provided.')

        first_img_entry = next(iter(input_df[feature[COLUMN]]))
        logger.debug('Detected image feature type is {}'.format(
            type(first_img_entry)))

        if not isinstance(first_img_entry, str) \
                and not isinstance(first_img_entry, np.ndarray):
            raise ValueError(
                'Invalid image feature data type.  Detected type is {}, '
                'expect either string for file path or numpy array.'.format(
                    type(first_img_entry)))

        first_img_entry = get_image_from_path(src_path, first_img_entry)

        (should_resize, width, height, num_channels,
         user_specified_num_channels,
         first_image) = ImageFeatureMixin._finalize_preprocessing_parameters(
             preprocessing_parameters, first_img_entry, src_path,
             input_df[feature[COLUMN]])

        metadata[feature[NAME]][PREPROCESSING]['height'] = height
        metadata[feature[NAME]][PREPROCESSING]['width'] = width
        metadata[feature[NAME]][PREPROCESSING]['num_channels'] = num_channels

        read_image_and_resize = partial(
            ImageFeatureMixin._read_image_and_resize,
            img_width=width,
            img_height=height,
            should_resize=should_resize,
            num_channels=num_channels,
            resize_method=preprocessing_parameters['resize_method'],
            user_specified_num_channels=user_specified_num_channels)

        # TODO: alternatively use get_average_image() for unreachable images
        default_image = get_gray_default_image(height, width, num_channels)

        # check to see if the active backend can support lazy loading of
        # image features from the hdf5 cache.
        backend.check_lazy_load_supported(feature)

        if in_memory or skip_save_processed_input:
            # Number of processes to run in parallel for preprocessing
            metadata[
                feature[NAME]][PREPROCESSING]['num_processes'] = num_processes
            metadata[feature[NAME]]['reshape'] = (height, width, num_channels)

            # Split the dataset into pools only if we have an explicit request to use
            # multiple processes. In case we have multiple input images use the
            # standard code anyway.
            if backend.supports_multiprocessing and (num_processes > 1
                                                     or num_images > 1):
                all_img_entries = [
                    get_abs_path(src_path, img_entry) if isinstance(
                        img_entry, str) else img_entry
                    for img_entry in input_df[feature[COLUMN]]
                ]

                with Pool(num_processes) as pool:
                    logger.debug(
                        'Using {} processes for preprocessing images'.format(
                            num_processes))
                    res = pool.map(read_image_and_resize, all_img_entries)
                    proc_df[feature[PROC_COLUMN]] = [
                        x if x is not None else default_image for x in res
                    ]
            else:
                # If we're not running multiple processes and we are only processing one
                # image just use this faster shortcut, bypassing multiprocessing.Pool.map
                logger.debug(
                    'No process pool initialized. Using internal process for preprocessing images'
                )

                # helper function for handling single image
                def _get_processed_image(img_store):
                    if isinstance(img_store, str):
                        res_single = read_image_and_resize(
                            get_abs_path(src_path, img_store))
                    else:
                        res_single = read_image_and_resize(img_store)
                    return res_single if res_single is not None else default_image

                proc_df[feature[PROC_COLUMN]] = backend.df_engine.map_objects(
                    input_df[feature[COLUMN]], _get_processed_image)
        else:

            all_img_entries = [
                get_abs_path(src_path, img_entry) if isinstance(
                    img_entry, str) else img_entry
                for img_entry in input_df[feature[COLUMN]]
            ]

            data_fp = backend.cache.get_cache_path(metadata.get(SRC),
                                                   metadata.get(CHECKSUM),
                                                   TRAINING)
            with upload_h5(data_fp) as h5_file:
                # todo future add multiprocessing/multithreading
                image_dataset = h5_file.create_dataset(
                    feature[PROC_COLUMN] + '_data',
                    (num_images, height, width, num_channels),
                    dtype=np.uint8)
                for i, img_entry in enumerate(all_img_entries):
                    res = read_image_and_resize(img_entry)
                    image_dataset[
                        i, :height, :
                        width, :] = res if res is not None else default_image
                h5_file.flush()

            proc_df[feature[PROC_COLUMN]] = np.arange(num_images)
        return proc_df
Esempio n. 4
0
    def add_feature_data(feature_config, input_df, proc_df, metadata,
                         preprocessing_parameters, backend,
                         skip_save_processed_input):
        set_default_value(feature_config[PREPROCESSING], "in_memory",
                          preprocessing_parameters["in_memory"])

        name = feature_config[NAME]
        column = input_df[feature_config[COLUMN]]

        src_path = None
        if SRC in metadata:
            src_path = os.path.dirname(os.path.abspath(metadata.get(SRC)))
        abs_path_column = backend.df_engine.map_objects(
            column,
            lambda row: get_abs_path(src_path, row)
            if isinstance(row, str) and not has_remote_protocol(row) else row,
        )

        (
            should_resize,
            width,
            height,
            num_channels,
            user_specified_num_channels,
        ) = ImageFeatureMixin._finalize_preprocessing_parameters(
            preprocessing_parameters, abs_path_column)

        metadata[name][PREPROCESSING]["height"] = height
        metadata[name][PREPROCESSING]["width"] = width
        metadata[name][PREPROCESSING]["num_channels"] = num_channels

        read_image_if_bytes_obj_and_resize = partial(
            ImageFeatureMixin._read_image_if_bytes_obj_and_resize,
            img_width=width,
            img_height=height,
            should_resize=should_resize,
            num_channels=num_channels,
            resize_method=preprocessing_parameters["resize_method"],
            user_specified_num_channels=user_specified_num_channels,
        )

        # TODO: alternatively use get_average_image() for unreachable images
        default_image = get_gray_default_image(num_channels, height, width)

        # check to see if the active backend can support lazy loading of
        # image features from the hdf5 cache.
        backend.check_lazy_load_supported(feature_config)

        in_memory = feature_config[PREPROCESSING]["in_memory"]
        if in_memory or skip_save_processed_input:
            metadata[name]["reshape"] = (num_channels, height, width)

            proc_col = backend.read_binary_files(
                abs_path_column, map_fn=read_image_if_bytes_obj_and_resize)
            proc_col = backend.df_engine.map_objects(
                proc_col, lambda row: row
                if row is not None else default_image)
            proc_df[feature_config[PROC_COLUMN]] = proc_col
        else:
            num_images = len(abs_path_column)

            data_fp = backend.cache.get_cache_path(wrap(metadata.get(SRC)),
                                                   metadata.get(CHECKSUM),
                                                   TRAINING)
            with upload_h5(data_fp) as h5_file:
                # todo future add multiprocessing/multithreading
                image_dataset = h5_file.create_dataset(
                    feature_config[PROC_COLUMN] + "_data",
                    (num_images, num_channels, height, width),
                    dtype=np.uint8)
                for i, img_entry in enumerate(abs_path_column):
                    res = read_image_if_bytes_obj_and_resize(img_entry)
                    image_dataset[
                        i, :height, :
                        width, :] = res if res is not None else default_image
                h5_file.flush()

            proc_df[feature_config[PROC_COLUMN]] = np.arange(num_images)
        return proc_df