Beispiel #1
0
    def _finalize_preprocessing_parameters(
            preprocessing_parameters: dict,
            first_img_entry: Union[str, 'numpy.array'], src_path: str,
            input_feature_col: np.array):
        """
        Helper method to determine the height, width and number of channels for
        preprocessing the image data. This is achieved by looking at the
        parameters provided by the user. When there are some missing parameters,
        we fall back on to the first image in the dataset. The assumption being
        that all the images in the data are expected be of the same size with
        the same number of channels
        """
        first_image = read_image(first_img_entry)

        explicit_height_width = HEIGHT in preprocessing_parameters or WIDTH in preprocessing_parameters
        explicit_num_channels = NUM_CHANNELS in preprocessing_parameters

        inferred_sample = None
        if preprocessing_parameters[INFER_IMAGE_DIMENSIONS] and not (
                explicit_height_width and explicit_num_channels):
            sample_size = min(
                len(input_feature_col),
                preprocessing_parameters[INFER_IMAGE_SAMPLE_SIZE])
            sample = [
                read_image(get_image_from_path(src_path, img))
                for img in input_feature_col.head(sample_size)
            ]
            inferred_sample = [img for img in sample if img is not None]
            if len(inferred_sample) == 0:
                raise ValueError(
                    "No readable images in sample, image dimensions cannot be inferred"
                )

        should_resize = False
        if explicit_height_width:
            should_resize = True
            try:
                height = int(preprocessing_parameters[HEIGHT])
                width = int(preprocessing_parameters[WIDTH])
            except ValueError as e:
                raise ValueError('Image height and width must be set and have '
                                 'positive integer values: ' + str(e))
            if height <= 0 or width <= 0:
                raise ValueError(
                    'Image height and width must be positive integers')
        else:
            # User hasn't specified height and width.
            # Default to inferring from sample or first image.
            if preprocessing_parameters[INFER_IMAGE_DIMENSIONS]:
                should_resize = True

                height_avg = min(
                    sum(x.shape[0]
                        for x in inferred_sample) / len(inferred_sample),
                    preprocessing_parameters[INFER_IMAGE_MAX_HEIGHT])
                width_avg = min(
                    sum(x.shape[1]
                        for x in inferred_sample) / len(inferred_sample),
                    preprocessing_parameters[INFER_IMAGE_MAX_WIDTH])

                height, width = round(height_avg), round(width_avg)
                logger.debug("Inferring height: {0} and width: {1}".format(
                    height, width))
            elif first_image is not None:
                height, width = first_image.shape[0], first_image.shape[1]
            else:
                raise ValueError(
                    "Explicit image width/height are not set, infer_image_dimensions is false, "
                    "and first image cannot be read, so image dimensions are unknown"
                )

        if explicit_num_channels:
            # User specified num_channels in the model/feature config
            user_specified_num_channels = True
            num_channels = preprocessing_parameters[NUM_CHANNELS]
        else:
            user_specified_num_channels = False
            if preprocessing_parameters[INFER_IMAGE_DIMENSIONS]:
                user_specified_num_channels = True
                num_channels = round(
                    sum(num_channels_in_image(x)
                        for x in inferred_sample) / len(inferred_sample))
            elif first_image is not None:
                num_channels = num_channels_in_image(first_image)
            else:
                raise ValueError(
                    "Explicit image num channels is not set, infer_image_dimensions is false, "
                    "and first image cannot be read, so image num channels is unknown"
                )

        assert isinstance(
            num_channels,
            int), ValueError('Number of image channels needs to be an integer')

        return (should_resize, width, height, num_channels,
                user_specified_num_channels, first_image)
Beispiel #2
0
    def add_feature_data(feature, input_df, proc_df, metadata,
                         preprocessing_parameters, backend,
                         skip_save_processed_input):
        in_memory = preprocessing_parameters['in_memory']
        if PREPROCESSING in feature and 'in_memory' in feature[PREPROCESSING]:
            in_memory = feature[PREPROCESSING]['in_memory']

        num_processes = preprocessing_parameters['num_processes']
        if PREPROCESSING in feature and 'num_processes' in feature[
                PREPROCESSING]:
            num_processes = feature[PREPROCESSING]['num_processes']

        src_path = None
        if SRC in metadata:
            src_path = os.path.dirname(os.path.abspath(metadata.get(SRC)))

        num_images = len(input_df[feature[COLUMN]])
        if num_images == 0:
            raise ValueError('There are no images in the dataset provided.')

        first_img_entry = next(iter(input_df[feature[COLUMN]]))
        logger.debug('Detected image feature type is {}'.format(
            type(first_img_entry)))

        if not isinstance(first_img_entry, str) \
                and not isinstance(first_img_entry, np.ndarray):
            raise ValueError(
                'Invalid image feature data type.  Detected type is {}, '
                'expect either string for file path or numpy array.'.format(
                    type(first_img_entry)))

        first_img_entry = get_image_from_path(src_path, first_img_entry)

        (should_resize, width, height, num_channels,
         user_specified_num_channels,
         first_image) = ImageFeatureMixin._finalize_preprocessing_parameters(
             preprocessing_parameters, first_img_entry, src_path,
             input_df[feature[COLUMN]])

        metadata[feature[NAME]][PREPROCESSING]['height'] = height
        metadata[feature[NAME]][PREPROCESSING]['width'] = width
        metadata[feature[NAME]][PREPROCESSING]['num_channels'] = num_channels

        read_image_and_resize = partial(
            ImageFeatureMixin._read_image_and_resize,
            img_width=width,
            img_height=height,
            should_resize=should_resize,
            num_channels=num_channels,
            resize_method=preprocessing_parameters['resize_method'],
            user_specified_num_channels=user_specified_num_channels)

        # TODO: alternatively use get_average_image() for unreachable images
        default_image = get_gray_default_image(height, width, num_channels)

        # check to see if the active backend can support lazy loading of
        # image features from the hdf5 cache.
        backend.check_lazy_load_supported(feature)

        if in_memory or skip_save_processed_input:
            # Number of processes to run in parallel for preprocessing
            metadata[
                feature[NAME]][PREPROCESSING]['num_processes'] = num_processes
            metadata[feature[NAME]]['reshape'] = (height, width, num_channels)

            # Split the dataset into pools only if we have an explicit request to use
            # multiple processes. In case we have multiple input images use the
            # standard code anyway.
            if backend.supports_multiprocessing and (num_processes > 1
                                                     or num_images > 1):
                all_img_entries = [
                    get_abs_path(src_path, img_entry) if isinstance(
                        img_entry, str) else img_entry
                    for img_entry in input_df[feature[COLUMN]]
                ]

                with Pool(num_processes) as pool:
                    logger.debug(
                        'Using {} processes for preprocessing images'.format(
                            num_processes))
                    res = pool.map(read_image_and_resize, all_img_entries)
                    proc_df[feature[PROC_COLUMN]] = [
                        x if x is not None else default_image for x in res
                    ]
            else:
                # If we're not running multiple processes and we are only processing one
                # image just use this faster shortcut, bypassing multiprocessing.Pool.map
                logger.debug(
                    'No process pool initialized. Using internal process for preprocessing images'
                )

                # helper function for handling single image
                def _get_processed_image(img_store):
                    if isinstance(img_store, str):
                        res_single = read_image_and_resize(
                            get_abs_path(src_path, img_store))
                    else:
                        res_single = read_image_and_resize(img_store)
                    return res_single if res_single is not None else default_image

                proc_df[feature[PROC_COLUMN]] = backend.df_engine.map_objects(
                    input_df[feature[COLUMN]], _get_processed_image)
        else:

            all_img_entries = [
                get_abs_path(src_path, img_entry) if isinstance(
                    img_entry, str) else img_entry
                for img_entry in input_df[feature[COLUMN]]
            ]

            data_fp = backend.cache.get_cache_path(metadata.get(SRC),
                                                   metadata.get(CHECKSUM),
                                                   TRAINING)
            with upload_h5(data_fp) as h5_file:
                # todo future add multiprocessing/multithreading
                image_dataset = h5_file.create_dataset(
                    feature[PROC_COLUMN] + '_data',
                    (num_images, height, width, num_channels),
                    dtype=np.uint8)
                for i, img_entry in enumerate(all_img_entries):
                    res = read_image_and_resize(img_entry)
                    image_dataset[
                        i, :height, :
                        width, :] = res if res is not None else default_image
                h5_file.flush()

            proc_df[feature[PROC_COLUMN]] = np.arange(num_images)
        return proc_df
Beispiel #3
0
    def _finalize_preprocessing_parameters(
            preprocessing_parameters: dict,
            first_img_entry: Union[str, 'numpy.array'], src_path: str,
            input_feature_col: np.array):
        """
        Helper method to determine the height, width and number of channels for
        preprocessing the image data. This is achieved by looking at the
        parameters provided by the user. When there are some missing parameters,
        we fall back on to the first image in the dataset. The assumption being
        that all the images in the data are expected be of the same size with
        the same number of channels
        """
        first_image = read_image(first_img_entry)
        first_img_height = first_image.shape[0]
        first_img_width = first_image.shape[1]
        first_img_num_channels = num_channels_in_image(first_image)

        should_resize = False
        if (HEIGHT in preprocessing_parameters
                or WIDTH in preprocessing_parameters):
            should_resize = True
            try:
                height = int(preprocessing_parameters[HEIGHT])
                width = int(preprocessing_parameters[WIDTH])
            except ValueError as e:
                raise ValueError('Image height and width must be set and have '
                                 'positive integer values: ' + str(e))
            if height <= 0 or width <= 0:
                raise ValueError(
                    'Image height and width must be positive integers')
        else:
            # User hasn't specified height and width.
            # Default to first image, or infer from sample.
            height, width = first_img_height, first_img_width

            if preprocessing_parameters[INFER_IMAGE_DIMENSIONS]:
                should_resize = True
                sample_size = min(
                    len(input_feature_col),
                    preprocessing_parameters[INFER_IMAGE_SAMPLE_SIZE])
                sample_images = [
                    read_image(get_image_from_path(src_path, img))
                    for img in input_feature_col[:sample_size]
                ]

                if sample_images:
                    height_avg = min(
                        sum(x.shape[0]
                            for x in sample_images) / len(sample_images),
                        preprocessing_parameters[INFER_IMAGE_MAX_HEIGHT])
                    width_avg = min(
                        sum(x.shape[1]
                            for x in sample_images) / len(sample_images),
                        preprocessing_parameters[INFER_IMAGE_MAX_WIDTH])

                    height, width = round(height_avg), round(width_avg)

                    logger.debug("Inferring height: {0} and width: {1}".format(
                        height, width))
                else:
                    logger.warning(
                        "Sample set for inference is empty, default to height and width of first image"
                    )

        if NUM_CHANNELS in preprocessing_parameters:
            # User specified num_channels in the model/feature config
            user_specified_num_channels = True
            num_channels = preprocessing_parameters[NUM_CHANNELS]
        else:
            user_specified_num_channels = False
            num_channels = first_img_num_channels

        assert isinstance(
            num_channels,
            int), ValueError('Number of image channels needs to be an integer')

        return (should_resize, width, height, num_channels,
                user_specified_num_channels, first_image)