def save_hdf5(data_fp, data): numpy_dataset = to_numpy_dataset(data) with upload_h5(data_fp) as h5_file: h5_file.create_dataset(HDF5_COLUMNS_KEY, data=np.array(data.columns.values, dtype="S")) for column in data.columns: h5_file.create_dataset(column, data=numpy_dataset[column])
def add_feature_data( feature, input_df, proc_df, metadata, preprocessing_parameters, backend ): in_memory = preprocessing_parameters['in_memory'] if PREPROCESSING in feature and 'in_memory' in feature[PREPROCESSING]: in_memory = feature[PREPROCESSING]['in_memory'] num_processes = preprocessing_parameters['num_processes'] if PREPROCESSING in feature and 'num_processes' in feature[ PREPROCESSING]: num_processes = feature[PREPROCESSING]['num_processes'] src_path = None if hasattr(input_df, 'src'): src_path = os.path.dirname(os.path.abspath(input_df.src)) num_images = len(input_df) if num_images == 0: raise ValueError('There are no images in the dataset provided.') first_path = next(iter(input_df[feature[COLUMN]])) if src_path is None and not os.path.isabs(first_path): raise ValueError('Image file paths must be absolute') first_path = get_abs_path(src_path, first_path) ( should_resize, width, height, num_channels, user_specified_num_channels, first_image ) = ImageFeatureMixin._finalize_preprocessing_parameters( preprocessing_parameters, first_path ) metadata[feature[NAME]][PREPROCESSING]['height'] = height metadata[feature[NAME]][PREPROCESSING]['width'] = width metadata[feature[NAME]][PREPROCESSING][ 'num_channels'] = num_channels read_image_and_resize = partial( ImageFeatureMixin._read_image_and_resize, img_width=width, img_height=height, should_resize=should_resize, num_channels=num_channels, resize_method=preprocessing_parameters['resize_method'], user_specified_num_channels=user_specified_num_channels ) if in_memory: # Number of processes to run in parallel for preprocessing metadata[feature[NAME]][PREPROCESSING][ 'num_processes'] = num_processes metadata[feature[NAME]]['reshape'] = (height, width, num_channels) # Split the dataset into pools only if we have an explicit request to use # multiple processes. In case we have multiple input images use the # standard code anyway. if backend.supports_multiprocessing and ( num_processes > 1 or num_images > 1): all_file_paths = [get_abs_path(src_path, file_path) for file_path in input_df[feature[NAME]]] with Pool(num_processes) as pool: logger.debug( 'Using {} processes for preprocessing images'.format( num_processes ) ) proc_df[feature[PROC_COLUMN]] = pool.map(read_image_and_resize, all_file_paths) else: # If we're not running multiple processes and we are only processing one # image just use this faster shortcut, bypassing multiprocessing.Pool.map logger.debug( 'No process pool initialized. Using internal process for preprocessing images' ) proc_df[feature[PROC_COLUMN]] = backend.df_engine.map_objects( input_df[feature[COLUMN]], lambda file_path: read_image_and_resize(get_abs_path(src_path, file_path)) ) else: backend.check_lazy_load_supported(feature) all_file_paths = [get_abs_path(src_path, file_path) for file_path in input_df[feature[NAME]]] data_fp = backend.cache.get_cache_path( input_df.src, metadata.get(CHECKSUM), TRAINING ) with upload_h5(data_fp) as h5_file: # todo future add multiprocessing/multithreading image_dataset = h5_file.create_dataset( feature[PROC_COLUMN] + '_data', (num_images, height, width, num_channels), dtype=np.uint8 ) for i, filepath in enumerate(all_file_paths): image_dataset[i, :height, :width, :] = ( read_image_and_resize(filepath) ) h5_file.flush() proc_df[feature[PROC_COLUMN]] = np.arange(num_images) return proc_df
def add_feature_data(feature, input_df, proc_df, metadata, preprocessing_parameters, backend, skip_save_processed_input): in_memory = preprocessing_parameters['in_memory'] if PREPROCESSING in feature and 'in_memory' in feature[PREPROCESSING]: in_memory = feature[PREPROCESSING]['in_memory'] num_processes = preprocessing_parameters['num_processes'] if PREPROCESSING in feature and 'num_processes' in feature[ PREPROCESSING]: num_processes = feature[PREPROCESSING]['num_processes'] src_path = None if SRC in metadata: src_path = os.path.dirname(os.path.abspath(metadata.get(SRC))) num_images = len(input_df[feature[COLUMN]]) if num_images == 0: raise ValueError('There are no images in the dataset provided.') first_img_entry = next(iter(input_df[feature[COLUMN]])) logger.debug('Detected image feature type is {}'.format( type(first_img_entry))) if not isinstance(first_img_entry, str) \ and not isinstance(first_img_entry, np.ndarray): raise ValueError( 'Invalid image feature data type. Detected type is {}, ' 'expect either string for file path or numpy array.'.format( type(first_img_entry))) first_img_entry = get_image_from_path(src_path, first_img_entry) (should_resize, width, height, num_channels, user_specified_num_channels, first_image) = ImageFeatureMixin._finalize_preprocessing_parameters( preprocessing_parameters, first_img_entry, src_path, input_df[feature[COLUMN]]) metadata[feature[NAME]][PREPROCESSING]['height'] = height metadata[feature[NAME]][PREPROCESSING]['width'] = width metadata[feature[NAME]][PREPROCESSING]['num_channels'] = num_channels read_image_and_resize = partial( ImageFeatureMixin._read_image_and_resize, img_width=width, img_height=height, should_resize=should_resize, num_channels=num_channels, resize_method=preprocessing_parameters['resize_method'], user_specified_num_channels=user_specified_num_channels) # TODO: alternatively use get_average_image() for unreachable images default_image = get_gray_default_image(height, width, num_channels) # check to see if the active backend can support lazy loading of # image features from the hdf5 cache. backend.check_lazy_load_supported(feature) if in_memory or skip_save_processed_input: # Number of processes to run in parallel for preprocessing metadata[ feature[NAME]][PREPROCESSING]['num_processes'] = num_processes metadata[feature[NAME]]['reshape'] = (height, width, num_channels) # Split the dataset into pools only if we have an explicit request to use # multiple processes. In case we have multiple input images use the # standard code anyway. if backend.supports_multiprocessing and (num_processes > 1 or num_images > 1): all_img_entries = [ get_abs_path(src_path, img_entry) if isinstance( img_entry, str) else img_entry for img_entry in input_df[feature[COLUMN]] ] with Pool(num_processes) as pool: logger.debug( 'Using {} processes for preprocessing images'.format( num_processes)) res = pool.map(read_image_and_resize, all_img_entries) proc_df[feature[PROC_COLUMN]] = [ x if x is not None else default_image for x in res ] else: # If we're not running multiple processes and we are only processing one # image just use this faster shortcut, bypassing multiprocessing.Pool.map logger.debug( 'No process pool initialized. Using internal process for preprocessing images' ) # helper function for handling single image def _get_processed_image(img_store): if isinstance(img_store, str): res_single = read_image_and_resize( get_abs_path(src_path, img_store)) else: res_single = read_image_and_resize(img_store) return res_single if res_single is not None else default_image proc_df[feature[PROC_COLUMN]] = backend.df_engine.map_objects( input_df[feature[COLUMN]], _get_processed_image) else: all_img_entries = [ get_abs_path(src_path, img_entry) if isinstance( img_entry, str) else img_entry for img_entry in input_df[feature[COLUMN]] ] data_fp = backend.cache.get_cache_path(metadata.get(SRC), metadata.get(CHECKSUM), TRAINING) with upload_h5(data_fp) as h5_file: # todo future add multiprocessing/multithreading image_dataset = h5_file.create_dataset( feature[PROC_COLUMN] + '_data', (num_images, height, width, num_channels), dtype=np.uint8) for i, img_entry in enumerate(all_img_entries): res = read_image_and_resize(img_entry) image_dataset[ i, :height, : width, :] = res if res is not None else default_image h5_file.flush() proc_df[feature[PROC_COLUMN]] = np.arange(num_images) return proc_df
def add_feature_data(feature_config, input_df, proc_df, metadata, preprocessing_parameters, backend, skip_save_processed_input): set_default_value(feature_config[PREPROCESSING], "in_memory", preprocessing_parameters["in_memory"]) name = feature_config[NAME] column = input_df[feature_config[COLUMN]] src_path = None if SRC in metadata: src_path = os.path.dirname(os.path.abspath(metadata.get(SRC))) abs_path_column = backend.df_engine.map_objects( column, lambda row: get_abs_path(src_path, row) if isinstance(row, str) and not has_remote_protocol(row) else row, ) ( should_resize, width, height, num_channels, user_specified_num_channels, ) = ImageFeatureMixin._finalize_preprocessing_parameters( preprocessing_parameters, abs_path_column) metadata[name][PREPROCESSING]["height"] = height metadata[name][PREPROCESSING]["width"] = width metadata[name][PREPROCESSING]["num_channels"] = num_channels read_image_if_bytes_obj_and_resize = partial( ImageFeatureMixin._read_image_if_bytes_obj_and_resize, img_width=width, img_height=height, should_resize=should_resize, num_channels=num_channels, resize_method=preprocessing_parameters["resize_method"], user_specified_num_channels=user_specified_num_channels, ) # TODO: alternatively use get_average_image() for unreachable images default_image = get_gray_default_image(num_channels, height, width) # check to see if the active backend can support lazy loading of # image features from the hdf5 cache. backend.check_lazy_load_supported(feature_config) in_memory = feature_config[PREPROCESSING]["in_memory"] if in_memory or skip_save_processed_input: metadata[name]["reshape"] = (num_channels, height, width) proc_col = backend.read_binary_files( abs_path_column, map_fn=read_image_if_bytes_obj_and_resize) proc_col = backend.df_engine.map_objects( proc_col, lambda row: row if row is not None else default_image) proc_df[feature_config[PROC_COLUMN]] = proc_col else: num_images = len(abs_path_column) data_fp = backend.cache.get_cache_path(wrap(metadata.get(SRC)), metadata.get(CHECKSUM), TRAINING) with upload_h5(data_fp) as h5_file: # todo future add multiprocessing/multithreading image_dataset = h5_file.create_dataset( feature_config[PROC_COLUMN] + "_data", (num_images, num_channels, height, width), dtype=np.uint8) for i, img_entry in enumerate(abs_path_column): res = read_image_if_bytes_obj_and_resize(img_entry) image_dataset[ i, :height, : width, :] = res if res is not None else default_image h5_file.flush() proc_df[feature_config[PROC_COLUMN]] = np.arange(num_images) return proc_df