def test_replace_plain(): ar = pa.array(['foo', 'food', None]) ar = pc.replace_substring(ar, pattern='foo', replacement='bar') assert ar.tolist() == ['bar', 'bard', None]
def _generate_examples(self, files, metadata_files, split_name): if not self.config.drop_metadata and metadata_files: split_metadata_files = metadata_files.get(split_name, []) image_empty_metadata = { k: None for k in self.info.features if k != "image" } last_checked_dir = None metadata_dir = None metadata_dict = None downloaded_metadata_file = None file_idx = 0 for original_file, downloaded_file_or_dir in files: if original_file is not None: _, original_file_ext = os.path.splitext(original_file) if original_file_ext.lower() in self.IMAGE_EXTENSIONS: # If the file is an image, and we've just entered a new directory, # find the nereast metadata file (by counting path segments) for the directory current_dir = os.path.dirname(original_file) if last_checked_dir is None or last_checked_dir != current_dir: last_checked_dir = current_dir metadata_file_candidates = [ ( os.path.relpath( original_file, os.path.dirname( metadata_file_candidate)), metadata_file_candidate, downloaded_metadata_file, ) for metadata_file_candidate, downloaded_metadata_file in split_metadata_files if metadata_file_candidate is not None # ignore metadata_files that are inside archives and not os.path.relpath( original_file, os.path.dirname(metadata_file_candidate) ).startswith("..") ] if metadata_file_candidates: _, metadata_file, downloaded_metadata_file = min( metadata_file_candidates, key=lambda x: count_path_segments(x[0])) with open(downloaded_metadata_file, "rb") as f: pa_metadata_table = paj.read_json(f) pa_file_name_array = pa_metadata_table[ "file_name"] pa_file_name_array = pc.replace_substring( pa_file_name_array, pattern="\\", replacement="/") pa_metadata_table = pa_metadata_table.drop( ["file_name"]) metadata_dir = os.path.dirname(metadata_file) metadata_dict = { file_name: image_metadata for file_name, image_metadata in zip( pa_file_name_array.to_pylist(), pa_table_to_pylist(pa_metadata_table)) } else: raise ValueError( f"One or several metadata.jsonl were found, but not in the same directory or in a parent directory of {downloaded_file_or_dir}." ) if metadata_dir is not None and downloaded_metadata_file is not None: file_relpath = os.path.relpath( original_file, metadata_dir) file_relpath = file_relpath.replace("\\", "/") if file_relpath not in metadata_dict: raise ValueError( f"Image at {file_relpath} doesn't have metadata in {downloaded_metadata_file}." ) image_metadata = metadata_dict[file_relpath] else: raise ValueError( f"One or several metadata.jsonl were found, but not in the same directory or in a parent directory of {downloaded_file_or_dir}." ) yield file_idx, { **image_empty_metadata, "image": downloaded_file_or_dir, **image_metadata, } file_idx += 1 else: for downloaded_dir_file in downloaded_file_or_dir: _, downloaded_dir_file_ext = os.path.splitext( downloaded_dir_file) if downloaded_dir_file_ext.lower( ) in self.IMAGE_EXTENSIONS: current_dir = os.path.dirname(downloaded_dir_file) if last_checked_dir is None or last_checked_dir != current_dir: last_checked_dir = current_dir metadata_file_candidates = [ ( os.path.relpath( downloaded_dir_file, os.path.dirname( downloaded_metadata_file)), metadata_file_candidate, downloaded_metadata_file, ) for metadata_file_candidate, downloaded_metadata_file in split_metadata_files if metadata_file_candidate is None # ignore metadata_files that are not inside archives and not os.path.relpath( downloaded_dir_file, os.path.dirname( downloaded_metadata_file) ).startswith("..") ] if metadata_file_candidates: _, metadata_file, downloaded_metadata_file = min( metadata_file_candidates, key=lambda x: count_path_segments(x[0] )) with open(downloaded_metadata_file, "rb") as f: pa_metadata_table = paj.read_json(f) pa_file_name_array = pa_metadata_table[ "file_name"] pa_file_name_array = pc.replace_substring( pa_file_name_array, pattern="\\", replacement="/") pa_metadata_table = pa_metadata_table.drop( ["file_name"]) metadata_dir = os.path.dirname( downloaded_metadata_file) metadata_dict = { file_name: image_metadata for file_name, image_metadata in zip( pa_file_name_array.to_pylist(), pa_table_to_pylist( pa_metadata_table)) } else: raise ValueError( f"One or several metadata.jsonl were found, but not in the same directory or in a parent directory of {downloaded_dir_file}." ) if metadata_dir is not None and downloaded_metadata_file is not None: downloaded_dir_file_relpath = os.path.relpath( downloaded_dir_file, metadata_dir) downloaded_dir_file_relpath = downloaded_dir_file_relpath.replace( "\\", "/") if downloaded_dir_file_relpath not in metadata_dict: raise ValueError( f"Image at {downloaded_dir_file_relpath} doesn't have metadata in {downloaded_metadata_file}." ) image_metadata = metadata_dict[ downloaded_dir_file_relpath] else: raise ValueError( f"One or several metadata.jsonl were found, but not in the same directory or in a parent directory of {downloaded_dir_file}." ) yield file_idx, { **image_empty_metadata, "image": downloaded_dir_file, **image_metadata, } file_idx += 1 else: file_idx = 0 for original_file, downloaded_file_or_dir in files: if original_file is not None: _, original_file_ext = os.path.splitext(original_file) if original_file_ext.lower() in self.IMAGE_EXTENSIONS: if self.config.drop_labels or metadata_files: yield file_idx, { "image": downloaded_file_or_dir, } else: yield file_idx, { "image": downloaded_file_or_dir, "label": os.path.basename( os.path.dirname(original_file)), } file_idx += 1 else: for downloaded_dir_file in downloaded_file_or_dir: _, downloaded_dir_file_ext = os.path.splitext( downloaded_dir_file) if downloaded_dir_file_ext.lower( ) in self.IMAGE_EXTENSIONS: if self.config.drop_labels or metadata_files: yield file_idx, { "image": downloaded_dir_file, } else: yield file_idx, { "image": downloaded_dir_file, "label": os.path.basename( os.path.dirname(downloaded_dir_file)), } file_idx += 1