def process_zip(gcp_bucket, zipped_stack): start_dt = datetime.now() assert "gs://" in zipped_stack assert "gs://" in gcp_bucket # clean up the tmp directory try: shutil.rmtree(tmp_directory.as_posix()) except FileNotFoundError: pass tmp_directory.mkdir() is_annotation = 'dmg' in zipped_stack stack_id = Path(zipped_stack).name.split('.')[0] split_strings = ['_8bit', '-mtrxdmg', '_dmg', '-dmg'] for s in split_strings: stack_id = stack_id.split(s)[0] stack_dir = Path(tmp_directory, stack_id) if not is_annotation and gcp_utils.remote_folder_exists( os.path.join(gcp_bucket, 'processed-data'), '/'.join([stack_id] + ["images"])): print("{} has already been processed! Skipping...".format( os.path.join(stack_id, "images"))) elif is_annotation and gcp_utils.remote_folder_exists( os.path.join(gcp_bucket, 'processed-data'), '/'.join([stack_id] + ["annotations"])): print("{} has already been processed! Skipping...".format( os.path.join(stack_id, "annotations"))) else: os.system("gsutil -m cp -r '{}' '{}'".format(zipped_stack, tmp_directory.as_posix())) os.system("7za x -y -o'{}' '{}'".format( stack_dir.as_posix(), Path(tmp_directory, Path(zipped_stack).name).as_posix())) os.remove(Path(tmp_directory, Path(zipped_stack).name).as_posix()) unzipped_dir = next(stack_dir.iterdir()) original_number_of_files_in_zip = len(list(unzipped_dir.iterdir())) temp_file_name = r'./temp.tif' for f in Path(unzipped_dir).iterdir(): if f.name[-4:] != '.tif': # remove any non-image files os.remove(f.as_posix()) else: # Old code to convert all images to greyscale (some are already and some aren't) # Image.open(f).convert("L").save(f) # The commented code was giving an error due to some compression setting of the SOME images: # return encoder(mode, *args + extra) # TypeError: argument 5 should be a str, not PosixPath # The error seems to be related to the image metadata or to a compression setting of the image. # A workaround was to copy and rename the image and delete the old one.. shutil.copyfile(f, temp_file_name) os.remove(f) im1 = Image.open(temp_file_name).convert("L").save( temp_file_name) os.rename(temp_file_name, f) shutil.move( unzipped_dir.as_posix(), Path(unzipped_dir.parent, 'annotations' if is_annotation else 'images').as_posix()) # get metadata file, if exists os.system("gsutil -m cp -r '{}' '{}'".format( os.path.join(gcp_bucket, 'processed-data/', stack_id, metadata_file_name), Path(tmp_directory, stack_id).as_posix())) try: with Path(tmp_directory, stack_id, metadata_file_name).open('r') as f: metadata = yaml.safe_load(f) except FileNotFoundError: metadata = {} metadata.update({ 'annotations' if is_annotation else 'images': { 'gcp_bucket': gcp_bucket, 'zipped_stack_file': zipped_stack, 'created_datetime': datetime.now(pytz.UTC).strftime('%Y%m%dT%H%M%SZ'), 'original_number_of_files_in_zip': original_number_of_files_in_zip, 'number_of_images': len( list( Path(unzipped_dir.parent, 'annotations' if is_annotation else 'images').iterdir())), 'git_hash': git.Repo(search_parent_directories=True).head.object.hexsha }, 'elapsed_minutes': round((datetime.now() - start_dt).total_seconds() / 60, 1) }) with Path(tmp_directory, stack_id, metadata_file_name).open('w') as f: yaml.safe_dump(metadata, f) os.system("gsutil -m cp -n -r '{}' '{}'".format( unzipped_dir.parent.as_posix(), os.path.join(gcp_bucket, 'processed-data/'))) print('\n Ingest Raw Data Metadata:') print(metadata) print('\n') shutil.rmtree(tmp_directory.as_posix())
def process_zip(gcp_bucket, zipped_stack): start_dt = datetime.now() assert "gs://" in zipped_stack assert "gs://" in gcp_bucket # clean up the tmp directory try: shutil.rmtree(tmp_directory.as_posix()) except FileNotFoundError: pass tmp_directory.mkdir() is_annotation = 'dmg' in zipped_stack stack_id = Path(zipped_stack).name.split('.')[0] split_strings = ['_8bit', '-', '_dmg'] for s in split_strings: stack_id = stack_id.split(s)[0] stack_dir = Path(tmp_directory, stack_id) if not is_annotation and remote_folder_exists( os.path.join(gcp_bucket, 'processed-data', stack_id), "images"): print("{} has already been processed! Skipping...".format( os.path.join(stack_id, "images"))) elif is_annotation and remote_folder_exists( os.path.join(gcp_bucket, 'processed-data', stack_id), "annotations"): print("{} has already been processed! Skipping...".format( os.path.join(stack_id, "annotations"))) else: os.system("gsutil -m cp -r '{}' '{}'".format(zipped_stack, tmp_directory.as_posix())) os.system("7za x -y -o'{}' '{}'".format( stack_dir.as_posix(), Path(tmp_directory, Path(zipped_stack).name).as_posix())) os.remove(Path(tmp_directory, Path(zipped_stack).name).as_posix()) unzipped_dir = next(stack_dir.iterdir()) original_number_of_files_in_zip = len(list(unzipped_dir.iterdir())) for f in Path(unzipped_dir).iterdir(): if f.name[-4:] != '.tif': # remove any non-image files os.remove(f.as_posix()) else: # convert all images to greyscale (some are already and some aren't) Image.open(f).convert("L").save(f) shutil.move( unzipped_dir.as_posix(), Path(unzipped_dir.parent, 'annotations' if is_annotation else 'images').as_posix()) # get metadata file, if exists os.system("gsutil -m cp -r '{}' '{}'".format( os.path.join(gcp_bucket, 'processed-data/', stack_id, metadata_file_name), Path(tmp_directory, stack_id).as_posix())) try: with Path(tmp_directory, stack_id, metadata_file_name).open('r') as f: metadata = yaml.safe_load(f) except FileNotFoundError: metadata = {} metadata.update({ 'annotations' if is_annotation else 'images': { 'gcp_bucket': gcp_bucket, 'zipped_stack_file': zipped_stack, 'created_datetime': datetime.now(pytz.UTC).strftime('%Y%m%dT%H%M%SZ'), 'original_number_of_files_in_zip': original_number_of_files_in_zip, 'number_of_images': len( list( Path(unzipped_dir.parent, 'annotations' if is_annotation else 'images').iterdir())), 'git_hash': git.Repo(search_parent_directories=True).head.object.hexsha }, 'elapsed_minutes': round((datetime.now() - start_dt).total_seconds() / 60, 1) }) with Path(tmp_directory, stack_id, metadata_file_name).open('w') as f: yaml.safe_dump(metadata, f) os.system("gsutil -m cp -r '{}' '{}'".format( unzipped_dir.parent.as_posix(), os.path.join(gcp_bucket, 'processed-data/'))) shutil.rmtree(tmp_directory.as_posix())
def prepare_dataset(gcp_bucket, config_file): """ The ordering of the steps is important because it assumes a certain directory structure is progressively created! """ start_dt = datetime.now() with Path(config_file).open('r') as f: dataset_config = yaml.safe_load(f)['dataset_config'] dataset_id = Path(config_file).name.split('.')[0] assert "gs://" in gcp_bucket # clean up the tmp directory try: shutil.rmtree(tmp_directory.as_posix()) except FileNotFoundError: pass tmp_directory.mkdir() processed_data_remote_source = os.path.join(gcp_bucket, 'processed-data') processed_data_local_dir = Path(tmp_directory, 'processed-data') processed_data_local_dir.mkdir() data_prep_local_dir = Path(tmp_directory, 'preparing') data_prep_local_dir.mkdir() prepared_dataset_local_dir = Path( tmp_directory, 'datasets', ) prepared_dataset_local_dir.mkdir(parents=True) prepared_dataset_remote_dest = os.path.join(gcp_bucket, 'datasets') with Path(prepared_dataset_local_dir, 'config.yaml').open('w') as f: yaml.safe_dump({'dataset_config': dataset_config}, f) all_scans = [] for _, scans in dataset_config['dataset_split'].items(): all_scans += scans all_scans = sorted(set(all_scans)) assert not remote_folder_exists(prepared_dataset_remote_dest, dataset_id) copy_processed_data_locally_if_missing(all_scans, processed_data_remote_source, processed_data_local_dir) copy_and_downsample_processed_data_to_preparation_if_missing( all_scans, processed_data_local_dir, data_prep_local_dir, dataset_config['stack_downsampling']) resize_and_crop(data_prep_local_dir, dataset_config['target_size'], dataset_config['image_cropping'], dataset_config['class_annotation_mapping']) # create_class_masks(data_prep_local_dir, dataset_config['class_annotation_mapping']) split_prepared_data(data_prep_local_dir, prepared_dataset_local_dir, dataset_config['dataset_split']) metadata = { 'gcp_bucket': gcp_bucket, 'created_datetime': datetime.now(pytz.UTC).strftime('%Y%m%dT%H%M%SZ'), 'number_of_images': { 'train': len( list( Path(prepared_dataset_local_dir, 'train', 'images').iterdir())), 'validation': len( list( Path(prepared_dataset_local_dir, 'validation', 'images').iterdir())), }, 'git_hash': git.Repo(search_parent_directories=True).head.object.hexsha, 'original_config_filename': config_file, 'elapsed_minutes': round((datetime.now() - start_dt).total_seconds() / 60, 1) } try: metadata['number_of_images']['test'] = len( list(Path(prepared_dataset_local_dir, 'test', 'images').iterdir())) except FileNotFoundError: pass # does not necessarily have to be test data with Path(prepared_dataset_local_dir, metadata_file_name).open('w') as f: yaml.safe_dump(metadata, f) copy_dataset_to_remote_dest(prepared_dataset_local_dir, prepared_dataset_remote_dest, dataset_id) shutil.rmtree(tmp_directory.as_posix())
def prepare_dataset(gcp_bucket, config_file, random_module_global_seed, numpy_random_global_seed): """ The ordering of the steps is important because it assumes a certain directory structure is progressively created! """ # seed global random generators if specified; global random seeds here must be int or default None (no seed given) if random_module_global_seed is not None: random.seed(random_module_global_seed) if numpy_random_global_seed is not None: np.random.seed(numpy_random_global_seed) start_dt = datetime.now() with Path(config_file).open('r') as f: dataset_config = yaml.safe_load(f)['dataset_config'] dataset_id = Path(config_file).name.split('.')[0] assert "gs://" in gcp_bucket # clean up the tmp directory try: shutil.rmtree(tmp_directory.as_posix()) except FileNotFoundError: pass tmp_directory.mkdir() processed_data_remote_source = os.path.join(gcp_bucket, 'processed-data') processed_data_local_dir = Path(tmp_directory, 'processed-data') processed_data_local_dir.mkdir() data_prep_local_dir = Path(tmp_directory, 'preparing') data_prep_local_dir.mkdir() prepared_dataset_local_dir = Path(tmp_directory, 'datasets', ) prepared_dataset_local_dir.mkdir(parents=True) prepared_dataset_remote_dest = os.path.join(gcp_bucket, 'datasets') with Path(prepared_dataset_local_dir, 'config.yaml').open('w') as f: yaml.safe_dump({'dataset_config': dataset_config}, f) all_scans = [] for _, scans in dataset_config['dataset_split'].items(): all_scans += scans all_scans = sorted(set(all_scans)) assert not remote_folder_exists(prepared_dataset_remote_dest, dataset_id, sample_file_name='config.yaml'), "Dataset already exists in the GCP bucket. Choose new name in dataset config used here. Do NOT modify or delete the existing dataset in the GCP bucket." copy_processed_data_locally_if_missing(all_scans, processed_data_remote_source, processed_data_local_dir) copy_and_downsample_processed_data_to_preparation_if_missing( all_scans, processed_data_local_dir, data_prep_local_dir, dataset_config['stack_downsampling']) resize_and_crop(data_prep_local_dir, dataset_config['target_size'], dataset_config['image_cropping'], dataset_config['class_annotation_mapping']) create_class_masks(data_prep_local_dir, dataset_config['class_annotation_mapping']) split_prepared_data(data_prep_local_dir, prepared_dataset_local_dir, dataset_config['dataset_split']) metadata = { 'gcp_bucket': gcp_bucket, 'created_datetime': datetime.now(pytz.UTC).strftime('%Y%m%dT%H%M%SZ'), 'number_of_images': { 'train': len(list(Path(prepared_dataset_local_dir, 'train', 'images').iterdir())), 'validation': len(list(Path(prepared_dataset_local_dir, 'validation', 'images').iterdir())), }, 'git_hash': git.Repo(search_parent_directories=True).head.object.hexsha, 'original_config_filename': config_file, 'elapsed_minutes': round((datetime.now() - start_dt).total_seconds() / 60, 1), 'random-module-global-seed': random_module_global_seed, 'numpy_random_global_seed': numpy_random_global_seed, } try: metadata['number_of_images']['test'] = len(list(Path(prepared_dataset_local_dir, 'test', 'images').iterdir())) except FileNotFoundError: pass # does not necessarily have to be test data with Path(prepared_dataset_local_dir, metadata_file_name).open('w') as f: yaml.safe_dump(metadata, f) copy_dataset_to_remote_dest(prepared_dataset_local_dir, prepared_dataset_remote_dest, dataset_id) print('\n Prepare Dataset Metadata:') print(metadata) print('\n') shutil.rmtree(tmp_directory.as_posix())