Esempio n. 1
0
def process_zip(gcp_bucket, zipped_stack):

    start_dt = datetime.now()

    assert "gs://" in zipped_stack
    assert "gs://" in gcp_bucket

    # clean up the tmp directory
    try:
        shutil.rmtree(tmp_directory.as_posix())
    except FileNotFoundError:
        pass
    tmp_directory.mkdir()

    is_annotation = 'dmg' in zipped_stack

    stack_id = Path(zipped_stack).name.split('.')[0]
    split_strings = ['_8bit', '-mtrxdmg', '_dmg', '-dmg']
    for s in split_strings:
        stack_id = stack_id.split(s)[0]

    stack_dir = Path(tmp_directory, stack_id)

    if not is_annotation and gcp_utils.remote_folder_exists(
            os.path.join(gcp_bucket, 'processed-data'),
            '/'.join([stack_id] + ["images"])):

        print("{} has already been processed! Skipping...".format(
            os.path.join(stack_id, "images")))

    elif is_annotation and gcp_utils.remote_folder_exists(
            os.path.join(gcp_bucket, 'processed-data'),
            '/'.join([stack_id] + ["annotations"])):

        print("{} has already been processed! Skipping...".format(
            os.path.join(stack_id, "annotations")))

    else:

        os.system("gsutil -m cp -r '{}' '{}'".format(zipped_stack,
                                                     tmp_directory.as_posix()))

        os.system("7za x -y -o'{}' '{}'".format(
            stack_dir.as_posix(),
            Path(tmp_directory,
                 Path(zipped_stack).name).as_posix()))
        os.remove(Path(tmp_directory, Path(zipped_stack).name).as_posix())
        unzipped_dir = next(stack_dir.iterdir())

        original_number_of_files_in_zip = len(list(unzipped_dir.iterdir()))
        temp_file_name = r'./temp.tif'
        for f in Path(unzipped_dir).iterdir():
            if f.name[-4:] != '.tif':
                # remove any non-image files
                os.remove(f.as_posix())
            else:
                # Old code to convert all images to greyscale (some are already and some aren't)
                # Image.open(f).convert("L").save(f)
                # The commented code was giving an error due to some compression setting of the SOME images:
                #     return encoder(mode, *args + extra)
                #     TypeError: argument 5 should be a str, not PosixPath
                # The error seems to be related to the image metadata or to a compression setting of the image.
                # A workaround was to copy and rename the image and delete the old one..
                shutil.copyfile(f, temp_file_name)
                os.remove(f)
                im1 = Image.open(temp_file_name).convert("L").save(
                    temp_file_name)
                os.rename(temp_file_name, f)

        shutil.move(
            unzipped_dir.as_posix(),
            Path(unzipped_dir.parent,
                 'annotations' if is_annotation else 'images').as_posix())

        # get metadata file, if exists
        os.system("gsutil -m cp -r '{}' '{}'".format(
            os.path.join(gcp_bucket, 'processed-data/', stack_id,
                         metadata_file_name),
            Path(tmp_directory, stack_id).as_posix()))

        try:
            with Path(tmp_directory, stack_id,
                      metadata_file_name).open('r') as f:
                metadata = yaml.safe_load(f)
        except FileNotFoundError:
            metadata = {}

        metadata.update({
            'annotations' if is_annotation else 'images': {
                'gcp_bucket':
                gcp_bucket,
                'zipped_stack_file':
                zipped_stack,
                'created_datetime':
                datetime.now(pytz.UTC).strftime('%Y%m%dT%H%M%SZ'),
                'original_number_of_files_in_zip':
                original_number_of_files_in_zip,
                'number_of_images':
                len(
                    list(
                        Path(unzipped_dir.parent, 'annotations'
                             if is_annotation else 'images').iterdir())),
                'git_hash':
                git.Repo(search_parent_directories=True).head.object.hexsha
            },
            'elapsed_minutes':
            round((datetime.now() - start_dt).total_seconds() / 60, 1)
        })

        with Path(tmp_directory, stack_id, metadata_file_name).open('w') as f:
            yaml.safe_dump(metadata, f)

        os.system("gsutil -m cp -n -r '{}' '{}'".format(
            unzipped_dir.parent.as_posix(),
            os.path.join(gcp_bucket, 'processed-data/')))

        print('\n Ingest Raw Data Metadata:')
        print(metadata)
        print('\n')

        shutil.rmtree(tmp_directory.as_posix())
def process_zip(gcp_bucket, zipped_stack):

    start_dt = datetime.now()

    assert "gs://" in zipped_stack
    assert "gs://" in gcp_bucket

    # clean up the tmp directory
    try:
        shutil.rmtree(tmp_directory.as_posix())
    except FileNotFoundError:
        pass
    tmp_directory.mkdir()

    is_annotation = 'dmg' in zipped_stack

    stack_id = Path(zipped_stack).name.split('.')[0]
    split_strings = ['_8bit', '-', '_dmg']
    for s in split_strings:
        stack_id = stack_id.split(s)[0]

    stack_dir = Path(tmp_directory, stack_id)

    if not is_annotation and remote_folder_exists(
            os.path.join(gcp_bucket, 'processed-data', stack_id), "images"):

        print("{} has already been processed! Skipping...".format(
            os.path.join(stack_id, "images")))

    elif is_annotation and remote_folder_exists(
            os.path.join(gcp_bucket, 'processed-data', stack_id),
            "annotations"):

        print("{} has already been processed! Skipping...".format(
            os.path.join(stack_id, "annotations")))

    else:

        os.system("gsutil -m cp -r '{}' '{}'".format(zipped_stack,
                                                     tmp_directory.as_posix()))

        os.system("7za x -y -o'{}' '{}'".format(
            stack_dir.as_posix(),
            Path(tmp_directory,
                 Path(zipped_stack).name).as_posix()))
        os.remove(Path(tmp_directory, Path(zipped_stack).name).as_posix())
        unzipped_dir = next(stack_dir.iterdir())

        original_number_of_files_in_zip = len(list(unzipped_dir.iterdir()))

        for f in Path(unzipped_dir).iterdir():
            if f.name[-4:] != '.tif':
                # remove any non-image files
                os.remove(f.as_posix())
            else:
                # convert all images to greyscale (some are already and some aren't)
                Image.open(f).convert("L").save(f)

        shutil.move(
            unzipped_dir.as_posix(),
            Path(unzipped_dir.parent,
                 'annotations' if is_annotation else 'images').as_posix())

        # get metadata file, if exists
        os.system("gsutil -m cp -r '{}' '{}'".format(
            os.path.join(gcp_bucket, 'processed-data/', stack_id,
                         metadata_file_name),
            Path(tmp_directory, stack_id).as_posix()))

        try:
            with Path(tmp_directory, stack_id,
                      metadata_file_name).open('r') as f:
                metadata = yaml.safe_load(f)
        except FileNotFoundError:
            metadata = {}

        metadata.update({
            'annotations' if is_annotation else 'images': {
                'gcp_bucket':
                gcp_bucket,
                'zipped_stack_file':
                zipped_stack,
                'created_datetime':
                datetime.now(pytz.UTC).strftime('%Y%m%dT%H%M%SZ'),
                'original_number_of_files_in_zip':
                original_number_of_files_in_zip,
                'number_of_images':
                len(
                    list(
                        Path(unzipped_dir.parent, 'annotations'
                             if is_annotation else 'images').iterdir())),
                'git_hash':
                git.Repo(search_parent_directories=True).head.object.hexsha
            },
            'elapsed_minutes':
            round((datetime.now() - start_dt).total_seconds() / 60, 1)
        })

        with Path(tmp_directory, stack_id, metadata_file_name).open('w') as f:
            yaml.safe_dump(metadata, f)

        os.system("gsutil -m cp -r '{}' '{}'".format(
            unzipped_dir.parent.as_posix(),
            os.path.join(gcp_bucket, 'processed-data/')))

        shutil.rmtree(tmp_directory.as_posix())
def prepare_dataset(gcp_bucket, config_file):
    """
    The ordering of the steps is important because it assumes a certain directory structure is progressively created!
    """
    start_dt = datetime.now()

    with Path(config_file).open('r') as f:
        dataset_config = yaml.safe_load(f)['dataset_config']

    dataset_id = Path(config_file).name.split('.')[0]

    assert "gs://" in gcp_bucket

    # clean up the tmp directory
    try:
        shutil.rmtree(tmp_directory.as_posix())
    except FileNotFoundError:
        pass
    tmp_directory.mkdir()

    processed_data_remote_source = os.path.join(gcp_bucket, 'processed-data')
    processed_data_local_dir = Path(tmp_directory, 'processed-data')
    processed_data_local_dir.mkdir()

    data_prep_local_dir = Path(tmp_directory, 'preparing')
    data_prep_local_dir.mkdir()

    prepared_dataset_local_dir = Path(
        tmp_directory,
        'datasets',
    )
    prepared_dataset_local_dir.mkdir(parents=True)

    prepared_dataset_remote_dest = os.path.join(gcp_bucket, 'datasets')

    with Path(prepared_dataset_local_dir, 'config.yaml').open('w') as f:
        yaml.safe_dump({'dataset_config': dataset_config}, f)

    all_scans = []
    for _, scans in dataset_config['dataset_split'].items():
        all_scans += scans
    all_scans = sorted(set(all_scans))

    assert not remote_folder_exists(prepared_dataset_remote_dest, dataset_id)

    copy_processed_data_locally_if_missing(all_scans,
                                           processed_data_remote_source,
                                           processed_data_local_dir)

    copy_and_downsample_processed_data_to_preparation_if_missing(
        all_scans, processed_data_local_dir, data_prep_local_dir,
        dataset_config['stack_downsampling'])

    resize_and_crop(data_prep_local_dir, dataset_config['target_size'],
                    dataset_config['image_cropping'],
                    dataset_config['class_annotation_mapping'])

    # create_class_masks(data_prep_local_dir, dataset_config['class_annotation_mapping'])

    split_prepared_data(data_prep_local_dir, prepared_dataset_local_dir,
                        dataset_config['dataset_split'])

    metadata = {
        'gcp_bucket':
        gcp_bucket,
        'created_datetime':
        datetime.now(pytz.UTC).strftime('%Y%m%dT%H%M%SZ'),
        'number_of_images': {
            'train':
            len(
                list(
                    Path(prepared_dataset_local_dir, 'train',
                         'images').iterdir())),
            'validation':
            len(
                list(
                    Path(prepared_dataset_local_dir, 'validation',
                         'images').iterdir())),
        },
        'git_hash':
        git.Repo(search_parent_directories=True).head.object.hexsha,
        'original_config_filename':
        config_file,
        'elapsed_minutes':
        round((datetime.now() - start_dt).total_seconds() / 60, 1)
    }
    try:
        metadata['number_of_images']['test'] = len(
            list(Path(prepared_dataset_local_dir, 'test', 'images').iterdir()))
    except FileNotFoundError:
        pass  # does not necessarily have to be test data

    with Path(prepared_dataset_local_dir, metadata_file_name).open('w') as f:
        yaml.safe_dump(metadata, f)

    copy_dataset_to_remote_dest(prepared_dataset_local_dir,
                                prepared_dataset_remote_dest, dataset_id)

    shutil.rmtree(tmp_directory.as_posix())
Esempio n. 4
0
def prepare_dataset(gcp_bucket, config_file, random_module_global_seed, numpy_random_global_seed):
    """
    The ordering of the steps is important because it assumes a certain directory structure is progressively created!
    """

    # seed global random generators if specified; global random seeds here must be int or default None (no seed given)
    if random_module_global_seed is not None:
        random.seed(random_module_global_seed)
    if numpy_random_global_seed is not None:
        np.random.seed(numpy_random_global_seed)

    start_dt = datetime.now()

    with Path(config_file).open('r') as f:
        dataset_config = yaml.safe_load(f)['dataset_config']

    dataset_id = Path(config_file).name.split('.')[0]

    assert "gs://" in gcp_bucket

    # clean up the tmp directory
    try:
        shutil.rmtree(tmp_directory.as_posix())
    except FileNotFoundError:
        pass
    tmp_directory.mkdir()

    processed_data_remote_source = os.path.join(gcp_bucket, 'processed-data')
    processed_data_local_dir = Path(tmp_directory, 'processed-data')
    processed_data_local_dir.mkdir()

    data_prep_local_dir = Path(tmp_directory, 'preparing')
    data_prep_local_dir.mkdir()

    prepared_dataset_local_dir = Path(tmp_directory, 'datasets', )
    prepared_dataset_local_dir.mkdir(parents=True)

    prepared_dataset_remote_dest = os.path.join(gcp_bucket, 'datasets')

    with Path(prepared_dataset_local_dir, 'config.yaml').open('w') as f:
        yaml.safe_dump({'dataset_config': dataset_config}, f)

    all_scans = []
    for _, scans in dataset_config['dataset_split'].items():
        all_scans += scans
    all_scans = sorted(set(all_scans))

    assert not remote_folder_exists(prepared_dataset_remote_dest, dataset_id, sample_file_name='config.yaml'), "Dataset already exists in the GCP bucket. Choose new name in dataset config used here. Do NOT modify or delete the existing dataset in the GCP bucket."

    copy_processed_data_locally_if_missing(all_scans, processed_data_remote_source, processed_data_local_dir)

    copy_and_downsample_processed_data_to_preparation_if_missing(
        all_scans, processed_data_local_dir, data_prep_local_dir, dataset_config['stack_downsampling'])

    resize_and_crop(data_prep_local_dir, dataset_config['target_size'], dataset_config['image_cropping'], dataset_config['class_annotation_mapping'])

    create_class_masks(data_prep_local_dir, dataset_config['class_annotation_mapping'])

    split_prepared_data(data_prep_local_dir, prepared_dataset_local_dir, dataset_config['dataset_split'])

    metadata = {
        'gcp_bucket': gcp_bucket,
        'created_datetime': datetime.now(pytz.UTC).strftime('%Y%m%dT%H%M%SZ'),
        'number_of_images': {
            'train': len(list(Path(prepared_dataset_local_dir, 'train', 'images').iterdir())),
            'validation': len(list(Path(prepared_dataset_local_dir, 'validation', 'images').iterdir())),
        },
        'git_hash': git.Repo(search_parent_directories=True).head.object.hexsha,
        'original_config_filename': config_file,
        'elapsed_minutes': round((datetime.now() - start_dt).total_seconds() / 60, 1),
        'random-module-global-seed': random_module_global_seed,
        'numpy_random_global_seed': numpy_random_global_seed,
    }
    try:
        metadata['number_of_images']['test'] = len(list(Path(prepared_dataset_local_dir, 'test', 'images').iterdir()))
    except FileNotFoundError:
        pass  # does not necessarily have to be test data

    with Path(prepared_dataset_local_dir, metadata_file_name).open('w') as f:
        yaml.safe_dump(metadata, f)

    copy_dataset_to_remote_dest(prepared_dataset_local_dir, prepared_dataset_remote_dest, dataset_id)

    print('\n Prepare Dataset Metadata:')
    print(metadata)
    print('\n')

    shutil.rmtree(tmp_directory.as_posix())