def main(params):
    """
    Training and validation datasets preparation.
    :param params: (dict) Parameters found in the yaml config file.

    """
    gpkg_file = []
    bucket_name = params['global']['bucket_name']
    data_path = params['global']['data_path']
    csv_file = params['sample']['prep_csv_file']

    if bucket_name:
        s3 = boto3.resource('s3')
        bucket = s3.Bucket(bucket_name)
        bucket.download_file(csv_file, 'samples_prep.csv')
        list_data_prep = read_csv('samples_prep.csv')
        if data_path:
            final_samples_folder = os.path.join(data_path, "samples")
        else:
            final_samples_folder = "samples"
        samples_folder = "samples"
        out_label_folder = "label"

    else:
        list_data_prep = read_csv(csv_file)
        samples_folder = os.path.join(data_path, "samples")
        out_label_folder = os.path.join(data_path, "label")

    create_or_empty_folder(samples_folder)
    create_or_empty_folder(out_label_folder)

    number_samples = {'trn': 0, 'val': 0, 'tst': 0}
    number_classes = 0

    trn_hdf5, val_hdf5, tst_hdf5 = create_files_and_datasets(params, samples_folder)

    with tqdm(list_data_prep) as _tqdm:
        for info in _tqdm:

            if bucket_name:
                bucket.download_file(info['tif'], "Images/" + info['tif'].split('/')[-1])
                info['tif'] = "Images/" + info['tif'].split('/')[-1]
                if info['gpkg'] not in gpkg_file:
                    gpkg_file.append(info['gpkg'])
                    bucket.download_file(info['gpkg'], info['gpkg'].split('/')[-1])
                info['gpkg'] = info['gpkg'].split('/')[-1]

            assert_band_number(info['tif'], params['global']['number_of_bands'])

            _tqdm.set_postfix(OrderedDict(file=f'{info["tif"]}', sample_size=params['global']['samples_size']))

            # Read the input raster image
            np_input_image = image_reader_as_array(info['tif'])

            # Validate the number of class in the vector file
            validate_num_classes(info['gpkg'], params['global']['num_classes'], info['attribute_name'])

            # Burn vector file in a raster file
            np_label_raster = vector_to_raster(info['gpkg'], info['tif'], info['attribute_name'])

            # Guidelines for pre-processing: http://cs231n.github.io/neural-networks-2/#datapre
            # Scale arrays to values [0,1]. Default: will scale. Useful if dealing with 8 bit *and* 16 bit images.
            scale = params['global']['scale_data'] if params['global']['scale_data'] else True
            if scale:
                sc_min, sc_max = params['global']['scale_data']
                np_input_image = minmax_scale(np_input_image,
                                              orig_range=(np.min(np_input_image), np.max(np_input_image)),
                                              scale_range=(sc_min,sc_max))

            # Mask the zeros from input image into label raster.
            if params['sample']['mask_reference']:
                np_label_raster = mask_image(np_input_image, np_label_raster)

            if info['dataset'] == 'trn':
                out_file = trn_hdf5
            elif info['dataset'] == 'val':
                out_file = val_hdf5
            elif info['dataset'] == 'tst':
                out_file = tst_hdf5
            else:
                raise ValueError(f"Dataset value must be trn or val or tst. Provided value is {info['dataset']}")

            np_label_raster = np.reshape(np_label_raster, (np_label_raster.shape[0], np_label_raster.shape[1], 1))
            number_samples, number_classes = samples_preparation(np_input_image,
                                                                 np_label_raster,
                                                                 params['global']['samples_size'],
                                                                 params['sample']['samples_dist'],
                                                                 number_samples,
                                                                 number_classes,
                                                                 out_file,
                                                                 info['dataset'],
                                                                 params['sample']['min_annotated_percent'])

            _tqdm.set_postfix(OrderedDict(number_samples=number_samples))
            out_file.flush()

    trn_hdf5.close()
    val_hdf5.close()
    tst_hdf5.close()

    print("Number of samples created: ", number_samples)

    if bucket_name:
        print('Transfering Samples to the bucket')
        bucket.upload_file(samples_folder + "/trn_samples.hdf5", final_samples_folder + '/trn_samples.hdf5')
        bucket.upload_file(samples_folder + "/val_samples.hdf5", final_samples_folder + '/val_samples.hdf5')
        bucket.upload_file(samples_folder + "/tst_samples.hdf5", final_samples_folder + '/tst_samples.hdf5')

    print("End of process")
def main(params):
    """
    Training and validation datasets preparation.
    :param params: (dict) Parameters found in the yaml config file.

    """
    bucket_file_cache = []
    bucket_name = params['global']['bucket_name']
    data_path = params['global']['data_path']
    Path.mkdir(Path(data_path), exist_ok=True)
    csv_file = params['sample']['prep_csv_file']

    final_samples_folder = None
    if bucket_name:
        s3 = boto3.resource('s3')
        bucket = s3.Bucket(bucket_name)
        bucket.download_file(csv_file, 'samples_prep.csv')
        list_data_prep = read_csv('samples_prep.csv')
        if data_path:
            final_samples_folder = os.path.join(data_path, "samples")
        else:
            final_samples_folder = "samples"
        samples_folder = "samples"
        out_label_folder = "label"

    else:
        list_data_prep = read_csv(csv_file)
        samples_folder = os.path.join(
            data_path, "samples")  #FIXME check that data_path exists!
        out_label_folder = os.path.join(data_path, "label")

    create_or_empty_folder(samples_folder)
    create_or_empty_folder(out_label_folder)

    number_samples = {'trn': 0, 'val': 0, 'tst': 0}
    number_classes = 0

    trn_hdf5, val_hdf5, tst_hdf5 = create_files_and_datasets(
        params, samples_folder)

    with tqdm(list_data_prep) as _tqdm:
        for info in _tqdm:

            if bucket_name:
                bucket.download_file(info['tif'],
                                     "Images/" + info['tif'].split('/')[-1])
                info['tif'] = "Images/" + info['tif'].split('/')[-1]
                if info['gpkg'] not in bucket_file_cache:
                    bucket_file_cache.append(info['gpkg'])
                    bucket.download_file(info['gpkg'],
                                         info['gpkg'].split('/')[-1])
                info['gpkg'] = info['gpkg'].split('/')[-1]
                if info['meta']:
                    if info['meta'] not in bucket_file_cache:
                        bucket_file_cache.append(info['meta'])
                        bucket.download_file(info['meta'],
                                             info['meta'].split('/')[-1])
                    info['meta'] = info['meta'].split('/')[-1]

            _tqdm.set_postfix(
                OrderedDict(file=f'{info["tif"]}',
                            sample_size=params['global']['samples_size']))

            # Validate the number of class in the vector file
            validate_num_classes(info['gpkg'], params['global']['num_classes'],
                                 info['attribute_name'])

            assert os.path.isfile(
                info['tif']), f"could not open raster file at {info['tif']}"
            with rasterio.open(info['tif'], 'r') as raster:

                # Burn vector file in a raster file
                np_label_raster = vector_to_raster(
                    vector_file=info['gpkg'],
                    input_image=raster,
                    attribute_name=info['attribute_name'],
                    fill=get_key_def('ignore_idx',
                                     get_key_def('training', params, {}), 0))

                # Read the input raster image
                np_input_image = image_reader_as_array(
                    input_image=raster,
                    scale=get_key_def('scale_data', params['global'], None),
                    aux_vector_file=get_key_def('aux_vector_file',
                                                params['global'], None),
                    aux_vector_attrib=get_key_def('aux_vector_attrib',
                                                  params['global'], None),
                    aux_vector_ids=get_key_def('aux_vector_ids',
                                               params['global'], None),
                    aux_vector_dist_maps=get_key_def('aux_vector_dist_maps',
                                                     params['global'], True),
                    aux_vector_dist_log=get_key_def('aux_vector_dist_log',
                                                    params['global'], True),
                    aux_vector_scale=get_key_def('aux_vector_scale',
                                                 params['global'], None))

            # Mask the zeros from input image into label raster.
            if params['sample']['mask_reference']:
                np_label_raster = mask_image(np_input_image, np_label_raster)

            if info['dataset'] == 'trn':
                out_file = trn_hdf5
            elif info['dataset'] == 'val':
                out_file = val_hdf5
            elif info['dataset'] == 'tst':
                out_file = tst_hdf5
            else:
                raise ValueError(
                    f"Dataset value must be trn or val or tst. Provided value is {info['dataset']}"
                )

            meta_map, metadata = get_key_def("meta_map", params["global"],
                                             {}), None
            if info['meta'] is not None and isinstance(
                    info['meta'], str) and os.path.isfile(info['meta']):
                metadata = read_parameters(info['meta'])

            input_band_count = np_input_image.shape[
                2] + MetaSegmentationDataset.get_meta_layer_count(meta_map)
            assert input_band_count == params['global']['number_of_bands'], \
                f"The number of bands in the input image ({input_band_count}) and the parameter" \
                f"'number_of_bands' in the yaml file ({params['global']['number_of_bands']}) should be identical"

            np_label_raster = np.reshape(
                np_label_raster,
                (np_label_raster.shape[0], np_label_raster.shape[1], 1))
            number_samples, number_classes = samples_preparation(
                np_input_image, np_label_raster,
                params['global']['samples_size'],
                params['sample']['samples_dist'], number_samples,
                number_classes, out_file, info['dataset'],
                params['sample']['min_annotated_percent'], metadata)

            _tqdm.set_postfix(OrderedDict(number_samples=number_samples))
            out_file.flush()

    trn_hdf5.close()
    val_hdf5.close()
    tst_hdf5.close()

    print("Number of samples created: ", number_samples)

    if bucket_name and final_samples_folder:
        print('Transfering Samples to the bucket')
        bucket.upload_file(samples_folder + "/trn_samples.hdf5",
                           final_samples_folder + '/trn_samples.hdf5')
        bucket.upload_file(samples_folder + "/val_samples.hdf5",
                           final_samples_folder + '/val_samples.hdf5')
        bucket.upload_file(samples_folder + "/tst_samples.hdf5",
                           final_samples_folder + '/tst_samples.hdf5')

    print("End of process")
Beispiel #3
0
def main(params):
    """
    Training and validation datasets preparation.
    :param params: (dict) Parameters found in the yaml config file.

    """
    now = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M")
    bucket_file_cache = []

    assert params['global'][
        'task'] == 'segmentation', f"images_to_samples.py isn't necessary when performing classification tasks"

    # SET BASIC VARIABLES AND PATHS. CREATE OUTPUT FOLDERS.
    bucket_name = get_key_def('bucket_name', params['global'])
    data_path = Path(params['global']['data_path'])
    Path.mkdir(data_path, exist_ok=True, parents=True)
    csv_file = params['sample']['prep_csv_file']
    val_percent = params['sample']['val_percent']
    samples_size = params["global"]["samples_size"]
    overlap = params["sample"]["overlap"]
    min_annot_perc = get_key_def('min_annotated_percent',
                                 params['sample']['sampling_method'],
                                 None,
                                 expected_type=int)
    num_bands = params['global']['number_of_bands']
    debug = get_key_def('debug_mode', params['global'], False)
    if debug:
        warnings.warn(f'Debug mode activate. Execution may take longer...')

    final_samples_folder = None

    sample_path_name = f'samples{samples_size}_overlap{overlap}_min-annot{min_annot_perc}_{num_bands}bands'

    # AWS
    if bucket_name:
        s3 = boto3.resource('s3')
        bucket = s3.Bucket(bucket_name)
        bucket.download_file(csv_file, 'samples_prep.csv')
        list_data_prep = read_csv('samples_prep.csv')
        if data_path:
            final_samples_folder = data_path.joinpath("samples")
        else:
            final_samples_folder = "samples"
        samples_folder = sample_path_name

    else:
        list_data_prep = read_csv(csv_file)
        samples_folder = data_path.joinpath(sample_path_name)

    if samples_folder.is_dir():
        warnings.warn(
            f'Data path exists: {samples_folder}. Suffix will be added to directory name.'
        )
        samples_folder = Path(str(samples_folder) + '_' + now)
    else:
        tqdm.write(f'Writing samples to {samples_folder}')
    Path.mkdir(samples_folder, exist_ok=False
               )  # TODO: what if we want to append samples to existing hdf5?
    tqdm.write(f'Samples will be written to {samples_folder}\n\n')

    tqdm.write(f'\nSuccessfully read csv file: {Path(csv_file).stem}\n'
               f'Number of rows: {len(list_data_prep)}\n'
               f'Copying first entry:\n{list_data_prep[0]}\n')
    ignore_index = get_key_def('ignore_index', params['training'], -1)
    meta_map, metadata = get_key_def("meta_map", params["global"], {}), None

    # VALIDATION: (1) Assert num_classes parameters == num actual classes in gpkg and (2) check CRS match (tif and gpkg)
    valid_gpkg_set = set()
    for info in tqdm(list_data_prep, position=0):
        assert_num_bands(info['tif'], num_bands, meta_map)
        if info['gpkg'] not in valid_gpkg_set:
            gpkg_classes = validate_num_classes(
                info['gpkg'], params['global']['num_classes'],
                info['attribute_name'], ignore_index)
            assert_crs_match(info['tif'], info['gpkg'])
            valid_gpkg_set.add(info['gpkg'])

    if debug:
        # VALIDATION (debug only): Checking validity of features in vector files
        for info in tqdm(
                list_data_prep,
                position=0,
                desc=f"Checking validity of features in vector files"):
            invalid_features = validate_features_from_gpkg(
                info['gpkg'], info['attribute_name']
            )  # TODO: test this with invalid features.
            assert not invalid_features, f"{info['gpkg']}: Invalid geometry object(s) '{invalid_features}'"

    number_samples = {'trn': 0, 'val': 0, 'tst': 0}
    number_classes = 0

    class_prop = get_key_def('class_proportion',
                             params['sample']['sampling_method'],
                             None,
                             expected_type=dict)

    trn_hdf5, val_hdf5, tst_hdf5 = create_files_and_datasets(
        params, samples_folder)

    # Set dontcare (aka ignore_index) value
    dontcare = get_key_def(
        "ignore_index", params["training"],
        -1)  # TODO: deduplicate with train_segmentation, l300
    if dontcare == 0:
        warnings.warn(
            "The 'dontcare' value (or 'ignore_index') used in the loss function cannot be zero;"
            " all valid class indices should be consecutive, and start at 0. The 'dontcare' value"
            " will be remapped to -1 while loading the dataset, and inside the config from now on."
        )
        params["training"]["ignore_index"] = -1

    # creates pixel_classes dict and keys
    pixel_classes = {key: 0 for key in gpkg_classes}
    background_val = 0
    pixel_classes[background_val] = 0
    class_prop = validate_class_prop_dict(pixel_classes, class_prop)
    pixel_classes[dontcare] = 0

    # For each row in csv: (1) burn vector file to raster, (2) read input raster image, (3) prepare samples
    with tqdm(list_data_prep,
              position=0,
              leave=False,
              desc=f'Preparing samples') as _tqdm:
        for info in _tqdm:
            _tqdm.set_postfix(
                OrderedDict(tif=f'{Path(info["tif"]).stem}',
                            sample_size=params['global']['samples_size']))
            try:
                if bucket_name:
                    bucket.download_file(
                        info['tif'], "Images/" + info['tif'].split('/')[-1])
                    info['tif'] = "Images/" + info['tif'].split('/')[-1]
                    if info['gpkg'] not in bucket_file_cache:
                        bucket_file_cache.append(info['gpkg'])
                        bucket.download_file(info['gpkg'],
                                             info['gpkg'].split('/')[-1])
                    info['gpkg'] = info['gpkg'].split('/')[-1]
                    if info['meta']:
                        if info['meta'] not in bucket_file_cache:
                            bucket_file_cache.append(info['meta'])
                            bucket.download_file(info['meta'],
                                                 info['meta'].split('/')[-1])
                        info['meta'] = info['meta'].split('/')[-1]

                with rasterio.open(info['tif'], 'r') as raster:
                    # 1. Read the input raster image
                    np_input_image, raster, dataset_nodata = image_reader_as_array(
                        input_image=raster,
                        clip_gpkg=info['gpkg'],
                        aux_vector_file=get_key_def('aux_vector_file',
                                                    params['global'], None),
                        aux_vector_attrib=get_key_def('aux_vector_attrib',
                                                      params['global'], None),
                        aux_vector_ids=get_key_def('aux_vector_ids',
                                                   params['global'], None),
                        aux_vector_dist_maps=get_key_def(
                            'aux_vector_dist_maps', params['global'], True),
                        aux_vector_dist_log=get_key_def(
                            'aux_vector_dist_log', params['global'], True),
                        aux_vector_scale=get_key_def('aux_vector_scale',
                                                     params['global'], None))

                    # 2. Burn vector file in a raster file
                    np_label_raster = vector_to_raster(
                        vector_file=info['gpkg'],
                        input_image=raster,
                        out_shape=np_input_image.shape[:2],
                        attribute_name=info['attribute_name'],
                        fill=background_val
                    )  # background value in rasterized vector.

                    if dataset_nodata is not None:
                        # 3. Set ignore_index value in label array where nodata in raster (only if nodata across all bands)
                        np_label_raster[dataset_nodata] = dontcare

                if debug:
                    out_meta = raster.meta.copy()
                    np_image_debug = np_input_image.transpose(2, 0, 1).astype(
                        out_meta['dtype'])
                    out_meta.update({
                        "driver": "GTiff",
                        "height": np_image_debug.shape[1],
                        "width": np_image_debug.shape[2]
                    })
                    out_tif = samples_folder / f"np_input_image_{_tqdm.n}.tif"
                    print(f"DEBUG: writing clipped raster to {out_tif}")
                    with rasterio.open(out_tif, "w", **out_meta) as dest:
                        dest.write(np_image_debug)

                    out_meta = raster.meta.copy()
                    np_label_debug = np.expand_dims(
                        np_label_raster,
                        axis=2).transpose(2, 0, 1).astype(out_meta['dtype'])
                    out_meta.update({
                        "driver": "GTiff",
                        "height": np_label_debug.shape[1],
                        "width": np_label_debug.shape[2],
                        'count': 1
                    })
                    out_tif = samples_folder / f"np_label_rasterized_{_tqdm.n}.tif"
                    print(f"DEBUG: writing final rasterized gpkg to {out_tif}")
                    with rasterio.open(out_tif, "w", **out_meta) as dest:
                        dest.write(np_label_debug)

                # Mask the zeros from input image into label raster.
                if params['sample']['mask_reference']:
                    np_label_raster = mask_image(np_input_image,
                                                 np_label_raster)

                if info['dataset'] == 'trn':
                    out_file = trn_hdf5
                elif info['dataset'] == 'tst':
                    out_file = tst_hdf5
                else:
                    raise ValueError(
                        f"Dataset value must be trn or tst. Provided value is {info['dataset']}"
                    )
                val_file = val_hdf5

                metadata = add_metadata_from_raster_to_sample(
                    sat_img_arr=np_input_image,
                    raster_handle=raster,
                    meta_map=meta_map,
                    raster_info=info)
                # Save label's per class pixel count to image metadata
                metadata['source_label_bincount'] = {
                    class_num: count
                    for class_num, count in enumerate(
                        np.bincount(np_label_raster.clip(min=0).flatten()))
                    if count > 0
                }  # TODO: add this to add_metadata_from[...] function?

                np_label_raster = np.reshape(
                    np_label_raster,
                    (np_label_raster.shape[0], np_label_raster.shape[1], 1))
                # 3. Prepare samples!
                number_samples, number_classes = samples_preparation(
                    in_img_array=np_input_image,
                    label_array=np_label_raster,
                    sample_size=samples_size,
                    overlap=overlap,
                    samples_count=number_samples,
                    num_classes=number_classes,
                    samples_file=out_file,
                    val_percent=val_percent,
                    val_sample_file=val_file,
                    dataset=info['dataset'],
                    pixel_classes=pixel_classes,
                    image_metadata=metadata,
                    dontcare=dontcare,
                    min_annot_perc=min_annot_perc,
                    class_prop=class_prop)

                _tqdm.set_postfix(OrderedDict(number_samples=number_samples))
                out_file.flush()
            except OSError as e:
                warnings.warn(
                    f'An error occurred while preparing samples with "{Path(info["tif"]).stem}" (tiff) and '
                    f'{Path(info["gpkg"]).stem} (gpkg). Error: "{e}"')
                continue

    trn_hdf5.close()
    val_hdf5.close()
    tst_hdf5.close()

    pixel_total = 0
    # adds up the number of pixels for each class in pixel_classes dict
    for i in pixel_classes:
        pixel_total += pixel_classes[i]

    # prints the proportion of pixels of each class for the samples created
    for i in pixel_classes:
        prop = round((pixel_classes[i] / pixel_total) *
                     100, 1) if pixel_total > 0 else 0
        print('Pixels from class', i, ':', prop, '%')

    print("Number of samples created: ", number_samples)

    if bucket_name and final_samples_folder:
        print('Transfering Samples to the bucket')
        bucket.upload_file(samples_folder + "/trn_samples.hdf5",
                           final_samples_folder + '/trn_samples.hdf5')
        bucket.upload_file(samples_folder + "/val_samples.hdf5",
                           final_samples_folder + '/val_samples.hdf5')
        bucket.upload_file(samples_folder + "/tst_samples.hdf5",
                           final_samples_folder + '/tst_samples.hdf5')

    print("End of process")
def main(params):
    """
    Training and validation datasets preparation.
    :param params: (dict) Parameters found in the yaml config file.

    """
    now = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M")
    bucket_file_cache = []

    assert params['global'][
        'task'] == 'segmentation', f"images_to_samples.py isn't necessary when performing classification tasks"

    # SET BASIC VARIABLES AND PATHS. CREATE OUTPUT FOLDERS.
    bucket_name = params['global']['bucket_name']
    data_path = Path(params['global']['data_path'])
    Path.mkdir(data_path, exist_ok=True, parents=True)
    csv_file = params['sample']['prep_csv_file']
    val_percent = params['sample']['val_percent']
    samples_size = params["global"]["samples_size"]
    overlap = params["sample"]["overlap"]
    min_annot_perc = params['sample']['sampling']['map']
    num_bands = params['global']['number_of_bands']
    debug = get_key_def('debug_mode', params['global'], False)
    if debug:
        warnings.warn(f'Debug mode activate. Execution may take longer...')

    final_samples_folder = None
    if bucket_name:
        s3 = boto3.resource('s3')
        bucket = s3.Bucket(bucket_name)
        bucket.download_file(csv_file, 'samples_prep.csv')
        list_data_prep = read_csv('samples_prep.csv')
        if data_path:
            final_samples_folder = os.path.join(data_path, "samples")
        else:
            final_samples_folder = "samples"
        samples_folder = f'samples{samples_size}_overlap{overlap}_min-annot{min_annot_perc}_{num_bands}bands'  # TODO: validate this is preferred name structure

    else:
        list_data_prep = read_csv(csv_file)
        samples_folder = data_path.joinpath(
            f'samples{samples_size}_overlap{overlap}_min-annot{min_annot_perc}_{num_bands}bands'
        )

    if samples_folder.is_dir():
        warnings.warn(
            f'Data path exists: {samples_folder}. Suffix will be added to directory name.'
        )
        samples_folder = Path(str(samples_folder) + '_' + now)
    else:
        tqdm.write(f'Writing samples to {samples_folder}')
    Path.mkdir(samples_folder, exist_ok=False
               )  # FIXME: what if we want to append samples to existing hdf5?
    tqdm.write(f'Samples will be written to {samples_folder}\n\n')

    tqdm.write(
        f'\nSuccessfully read csv file: {Path(csv_file).stem}\nNumber of rows: {len(list_data_prep)}\nCopying first entry:\n{list_data_prep[0]}\n'
    )
    ignore_index = get_key_def('ignore_index', params['training'], -1)

    for info in tqdm(list_data_prep,
                     position=0,
                     desc=f'Asserting existence of tif and gpkg files in csv'):
        assert Path(info['tif']).is_file(
        ), f'Could not locate "{info["tif"]}". Make sure file exists in this directory.'
        assert Path(info['gpkg']).is_file(
        ), f'Could not locate "{info["gpkg"]}". Make sure file exists in this directory.'
    if debug:
        for info in tqdm(
                list_data_prep,
                position=0,
                desc=f"Validating presence of {params['global']['num_classes']} "
                f"classes in attribute \"{info['attribute_name']}\" for vector "
                f"file \"{Path(info['gpkg']).stem}\""):
            validate_num_classes(info['gpkg'], params['global']['num_classes'],
                                 info['attribute_name'], ignore_index)
        with tqdm(list_data_prep,
                  position=0,
                  desc=f"Checking validity of features in vector files"
                  ) as _tqdm:
            invalid_features = {}
            for info in _tqdm:
                # Extract vector features to burn in the raster image
                with fiona.open(
                        info['gpkg'],
                        'r') as src:  # TODO: refactor as independent function
                    lst_vector = [vector for vector in src]
                shapes = lst_ids(list_vector=lst_vector,
                                 attr_name=info['attribute_name'])
                for index, item in enumerate(
                        tqdm([v for vecs in shapes.values() for v in vecs],
                             leave=False,
                             position=1)):
                    # geom must be a valid GeoJSON geometry type and non-empty
                    geom, value = item
                    geom = getattr(geom, '__geo_interface__', None) or geom
                    if not is_valid_geom(geom):
                        gpkg_stem = str(Path(info['gpkg']).stem)
                        if gpkg_stem not in invalid_features.keys(
                        ):  # create key with name of gpkg
                            invalid_features[gpkg_stem] = []
                        if lst_vector[index]["id"] not in invalid_features[
                                gpkg_stem]:  # ignore feature is already appended
                            invalid_features[gpkg_stem].append(
                                lst_vector[index]["id"])
            assert len(
                invalid_features.values()
            ) == 0, f'Invalid geometry object(s) for "gpkg:ids": \"{invalid_features}\"'

    number_samples = {'trn': 0, 'val': 0, 'tst': 0}
    number_classes = 0

    # 'sampling' ordereddict validation
    check_sampling_dict()

    pixel_classes = {}
    # creates pixel_classes dict and keys
    for i in range(0, params['global']['num_classes'] + 1):
        pixel_classes.update({i: 0})
    pixel_classes.update(
        {ignore_index: 0}
    )  # FIXME: pixel_classes dict needs to be populated with classes obtained from target

    trn_hdf5, val_hdf5, tst_hdf5 = create_files_and_datasets(
        params, samples_folder)

    # For each row in csv: (1) burn vector file to raster, (2) read input raster image, (3) prepare samples
    with tqdm(list_data_prep,
              position=0,
              leave=False,
              desc=f'Preparing samples') as _tqdm:
        for info in _tqdm:
            _tqdm.set_postfix(
                OrderedDict(tif=f'{Path(info["tif"]).stem}',
                            sample_size=params['global']['samples_size']))
            try:
                if bucket_name:
                    bucket.download_file(
                        info['tif'], "Images/" + info['tif'].split('/')[-1])
                    info['tif'] = "Images/" + info['tif'].split('/')[-1]
                    if info['gpkg'] not in bucket_file_cache:
                        bucket_file_cache.append(info['gpkg'])
                        bucket.download_file(info['gpkg'],
                                             info['gpkg'].split('/')[-1])
                    info['gpkg'] = info['gpkg'].split('/')[-1]
                    if info['meta']:
                        if info['meta'] not in bucket_file_cache:
                            bucket_file_cache.append(info['meta'])
                            bucket.download_file(info['meta'],
                                                 info['meta'].split('/')[-1])
                        info['meta'] = info['meta'].split('/')[-1]

                with rasterio.open(info['tif'], 'r') as raster:
                    # Burn vector file in a raster file
                    np_label_raster = vector_to_raster(
                        vector_file=info['gpkg'],
                        input_image=raster,
                        attribute_name=info['attribute_name'],
                        fill=get_key_def('ignore_idx',
                                         get_key_def('training', params, {}),
                                         0))
                    # Read the input raster image
                    np_input_image = image_reader_as_array(
                        input_image=raster,
                        scale=get_key_def('scale_data', params['global'],
                                          None),
                        aux_vector_file=get_key_def('aux_vector_file',
                                                    params['global'], None),
                        aux_vector_attrib=get_key_def('aux_vector_attrib',
                                                      params['global'], None),
                        aux_vector_ids=get_key_def('aux_vector_ids',
                                                   params['global'], None),
                        aux_vector_dist_maps=get_key_def(
                            'aux_vector_dist_maps', params['global'], True),
                        aux_vector_dist_log=get_key_def(
                            'aux_vector_dist_log', params['global'], True),
                        aux_vector_scale=get_key_def('aux_vector_scale',
                                                     params['global'], None))

                # Mask the zeros from input image into label raster.
                if params['sample']['mask_reference']:
                    np_label_raster = mask_image(np_input_image,
                                                 np_label_raster)

                if info['dataset'] == 'trn':
                    out_file = trn_hdf5
                    val_file = val_hdf5
                elif info['dataset'] == 'tst':
                    out_file = tst_hdf5
                else:
                    raise ValueError(
                        f"Dataset value must be trn or val or tst. Provided value is {info['dataset']}"
                    )

                meta_map, metadata = get_key_def("meta_map", params["global"],
                                                 {}), None
                if info['meta'] is not None and isinstance(
                        info['meta'], str) and Path(info['meta']).is_file():
                    metadata = read_parameters(info['meta'])

                # FIXME: think this through. User will have to calculate the total number of bands including meta layers and
                #  specify it in yaml. Is this the best approach? What if metalayers are added on the fly ?
                input_band_count = np_input_image.shape[
                    2] + MetaSegmentationDataset.get_meta_layer_count(meta_map)
                # FIXME: could this assert be done before getting into this big for loop?
                assert input_band_count == num_bands, \
                    f"The number of bands in the input image ({input_band_count}) and the parameter" \
                    f"'number_of_bands' in the yaml file ({params['global']['number_of_bands']}) should be identical"

                np_label_raster = np.reshape(
                    np_label_raster,
                    (np_label_raster.shape[0], np_label_raster.shape[1], 1))
                number_samples, number_classes = samples_preparation(
                    np_input_image, np_label_raster, samples_size, overlap,
                    number_samples, number_classes, out_file, val_percent,
                    val_file, info['dataset'], pixel_classes, metadata)

                _tqdm.set_postfix(OrderedDict(number_samples=number_samples))
                out_file.flush()
            except Exception as e:
                warnings.warn(
                    f'An error occurred while preparing samples with "{Path(info["tif"]).stem}" (tiff) and '
                    f'{Path(info["gpkg"]).stem} (gpkg). Error: "{e}"')
                continue

    trn_hdf5.close()
    val_hdf5.close()
    tst_hdf5.close()

    pixel_total = 0
    # adds up the number of pixels for each class in pixel_classes dict
    for i in pixel_classes:
        pixel_total += pixel_classes[i]

    # prints the proportion of pixels of each class for the samples created
    for i in pixel_classes:
        print('Pixels from class', i, ':',
              round((pixel_classes[i] / pixel_total) * 100, 1), '%')

    print("Number of samples created: ", number_samples)

    if bucket_name and final_samples_folder:
        print('Transfering Samples to the bucket')
        bucket.upload_file(samples_folder + "/trn_samples.hdf5",
                           final_samples_folder + '/trn_samples.hdf5')
        bucket.upload_file(samples_folder + "/val_samples.hdf5",
                           final_samples_folder + '/val_samples.hdf5')
        bucket.upload_file(samples_folder + "/tst_samples.hdf5",
                           final_samples_folder + '/tst_samples.hdf5')

    print("End of process")