def image_reader_as_array(input_image, scale=None, aux_vector_file=None, aux_vector_attrib=None, aux_vector_ids=None, aux_vector_dist_maps=False, aux_vector_dist_log=True, aux_vector_scale=None): """Read an image from a file and return a 3d array (h,w,c) Args: input_image: Rasterio file handle holding the (already opened) input raster scale: optional scaling factor for the raw data aux_vector_file: optional vector file from which to extract auxiliary shapes aux_vector_attrib: optional vector file attribute name to parse in order to fetch ids aux_vector_ids: optional vector ids to target in the vector file above aux_vector_dist_maps: flag indicating whether aux vector bands should be distance maps or binary maps aux_vector_dist_log: flag indicating whether log distances should be used in distance maps or not aux_vector_scale: optional floating point scale factor to multiply to rasterized vector maps Return: numpy array of the image (possibly concatenated with auxiliary vector channels) """ np_array = np.empty( [input_image.height, input_image.width, input_image.count], dtype=np.float32) for i in tqdm( range(input_image.count), position=1, leave=False, desc=f'Reading image bands: {Path(input_image.files[0]).stem}'): np_array[:, :, i] = input_image.read( i + 1 ) # Bands starts at 1 in rasterio not 0 # TODO: reading a large image >10Gb is VERY slow. Is this line the culprit? # Guidelines for pre-processing: http://cs231n.github.io/neural-networks-2/#datapre # Scale array values from range [0,255] to values in config (e.g. [0,1]) if scale: sc_min, sc_max = scale assert np.min(np_array) >= 0 and np.max(np_array) <= 255, f'Values in input image of shape {np_array.shape} ' \ f'range from {np.min(np_array)} to {np.max(np_array)}.' \ f'They should range from 0 to 255 (8bit).' np_array = minmax_scale(img=np_array, orig_range=(0, 255), scale_range=(sc_min, sc_max)) # if requested, load vectors from external file, rasterize, and append distance maps to array if aux_vector_file is not None: vec_tensor = vector_to_raster(vector_file=aux_vector_file, input_image=input_image, attribute_name=aux_vector_attrib, fill=0, target_ids=aux_vector_ids, merge_all=False) if aux_vector_dist_maps: import cv2 as cv # opencv becomes a project dependency only if we need to compute distance maps here vec_tensor = vec_tensor.astype(np.float32) for vec_band_idx in range(vec_tensor.shape[2]): mask = vec_tensor[:, :, vec_band_idx] mask = cv.dilate( mask, (3, 3)) # make points and linestring easier to work with #display_resize = cv.resize(np.where(mask, np.uint8(0), np.uint8(255)), (1000, 1000)) #cv.imshow("mask", display_resize) dmap = cv.distanceTransform( np.where(mask, np.uint8(0), np.uint8(255)), cv.DIST_L2, cv.DIST_MASK_PRECISE) if aux_vector_dist_log: dmap = np.log(dmap + 1) #display_resize = cv.resize(cv.normalize(dmap, None, 0, 1, cv.NORM_MINMAX, dtype=cv.CV_32F), (1000, 1000)) #cv.imshow("dmap1", display_resize) dmap_inv = cv.distanceTransform( np.where(mask, np.uint8(255), np.uint8(0)), cv.DIST_L2, cv.DIST_MASK_PRECISE) if aux_vector_dist_log: dmap_inv = np.log(dmap_inv + 1) #display_resize = cv.resize(cv.normalize(dmap_inv, None, 0, 1, cv.NORM_MINMAX, dtype=cv.CV_32F), (1000, 1000)) #cv.imshow("dmap2", display_resize) vec_tensor[:, :, vec_band_idx] = np.where(mask, -dmap_inv, dmap) #display = cv.normalize(vec_tensor[:, :, vec_band_idx], None, 0, 1, cv.NORM_MINMAX, dtype=cv.CV_32F) #display_resize = cv.resize(display, (1000, 1000)) #cv.imshow("distmap", display_resize) #cv.waitKey(0) if aux_vector_scale: for vec_band_idx in vec_tensor.shape[2]: vec_tensor[:, :, vec_band_idx] *= aux_vector_scale np_array = np.concatenate([np_array, vec_tensor], axis=2) return np_array
def main(params): """ Training and validation datasets preparation. :param params: (dict) Parameters found in the yaml config file. """ now = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M") bucket_file_cache = [] assert params['global'][ 'task'] == 'segmentation', f"images_to_samples.py isn't necessary when performing classification tasks" # SET BASIC VARIABLES AND PATHS. CREATE OUTPUT FOLDERS. bucket_name = params['global']['bucket_name'] data_path = Path(params['global']['data_path']) Path.mkdir(data_path, exist_ok=True, parents=True) csv_file = params['sample']['prep_csv_file'] val_percent = params['sample']['val_percent'] samples_size = params["global"]["samples_size"] overlap = params["sample"]["overlap"] min_annot_perc = params['sample']['sampling']['map'] num_bands = params['global']['number_of_bands'] debug = get_key_def('debug_mode', params['global'], False) if debug: warnings.warn(f'Debug mode activate. Execution may take longer...') final_samples_folder = None if bucket_name: s3 = boto3.resource('s3') bucket = s3.Bucket(bucket_name) bucket.download_file(csv_file, 'samples_prep.csv') list_data_prep = read_csv('samples_prep.csv') if data_path: final_samples_folder = os.path.join(data_path, "samples") else: final_samples_folder = "samples" samples_folder = f'samples{samples_size}_overlap{overlap}_min-annot{min_annot_perc}_{num_bands}bands' # TODO: validate this is preferred name structure else: list_data_prep = read_csv(csv_file) samples_folder = data_path.joinpath( f'samples{samples_size}_overlap{overlap}_min-annot{min_annot_perc}_{num_bands}bands' ) if samples_folder.is_dir(): warnings.warn( f'Data path exists: {samples_folder}. Suffix will be added to directory name.' ) samples_folder = Path(str(samples_folder) + '_' + now) else: tqdm.write(f'Writing samples to {samples_folder}') Path.mkdir(samples_folder, exist_ok=False ) # FIXME: what if we want to append samples to existing hdf5? tqdm.write(f'Samples will be written to {samples_folder}\n\n') tqdm.write( f'\nSuccessfully read csv file: {Path(csv_file).stem}\nNumber of rows: {len(list_data_prep)}\nCopying first entry:\n{list_data_prep[0]}\n' ) ignore_index = get_key_def('ignore_index', params['training'], -1) for info in tqdm(list_data_prep, position=0, desc=f'Asserting existence of tif and gpkg files in csv'): assert Path(info['tif']).is_file( ), f'Could not locate "{info["tif"]}". Make sure file exists in this directory.' assert Path(info['gpkg']).is_file( ), f'Could not locate "{info["gpkg"]}". Make sure file exists in this directory.' if debug: for info in tqdm( list_data_prep, position=0, desc=f"Validating presence of {params['global']['num_classes']} " f"classes in attribute \"{info['attribute_name']}\" for vector " f"file \"{Path(info['gpkg']).stem}\""): validate_num_classes(info['gpkg'], params['global']['num_classes'], info['attribute_name'], ignore_index) with tqdm(list_data_prep, position=0, desc=f"Checking validity of features in vector files" ) as _tqdm: invalid_features = {} for info in _tqdm: # Extract vector features to burn in the raster image with fiona.open( info['gpkg'], 'r') as src: # TODO: refactor as independent function lst_vector = [vector for vector in src] shapes = lst_ids(list_vector=lst_vector, attr_name=info['attribute_name']) for index, item in enumerate( tqdm([v for vecs in shapes.values() for v in vecs], leave=False, position=1)): # geom must be a valid GeoJSON geometry type and non-empty geom, value = item geom = getattr(geom, '__geo_interface__', None) or geom if not is_valid_geom(geom): gpkg_stem = str(Path(info['gpkg']).stem) if gpkg_stem not in invalid_features.keys( ): # create key with name of gpkg invalid_features[gpkg_stem] = [] if lst_vector[index]["id"] not in invalid_features[ gpkg_stem]: # ignore feature is already appended invalid_features[gpkg_stem].append( lst_vector[index]["id"]) assert len( invalid_features.values() ) == 0, f'Invalid geometry object(s) for "gpkg:ids": \"{invalid_features}\"' number_samples = {'trn': 0, 'val': 0, 'tst': 0} number_classes = 0 # 'sampling' ordereddict validation check_sampling_dict() pixel_classes = {} # creates pixel_classes dict and keys for i in range(0, params['global']['num_classes'] + 1): pixel_classes.update({i: 0}) pixel_classes.update( {ignore_index: 0} ) # FIXME: pixel_classes dict needs to be populated with classes obtained from target trn_hdf5, val_hdf5, tst_hdf5 = create_files_and_datasets( params, samples_folder) # For each row in csv: (1) burn vector file to raster, (2) read input raster image, (3) prepare samples with tqdm(list_data_prep, position=0, leave=False, desc=f'Preparing samples') as _tqdm: for info in _tqdm: _tqdm.set_postfix( OrderedDict(tif=f'{Path(info["tif"]).stem}', sample_size=params['global']['samples_size'])) try: if bucket_name: bucket.download_file( info['tif'], "Images/" + info['tif'].split('/')[-1]) info['tif'] = "Images/" + info['tif'].split('/')[-1] if info['gpkg'] not in bucket_file_cache: bucket_file_cache.append(info['gpkg']) bucket.download_file(info['gpkg'], info['gpkg'].split('/')[-1]) info['gpkg'] = info['gpkg'].split('/')[-1] if info['meta']: if info['meta'] not in bucket_file_cache: bucket_file_cache.append(info['meta']) bucket.download_file(info['meta'], info['meta'].split('/')[-1]) info['meta'] = info['meta'].split('/')[-1] with rasterio.open(info['tif'], 'r') as raster: # Burn vector file in a raster file np_label_raster = vector_to_raster( vector_file=info['gpkg'], input_image=raster, attribute_name=info['attribute_name'], fill=get_key_def('ignore_idx', get_key_def('training', params, {}), 0)) # Read the input raster image np_input_image = image_reader_as_array( input_image=raster, scale=get_key_def('scale_data', params['global'], None), aux_vector_file=get_key_def('aux_vector_file', params['global'], None), aux_vector_attrib=get_key_def('aux_vector_attrib', params['global'], None), aux_vector_ids=get_key_def('aux_vector_ids', params['global'], None), aux_vector_dist_maps=get_key_def( 'aux_vector_dist_maps', params['global'], True), aux_vector_dist_log=get_key_def( 'aux_vector_dist_log', params['global'], True), aux_vector_scale=get_key_def('aux_vector_scale', params['global'], None)) # Mask the zeros from input image into label raster. if params['sample']['mask_reference']: np_label_raster = mask_image(np_input_image, np_label_raster) if info['dataset'] == 'trn': out_file = trn_hdf5 val_file = val_hdf5 elif info['dataset'] == 'tst': out_file = tst_hdf5 else: raise ValueError( f"Dataset value must be trn or val or tst. Provided value is {info['dataset']}" ) meta_map, metadata = get_key_def("meta_map", params["global"], {}), None if info['meta'] is not None and isinstance( info['meta'], str) and Path(info['meta']).is_file(): metadata = read_parameters(info['meta']) # FIXME: think this through. User will have to calculate the total number of bands including meta layers and # specify it in yaml. Is this the best approach? What if metalayers are added on the fly ? input_band_count = np_input_image.shape[ 2] + MetaSegmentationDataset.get_meta_layer_count(meta_map) # FIXME: could this assert be done before getting into this big for loop? assert input_band_count == num_bands, \ f"The number of bands in the input image ({input_band_count}) and the parameter" \ f"'number_of_bands' in the yaml file ({params['global']['number_of_bands']}) should be identical" np_label_raster = np.reshape( np_label_raster, (np_label_raster.shape[0], np_label_raster.shape[1], 1)) number_samples, number_classes = samples_preparation( np_input_image, np_label_raster, samples_size, overlap, number_samples, number_classes, out_file, val_percent, val_file, info['dataset'], pixel_classes, metadata) _tqdm.set_postfix(OrderedDict(number_samples=number_samples)) out_file.flush() except Exception as e: warnings.warn( f'An error occurred while preparing samples with "{Path(info["tif"]).stem}" (tiff) and ' f'{Path(info["gpkg"]).stem} (gpkg). Error: "{e}"') continue trn_hdf5.close() val_hdf5.close() tst_hdf5.close() pixel_total = 0 # adds up the number of pixels for each class in pixel_classes dict for i in pixel_classes: pixel_total += pixel_classes[i] # prints the proportion of pixels of each class for the samples created for i in pixel_classes: print('Pixels from class', i, ':', round((pixel_classes[i] / pixel_total) * 100, 1), '%') print("Number of samples created: ", number_samples) if bucket_name and final_samples_folder: print('Transfering Samples to the bucket') bucket.upload_file(samples_folder + "/trn_samples.hdf5", final_samples_folder + '/trn_samples.hdf5') bucket.upload_file(samples_folder + "/val_samples.hdf5", final_samples_folder + '/val_samples.hdf5') bucket.upload_file(samples_folder + "/tst_samples.hdf5", final_samples_folder + '/tst_samples.hdf5') print("End of process")
def main(params): """ Training and validation datasets preparation. :param params: (dict) Parameters found in the yaml config file. """ bucket_file_cache = [] bucket_name = params['global']['bucket_name'] data_path = params['global']['data_path'] Path.mkdir(Path(data_path), exist_ok=True) csv_file = params['sample']['prep_csv_file'] final_samples_folder = None if bucket_name: s3 = boto3.resource('s3') bucket = s3.Bucket(bucket_name) bucket.download_file(csv_file, 'samples_prep.csv') list_data_prep = read_csv('samples_prep.csv') if data_path: final_samples_folder = os.path.join(data_path, "samples") else: final_samples_folder = "samples" samples_folder = "samples" out_label_folder = "label" else: list_data_prep = read_csv(csv_file) samples_folder = os.path.join( data_path, "samples") #FIXME check that data_path exists! out_label_folder = os.path.join(data_path, "label") create_or_empty_folder(samples_folder) create_or_empty_folder(out_label_folder) number_samples = {'trn': 0, 'val': 0, 'tst': 0} number_classes = 0 trn_hdf5, val_hdf5, tst_hdf5 = create_files_and_datasets( params, samples_folder) with tqdm(list_data_prep) as _tqdm: for info in _tqdm: if bucket_name: bucket.download_file(info['tif'], "Images/" + info['tif'].split('/')[-1]) info['tif'] = "Images/" + info['tif'].split('/')[-1] if info['gpkg'] not in bucket_file_cache: bucket_file_cache.append(info['gpkg']) bucket.download_file(info['gpkg'], info['gpkg'].split('/')[-1]) info['gpkg'] = info['gpkg'].split('/')[-1] if info['meta']: if info['meta'] not in bucket_file_cache: bucket_file_cache.append(info['meta']) bucket.download_file(info['meta'], info['meta'].split('/')[-1]) info['meta'] = info['meta'].split('/')[-1] _tqdm.set_postfix( OrderedDict(file=f'{info["tif"]}', sample_size=params['global']['samples_size'])) # Validate the number of class in the vector file validate_num_classes(info['gpkg'], params['global']['num_classes'], info['attribute_name']) assert os.path.isfile( info['tif']), f"could not open raster file at {info['tif']}" with rasterio.open(info['tif'], 'r') as raster: # Burn vector file in a raster file np_label_raster = vector_to_raster( vector_file=info['gpkg'], input_image=raster, attribute_name=info['attribute_name'], fill=get_key_def('ignore_idx', get_key_def('training', params, {}), 0)) # Read the input raster image np_input_image = image_reader_as_array( input_image=raster, scale=get_key_def('scale_data', params['global'], None), aux_vector_file=get_key_def('aux_vector_file', params['global'], None), aux_vector_attrib=get_key_def('aux_vector_attrib', params['global'], None), aux_vector_ids=get_key_def('aux_vector_ids', params['global'], None), aux_vector_dist_maps=get_key_def('aux_vector_dist_maps', params['global'], True), aux_vector_dist_log=get_key_def('aux_vector_dist_log', params['global'], True), aux_vector_scale=get_key_def('aux_vector_scale', params['global'], None)) # Mask the zeros from input image into label raster. if params['sample']['mask_reference']: np_label_raster = mask_image(np_input_image, np_label_raster) if info['dataset'] == 'trn': out_file = trn_hdf5 elif info['dataset'] == 'val': out_file = val_hdf5 elif info['dataset'] == 'tst': out_file = tst_hdf5 else: raise ValueError( f"Dataset value must be trn or val or tst. Provided value is {info['dataset']}" ) meta_map, metadata = get_key_def("meta_map", params["global"], {}), None if info['meta'] is not None and isinstance( info['meta'], str) and os.path.isfile(info['meta']): metadata = read_parameters(info['meta']) input_band_count = np_input_image.shape[ 2] + MetaSegmentationDataset.get_meta_layer_count(meta_map) assert input_band_count == params['global']['number_of_bands'], \ f"The number of bands in the input image ({input_band_count}) and the parameter" \ f"'number_of_bands' in the yaml file ({params['global']['number_of_bands']}) should be identical" np_label_raster = np.reshape( np_label_raster, (np_label_raster.shape[0], np_label_raster.shape[1], 1)) number_samples, number_classes = samples_preparation( np_input_image, np_label_raster, params['global']['samples_size'], params['sample']['samples_dist'], number_samples, number_classes, out_file, info['dataset'], params['sample']['min_annotated_percent'], metadata) _tqdm.set_postfix(OrderedDict(number_samples=number_samples)) out_file.flush() trn_hdf5.close() val_hdf5.close() tst_hdf5.close() print("Number of samples created: ", number_samples) if bucket_name and final_samples_folder: print('Transfering Samples to the bucket') bucket.upload_file(samples_folder + "/trn_samples.hdf5", final_samples_folder + '/trn_samples.hdf5') bucket.upload_file(samples_folder + "/val_samples.hdf5", final_samples_folder + '/val_samples.hdf5') bucket.upload_file(samples_folder + "/tst_samples.hdf5", final_samples_folder + '/tst_samples.hdf5') print("End of process")
def create_csv(): """ Creates samples from the input images for the pixel_inventory function """ prep_csv_path = params['sample']['prep_csv_file'] dist_samples = params['sample']['samples_dist'] sample_size = params['global']['samples_size'] data_path = params['global']['data_path'] Path.mkdir(Path(data_path), exist_ok=True) data_prep_csv = read_csv(prep_csv_path) csv_prop_data = params['global']['data_path'] + '/prop_data.csv' if os.path.isfile(csv_prop_data): os.remove(csv_prop_data) with tqdm(data_prep_csv) as _tqdm: for info in _tqdm: _tqdm.set_postfix( OrderedDict(file=f'{info["tif"]}', sample_size=params['global']['samples_size'])) # Validate the number of class in the vector file validate_num_classes(info['gpkg'], params['global']['num_classes'], info['attribute_name']) assert os.path.isfile( info['tif']), f"could not open raster file at {info['tif']}" with rasterio.open(info['tif'], 'r') as raster: # Burn vector file in a raster file np_label_raster = vector_to_raster( vector_file=info['gpkg'], input_image=raster, attribute_name=info['attribute_name'], fill=get_key_def('ignore_idx', get_key_def('training', params, {}), 0)) # Read the input raster image np_input_image = image_reader_as_array( input_image=raster, scale=get_key_def('scale_data', params['global'], None), aux_vector_file=get_key_def('aux_vector_file', params['global'], None), aux_vector_attrib=get_key_def('aux_vector_attrib', params['global'], None), aux_vector_ids=get_key_def('aux_vector_ids', params['global'], None), aux_vector_dist_maps=get_key_def('aux_vector_dist_maps', params['global'], True), aux_vector_dist_log=get_key_def('aux_vector_dist_log', params['global'], True), aux_vector_scale=get_key_def('aux_vector_scale', params['global'], None)) # Mask the zeros from input image into label raster. if params['sample']['mask_reference']: np_label_raster = images_to_samples.mask_image( np_input_image, np_label_raster) np_label_raster = np.reshape( np_label_raster, (np_label_raster.shape[0], np_label_raster.shape[1], 1)) h, w, num_bands = np_input_image.shape # half tile padding half_tile = int(sample_size / 2) pad_label_array = np.pad(np_label_raster, ((half_tile, half_tile), (half_tile, half_tile), (0, 0)), mode='constant') for row in range(0, h, dist_samples): for column in range(0, w, dist_samples): target = np.squeeze( pad_label_array[row:row + sample_size, column:column + sample_size, :], axis=2) pixel_inventory(target, sample_size, params['global']['num_classes'] + 1, params['global']['data_path'], info['dataset'])