def set_hyperparameters(params, num_classes, model, checkpoint, dontcare_val, loss_fn, optimizer, class_weights=None, inference: str = ''): """ Function to set hyperparameters based on values provided in yaml config file. If none provided, default functions values may be used. :param params: (dict) Parameters found in the yaml config file :param num_classes: (int) number of classes for current task :param model: initialized model :param checkpoint: (dict) state dict as loaded by model_choice.py :param dontcare_val: value in label to ignore during loss calculation :param loss_fn: loss function :param optimizer: optimizer function :param class_weights: class weights for loss function :param inference: (str) path to inference checkpoint (used in load_from_checkpoint()) :return: model, criterion, optimizer, lr_scheduler, num_gpus """ # set mandatory hyperparameters values with those in config file if they exist lr = get_key_def('learning_rate', params['training'], None) weight_decay = get_key_def('weight_decay', params['training'], None) step_size = get_key_def('step_size', params['training'], None) gamma = get_key_def('gamma', params['training'], None) class_weights = torch.tensor(class_weights) if class_weights else None # Loss function criterion = MultiClassCriterion(loss_type=loss_fn, ignore_index=dontcare_val, weight=class_weights) # Optimizer opt_fn = optimizer optimizer = create_optimizer(params=model.parameters(), mode=opt_fn, base_lr=lr, weight_decay=weight_decay) lr_scheduler = optim.lr_scheduler.StepLR(optimizer=optimizer, step_size=step_size, gamma=gamma) if checkpoint: tqdm.write(f'Loading checkpoint...') model, optimizer = load_from_checkpoint(checkpoint, model, optimizer=optimizer, inference=inference) return model, criterion, optimizer, lr_scheduler
def ternausnet(num_classes, state_dict_path): """ pretrained: False - no pre-trained network is used True - encoder is pre-trained with VGG11 carvana - all weights are pre-trained on Kaggle: Carvana dataset https://www.kaggle.com/c/carvana-image-masking-challenge """ model = UNet11(num_classes) model = load_from_checkpoint(state_dict_path, model, final_layer_name='final') return model
def set_hyperparameters(params, model, checkpoint): """ Function to set hyperparameters based on values provided in yaml config file. Will also set model to GPU, if available. If none provided, default functions values may be used. :param params: (dict) Parameters found in the yaml config file :param model: Model loaded from model_choice.py :param checkpoint: (dict) state dict as loaded by model_choice.py :return: model, criterion, optimizer, lr_scheduler, num_gpus """ # set mandatory hyperparameters values with those in config file if they exist lr = params['training']['learning_rate'] assert lr is not None and lr > 0, "missing mandatory learning rate parameter" weight_decay = params['training']['weight_decay'] assert weight_decay is not None and weight_decay >= 0, "missing mandatory weight decay parameter" step_size = params['training']['step_size'] assert step_size is not None and step_size > 0, "missing mandatory step size parameter" gamma = params['training']['gamma'] assert gamma is not None and gamma >= 0, "missing mandatory gamma parameter" # optional hyperparameters. Set to None if not in config file class_weights = torch.tensor( params['training'] ['class_weights']) if params['training']['class_weights'] else None if params['training']['class_weights']: verify_weights(params['global']['num_classes'], class_weights) ignore_index = -100 if params['training']['ignore_index'] is not None: ignore_index = params['training']['ignore_index'] # Loss function criterion = MultiClassCriterion(loss_type=params['training']['loss_fn'], ignore_index=ignore_index, weight=class_weights) # Optimizer opt_fn = params['training']['optimizer'] optimizer = create_optimizer(params=model.parameters(), mode=opt_fn, base_lr=lr, weight_decay=weight_decay) lr_scheduler = optim.lr_scheduler.StepLR(optimizer=optimizer, step_size=step_size, gamma=gamma) if checkpoint: model, optimizer = load_from_checkpoint(checkpoint, model, optimizer=optimizer) return model, criterion, optimizer, lr_scheduler
def set_hyperparameters(params, num_classes, model, checkpoint): """ Function to set hyperparameters based on values provided in yaml config file. Will also set model to GPU, if available. If none provided, default functions values may be used. :param params: (dict) Parameters found in the yaml config file :param num_classes: (int) number of classes for current task :param model: Model loaded from model_choice.py :param checkpoint: (dict) state dict as loaded by model_choice.py :return: model, criterion, optimizer, lr_scheduler, num_gpus """ # set mandatory hyperparameters values with those in config file if they exist lr = get_key_def('learning_rate', params['training'], None, "missing mandatory learning rate parameter") weight_decay = get_key_def('weight_decay', params['training'], None, "missing mandatory weight decay parameter") step_size = get_key_def('step_size', params['training'], None, "missing mandatory step size parameter") gamma = get_key_def('gamma', params['training'], None, "missing mandatory gamma parameter") # optional hyperparameters. Set to None if not in config file class_weights = torch.tensor( params['training'] ['class_weights']) if params['training']['class_weights'] else None if params['training']['class_weights']: verify_weights(num_classes, class_weights) ignore_index = get_key_def('ignore_index', params['training'], -1) # Loss function criterion = MultiClassCriterion(loss_type=params['training']['loss_fn'], ignore_index=ignore_index, weight=class_weights) # Optimizer opt_fn = params['training']['optimizer'] optimizer = create_optimizer(params=model.parameters(), mode=opt_fn, base_lr=lr, weight_decay=weight_decay) lr_scheduler = optim.lr_scheduler.StepLR(optimizer=optimizer, step_size=step_size, gamma=gamma) if checkpoint: tqdm.write(f'Loading checkpoint...') model, optimizer = load_from_checkpoint(checkpoint, model, optimizer=optimizer) return model, criterion, optimizer, lr_scheduler
def main(params): """ Identify the class to which each image belongs. :param params: (dict) Parameters found in the yaml config file. """ # SET BASIC VARIABLES AND PATHS since = time.time() debug = get_key_def('debug_mode', params['global'], False) if debug: warnings.warn(f'Debug mode activated. Some debug features may mobilize extra disk space and cause delays in execution.') num_classes = params['global']['num_classes'] if params['global']['task'] == 'segmentation': # assume background is implicitly needed (makes no sense to predict with one class, for example.) # this will trigger some warnings elsewhere, but should succeed nonetheless num_classes_corrected = num_classes + 1 # + 1 for background # FIXME temporary patch for num_classes problem. elif params['global']['task'] == 'classification': num_classes_corrected = num_classes chunk_size = get_key_def('chunk_size', params['inference'], 512) overlap = get_key_def('overlap', params['inference'], 10) nbr_pix_overlap = int(math.floor(overlap / 100 * chunk_size)) num_bands = params['global']['number_of_bands'] img_dir_or_csv = params['inference']['img_dir_or_csv_file'] default_working_folder = Path(params['inference']['state_dict_path']).parent.joinpath(f'inference_{num_bands}bands') working_folder = Path(get_key_def('working_folder', params['inference'], default_working_folder)) # TODO: remove working_folder parameter in all templates Path.mkdir(working_folder, exist_ok=True) print(f'Inferences will be saved to: {working_folder}\n\n') bucket = None bucket_file_cache = [] bucket_name = params['global']['bucket_name'] # CONFIGURE MODEL model, state_dict_path, model_name = net(params, num_channels=num_classes_corrected, inference=True) num_devices = params['global']['num_gpus'] if params['global']['num_gpus'] else 0 # list of GPU devices that are available and unused. If no GPUs, returns empty list lst_device_ids = get_device_ids(num_devices) if torch.cuda.is_available() else [] device = torch.device(f'cuda:{lst_device_ids[0]}' if torch.cuda.is_available() and lst_device_ids else 'cpu') if lst_device_ids: print(f"Number of cuda devices requested: {num_devices}. Cuda devices available: {lst_device_ids}. Using {lst_device_ids[0]}\n\n") else: warnings.warn(f"No Cuda device available. This process will only run on CPU") try: model.to(device) except RuntimeError: print(f"Unable to use device. Trying device 0") device = torch.device(f'cuda:0' if torch.cuda.is_available() and lst_device_ids else 'cpu') model.to(device) if bucket_name: s3 = boto3.resource('s3') bucket = s3.Bucket(bucket_name) if img_dir_or_csv.endswith('.csv'): bucket.download_file(img_dir_or_csv, 'img_csv_file.csv') list_img = read_csv('img_csv_file.csv', inference=True) else: raise NotImplementedError( 'Specify a csv file containing images for inference. Directory input not implemented yet') else: if img_dir_or_csv.endswith('.csv'): list_img = read_csv(img_dir_or_csv, inference=True) else: img_dir = Path(img_dir_or_csv) assert img_dir.is_dir(), f'Could not find directory "{img_dir_or_csv}"' list_img_paths = sorted(img_dir.glob('*.tif')) # FIXME: what if .tif is in caps (.TIF) ? list_img = [] for img_path in list_img_paths: img = {} img['tif'] = img_path list_img.append(img) assert len(list_img) >= 0, f'No .tif files found in {img_dir_or_csv}' if params['global']['task'] == 'classification': classifier(params, list_img, model, device, working_folder) # FIXME: why don't we load from checkpoint in classification? elif params['global']['task'] == 'segmentation': if bucket: bucket.download_file(state_dict_path, "saved_model.pth.tar") model, _ = load_from_checkpoint("saved_model.pth.tar", model, inference=True) else: model, _ = load_from_checkpoint(state_dict_path, model, inference=True) with tqdm(list_img, desc='image list', position=0) as _tqdm: for img in _tqdm: img_name = Path(img['tif']).name if bucket: local_img = f"Images/{img_name}" bucket.download_file(img['tif'], local_img) inference_image = f"Classified_Images/{img_name.split('.')[0]}_inference.tif" if img['meta']: if img['meta'] not in bucket_file_cache: bucket_file_cache.append(img['meta']) bucket.download_file(img['meta'], img['meta'].split('/')[-1]) img['meta'] = img['meta'].split('/')[-1] else: local_img = Path(img['tif']) inference_image = working_folder.joinpath(f"{img_name.split('.')[0]}_inference.tif") assert local_img.is_file(), f"Could not open raster file at {local_img}" scale = get_key_def('scale_data', params['global'], None) with rasterio.open(local_img, 'r') as raster: np_input_image = image_reader_as_array(input_image=raster, scale=scale, aux_vector_file=get_key_def('aux_vector_file', params['global'], None), aux_vector_attrib=get_key_def('aux_vector_attrib', params['global'], None), aux_vector_ids=get_key_def('aux_vector_ids', params['global'], None), aux_vector_dist_maps=get_key_def('aux_vector_dist_maps', params['global'], True), aux_vector_scale=get_key_def('aux_vector_scale', params['global'], None)) meta_map, metadata = get_key_def("meta_map", params["global"], {}), None if meta_map: assert img['meta'] is not None and isinstance(img['meta'], str) and os.path.isfile(img['meta']), \ "global configuration requested metadata mapping onto loaded samples, but raster did not have available metadata" metadata = read_parameters(img['meta']) if debug: _tqdm.set_postfix(OrderedDict(img_name=img_name, img=np_input_image.shape, img_min_val=np.min(np_input_image), img_max_val=np.max(np_input_image))) input_band_count = np_input_image.shape[2] + MetaSegmentationDataset.get_meta_layer_count(meta_map) if input_band_count > params['global']['number_of_bands']: # FIXME: Following statements should be reconsidered to better manage inconsistencies between # provided number of band and image number of band. warnings.warn(f"Input image has more band than the number provided in the yaml file ({params['global']['number_of_bands']}). " f"Will use the first {params['global']['number_of_bands']} bands of the input image.") np_input_image = np_input_image[:, :, 0:params['global']['number_of_bands']] print(f"Input image's new shape: {np_input_image.shape}") elif input_band_count < params['global']['number_of_bands']: warnings.warn(f"Skipping image: The number of bands requested in the yaml file ({params['global']['number_of_bands']})" f"can not be larger than the number of band in the input image ({input_band_count}).") continue # START INFERENCES ON SUB-IMAGES sem_seg_results_per_class = sem_seg_inference(model, np_input_image, nbr_pix_overlap, chunk_size, num_classes_corrected, device, meta_map, metadata, output_path=working_folder, index=_tqdm.n, debug=debug) # CREATE GEOTIF FROM METADATA OF ORIGINAL IMAGE tqdm.write(f'Saving inference...\n') if get_key_def('heatmaps', params['inference'], False): tqdm.write(f'Heatmaps will be saved.\n') vis(params, np_input_image, sem_seg_results_per_class, working_folder, inference_input_path=local_img, debug=debug) tqdm.write(f"\n\nSemantic segmentation of image {img_name} completed\n\n") if bucket: bucket.upload_file(inference_image, os.path.join(working_folder, f"{img_name.split('.')[0]}_inference.tif")) else: raise ValueError( f"The task should be either classification or segmentation. The provided value is {params['global']['task']}") time_elapsed = time.time() - since print('Inference completed in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))
def main(params: dict): """ Identify the class to which each image belongs. :param params: (dict) Parameters found in the yaml config file. """ # SET BASIC VARIABLES AND PATHS since = time.time() debug = get_key_def('debug_mode', params['global'], False) if debug: warnings.warn(f'Debug mode activated. Some debug features may mobilize extra disk space and cause delays in execution.') num_classes = params['global']['num_classes'] task = params['global']['task'] num_classes_corrected = add_background_to_num_class(task, num_classes) chunk_size = get_key_def('chunk_size', params['inference'], 512) overlap = get_key_def('overlap', params['inference'], 10) nbr_pix_overlap = int(math.floor(overlap / 100 * chunk_size)) num_bands = params['global']['number_of_bands'] img_dir_or_csv = params['inference']['img_dir_or_csv_file'] default_working_folder = Path(params['inference']['state_dict_path']).parent.joinpath(f'inference_{num_bands}bands') working_folder = get_key_def('working_folder', params['inference'], None) if working_folder: # TODO: July 2020: deprecation started. Remove custom working_folder parameter as of Sept 2020? working_folder = Path(working_folder) warnings.warn(f"Deprecated parameter. Remove it in your future yamls as this folder is now created " f"automatically in a logical path, " f"i.e. [state_dict_path from inference section in yaml]/inference_[num_bands]bands") else: working_folder = default_working_folder Path.mkdir(working_folder, exist_ok=True) print(f'Inferences will be saved to: {working_folder}\n\n') bucket = None bucket_file_cache = [] bucket_name = get_key_def('bucket_name', params['global']) # CONFIGURE MODEL model, state_dict_path, model_name = net(params, num_channels=num_classes_corrected, inference=True) num_devices = params['global']['num_gpus'] if params['global']['num_gpus'] else 0 # list of GPU devices that are available and unused. If no GPUs, returns empty list lst_device_ids = get_device_ids(num_devices) if torch.cuda.is_available() else [] device = torch.device(f'cuda:{lst_device_ids[0]}' if torch.cuda.is_available() and lst_device_ids else 'cpu') if lst_device_ids: print(f"Number of cuda devices requested: {num_devices}. Cuda devices available: {lst_device_ids}. Using {lst_device_ids[0]}\n\n") else: warnings.warn(f"No Cuda device available. This process will only run on CPU") try: model.to(device) except RuntimeError: print(f"Unable to use device. Trying device 0") device = torch.device(f'cuda:0' if torch.cuda.is_available() and lst_device_ids else 'cpu') model.to(device) # CREATE LIST OF INPUT IMAGES FOR INFERENCE list_img = list_input_images(img_dir_or_csv, bucket_name, glob_patterns=["*.tif", "*.TIF"]) if task == 'classification': classifier(params, list_img, model, device, working_folder) # FIXME: why don't we load from checkpoint in classification? elif task == 'segmentation': if bucket: bucket.download_file(state_dict_path, "saved_model.pth.tar") # TODO: is this still valid? model, _ = load_from_checkpoint("saved_model.pth.tar", model) else: model, _ = load_from_checkpoint(state_dict_path, model) ignore_index = get_key_def('ignore_index', params['training'], -1) meta_map, yaml_metadata = get_key_def("meta_map", params["global"], {}), None # LOOP THROUGH LIST OF INPUT IMAGES with tqdm(list_img, desc='image list', position=0) as _tqdm: for info in _tqdm: img_name = Path(info['tif']).name if bucket: local_img = f"Images/{img_name}" bucket.download_file(info['tif'], local_img) inference_image = f"Classified_Images/{img_name.split('.')[0]}_inference.tif" if info['meta']: if info['meta'] not in bucket_file_cache: bucket_file_cache.append(info['meta']) bucket.download_file(info['meta'], info['meta'].split('/')[-1]) info['meta'] = info['meta'].split('/')[-1] else: # FIXME: else statement should support img['meta'] integration as well. local_img = Path(info['tif']) inference_image = working_folder.joinpath(f"{img_name.split('.')[0]}_inference.tif") assert local_img.is_file(), f"Could not open raster file at {local_img}" # Empty sample as dictionary inf_sample = {'sat_img': None, 'metadata': None} with rasterio.open(local_img, 'r') as raster_handle: inf_sample['sat_img'], raster_handle_updated, dataset_nodata = image_reader_as_array( input_image=raster_handle, aux_vector_file=get_key_def('aux_vector_file', params['global'], None), aux_vector_attrib=get_key_def('aux_vector_attrib', params['global'], None), aux_vector_ids=get_key_def('aux_vector_ids', params['global'], None), aux_vector_dist_maps=get_key_def('aux_vector_dist_maps', params['global'], True), aux_vector_scale=get_key_def('aux_vector_scale', params['global'], None)) inf_sample['metadata'] = add_metadata_from_raster_to_sample(sat_img_arr=inf_sample['sat_img'], raster_handle=raster_handle_updated, meta_map=meta_map, raster_info=info) _tqdm.set_postfix(OrderedDict(img_name=img_name, img=inf_sample['sat_img'].shape, img_min_val=np.min(inf_sample['sat_img']), img_max_val=np.max(inf_sample['sat_img']))) input_band_count = inf_sample['sat_img'].shape[2] + MetaSegmentationDataset.get_meta_layer_count(meta_map) if input_band_count > num_bands: # TODO: move as new function in utils.verifications # FIXME: Following statements should be reconsidered to better manage inconsistencies between # provided number of band and image number of band. warnings.warn(f"Input image has more band than the number provided in the yaml file ({num_bands}). " f"Will use the first {num_bands} bands of the input image.") inf_sample['sat_img'] = inf_sample['sat_img'][:, :, 0:num_bands] print(f"Input image's new shape: {inf_sample['sat_img'].shape}") elif input_band_count < num_bands: warnings.warn(f"Skipping image: The number of bands requested in the yaml file ({num_bands})" f"can not be larger than the number of band in the input image ({input_band_count}).") continue # START INFERENCES ON SUB-IMAGES sem_seg_results_per_class = sem_seg_inference(model, inf_sample['sat_img'], nbr_pix_overlap, chunk_size, num_classes_corrected, device, meta_map, inf_sample['metadata'], output_path=working_folder, index=_tqdm.n, debug=debug) # CREATE GEOTIF FROM METADATA OF ORIGINAL IMAGE tqdm.write(f'Saving inference...\n') if get_key_def('heatmaps', params['inference'], False): tqdm.write(f'Heatmaps will be saved.\n') vis(params, inf_sample['sat_img'], sem_seg_results_per_class, working_folder, inference_input_path=local_img, debug=debug) tqdm.write(f"\n\nSemantic segmentation of image {img_name} completed\n\n") if bucket: bucket.upload_file(inference_image, os.path.join(working_folder, f"{img_name.split('.')[0]}_inference.tif")) else: raise ValueError( f"The task should be either classification or segmentation. The provided value is {params['global']['task']}") time_elapsed = time.time() - since print('Inference completed in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))
def main(params: dict): """ Identify the class to which each image belongs. :param params: (dict) Parameters found in the yaml config file. """ since = time.time() # MANDATORY PARAMETERS img_dir_or_csv = get_key_def('img_dir_or_csv_file', params['inference'], expected_type=str) state_dict = get_key_def('state_dict_path', params['inference']) task = get_key_def('task', params['global'], expected_type=str) if task not in ['classification', 'segmentation']: raise ValueError( f'Task should be either "classification" or "segmentation". Got {task}' ) model_name = get_key_def('model_name', params['global'], expected_type=str).lower() num_classes = get_key_def('num_classes', params['global'], expected_type=int) num_bands = get_key_def('number_of_bands', params['global'], expected_type=int) chunk_size = get_key_def('chunk_size', params['inference'], default=512, expected_type=int) BGR_to_RGB = get_key_def('BGR_to_RGB', params['global'], expected_type=bool) # OPTIONAL PARAMETERS dontcare_val = get_key_def("ignore_index", params["training"], default=-1, expected_type=int) num_devices = get_key_def('num_gpus', params['global'], default=0, expected_type=int) default_max_used_ram = 25 max_used_ram = get_key_def('max_used_ram', params['global'], default=default_max_used_ram, expected_type=int) max_used_perc = get_key_def('max_used_perc', params['global'], default=25, expected_type=int) scale = get_key_def('scale_data', params['global'], default=[0, 1], expected_type=List) debug = get_key_def('debug_mode', params['global'], default=False, expected_type=bool) raster_to_vec = get_key_def('ras2vec', params['inference'], False) # benchmark (ie when gkpgs are inputted along with imagery) dontcare = get_key_def("ignore_index", params["training"], -1) targ_ids = get_key_def('target_ids', params['sample'], None, expected_type=List) # SETTING OUTPUT DIRECTORY working_folder = Path( params['inference']['state_dict_path']).parent.joinpath( f'inference_{num_bands}bands') Path.mkdir(working_folder, parents=True, exist_ok=True) # mlflow logging mlflow_uri = get_key_def('mlflow_uri', params['global'], default=None, expected_type=str) if mlflow_uri and not Path(mlflow_uri).is_dir(): warnings.warn(f'Mlflow uri path is not valid: {mlflow_uri}') mlflow_uri = None # SETUP LOGGING import logging.config # See: https://docs.python.org/2.4/lib/logging-config-fileformat.html if mlflow_uri: log_config_path = Path('utils/logging.conf').absolute() logfile = f'{working_folder}/info.log' logfile_debug = f'{working_folder}/debug.log' console_level_logging = 'INFO' if not debug else 'DEBUG' logging.config.fileConfig(log_config_path, defaults={ 'logfilename': logfile, 'logfilename_debug': logfile_debug, 'console_level': console_level_logging }) # import only if mlflow uri is set from mlflow import log_params, set_tracking_uri, set_experiment, start_run, log_artifact, log_metrics if not Path(mlflow_uri).is_dir(): logging.warning( f"Couldn't locate mlflow uri directory {mlflow_uri}. Directory will be created." ) Path(mlflow_uri).mkdir() set_tracking_uri(mlflow_uri) exp_name = get_key_def('mlflow_experiment_name', params['global'], default='gdl-inference', expected_type=str) set_experiment(f'{exp_name}/{working_folder.name}') run_name = get_key_def('mlflow_run_name', params['global'], default='gdl', expected_type=str) start_run(run_name=run_name) log_params(params['global']) log_params(params['inference']) else: # set a console logger as default logging.basicConfig(level=logging.DEBUG) logging.info( 'No logging folder set for mlflow. Logging will be limited to console' ) if debug: logging.warning( f'Debug mode activated. Some debug features may mobilize extra disk space and ' f'cause delays in execution.') # Assert that all items in target_ids are integers (ex.: to benchmark single-class model with multi-class labels) if targ_ids: for item in targ_ids: if not isinstance(item, int): raise ValueError( f'Target id "{item}" in target_ids is {type(item)}, expected int.' ) logging.info(f'Inferences will be saved to: {working_folder}\n\n') if not (0 <= max_used_ram <= 100): logging.warning( f'Max used ram parameter should be a percentage. Got {max_used_ram}. ' f'Will set default value of {default_max_used_ram} %') max_used_ram = default_max_used_ram # AWS bucket = None bucket_file_cache = [] bucket_name = get_key_def('bucket_name', params['global']) # list of GPU devices that are available and unused. If no GPUs, returns empty dict gpu_devices_dict = get_device_ids(num_devices, max_used_ram_perc=max_used_ram, max_used_perc=max_used_perc) if gpu_devices_dict: logging.info( f"Number of cuda devices requested: {num_devices}. Cuda devices available: {gpu_devices_dict}. " f"Using {list(gpu_devices_dict.keys())[0]}\n\n") device = torch.device( f'cuda:{list(range(len(gpu_devices_dict.keys())))[0]}') else: logging.warning( f"No Cuda device available. This process will only run on CPU") device = torch.device('cpu') # CONFIGURE MODEL num_classes_backgr = add_background_to_num_class(task, num_classes) model, loaded_checkpoint, model_name = net(model_name=model_name, num_bands=num_bands, num_channels=num_classes_backgr, dontcare_val=dontcare_val, num_devices=1, net_params=params, inference_state_dict=state_dict) try: model.to(device) except RuntimeError: logging.info(f"Unable to use device 0") device = torch.device(f'cuda' if gpu_devices_dict else 'cpu') model.to(device) # CREATE LIST OF INPUT IMAGES FOR INFERENCE list_img = list_input_images(img_dir_or_csv, bucket_name, glob_patterns=["*.tif", "*.TIF"]) # VALIDATION: anticipate problems with imagery and label (if provided) before entering main for loop valid_gpkg_set = set() for info in tqdm(list_img, desc='Validating imagery'): # validate_raster(info['tif'], num_bands, meta_map) if 'gpkg' in info.keys( ) and info['gpkg'] and info['gpkg'] not in valid_gpkg_set: validate_num_classes(vector_file=info['gpkg'], num_classes=num_classes, attribute_name=info['attribute_name'], ignore_index=dontcare, target_ids=targ_ids) assert_crs_match(info['tif'], info['gpkg']) valid_gpkg_set.add(info['gpkg']) logging.info('Successfully validated imagery') if valid_gpkg_set: logging.info('Successfully validated label data for benchmarking') if task == 'classification': classifier( params, list_img, model, device, working_folder ) # FIXME: why don't we load from checkpoint in classification? elif task == 'segmentation': gdf_ = [] gpkg_name_ = [] # TODO: Add verifications? if bucket: bucket.download_file( loaded_checkpoint, "saved_model.pth.tar") # TODO: is this still valid? model, _ = load_from_checkpoint("saved_model.pth.tar", model) else: model, _ = load_from_checkpoint(loaded_checkpoint, model) # LOOP THROUGH LIST OF INPUT IMAGES for info in tqdm(list_img, desc='Inferring from images', position=0, leave=True): with start_run(run_name=Path(info['tif']).name, nested=True): img_name = Path(info['tif']).name local_gpkg = Path( info['gpkg'] ) if 'gpkg' in info.keys() and info['gpkg'] else None gpkg_name = local_gpkg.stem if local_gpkg else None if bucket: local_img = f"Images/{img_name}" bucket.download_file(info['tif'], local_img) inference_image = f"Classified_Images/{img_name.split('.')[0]}_inference.tif" if info['meta']: if info['meta'] not in bucket_file_cache: bucket_file_cache.append(info['meta']) bucket.download_file(info['meta'], info['meta'].split('/')[-1]) info['meta'] = info['meta'].split('/')[-1] else: # FIXME: else statement should support img['meta'] integration as well. local_img = Path(info['tif']) Path.mkdir(working_folder.joinpath(local_img.parent.name), parents=True, exist_ok=True) inference_image = working_folder.joinpath( local_img.parent.name, f"{img_name.split('.')[0]}_inference.tif") temp_file = working_folder.joinpath( local_img.parent.name, f"{img_name.split('.')[0]}.dat") raster = rasterio.open(local_img, 'r') logging.info(f'Reading original image: {raster.name}') inf_meta = raster.meta label = None if local_gpkg: logging.info(f'Burning label as raster: {local_gpkg}') local_img = clip_raster_with_gpkg(raster, local_gpkg) raster.close() raster = rasterio.open(local_img, 'r') logging.info(f'Reading clipped image: {raster.name}') inf_meta = raster.meta label = vector_to_raster( vector_file=local_gpkg, input_image=raster, out_shape=(inf_meta['height'], inf_meta['width']), attribute_name=info['attribute_name'], fill=0, # background value in rasterized vector. target_ids=targ_ids) if debug: logging.debug( f'Unique values in loaded label as raster: {np.unique(label)}\n' f'Shape of label as raster: {label.shape}') pred, gdf = segmentation(param=params, input_image=raster, label_arr=label, num_classes=num_classes_backgr, gpkg_name=gpkg_name, model=model, chunk_size=chunk_size, device=device, scale=scale, BGR_to_RGB=BGR_to_RGB, tp_mem=temp_file, debug=debug) if gdf is not None: gdf_.append(gdf) gpkg_name_.append(gpkg_name) if local_gpkg: pixelMetrics = ComputePixelMetrics(label, pred, num_classes_backgr) log_metrics(pixelMetrics.update(pixelMetrics.iou)) log_metrics(pixelMetrics.update(pixelMetrics.dice)) pred = pred[np.newaxis, :, :].astype(np.uint8) inf_meta.update({ "driver": "GTiff", "height": pred.shape[1], "width": pred.shape[2], "count": pred.shape[0], "dtype": 'uint8', "compress": 'lzw' }) logging.info( f'Successfully inferred on {img_name}\nWriting to file: {inference_image}' ) with rasterio.open(inference_image, 'w+', **inf_meta) as dest: dest.write(pred) del pred try: temp_file.unlink() except OSError as e: logging.warning(f'File Error: {temp_file, e.strerror}') if raster_to_vec: start_vec = time.time() inference_vec = working_folder.joinpath( local_img.parent.name, f"{img_name.split('.')[0]}_inference.gpkg") ras2vec(inference_image, inference_vec) end_vec = time.time() - start_vec logging.info( 'Vectorization completed in {:.0f}m {:.0f}s'.format( end_vec // 60, end_vec % 60)) if len(gdf_) >= 1: if not len(gdf_) == len(gpkg_name_): raise ValueError('benchmarking unable to complete') all_gdf = pd.concat( gdf_) # Concatenate all geo data frame into one geo data frame all_gdf.reset_index(drop=True, inplace=True) gdf_x = gpd.GeoDataFrame(all_gdf) bench_gpkg = working_folder / "benchmark.gpkg" gdf_x.to_file(bench_gpkg, driver="GPKG", index=False) logging.info( f'Successfully wrote benchmark geopackage to: {bench_gpkg}') # log_artifact(working_folder) time_elapsed = time.time() - since logging.info('Inference Script completed in {:.0f}m {:.0f}s'.format( time_elapsed // 60, time_elapsed % 60))
def main(params, config_path): """ Function to train and validate a models for semantic segmentation or classification. :param params: (dict) Parameters found in the yaml config file. :param config_path: (str) Path to the yaml config file. """ debug = get_key_def('debug_mode', params['global'], False) if debug: warnings.warn( f'Debug mode activated. Some debug functions may cause delays in execution.' ) now = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M") num_classes = params['global']['num_classes'] task = params['global']['task'] batch_size = params['training']['batch_size'] assert task == 'classification', f"The task should be classification. The provided value is {task}" # INSTANTIATE MODEL AND LOAD CHECKPOINT FROM PATH model, checkpoint, model_name = net( params, num_classes) # pretrained could become a yaml parameter. tqdm.write( f'Instantiated {model_name} model with {num_classes} output channels.\n' ) bucket_name = params['global']['bucket_name'] data_path = params['global']['data_path'] modelname = config_path.stem output_path = Path(data_path).joinpath('model') / modelname if output_path.is_dir(): output_path = Path(str(output_path) + '_' + now) output_path.mkdir(parents=True, exist_ok=False) shutil.copy(str(config_path), str(output_path)) tqdm.write(f'Model and log files will be saved to: {output_path}\n\n') if bucket_name: bucket, bucket_output_path, output_path, data_path = download_s3_files( bucket_name=bucket_name, data_path=data_path, output_path=output_path, num_classes=num_classes) elif not bucket_name: get_local_classes(num_classes, data_path, output_path) since = time.time() best_loss = 999 progress_log = Path(output_path) / 'progress.log' if not progress_log.exists(): progress_log.open('w', buffering=1).write( tsv_line('ep_idx', 'phase', 'iter', 'i_p_ep', 'time')) # Add header trn_log = InformationLogger('trn') val_log = InformationLogger('val') tst_log = InformationLogger('tst') num_devices = params['global']['num_gpus'] assert num_devices is not None and num_devices >= 0, "missing mandatory num gpus parameter" # list of GPU devices that are available and unused. If no GPUs, returns empty list lst_device_ids = get_device_ids( num_devices) if torch.cuda.is_available() else [] num_devices = len(lst_device_ids) if lst_device_ids else 0 device = torch.device(f'cuda:{lst_device_ids[0]}' if torch.cuda. is_available() and lst_device_ids else 'cpu') print( f"Number of cuda devices requested: {params['global']['num_gpus']}. Cuda devices available: {lst_device_ids}\n" ) if num_devices == 1: print(f"Using Cuda device {lst_device_ids[0]}\n") elif num_devices > 1: print( f"Using data parallel on devices: {str(lst_device_ids)[1:-1]}. Main device: {lst_device_ids[0]}\n" ) # TODO: why are we showing indices [1:-1] for lst_device_ids? try: # TODO: For HPC when device 0 not available. Error: Invalid device id (in torch/cuda/__init__.py). model = nn.DataParallel( model, device_ids=lst_device_ids ) # DataParallel adds prefix 'module.' to state_dict keys except AssertionError: warnings.warn( f"Unable to use devices {lst_device_ids}. Trying devices {list(range(len(lst_device_ids)))}" ) device = torch.device('cuda:0') lst_device_ids = range(len(lst_device_ids)) model = nn.DataParallel( model, device_ids=lst_device_ids ) # DataParallel adds prefix 'module.' to state_dict keys else: warnings.warn( f"No Cuda device available. This process will only run on CPU\n") tqdm.write(f'Creating dataloaders from data in {Path(data_path)}...\n') trn_dataloader, val_dataloader, tst_dataloader = create_classif_dataloader( data_path=data_path, batch_size=batch_size, num_devices=num_devices, ) tqdm.write( f'Setting model, criterion, optimizer and learning rate scheduler...\n' ) model, criterion, optimizer, lr_scheduler = set_hyperparameters( params, num_classes, model, checkpoint) criterion = criterion.to(device) try: # For HPC when device 0 not available. Error: Cuda invalid device ordinal. model.to(device) except RuntimeError: warnings.warn(f"Unable to use device. Trying device 0...\n") device = torch.device(f'cuda:0' if torch.cuda.is_available() and lst_device_ids else 'cpu') model.to(device) filename = os.path.join(output_path, 'checkpoint.pth.tar') for epoch in range(0, params['training']['num_epochs']): print( f'\nEpoch {epoch}/{params["training"]["num_epochs"] - 1}\n{"-" * 20}' ) trn_report = train(train_loader=trn_dataloader, model=model, criterion=criterion, optimizer=optimizer, scheduler=lr_scheduler, num_classes=num_classes, batch_size=batch_size, ep_idx=epoch, progress_log=progress_log, device=device, debug=debug) trn_log.add_values(trn_report, epoch, ignore=['precision', 'recall', 'fscore', 'iou']) val_report = evaluation( eval_loader=val_dataloader, model=model, criterion=criterion, num_classes=num_classes, batch_size=batch_size, ep_idx=epoch, progress_log=progress_log, batch_metrics=params['training']['batch_metrics'], dataset='val', device=device, debug=debug) val_loss = val_report['loss'].avg if params['training']['batch_metrics'] is not None: val_log.add_values(val_report, epoch, ignore=['iou']) else: val_log.add_values(val_report, epoch, ignore=['precision', 'recall', 'fscore', 'iou']) if val_loss < best_loss: tqdm.write("save checkpoint\n") best_loss = val_loss # More info: https://pytorch.org/tutorials/beginner/saving_loading_models.html#saving-torch-nn-dataparallel-models state_dict = model.module.state_dict( ) if num_devices > 1 else model.state_dict() torch.save( { 'epoch': epoch, 'arch': model_name, 'model': state_dict, 'best_loss': best_loss, 'optimizer': optimizer.state_dict() }, filename) if bucket_name: bucket_filename = os.path.join(bucket_output_path, 'checkpoint.pth.tar') bucket.upload_file(filename, bucket_filename) if bucket_name: save_logs_to_bucket(bucket, bucket_output_path, output_path, now, params['training']['batch_metrics']) cur_elapsed = time.time() - since print( f'Current elapsed time {cur_elapsed // 60:.0f}m {cur_elapsed % 60:.0f}s' ) # load checkpoint model and evaluate it on test dataset. if int( params['training']['num_epochs'] ) > 0: # if num_epochs is set to 0, model is loaded to evaluate on test set checkpoint = load_checkpoint(filename) model, _ = load_from_checkpoint(checkpoint, model) if tst_dataloader: tst_report = evaluation( eval_loader=tst_dataloader, model=model, criterion=criterion, num_classes=num_classes, batch_size=batch_size, ep_idx=params['training']['num_epochs'], progress_log=progress_log, batch_metrics=params['training']['batch_metrics'], dataset='tst', device=device) tst_log.add_values(tst_report, params['training']['num_epochs'], ignore=['iou']) if bucket_name: bucket_filename = os.path.join(bucket_output_path, 'last_epoch.pth.tar') bucket.upload_file( "output.txt", os.path.join(bucket_output_path, f"Logs/{now}_output.txt")) bucket.upload_file(filename, bucket_filename) time_elapsed = time.time() - since print('Training complete in {:.0f}m {:.0f}s'.format( time_elapsed // 60, time_elapsed % 60))
def main(params, config_path): """ Function to train and validate a model for semantic segmentation. Process ------- 1. Model is instantiated and checkpoint is loaded from path, if provided in `your_config.yaml`. 2. GPUs are requested according to desired amount of `num_gpus` and available GPUs. 3. If more than 1 GPU is requested, model is cast to DataParallel model 4. Dataloaders are created with `create_dataloader()` 5. Loss criterion, optimizer and learning rate are set with `set_hyperparameters()` as requested in `config.yaml`. 5. Using these hyperparameters, the application will try to minimize the loss on the training data and evaluate every epoch on the validation data. 6. For every epoch, the application shows and logs the loss on "trn" and "val" datasets. 7. For every epoch (if `batch_metrics: 1`), the application shows and logs the accuracy, recall and f-score on "val" dataset. Those metrics are also computed on each class. 8. At the end of the training process, the application shows and logs the accuracy, recall and f-score on "tst" dataset. Those metrics are also computed on each class. ------- :param params: (dict) Parameters found in the yaml config file. :param config_path: (str) Path to the yaml config file. """ now = datetime.now().strftime("%Y-%m-%d_%H-%M") # MANDATORY PARAMETERS num_classes = get_key_def('num_classes', params['global'], expected_type=int) num_classes_corrected = num_classes + 1 # + 1 for background # FIXME temporary patch for num_classes problem. num_bands = get_key_def('number_of_bands', params['global'], expected_type=int) batch_size = get_key_def('batch_size', params['training'], expected_type=int) eval_batch_size = get_key_def('eval_batch_size', params['training'], expected_type=int, default=batch_size) num_epochs = get_key_def('num_epochs', params['training'], expected_type=int) model_name = get_key_def('model_name', params['global'], expected_type=str).lower() BGR_to_RGB = get_key_def('BGR_to_RGB', params['global'], expected_type=bool) # OPTIONAL PARAMETERS # basics debug = get_key_def('debug_mode', params['global'], default=False, expected_type=bool) task = get_key_def('task', params['global'], default='segmentation', expected_type=str) if not task == 'segmentation': raise ValueError(f"The task should be segmentation. The provided value is {task}") dontcare_val = get_key_def("ignore_index", params["training"], default=-1, expected_type=int) crop_size = get_key_def('target_size', params['training'], default=None, expected_type=int) batch_metrics = get_key_def('batch_metrics', params['training'], default=None, expected_type=int) meta_map = get_key_def("meta_map", params["global"], default=None) if meta_map and not Path(meta_map).is_file(): raise FileNotFoundError(f'Couldn\'t locate {meta_map}') bucket_name = get_key_def('bucket_name', params['global']) # AWS scale = get_key_def('scale_data', params['global'], default=[0, 1], expected_type=List) # model params loss_fn = get_key_def('loss_fn', params['training'], default='CrossEntropy', expected_type=str) class_weights = get_key_def('class_weights', params['training'], default=None, expected_type=Sequence) if class_weights: verify_weights(num_classes_corrected, class_weights) optimizer = get_key_def('optimizer', params['training'], default='adam', expected_type=str) pretrained = get_key_def('pretrained', params['training'], default=True, expected_type=bool) train_state_dict_path = get_key_def('state_dict_path', params['training'], default=None, expected_type=str) if train_state_dict_path and not Path(train_state_dict_path).is_file(): raise FileNotFoundError(f'Could not locate pretrained checkpoint for training: {train_state_dict_path}') dropout_prob = get_key_def('dropout_prob', params['training'], default=None, expected_type=float) # Read the concatenation point # TODO: find a way to maybe implement it in classification one day conc_point = get_key_def('concatenate_depth', params['global'], None) # gpu parameters num_devices = get_key_def('num_gpus', params['global'], default=0, expected_type=int) if num_devices and not num_devices >= 0: raise ValueError("missing mandatory num gpus parameter") # mlflow logging mlflow_uri = get_key_def('mlflow_uri', params['global'], default="./mlruns") Path(mlflow_uri).mkdir(exist_ok=True) experiment_name = get_key_def('mlflow_experiment_name', params['global'], default='gdl-training', expected_type=str) run_name = get_key_def('mlflow_run_name', params['global'], default='gdl', expected_type=str) # parameters to find hdf5 samples data_path = Path(get_key_def('data_path', params['global'], './data', expected_type=str)) samples_size = get_key_def("samples_size", params["global"], default=1024, expected_type=int) overlap = get_key_def("overlap", params["sample"], default=5, expected_type=int) min_annot_perc = get_key_def('min_annotated_percent', params['sample']['sampling_method'], default=0, expected_type=int) if not data_path.is_dir(): raise FileNotFoundError(f'Could not locate data path {data_path}') samples_folder_name = (f'samples{samples_size}_overlap{overlap}_min-annot{min_annot_perc}_{num_bands}bands' f'_{experiment_name}') samples_folder = data_path.joinpath(samples_folder_name) # visualization parameters vis_at_train = get_key_def('vis_at_train', params['visualization'], default=False) vis_at_eval = get_key_def('vis_at_evaluation', params['visualization'], default=False) vis_batch_range = get_key_def('vis_batch_range', params['visualization'], default=None) vis_at_checkpoint = get_key_def('vis_at_checkpoint', params['visualization'], default=False) ep_vis_min_thresh = get_key_def('vis_at_ckpt_min_ep_diff', params['visualization'], default=1, expected_type=int) vis_at_ckpt_dataset = get_key_def('vis_at_ckpt_dataset', params['visualization'], 'val') colormap_file = get_key_def('colormap_file', params['visualization'], None) heatmaps = get_key_def('heatmaps', params['visualization'], False) heatmaps_inf = get_key_def('heatmaps', params['inference'], False) grid = get_key_def('grid', params['visualization'], False) mean = get_key_def('mean', params['training']['normalization']) std = get_key_def('std', params['training']['normalization']) vis_params = {'colormap_file': colormap_file, 'heatmaps': heatmaps, 'heatmaps_inf': heatmaps_inf, 'grid': grid, 'mean': mean, 'std': std, 'vis_batch_range': vis_batch_range, 'vis_at_train': vis_at_train, 'vis_at_eval': vis_at_eval, 'ignore_index': dontcare_val, 'inference_input_path': None} # coordconv parameters coordconv_params = {} for param, val in params['global'].items(): if 'coordconv' in param: coordconv_params[param] = val # add git hash from current commit to parameters if available. Parameters will be saved to model's .pth.tar params['global']['git_hash'] = get_git_hash() # automatic model naming with unique id for each training model_id = config_path.stem output_path = samples_folder.joinpath('model') / model_id if output_path.is_dir(): last_mod_time_suffix = datetime.fromtimestamp(output_path.stat().st_mtime).strftime('%Y%m%d-%H%M%S') archive_output_path = samples_folder.joinpath('model') / f"{model_id}_{last_mod_time_suffix}" shutil.move(output_path, archive_output_path) output_path.mkdir(parents=True, exist_ok=False) shutil.copy(str(config_path), str(output_path)) # copy yaml to output path where model will be saved import logging.config # See: https://docs.python.org/2.4/lib/logging-config-fileformat.html log_config_path = Path('utils/logging.conf').absolute() logfile = f'{output_path}/{model_id}.log' logfile_debug = f'{output_path}/{model_id}_debug.log' console_level_logging = 'INFO' if not debug else 'DEBUG' logging.config.fileConfig(log_config_path, defaults={'logfilename': logfile, 'logfilename_debug': logfile_debug, 'console_level': console_level_logging}) logging.info(f'Model and log files will be saved to: {output_path}\n\n') if debug: logging.warning(f'Debug mode activated. Some debug features may mobilize extra disk space and ' f'cause delays in execution.') if dontcare_val < 0 and vis_batch_range: logging.warning(f'Visualization: expected positive value for ignore_index, got {dontcare_val}.' f'Will be overridden to 255 during visualization only. Problems may occur.') # overwrite dontcare values in label if loss is not lovasz or crossentropy. FIXME: hacky fix. dontcare2backgr = False if loss_fn not in ['Lovasz', 'CrossEntropy', 'OhemCrossEntropy']: dontcare2backgr = True logging.warning(f'Dontcare is not implemented for loss function "{loss_fn}". ' f'Dontcare values ({dontcare_val}) in label will be replaced with background value (0)') # Will check if batch size needs to be a lower value only if cropping samples during training calc_eval_bs = True if crop_size else False # INSTANTIATE MODEL AND LOAD CHECKPOINT FROM PATH model, model_name, criterion, optimizer, lr_scheduler, device, gpu_devices_dict = \ net(model_name=model_name, num_bands=num_bands, num_channels=num_classes_corrected, dontcare_val=dontcare_val, num_devices=num_devices, train_state_dict_path=train_state_dict_path, pretrained=pretrained, dropout_prob=dropout_prob, loss_fn=loss_fn, class_weights=class_weights, optimizer=optimizer, net_params=params, conc_point=conc_point, coordconv_params=coordconv_params) logging.info(f'Instantiated {model_name} model with {num_classes_corrected} output channels.\n') logging.info(f'Creating dataloaders from data in {samples_folder}...\n') trn_dataloader, val_dataloader, tst_dataloader = create_dataloader(samples_folder=samples_folder, batch_size=batch_size, eval_batch_size=eval_batch_size, gpu_devices_dict=gpu_devices_dict, sample_size=samples_size, dontcare_val=dontcare_val, crop_size=crop_size, meta_map=meta_map, num_bands=num_bands, BGR_to_RGB=BGR_to_RGB, scale=scale, params=params, dontcare2backgr=dontcare2backgr, calc_eval_bs=calc_eval_bs, debug=debug) # mlflow tracking path + parameters logging set_tracking_uri(mlflow_uri) set_experiment(experiment_name) start_run(run_name=run_name) log_params(params['training']) log_params(params['global']) log_params(params['sample']) if bucket_name: from utils.aws import download_s3_files bucket, bucket_output_path, output_path, data_path = download_s3_files(bucket_name=bucket_name, data_path=data_path, output_path=output_path) since = time.time() best_loss = 999 last_vis_epoch = 0 progress_log = output_path / 'progress.log' if not progress_log.exists(): progress_log.open('w', buffering=1).write(tsv_line('ep_idx', 'phase', 'iter', 'i_p_ep', 'time')) # Add header trn_log = InformationLogger('trn') val_log = InformationLogger('val') tst_log = InformationLogger('tst') filename = output_path.joinpath('checkpoint.pth.tar') # VISUALIZATION: generate pngs of inputs, labels and outputs if vis_batch_range is not None: # Make sure user-provided range is a tuple with 3 integers (start, finish, increment). # Check once for all visualization tasks. if not isinstance(vis_batch_range, list) and len(vis_batch_range) == 3 and all(isinstance(x, int) for x in vis_batch_range): raise ValueError(f'Vis_batch_range expects three integers in a list: start batch, end batch, increment.' f'Got {vis_batch_range}') vis_at_init_dataset = get_key_def('vis_at_init_dataset', params['visualization'], 'val') # Visualization at initialization. Visualize batch range before first eopch. if get_key_def('vis_at_init', params['visualization'], False): logging.info(f'Visualizing initialized model on batch range {vis_batch_range} ' f'from {vis_at_init_dataset} dataset...\n') vis_from_dataloader(vis_params=vis_params, eval_loader=val_dataloader if vis_at_init_dataset == 'val' else tst_dataloader, model=model, ep_num=0, output_path=output_path, dataset=vis_at_init_dataset, scale=scale, device=device, vis_batch_range=vis_batch_range) for epoch in range(0, num_epochs): logging.info(f'\nEpoch {epoch}/{num_epochs - 1}\n{"-" * 20}') trn_report = train(train_loader=trn_dataloader, model=model, criterion=criterion, optimizer=optimizer, scheduler=lr_scheduler, num_classes=num_classes_corrected, batch_size=batch_size, ep_idx=epoch, progress_log=progress_log, device=device, scale=scale, vis_params=vis_params, debug=debug) trn_log.add_values(trn_report, epoch, ignore=['precision', 'recall', 'fscore', 'iou']) val_report = evaluation(eval_loader=val_dataloader, model=model, criterion=criterion, num_classes=num_classes_corrected, batch_size=batch_size, ep_idx=epoch, progress_log=progress_log, batch_metrics=batch_metrics, dataset='val', device=device, scale=scale, vis_params=vis_params, debug=debug) val_loss = val_report['loss'].avg if batch_metrics is not None: val_log.add_values(val_report, epoch) else: val_log.add_values(val_report, epoch, ignore=['precision', 'recall', 'fscore', 'iou']) if val_loss < best_loss: logging.info("save checkpoint\n") best_loss = val_loss # More info: https://pytorch.org/tutorials/beginner/saving_loading_models.html#saving-torch-nn-dataparallel-models state_dict = model.module.state_dict() if num_devices > 1 else model.state_dict() torch.save({'epoch': epoch, 'params': params, 'model': state_dict, 'best_loss': best_loss, 'optimizer': optimizer.state_dict()}, filename) if bucket_name: bucket_filename = bucket_output_path.joinpath('checkpoint.pth.tar') bucket.upload_file(filename, bucket_filename) # VISUALIZATION: generate pngs of img samples, labels and outputs as alternative to follow training if vis_batch_range is not None and vis_at_checkpoint and epoch - last_vis_epoch >= ep_vis_min_thresh: if last_vis_epoch == 0: logging.info(f'Visualizing with {vis_at_ckpt_dataset} dataset samples on checkpointed model for' f'batches in range {vis_batch_range}') vis_from_dataloader(vis_params=vis_params, eval_loader=val_dataloader if vis_at_ckpt_dataset == 'val' else tst_dataloader, model=model, ep_num=epoch+1, output_path=output_path, dataset=vis_at_ckpt_dataset, scale=scale, device=device, vis_batch_range=vis_batch_range) last_vis_epoch = epoch if bucket_name: save_logs_to_bucket(bucket, bucket_output_path, output_path, now, params['training']['batch_metrics']) cur_elapsed = time.time() - since logging.info(f'Current elapsed time {cur_elapsed // 60:.0f}m {cur_elapsed % 60:.0f}s') # load checkpoint model and evaluate it on test dataset. if num_epochs > 0: # if num_epochs is set to 0, model is loaded to evaluate on test set checkpoint = load_checkpoint(filename) model, _ = load_from_checkpoint(checkpoint, model) if tst_dataloader: tst_report = evaluation(eval_loader=tst_dataloader, model=model, criterion=criterion, num_classes=num_classes_corrected, batch_size=batch_size, ep_idx=num_epochs, progress_log=progress_log, batch_metrics=batch_metrics, dataset='tst', scale=scale, vis_params=vis_params, device=device) tst_log.add_values(tst_report, num_epochs) if bucket_name: bucket_filename = bucket_output_path.joinpath('last_epoch.pth.tar') bucket.upload_file("output.txt", bucket_output_path.joinpath(f"Logs/{now}_output.txt")) bucket.upload_file(filename, bucket_filename) time_elapsed = time.time() - since log_params({'checkpoint path': filename}) logging.info('Training complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))
def main(params: dict) -> None: """ Function to manage details about the inference on segmentation task. 1. Read the parameters from the config given. 2. Read and load the state dict from the previous training or the given one. 3. Make the inference on the data specifies in the config. ------- :param params: (dict) Parameters found in the yaml config file. """ # since = time.time() # PARAMETERS mode = get_key_def('mode', params, expected_type=str) task = get_key_def('task_name', params['task'], expected_type=str) model_name = get_key_def('model_name', params['model'], expected_type=str).lower() num_classes = len(get_key_def('classes_dict', params['dataset']).keys()) modalities = read_modalities(get_key_def('modalities', params['dataset'], expected_type=str)) BGR_to_RGB = get_key_def('BGR_to_RGB', params['dataset'], expected_type=bool) num_bands = len(modalities) debug = get_key_def('debug', params, default=False, expected_type=bool) # SETTING OUTPUT DIRECTORY try: state_dict = Path(params['inference']['state_dict_path']).resolve(strict=True) except FileNotFoundError: logging.info( f"\nThe state dict path directory '{params['inference']['state_dict_path']}' don't seem to be find," + f"we will try to locate a state dict path in the '{params['general']['save_weights_dir']}' " + f"specify during the training phase" ) try: state_dict = Path(params['general']['save_weights_dir']).resolve(strict=True) except FileNotFoundError: raise logging.critical( f"\nThe state dict path directory '{params['general']['save_weights_dir']}'" + f" don't seem to be find either, please specify the path to a state dict" ) # TODO add more detail in the parent folder working_folder = state_dict.parent.joinpath(f'inference_{num_bands}bands') logging.info("\nThe state dict path directory used '{}'".format(working_folder)) Path.mkdir(working_folder, parents=True, exist_ok=True) # LOGGING PARAMETERS TODO put option not just mlflow experiment_name = get_key_def('project_name', params['general'], default='gdl-training') try: tracker_uri = get_key_def('uri', params['tracker'], default=None, expected_type=str) Path(tracker_uri).mkdir(exist_ok=True) run_name = get_key_def('run_name', params['tracker'], default='gdl') # TODO change for something meaningful run_name = '{}_{}_{}'.format(run_name, mode, task) logging.info(f'\nInference and log files will be saved to: {working_folder}') # TODO change to fit whatever inport from mlflow import log_params, set_tracking_uri, set_experiment, start_run, log_artifact, log_metrics # tracking path + parameters logging set_tracking_uri(tracker_uri) set_experiment(experiment_name) start_run(run_name=run_name) log_params(dict_path(params, 'general')) log_params(dict_path(params, 'dataset')) log_params(dict_path(params, 'data')) log_params(dict_path(params, 'model')) log_params(dict_path(params, 'inference')) # meaning no logging tracker as been assigned or it doesnt exist in config/logging except ConfigKeyError: logging.info( "\nNo logging tracker as been assigned or the yaml config doesnt exist in 'config/tracker'." "\nNo tracker file will be save in that case." ) # MANDATORY PARAMETERS img_dir_or_csv = get_key_def( 'img_dir_or_csv_file', params['inference'], default=params['general']['raw_data_csv'], expected_type=str ) if not (Path(img_dir_or_csv).is_dir() or Path(img_dir_or_csv).suffix == '.csv'): raise logging.critical( FileNotFoundError( f'\nCouldn\'t locate .csv file or directory "{img_dir_or_csv}" containing imagery for inference' ) ) # load the checkpoint try: # Sort by modification time (mtime) descending sorted_by_mtime_descending = sorted( [os.path.join(state_dict, x) for x in os.listdir(state_dict)], key=lambda t: -os.stat(t).st_mtime ) last_checkpoint_save = find_first_file('checkpoint.pth.tar', sorted_by_mtime_descending) if last_checkpoint_save is None: raise FileNotFoundError # change the state_dict state_dict = last_checkpoint_save except FileNotFoundError as e: logging.error(f"\nNo file name 'checkpoint.pth.tar' as been found at '{state_dict}'") raise e task = get_key_def('task_name', params['task'], expected_type=str) # TODO change it next version for all task if task not in ['classification', 'segmentation']: raise logging.critical( ValueError(f'\nTask should be either "classification" or "segmentation". Got {task}') ) # OPTIONAL PARAMETERS dontcare_val = get_key_def("ignore_index", params["training"], default=-1, expected_type=int) num_devices = get_key_def('num_gpus', params['training'], default=0, expected_type=int) default_max_used_ram = 25 max_used_ram = get_key_def('max_used_ram', params['training'], default=default_max_used_ram, expected_type=int) max_used_perc = get_key_def('max_used_perc', params['training'], default=25, expected_type=int) scale = get_key_def('scale_data', params['augmentation'], default=[0, 1], expected_type=ListConfig) raster_to_vec = get_key_def('ras2vec', params['inference'], False) # FIXME not implemented with hydra # benchmark (ie when gkpgs are inputted along with imagery) dontcare = get_key_def("ignore_index", params["training"], -1) targ_ids = None # TODO get_key_def('target_ids', params['sample'], None, expected_type=List) if debug: logging.warning(f'\nDebug mode activated. Some debug features may mobilize extra disk space and ' f'cause delays in execution.') # Assert that all items in target_ids are integers (ex.: to benchmark single-class model with multi-class labels) if targ_ids: for item in targ_ids: if not isinstance(item, int): raise ValueError(f'\nTarget id "{item}" in target_ids is {type(item)}, expected int.') logging.info(f'\nInferences will be saved to: {working_folder}\n\n') if not (0 <= max_used_ram <= 100): logging.warning(f'\nMax used ram parameter should be a percentage. Got {max_used_ram}. ' f'Will set default value of {default_max_used_ram} %') max_used_ram = default_max_used_ram # AWS bucket = None bucket_file_cache = [] bucket_name = get_key_def('bucket_name', params['AWS']) # list of GPU devices that are available and unused. If no GPUs, returns empty dict gpu_devices_dict = get_device_ids(num_devices, max_used_ram_perc=max_used_ram, max_used_perc=max_used_perc) if gpu_devices_dict: chunk_size = calc_inference_chunk_size(gpu_devices_dict=gpu_devices_dict, max_pix_per_mb_gpu=50) logging.info(f"\nNumber of cuda devices requested: {num_devices}. " f"\nCuda devices available: {gpu_devices_dict}. " f"\nUsing {list(gpu_devices_dict.keys())[0]}\n\n") device = torch.device(f'cuda:{list(range(len(gpu_devices_dict.keys())))[0]}') else: chunk_size = get_key_def('chunk_size', params['inference'], default=512, expected_type=int) logging.warning(f"\nNo Cuda device available. This process will only run on CPU") device = torch.device('cpu') # CONFIGURE MODEL num_classes_backgr = add_background_to_num_class(task, num_classes) model, loaded_checkpoint, model_name = net(model_name=model_name, num_bands=num_bands, num_channels=num_classes_backgr, dontcare_val=dontcare_val, num_devices=1, net_params=params, inference_state_dict=state_dict) try: model.to(device) except RuntimeError: logging.info(f"\nUnable to use device. Trying device 0") device = torch.device(f'cuda' if gpu_devices_dict else 'cpu') model.to(device) # CREATE LIST OF INPUT IMAGES FOR INFERENCE try: # check if the data folder exist raw_data_dir = get_key_def('raw_data_dir', params['dataset']) my_data_path = Path(raw_data_dir).resolve(strict=True) logging.info("\nImage directory used '{}'".format(my_data_path)) data_path = Path(my_data_path) except FileNotFoundError: raw_data_dir = get_key_def('raw_data_dir', params['dataset']) raise logging.critical( "\nImage directory '{}' doesn't exist, please change the path".format(raw_data_dir) ) list_img = list_input_images( img_dir_or_csv, bucket_name, glob_patterns=["*.tif", "*.TIF"], in_case_of_path=str(data_path) ) # VALIDATION: anticipate problems with imagery and label (if provided) before entering main for loop valid_gpkg_set = set() for info in tqdm(list_img, desc='Validating imagery'): # validate_raster(info['tif'], num_bands, meta_map) if 'gpkg' in info.keys() and info['gpkg'] and info['gpkg'] not in valid_gpkg_set: validate_num_classes(vector_file=info['gpkg'], num_classes=num_classes, attribute_name=info['attribute_name'], ignore_index=dontcare, target_ids=targ_ids) assert_crs_match(info['tif'], info['gpkg']) valid_gpkg_set.add(info['gpkg']) logging.info('\nSuccessfully validated imagery') if valid_gpkg_set: logging.info('\nSuccessfully validated label data for benchmarking') if task == 'classification': classifier(params, list_img, model, device, working_folder) # FIXME: why don't we load from checkpoint in classification? elif task == 'segmentation': gdf_ = [] gpkg_name_ = [] # TODO: Add verifications? if bucket: bucket.download_file(loaded_checkpoint, "saved_model.pth.tar") # TODO: is this still valid? model, _ = load_from_checkpoint("saved_model.pth.tar", model) else: model, _ = load_from_checkpoint(loaded_checkpoint, model) # Save tracking TODO put option not just mlflow if 'tracker_uri' in locals() and 'run_name' in locals(): mode = get_key_def('mode', params, expected_type=str) task = get_key_def('task_name', params['task'], expected_type=str) run_name = '{}_{}_{}'.format(run_name, mode, task) # tracking path + parameters logging set_tracking_uri(tracker_uri) set_experiment(experiment_name) start_run(run_name=run_name) log_params(dict_path(params, 'inference')) log_params(dict_path(params, 'dataset')) log_params(dict_path(params, 'model')) # LOOP THROUGH LIST OF INPUT IMAGES for info in tqdm(list_img, desc='Inferring from images', position=0, leave=True): img_name = Path(info['tif']).name local_gpkg = Path(info['gpkg']) if 'gpkg' in info.keys() and info['gpkg'] else None gpkg_name = local_gpkg.stem if local_gpkg else None if bucket: local_img = f"Images/{img_name}" bucket.download_file(info['tif'], local_img) inference_image = f"Classified_Images/{img_name.split('.')[0]}_inference.tif" if info['meta']: if info['meta'] not in bucket_file_cache: bucket_file_cache.append(info['meta']) bucket.download_file(info['meta'], info['meta'].split('/')[-1]) info['meta'] = info['meta'].split('/')[-1] else: # FIXME: else statement should support img['meta'] integration as well. local_img = Path(info['tif']) Path.mkdir(working_folder.joinpath(local_img.parent.name), parents=True, exist_ok=True) inference_image = working_folder.joinpath(local_img.parent.name, f"{img_name.split('.')[0]}_inference.tif") temp_file = working_folder.joinpath(local_img.parent.name, f"{img_name.split('.')[0]}.dat") raster = rasterio.open(local_img, 'r') logging.info(f'\nReading original image: {raster.name}') inf_meta = raster.meta label = None if local_gpkg: logging.info(f'\nBurning label as raster: {local_gpkg}') local_img = clip_raster_with_gpkg(raster, local_gpkg) raster.close() raster = rasterio.open(local_img, 'r') logging.info(f'\nReading clipped image: {raster.name}') inf_meta = raster.meta label = vector_to_raster(vector_file=local_gpkg, input_image=raster, out_shape=(inf_meta['height'], inf_meta['width']), attribute_name=info['attribute_name'], fill=0, # background value in rasterized vector. target_ids=targ_ids) if debug: logging.debug(f'\nUnique values in loaded label as raster: {np.unique(label)}\n' f'Shape of label as raster: {label.shape}') pred, gdf = segmentation(param=params, input_image=raster, label_arr=label, num_classes=num_classes_backgr, gpkg_name=gpkg_name, model=model, chunk_size=chunk_size, device=device, scale=scale, BGR_to_RGB=BGR_to_RGB, tp_mem=temp_file, debug=debug) if gdf is not None: gdf_.append(gdf) gpkg_name_.append(gpkg_name) if local_gpkg and 'tracker_uri' in locals(): pixelMetrics = ComputePixelMetrics(label, pred, num_classes_backgr) log_metrics(pixelMetrics.update(pixelMetrics.iou)) log_metrics(pixelMetrics.update(pixelMetrics.dice)) pred = pred[np.newaxis, :, :].astype(np.uint8) inf_meta.update({"driver": "GTiff", "height": pred.shape[1], "width": pred.shape[2], "count": pred.shape[0], "dtype": 'uint8', "compress": 'lzw'}) logging.info(f'\nSuccessfully inferred on {img_name}\nWriting to file: {inference_image}') with rasterio.open(inference_image, 'w+', **inf_meta) as dest: dest.write(pred) del pred try: temp_file.unlink() except OSError as e: logging.warning(f'File Error: {temp_file, e.strerror}') if raster_to_vec: start_vec = time.time() inference_vec = working_folder.joinpath(local_img.parent.name, f"{img_name.split('.')[0]}_inference.gpkg") ras2vec(inference_image, inference_vec) end_vec = time.time() - start_vec logging.info('Vectorization completed in {:.0f}m {:.0f}s'.format(end_vec // 60, end_vec % 60)) if len(gdf_) >= 1: if not len(gdf_) == len(gpkg_name_): raise logging.critical(ValueError('\nbenchmarking unable to complete')) all_gdf = pd.concat(gdf_) # Concatenate all geo data frame into one geo data frame all_gdf.reset_index(drop=True, inplace=True) gdf_x = gpd.GeoDataFrame(all_gdf) bench_gpkg = working_folder / "benchmark.gpkg" gdf_x.to_file(bench_gpkg, driver="GPKG", index=False) logging.info(f'\nSuccessfully wrote benchmark geopackage to: {bench_gpkg}') # log_artifact(working_folder)
def main(params, config_path): """ Function to train and validate a model for semantic segmentation. Process ------- 1. Model is instantiated and checkpoint is loaded from path, if provided in `your_config.yaml`. 2. GPUs are requested according to desired amount of `num_gpus` and available GPUs. 3. If more than 1 GPU is requested, model is cast to DataParallel model 4. Dataloaders are created with `create_dataloader()` 5. Loss criterion, optimizer and learning rate are set with `set_hyperparameters()` as requested in `config.yaml`. 5. Using these hyperparameters, the application will try to minimize the loss on the training data and evaluate every epoch on the validation data. 6. For every epoch, the application shows and logs the loss on "trn" and "val" datasets. 7. For every epoch (if `batch_metrics: 1`), the application shows and logs the accuracy, recall and f-score on "val" dataset. Those metrics are also computed on each class. 8. At the end of the training process, the application shows and logs the accuracy, recall and f-score on "tst" dataset. Those metrics are also computed on each class. ------- :param params: (dict) Parameters found in the yaml config file. :param config_path: (str) Path to the yaml config file. """ params['global']['git_hash'] = get_git_hash() debug = get_key_def('debug_mode', params['global'], False) if debug: warnings.warn( f'Debug mode activated. Some debug features may mobilize extra disk space and cause delays in execution.' ) now = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M") num_classes = params['global']['num_classes'] task = params['global']['task'] assert task == 'segmentation', f"The task should be segmentation. The provided value is {task}" num_classes_corrected = num_classes + 1 # + 1 for background # FIXME temporary patch for num_classes problem. data_path = Path(params['global']['data_path']) assert data_path.is_dir(), f'Could not locate data path {data_path}' samples_size = params["global"]["samples_size"] overlap = params["sample"]["overlap"] min_annot_perc = get_key_def('min_annotated_percent', params['sample']['sampling_method'], 0, expected_type=int) num_bands = params['global']['number_of_bands'] samples_folder_name = f'samples{samples_size}_overlap{overlap}_min-annot{min_annot_perc}_{num_bands}bands' # FIXME: won't check if folder has datetime suffix (if multiple folders) samples_folder = data_path.joinpath(samples_folder_name) batch_size = params['training']['batch_size'] num_devices = params['global']['num_gpus'] # list of GPU devices that are available and unused. If no GPUs, returns empty list lst_device_ids = get_device_ids( num_devices) if torch.cuda.is_available() else [] num_devices = len(lst_device_ids) if lst_device_ids else 0 device = torch.device(f'cuda:{lst_device_ids[0]}' if torch.cuda. is_available() and lst_device_ids else 'cpu') tqdm.write(f'Creating dataloaders from data in {samples_folder}...\n') trn_dataloader, val_dataloader, tst_dataloader = create_dataloader( samples_folder=samples_folder, batch_size=batch_size, num_devices=num_devices, params=params) # INSTANTIATE MODEL AND LOAD CHECKPOINT FROM PATH model, model_name, criterion, optimizer, lr_scheduler = net( params, num_classes_corrected) # pretrained could become a yaml parameter. tqdm.write( f'Instantiated {model_name} model with {num_classes_corrected} output channels.\n' ) bucket_name = get_key_def('bucket_name', params['global']) # mlflow tracking path + parameters logging set_tracking_uri( get_key_def('mlflow_uri', params['global'], default="./mlruns")) set_experiment('gdl-training') log_params(params['training']) log_params(params['global']) log_params(params['sample']) modelname = config_path.stem output_path = samples_folder.joinpath('model') / modelname if output_path.is_dir(): output_path = output_path.joinpath(f"_{now}") output_path.mkdir(parents=True, exist_ok=False) shutil.copy(str(config_path), str(output_path)) tqdm.write(f'Model and log files will be saved to: {output_path}\n\n') if bucket_name: from utils.aws import download_s3_files bucket, bucket_output_path, output_path, data_path = download_s3_files( bucket_name=bucket_name, data_path=data_path, output_path=output_path) since = time.time() best_loss = 999 last_vis_epoch = 0 progress_log = output_path / 'progress.log' if not progress_log.exists(): progress_log.open('w', buffering=1).write( tsv_line('ep_idx', 'phase', 'iter', 'i_p_ep', 'time')) # Add header trn_log = InformationLogger('trn') val_log = InformationLogger('val') tst_log = InformationLogger('tst') filename = output_path.joinpath('checkpoint.pth.tar') # VISUALIZATION: generate pngs of inputs, labels and outputs vis_batch_range = get_key_def('vis_batch_range', params['visualization'], None) if vis_batch_range is not None: # Make sure user-provided range is a tuple with 3 integers (start, finish, increment). Check once for all visualization tasks. assert isinstance(vis_batch_range, list) and len(vis_batch_range) == 3 and all( isinstance(x, int) for x in vis_batch_range) vis_at_init_dataset = get_key_def('vis_at_init_dataset', params['visualization'], 'val') # Visualization at initialization. Visualize batch range before first eopch. if get_key_def('vis_at_init', params['visualization'], False): tqdm.write( f'Visualizing initialized model on batch range {vis_batch_range} from {vis_at_init_dataset} dataset...\n' ) vis_from_dataloader( params=params, eval_loader=val_dataloader if vis_at_init_dataset == 'val' else tst_dataloader, model=model, ep_num=0, output_path=output_path, dataset=vis_at_init_dataset, device=device, vis_batch_range=vis_batch_range) for epoch in range(0, params['training']['num_epochs']): print( f'\nEpoch {epoch}/{params["training"]["num_epochs"] - 1}\n{"-" * 20}' ) trn_report = train(train_loader=trn_dataloader, model=model, criterion=criterion, optimizer=optimizer, scheduler=lr_scheduler, num_classes=num_classes_corrected, batch_size=batch_size, ep_idx=epoch, progress_log=progress_log, vis_params=params, device=device, debug=debug) trn_log.add_values(trn_report, epoch, ignore=['precision', 'recall', 'fscore', 'iou']) val_report = evaluation( eval_loader=val_dataloader, model=model, criterion=criterion, num_classes=num_classes_corrected, batch_size=batch_size, ep_idx=epoch, progress_log=progress_log, vis_params=params, batch_metrics=params['training']['batch_metrics'], dataset='val', device=device, debug=debug) val_loss = val_report['loss'].avg if params['training']['batch_metrics'] is not None: val_log.add_values(val_report, epoch) else: val_log.add_values(val_report, epoch, ignore=['precision', 'recall', 'fscore', 'iou']) if val_loss < best_loss: tqdm.write("save checkpoint\n") best_loss = val_loss # More info: https://pytorch.org/tutorials/beginner/saving_loading_models.html#saving-torch-nn-dataparallel-models state_dict = model.module.state_dict( ) if num_devices > 1 else model.state_dict() torch.save( { 'epoch': epoch, 'params': params, 'model': state_dict, 'best_loss': best_loss, 'optimizer': optimizer.state_dict() }, filename) if epoch == 0: log_artifact(filename) if bucket_name: bucket_filename = bucket_output_path.joinpath( 'checkpoint.pth.tar') bucket.upload_file(filename, bucket_filename) # VISUALIZATION: generate png of test samples, labels and outputs for visualisation to follow training performance vis_at_checkpoint = get_key_def('vis_at_checkpoint', params['visualization'], False) ep_vis_min_thresh = get_key_def('vis_at_ckpt_min_ep_diff', params['visualization'], 4) vis_at_ckpt_dataset = get_key_def('vis_at_ckpt_dataset', params['visualization'], 'val') if vis_batch_range is not None and vis_at_checkpoint and epoch - last_vis_epoch >= ep_vis_min_thresh: if last_vis_epoch == 0: tqdm.write( f'Visualizing with {vis_at_ckpt_dataset} dataset samples on checkpointed model for' f'batches in range {vis_batch_range}') vis_from_dataloader( params=params, eval_loader=val_dataloader if vis_at_ckpt_dataset == 'val' else tst_dataloader, model=model, ep_num=epoch + 1, output_path=output_path, dataset=vis_at_ckpt_dataset, device=device, vis_batch_range=vis_batch_range) last_vis_epoch = epoch if bucket_name: save_logs_to_bucket(bucket, bucket_output_path, output_path, now, params['training']['batch_metrics']) cur_elapsed = time.time() - since print( f'Current elapsed time {cur_elapsed // 60:.0f}m {cur_elapsed % 60:.0f}s' ) # load checkpoint model and evaluate it on test dataset. if int( params['training']['num_epochs'] ) > 0: # if num_epochs is set to 0, model is loaded to evaluate on test set checkpoint = load_checkpoint(filename) model, _ = load_from_checkpoint(checkpoint, model) if tst_dataloader: tst_report = evaluation( eval_loader=tst_dataloader, model=model, criterion=criterion, num_classes=num_classes_corrected, batch_size=batch_size, ep_idx=params['training']['num_epochs'], progress_log=progress_log, vis_params=params, batch_metrics=params['training']['batch_metrics'], dataset='tst', device=device) tst_log.add_values(tst_report, params['training']['num_epochs']) if bucket_name: bucket_filename = bucket_output_path.joinpath('last_epoch.pth.tar') bucket.upload_file( "output.txt", bucket_output_path.joinpath(f"Logs/{now}_output.txt")) bucket.upload_file(filename, bucket_filename) time_elapsed = time.time() - since print('Training complete in {:.0f}m {:.0f}s'.format( time_elapsed // 60, time_elapsed % 60))
def main(params, config_path): """ Function to train and validate a models for semantic segmentation or classification. :param params: (dict) Parameters found in the yaml config file. :param config_path: (str) Path to the yaml config file. """ # MANDATORY PARAMETERS num_classes = get_key_def('num_classes', params['global'], expected_type=int) num_bands = get_key_def('number_of_bands', params['global'], expected_type=int) batch_size = get_key_def('batch_size', params['training'], expected_type=int) num_epochs = get_key_def('num_epochs', params['training'], expected_type=int) model_name = get_key_def('model_name', params['global'], expected_type=str).lower() BGR_to_RGB = get_key_def('BGR_to_RGB', params['global'], expected_type=bool) # parameters to find hdf5 samples data_path = Path(get_key_def('data_path', params['global'], './data', expected_type=str)) assert data_path.is_dir(), f'Could not locate data path {data_path}' # OPTIONAL PARAMETERS # basics debug = get_key_def('debug_mode', params['global'], default=False, expected_type=bool) task = get_key_def('task', params['global'], default='classification', expected_type=str) assert task == 'classification', f"The task should be classification. The provided value is {task}" dontcare_val = get_key_def("ignore_index", params["training"], default=-1, expected_type=int) batch_metrics = get_key_def('batch_metrics', params['training'], default=1, expected_type=int) meta_map = get_key_def("meta_map", params["global"], default={}) bucket_name = get_key_def('bucket_name', params['global']) # AWS # model params loss_fn = get_key_def('loss_fn', params['training'], default='CrossEntropy', expected_type=str) optimizer = get_key_def('optimizer', params['training'], default='adam', expected_type=str) pretrained = get_key_def('pretrained', params['training'], default=True, expected_type=bool) train_state_dict_path = get_key_def('state_dict_path', params['training'], default=None, expected_type=str) dropout_prob = get_key_def('dropout_prob', params['training'], default=None, expected_type=float) # gpu parameters num_devices = get_key_def('num_gpus', params['global'], default=0, expected_type=int) max_used_ram = get_key_def('max_used_ram', params['global'], default=2000, expected_type=int) max_used_perc = get_key_def('max_used_perc', params['global'], default=15, expected_type=int) # automatic model naming with unique id for each training model_id = config_path.stem output_path = data_path.joinpath('model') / model_id if output_path.is_dir(): last_mod_time_suffix = datetime.fromtimestamp(output_path.stat().st_mtime).strftime('%Y%m%d-%H%M%S') archive_output_path = data_path.joinpath('model') / f"{model_id}_{last_mod_time_suffix}" shutil.move(output_path, archive_output_path) output_path.mkdir(parents=True, exist_ok=False) shutil.copy(str(config_path), str(output_path)) # copy yaml to output path where model will be saved tqdm.write(f'Model and log files will be saved to: {output_path}\n\n') if debug: warnings.warn(f'Debug mode activated. Some debug functions may cause delays in execution.') if bucket_name: bucket, bucket_output_path, output_path, data_path = download_s3_files(bucket_name=bucket_name, data_path=data_path, output_path=output_path, num_classes=num_classes) elif not bucket_name: get_local_classes(num_classes, data_path, output_path) since = time.time() now = datetime.now().strftime("%Y-%m-%d_%H-%M") best_loss = 999 progress_log = Path(output_path) / 'progress.log' if not progress_log.exists(): progress_log.open('w', buffering=1).write(tsv_line('ep_idx', 'phase', 'iter', 'i_p_ep', 'time')) # Add header trn_log = InformationLogger('trn') val_log = InformationLogger('val') tst_log = InformationLogger('tst') # list of GPU devices that are available and unused. If no GPUs, returns empty list gpu_devices_dict = get_device_ids(num_devices, max_used_ram_perc=max_used_ram, max_used_perc=max_used_perc) num_devices = len(gpu_devices_dict.keys()) device = torch.device(f'cuda:0' if gpu_devices_dict else 'cpu') tqdm.write(f'Creating dataloaders from data in {Path(data_path)}...\n') trn_dataloader, val_dataloader, tst_dataloader = create_classif_dataloader(data_path=data_path, batch_size=batch_size, num_devices=num_devices,) # INSTANTIATE MODEL AND LOAD CHECKPOINT FROM PATH model, model_name, criterion, optimizer, lr_scheduler = net(model_name=model_name, num_bands=num_bands, num_channels=num_classes, dontcare_val=dontcare_val, num_devices=num_devices, train_state_dict_path=train_state_dict_path, pretrained=pretrained, dropout_prob=dropout_prob, loss_fn=loss_fn, optimizer=optimizer, net_params=params) tqdm.write(f'Instantiated {model_name} model with {num_classes} output channels.\n') filename = os.path.join(output_path, 'checkpoint.pth.tar') for epoch in range(0, params['training']['num_epochs']): logging.info(f'\nEpoch {epoch}/{params["training"]["num_epochs"] - 1}\n{"-" * 20}') trn_report = train(train_loader=trn_dataloader, model=model, criterion=criterion, optimizer=optimizer, scheduler=lr_scheduler, num_classes=num_classes, batch_size=batch_size, ep_idx=epoch, progress_log=progress_log, device=device, debug=debug) trn_log.add_values(trn_report, epoch, ignore=['precision', 'recall', 'fscore', 'iou']) val_report = evaluation(eval_loader=val_dataloader, model=model, criterion=criterion, num_classes=num_classes, batch_size=batch_size, ep_idx=epoch, progress_log=progress_log, batch_metrics=params['training']['batch_metrics'], dataset='val', device=device, debug=debug) val_loss = val_report['loss'].avg if params['training']['batch_metrics'] is not None: val_log.add_values(val_report, epoch, ignore=['iou']) else: val_log.add_values(val_report, epoch, ignore=['precision', 'recall', 'fscore', 'iou']) if val_loss < best_loss: tqdm.write("save checkpoint\n") best_loss = val_loss # More info: https://pytorch.org/tutorials/beginner/saving_loading_models.html#saving-torch-nn-dataparallel-models state_dict = model.module.state_dict() if num_devices > 1 else model.state_dict() torch.save({'epoch': epoch, 'params': params, 'model': state_dict, 'best_loss': best_loss, 'optimizer': optimizer.state_dict()}, filename) if bucket_name: bucket_filename = os.path.join(bucket_output_path, 'checkpoint.pth.tar') bucket.upload_file(filename, bucket_filename) if bucket_name: save_logs_to_bucket(bucket, bucket_output_path, output_path, batch_metrics) cur_elapsed = time.time() - since logging.info(f'Current elapsed time {cur_elapsed // 60:.0f}m {cur_elapsed % 60:.0f}s') # load checkpoint model and evaluate it on test dataset. if int(params['training']['num_epochs']) > 0: # if num_epochs is set to 0, model is loaded to evaluate on test set checkpoint = load_checkpoint(filename) model, _ = load_from_checkpoint(checkpoint, model) if tst_dataloader: tst_report = evaluation(eval_loader=tst_dataloader, model=model, criterion=criterion, num_classes=num_classes, batch_size=batch_size, ep_idx=num_epochs, progress_log=progress_log, batch_metrics=batch_metrics, dataset='tst', device=device) tst_log.add_values(tst_report, num_epochs, ignore=['iou']) if bucket_name: bucket_filename = os.path.join(bucket_output_path, 'last_epoch.pth.tar') bucket.upload_file("output.txt", os.path.join(bucket_output_path, f"Logs/{now}_output.txt")) bucket.upload_file(filename, bucket_filename) time_elapsed = time.time() - since logging.info('Training complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))
def main(params, config_path): """ Function to train and validate a models for semantic segmentation or classification. :param params: (dict) Parameters found in the yaml config file. :param config_path: (str) Path to the yaml config file. """ now = datetime.datetime.now().strftime("%Y-%m-%d_%I-%M") model, checkpoint, model_name = net(params) bucket_name = params['global']['bucket_name'] data_path = params['global']['data_path'] modelname = config_path.stem output_path = Path(data_path).joinpath('model') / modelname try: output_path.mkdir(parents=True, exist_ok=False) except FileExistsError: output_path = Path(str(output_path) + '_' + now) output_path.mkdir(exist_ok=True) print(f'Model and log files will be saved to: {output_path}') task = params['global']['task'] num_classes = params['global']['num_classes'] batch_size = params['training']['batch_size'] if num_classes == 1: # assume background is implicitly needed (makes no sense to train with one class otherwise) # this will trigger some warnings elsewhere, but should succeed nonetheless num_classes = 2 if bucket_name: bucket, bucket_output_path, output_path, data_path = download_s3_files( bucket_name=bucket_name, data_path=data_path, output_path=output_path, num_classes=num_classes, task=task) elif not bucket_name and task == 'classification': get_local_classes(num_classes, data_path, output_path) since = time.time() best_loss = 999 progress_log = Path(output_path) / 'progress.log' if not progress_log.exists(): progress_log.open('w', buffering=1).write( tsv_line('ep_idx', 'phase', 'iter', 'i_p_ep', 'time')) # Add header trn_log = InformationLogger(output_path, 'trn') val_log = InformationLogger(output_path, 'val') tst_log = InformationLogger(output_path, 'tst') num_devices = params['global']['num_gpus'] assert num_devices is not None and num_devices >= 0, "missing mandatory num gpus parameter" # list of GPU devices that are available and unused. If no GPUs, returns empty list lst_device_ids = get_device_ids( num_devices) if torch.cuda.is_available() else [] num_devices = len(lst_device_ids) if lst_device_ids else 0 device = torch.device(f'cuda:{lst_device_ids[0]}' if torch.cuda. is_available() and lst_device_ids else 'cpu') if num_devices == 1: print(f"Using Cuda device {lst_device_ids[0]}") elif num_devices > 1: print(f"Using data parallel on devices {str(lst_device_ids)[1:-1]}") model = nn.DataParallel(model, device_ids=lst_device_ids ) # adds prefix 'module.' to state_dict keys else: warnings.warn( f"No Cuda device available. This process will only run on CPU") trn_dataloader, val_dataloader, tst_dataloader = create_dataloader( data_path=data_path, batch_size=batch_size, task=task, num_devices=num_devices, params=params) model, criterion, optimizer, lr_scheduler = set_hyperparameters( params, model, checkpoint) criterion = criterion.to(device) model = model.to(device) filename = os.path.join(output_path, 'checkpoint.pth.tar') for epoch in range(0, params['training']['num_epochs']): print( f'\nEpoch {epoch}/{params["training"]["num_epochs"] - 1}\n{"-" * 20}' ) trn_report = train(train_loader=trn_dataloader, model=model, criterion=criterion, optimizer=optimizer, scheduler=lr_scheduler, num_classes=num_classes, batch_size=batch_size, task=task, ep_idx=epoch, progress_log=progress_log, device=device) trn_log.add_values(trn_report, epoch, ignore=['precision', 'recall', 'fscore', 'iou']) val_report = evaluation( eval_loader=val_dataloader, model=model, criterion=criterion, num_classes=num_classes, batch_size=batch_size, task=task, ep_idx=epoch, progress_log=progress_log, batch_metrics=params['training']['batch_metrics'], dataset='val', device=device) val_loss = val_report['loss'].avg if params['training']['batch_metrics'] is not None: val_log.add_values(val_report, epoch) else: val_log.add_values(val_report, epoch, ignore=['precision', 'recall', 'fscore', 'iou']) if val_loss < best_loss: print("save checkpoint") best_loss = val_loss # More info: https://pytorch.org/tutorials/beginner/saving_loading_models.html#saving-torch-nn-dataparallel-models state_dict = model.module.state_dict( ) if num_devices > 1 else model.state_dict() torch.save( { 'epoch': epoch, 'arch': model_name, 'model': state_dict, 'best_loss': best_loss, 'optimizer': optimizer.state_dict() }, filename) if bucket_name: bucket_filename = os.path.join(bucket_output_path, 'checkpoint.pth.tar') bucket.upload_file(filename, bucket_filename) if bucket_name: save_logs_to_bucket(bucket, bucket_output_path, output_path, now, params['training']['batch_metrics']) cur_elapsed = time.time() - since print( f'Current elapsed time {cur_elapsed // 60:.0f}m {cur_elapsed % 60:.0f}s' ) # load checkpoint model and evaluate it on test dataset. if int( params['training']['num_epochs'] ) > 0: #if num_epochs is set to 0, is loaded model to evaluate on test set checkpoint = load_checkpoint(filename) model, _ = load_from_checkpoint(checkpoint, model) tst_report = evaluation(eval_loader=tst_dataloader, model=model, criterion=criterion, num_classes=num_classes, batch_size=batch_size, task=task, ep_idx=params['training']['num_epochs'], progress_log=progress_log, batch_metrics=params['training']['batch_metrics'], dataset='tst', device=device) tst_log.add_values(tst_report, params['training']['num_epochs']) if bucket_name: bucket_filename = os.path.join(bucket_output_path, 'last_epoch.pth.tar') bucket.upload_file( "output.txt", os.path.join(bucket_output_path, f"Logs/{now}_output.txt")) bucket.upload_file(filename, bucket_filename) time_elapsed = time.time() - since print('Training complete in {:.0f}m {:.0f}s'.format( time_elapsed // 60, time_elapsed % 60))
def main(params): """ Identify the class to which each image belongs. :param params: (dict) Parameters found in the yaml config file. """ since = time.time() csv_file = params['inference']['img_csv_file'] bucket = None bucket_name = params['global']['bucket_name'] model, state_dict_path, model_name = net(params, inference=True) num_devices = params['global']['num_gpus'] if params['global'][ 'num_gpus'] else 0 # list of GPU devices that are available and unused. If no GPUs, returns empty list lst_device_ids = get_device_ids( num_devices) if torch.cuda.is_available() else [] device = torch.device(f'cuda:{lst_device_ids[0]}' if torch.cuda. is_available() and lst_device_ids else 'cpu') if lst_device_ids: print(f"Using Cuda device {lst_device_ids[0]}") else: warnings.warn( f"No Cuda device available. This process will only run on CPU") model.to(device) if bucket_name: s3 = boto3.resource('s3') bucket = s3.Bucket(bucket_name) bucket.download_file(csv_file, 'img_csv_file.csv') list_img = read_csv('img_csv_file.csv', inference=True) else: list_img = read_csv(csv_file, inference=True) if params['global']['task'] == 'classification': classifier(params, list_img, model) elif params['global']['task'] == 'segmentation': if bucket: bucket.download_file(state_dict_path, "saved_model.pth.tar") model = load_from_checkpoint("saved_model.pth.tar", model) else: model = load_from_checkpoint(state_dict_path, model) chunk_size, nbr_pix_overlap = calc_overlap(params) num_classes = params['global']['num_classes'] for img in tqdm(list_img, desc='image list', position=0): img_name = os.path.basename(img['tif']) if bucket: local_img = f"Images/{img_name}" bucket.download_file(img['tif'], local_img) inference_image = f"Classified_Images/{img_name.split('.')[0]}_inference.tif" else: local_img = img['tif'] inference_image = os.path.join( params['inference']['working_folder'], f"{img_name.split('.')[0]}_inference.tif") assert_band_number(local_img, params['global']['number_of_bands']) nd_array_tif = image_reader_as_array(local_img) # See: http://cs231n.github.io/neural-networks-2/#datapre # e.g. Scale arrays from [0,255] to [0,1] scale = params['global']['scale_data'] if params['global'][ 'scale_data'] else True if scale: sc_min, sc_max = params['global']['scale_data'] nd_array_tif = minmax_scale(nd_array_tif, orig_range=(np.min(nd_array_tif), np.max(nd_array_tif)), scale_range=(sc_min, sc_max)) sem_seg_results = sem_seg_inference(model, nd_array_tif, nbr_pix_overlap, chunk_size, num_classes, device) create_new_raster_from_base(local_img, inference_image, sem_seg_results) tqdm.write(f"Semantic segmentation of image {img_name} completed") if bucket: bucket.upload_file( inference_image, os.path.join(params['inference']['working_folder'], f"{img_name.split('.')[0]}_inference.tif")) else: raise ValueError( f"The task should be either classification or segmentation. The provided value is {params['global']['task']}" ) time_elapsed = time.time() - since print('Inference completed in {:.0f}m {:.0f}s'.format( time_elapsed // 60, time_elapsed % 60))
def main(params): """ Identify the class to which each image belongs. :param params: (dict) Parameters found in the yaml config file. """ since = time.time() img_dir_or_csv = params['inference']['img_dir_or_csv_file'] working_folder = Path(params['inference']['working_folder']) Path.mkdir(working_folder, exist_ok=True) print(f'Inferences will be saved to: {working_folder}') bucket = None bucket_file_cache = [] bucket_name = params['global']['bucket_name'] model, state_dict_path, model_name = net(params, inference=True) num_devices = params['global']['num_gpus'] if params['global'][ 'num_gpus'] else 0 # list of GPU devices that are available and unused. If no GPUs, returns empty list lst_device_ids = get_device_ids( num_devices) if torch.cuda.is_available() else [] device = torch.device(f'cuda:{lst_device_ids[0]}' if torch.cuda. is_available() and lst_device_ids else 'cpu') if lst_device_ids: print(f"Using Cuda device {lst_device_ids[0]}") else: warnings.warn( f"No Cuda device available. This process will only run on CPU") model.to(device) if bucket_name: s3 = boto3.resource('s3') bucket = s3.Bucket(bucket_name) if img_dir_or_csv.endswith('.csv'): bucket.download_file(img_dir_or_csv, 'img_csv_file.csv') list_img = read_csv('img_csv_file.csv', inference=True) else: raise NotImplementedError( 'Specify a csv file containing images for inference. Directory input not implemented yet' ) else: if img_dir_or_csv.endswith('.csv'): list_img = read_csv(img_dir_or_csv, inference=True) else: img_dir = Path(img_dir_or_csv) assert img_dir.exists( ), f'Could not find directory "{img_dir_or_csv}"' list_img_paths = sorted(img_dir.glob('*.tif')) list_img = [] for img_path in list_img_paths: img = {} img['tif'] = img_path list_img.append(img) assert len( list_img) >= 0, f'No .tif files found in {img_dir_or_csv}' if params['global']['task'] == 'classification': classifier(params, list_img, model, device) elif params['global']['task'] == 'segmentation': if bucket: bucket.download_file(state_dict_path, "saved_model.pth.tar") model, _ = load_from_checkpoint("saved_model.pth.tar", model) else: model, _ = load_from_checkpoint(state_dict_path, model) chunk_size, nbr_pix_overlap = calc_overlap(params) num_classes = params['global']['num_classes'] if num_classes == 1: # assume background is implicitly needed (makes no sense to predict with one class otherwise) # this will trigger some warnings elsewhere, but should succeed nonetheless num_classes = 2 with tqdm(list_img, desc='image list', position=0) as _tqdm: for img in _tqdm: img_name = os.path.basename(img['tif']) if bucket: local_img = f"Images/{img_name}" bucket.download_file(img['tif'], local_img) inference_image = f"Classified_Images/{img_name.split('.')[0]}_inference.tif" if img['meta']: if img['meta'] not in bucket_file_cache: bucket_file_cache.append(img['meta']) bucket.download_file(img['meta'], img['meta'].split('/')[-1]) img['meta'] = img['meta'].split('/')[-1] else: local_img = img['tif'] inference_image = os.path.join( params['inference']['working_folder'], f"{img_name.split('.')[0]}_inference.tif") assert os.path.isfile( local_img), f"could not open raster file at {local_img}" with rasterio.open(local_img, 'r') as raster: np_input_image = image_reader_as_array( input_image=raster, scale=get_key_def('scale_data', params['global'], None), aux_vector_file=get_key_def('aux_vector_file', params['global'], None), aux_vector_attrib=get_key_def('aux_vector_attrib', params['global'], None), aux_vector_ids=get_key_def('aux_vector_ids', params['global'], None), aux_vector_dist_maps=get_key_def( 'aux_vector_dist_maps', params['global'], True), aux_vector_scale=get_key_def('aux_vector_scale', params['global'], None)) meta_map, metadata = get_key_def("meta_map", params["global"], {}), None if meta_map: assert img['meta'] is not None and isinstance(img['meta'], str) and os.path.isfile(img['meta']), \ "global configuration requested metadata mapping onto loaded samples, but raster did not have available metadata" metadata = read_parameters(img['meta']) if debug: _tqdm.set_postfix( OrderedDict(image_name=img_name, image_shape=np_input_image.shape)) input_band_count = np_input_image.shape[ 2] + MetaSegmentationDataset.get_meta_layer_count(meta_map) assert input_band_count == params['global']['number_of_bands'], \ f"The number of bands in the input image ({input_band_count}) and the parameter" \ f"'number_of_bands' in the yaml file ({params['global']['number_of_bands']}) should be identical" sem_seg_results = sem_seg_inference(model, np_input_image, nbr_pix_overlap, chunk_size, num_classes, device, meta_map, metadata) if debug and len(np.unique(sem_seg_results)) == 1: print( f'Something is wrong. Inference contains only one value. Make sure data scale is coherent with training domain values.' ) create_new_raster_from_base(local_img, inference_image, sem_seg_results) tqdm.write( f"Semantic segmentation of image {img_name} completed") if bucket: bucket.upload_file( inference_image, os.path.join( params['inference']['working_folder'], f"{img_name.split('.')[0]}_inference.tif")) else: raise ValueError( f"The task should be either classification or segmentation. The provided value is {params['global']['task']}" ) time_elapsed = time.time() - since print('Inference completed in {:.0f}m {:.0f}s'.format( time_elapsed // 60, time_elapsed % 60))
def train(cfg: DictConfig) -> None: """ Function to train and validate a model for semantic segmentation. ------- 1. Model is instantiated and checkpoint is loaded from path, if provided in `your_config.yaml`. 2. GPUs are requested according to desired amount of `num_gpus` and available GPUs. 3. If more than 1 GPU is requested, model is cast to DataParallel model 4. Dataloaders are created with `create_dataloader()` 5. Loss criterion, optimizer and learning rate are set with `set_hyperparameters()` as requested in `config.yaml`. 5. Using these hyperparameters, the application will try to minimize the loss on the training data and evaluate every epoch on the validation data. 6. For every epoch, the application shows and logs the loss on "trn" and "val" datasets. 7. For every epoch (if `batch_metrics: 1`), the application shows and logs the accuracy, recall and f-score on "val" dataset. Those metrics are also computed on each class. 8. At the end of the training process, the application shows and logs the accuracy, recall and f-score on "tst" dataset. Those metrics are also computed on each class. ------- :param cfg: (dict) Parameters found in the yaml config file. """ now = datetime.now().strftime("%Y-%m-%d_%H-%M") # MANDATORY PARAMETERS num_classes = len(get_key_def('classes_dict', cfg['dataset']).keys()) num_classes_corrected = num_classes + 1 # + 1 for background # FIXME temporary patch for num_classes problem. num_bands = len(read_modalities(cfg.dataset.modalities)) batch_size = get_key_def('batch_size', cfg['training'], expected_type=int) eval_batch_size = get_key_def('eval_batch_size', cfg['training'], expected_type=int, default=batch_size) num_epochs = get_key_def('max_epochs', cfg['training'], expected_type=int) model_name = get_key_def('model_name', cfg['model'], expected_type=str).lower() # TODO need to keep in parameters? see victor stuff # BGR_to_RGB = get_key_def('BGR_to_RGB', params['global'], expected_type=bool) BGR_to_RGB = False # OPTIONAL PARAMETERS debug = get_key_def('debug', cfg) task = get_key_def('task_name', cfg['task'], default='segmentation') dontcare_val = get_key_def("ignore_index", cfg['dataset'], default=-1) bucket_name = get_key_def('bucket_name', cfg['AWS']) scale = get_key_def('scale_data', cfg['augmentation'], default=[0, 1]) batch_metrics = get_key_def('batch_metrics', cfg['training'], default=None) meta_map = get_key_def("meta_map", cfg['training'], default=None) # TODO what is that? crop_size = get_key_def('target_size', cfg['training'], default=None) # if error if meta_map and not Path(meta_map).is_file(): raise logging.critical( FileNotFoundError(f'\nCouldn\'t locate {meta_map}')) if task != 'segmentation': raise logging.critical( ValueError( f"\nThe task should be segmentation. The provided value is {task}" )) # MODEL PARAMETERS class_weights = get_key_def('class_weights', cfg['dataset'], default=None) loss_fn = get_key_def('loss_fn', cfg['training'], default='CrossEntropy') optimizer = get_key_def( 'optimizer_name', cfg['optimizer'], default='adam', expected_type=str) # TODO change something to call the function pretrained = get_key_def('pretrained', cfg['model'], default=True, expected_type=bool) train_state_dict_path = get_key_def('state_dict_path', cfg['general'], default=None, expected_type=str) dropout_prob = get_key_def('factor', cfg['scheduler']['params'], default=None, expected_type=float) # if error if train_state_dict_path and not Path(train_state_dict_path).is_file(): raise logging.critical( FileNotFoundError( f'\nCould not locate pretrained checkpoint for training: {train_state_dict_path}' )) if class_weights: verify_weights(num_classes_corrected, class_weights) # Read the concatenation point # TODO: find a way to maybe implement it in classification one day conc_point = None # conc_point = get_key_def('concatenate_depth', params['global'], None) # GPU PARAMETERS num_devices = get_key_def('num_gpus', cfg['training'], default=0) if num_devices and not num_devices >= 0: raise logging.critical( ValueError("\nmissing mandatory num gpus parameter")) default_max_used_ram = 15 max_used_ram = get_key_def('max_used_ram', cfg['training'], default=default_max_used_ram) max_used_perc = get_key_def('max_used_perc', cfg['training'], default=15) # LOGGING PARAMETERS TODO put option not just mlflow experiment_name = get_key_def('project_name', cfg['general'], default='gdl-training') try: tracker_uri = get_key_def('uri', cfg['tracker']) Path(tracker_uri).mkdir(exist_ok=True) run_name = get_key_def( 'run_name', cfg['tracker'], default='gdl') # TODO change for something meaningful # meaning no logging tracker as been assigned or it doesnt exist in config/logging except ConfigKeyError: logging.info( "\nNo logging tracker as been assigned or the yaml config doesnt exist in 'config/tracker'." "\nNo tracker file will be saved in this case.") # PARAMETERS FOR hdf5 SAMPLES # info on the hdf5 name samples_size = get_key_def("input_dim", cfg['dataset'], expected_type=int, default=256) overlap = get_key_def("overlap", cfg['dataset'], expected_type=int, default=0) min_annot_perc = get_key_def('min_annotated_percent', cfg['dataset'], expected_type=int, default=0) samples_folder_name = ( f'samples{samples_size}_overlap{overlap}_min-annot{min_annot_perc}_{num_bands}bands_{experiment_name}' ) try: my_hdf5_path = Path(str( cfg.dataset.sample_data_dir)).resolve(strict=True) samples_folder = Path( my_hdf5_path.joinpath(samples_folder_name)).resolve(strict=True) logging.info("\nThe HDF5 directory used '{}'".format(samples_folder)) except FileNotFoundError: samples_folder = Path(str( cfg.dataset.sample_data_dir)).joinpath(samples_folder_name) logging.info( f"\nThe HDF5 directory '{samples_folder}' doesn't exist, please change the path." + f"\nWe will try to find '{samples_folder_name}' in '{cfg.dataset.raw_data_dir}'." ) try: my_data_path = Path(cfg.dataset.raw_data_dir).resolve(strict=True) samples_folder = Path( my_data_path.joinpath(samples_folder_name)).resolve( strict=True) logging.info( "\nThe HDF5 directory used '{}'".format(samples_folder)) cfg.general.sample_data_dir = str( my_data_path ) # need to be done for when the config will be saved except FileNotFoundError: raise logging.critical( f"\nThe HDF5 directory '{samples_folder_name}' doesn't exist in '{cfg.dataset.raw_data_dir}'" + f"\n or in '{cfg.dataset.sample_data_dir}', please verify the location of your HDF5." ) # visualization parameters vis_at_train = get_key_def('vis_at_train', cfg['visualization'], default=False) vis_at_eval = get_key_def('vis_at_evaluation', cfg['visualization'], default=False) vis_batch_range = get_key_def('vis_batch_range', cfg['visualization'], default=None) vis_at_checkpoint = get_key_def('vis_at_checkpoint', cfg['visualization'], default=False) ep_vis_min_thresh = get_key_def('vis_at_ckpt_min_ep_diff', cfg['visualization'], default=1) vis_at_ckpt_dataset = get_key_def('vis_at_ckpt_dataset', cfg['visualization'], 'val') colormap_file = get_key_def('colormap_file', cfg['visualization'], None) heatmaps = get_key_def('heatmaps', cfg['visualization'], False) heatmaps_inf = get_key_def('heatmaps', cfg['inference'], False) grid = get_key_def('grid', cfg['visualization'], False) mean = get_key_def('mean', cfg['augmentation']['normalization']) std = get_key_def('std', cfg['augmentation']['normalization']) vis_params = { 'colormap_file': colormap_file, 'heatmaps': heatmaps, 'heatmaps_inf': heatmaps_inf, 'grid': grid, 'mean': mean, 'std': std, 'vis_batch_range': vis_batch_range, 'vis_at_train': vis_at_train, 'vis_at_eval': vis_at_eval, 'ignore_index': dontcare_val, 'inference_input_path': None } # coordconv parameters TODO # coordconv_params = {} # for param, val in params['global'].items(): # if 'coordconv' in param: # coordconv_params[param] = val coordconv_params = get_key_def('coordconv', cfg['model']) # automatic model naming with unique id for each training config_path = None for list_path in cfg.general.config_path: if list_path['provider'] == 'main': config_path = list_path['path'] config_name = str(cfg.general.config_name) model_id = config_name output_path = Path(f'model/{model_id}') output_path.mkdir(parents=True, exist_ok=False) logging.info( f'\nModel and log files will be saved to: {os.getcwd()}/{output_path}') if debug: logging.warning( f'\nDebug mode activated. Some debug features may mobilize extra disk space and ' f'cause delays in execution.') if dontcare_val < 0 and vis_batch_range: logging.warning( f'\nVisualization: expected positive value for ignore_index, got {dontcare_val}.' f'Will be overridden to 255 during visualization only. Problems may occur.' ) # overwrite dontcare values in label if loss is not lovasz or crossentropy. FIXME: hacky fix. dontcare2backgr = False if loss_fn not in ['Lovasz', 'CrossEntropy', 'OhemCrossEntropy']: dontcare2backgr = True logging.warning( f'\nDontcare is not implemented for loss function "{loss_fn}". ' f'\nDontcare values ({dontcare_val}) in label will be replaced with background value (0)' ) # Will check if batch size needs to be a lower value only if cropping samples during training calc_eval_bs = True if crop_size else False # INSTANTIATE MODEL AND LOAD CHECKPOINT FROM PATH model, model_name, criterion, optimizer, lr_scheduler, device, gpu_devices_dict = \ net(model_name=model_name, num_bands=num_bands, num_channels=num_classes_corrected, dontcare_val=dontcare_val, num_devices=num_devices, train_state_dict_path=train_state_dict_path, pretrained=pretrained, dropout_prob=dropout_prob, loss_fn=loss_fn, class_weights=class_weights, optimizer=optimizer, net_params=cfg, conc_point=conc_point, coordconv_params=coordconv_params) logging.info( f'Instantiated {model_name} model with {num_classes_corrected} output channels.\n' ) logging.info(f'Creating dataloaders from data in {samples_folder}...\n') trn_dataloader, val_dataloader, tst_dataloader = create_dataloader( samples_folder=samples_folder, batch_size=batch_size, eval_batch_size=eval_batch_size, gpu_devices_dict=gpu_devices_dict, sample_size=samples_size, dontcare_val=dontcare_val, crop_size=crop_size, meta_map=meta_map, num_bands=num_bands, BGR_to_RGB=BGR_to_RGB, scale=scale, cfg=cfg, dontcare2backgr=dontcare2backgr, calc_eval_bs=calc_eval_bs, debug=debug) # Save tracking TODO put option not just mlflow if 'tracker_uri' in locals() and 'run_name' in locals(): mode = get_key_def('mode', cfg, expected_type=str) task = get_key_def('task_name', cfg['task'], expected_type=str) run_name = '{}_{}_{}'.format(run_name, mode, task) # tracking path + parameters logging set_tracking_uri(tracker_uri) set_experiment(experiment_name) start_run(run_name=run_name) log_params(dict_path(cfg, 'training')) log_params(dict_path(cfg, 'dataset')) log_params(dict_path(cfg, 'model')) log_params(dict_path(cfg, 'optimizer')) log_params(dict_path(cfg, 'scheduler')) log_params(dict_path(cfg, 'augmentation')) # TODO change something to not only have the mlflow option trn_log = InformationLogger('trn') val_log = InformationLogger('val') tst_log = InformationLogger('tst') if bucket_name: from utils.aws import download_s3_files bucket, bucket_output_path, output_path, data_path = download_s3_files( bucket_name=bucket_name, data_path=data_path, # FIXME output_path=output_path) since = time.time() best_loss = 999 last_vis_epoch = 0 progress_log = output_path / 'progress.log' if not progress_log.exists(): progress_log.open('w', buffering=1).write( tsv_line('ep_idx', 'phase', 'iter', 'i_p_ep', 'time')) # Add header # create the checkpoint file filename = output_path.joinpath('checkpoint.pth.tar') # VISUALIZATION: generate pngs of inputs, labels and outputs if vis_batch_range is not None: # Make sure user-provided range is a tuple with 3 integers (start, finish, increment). # Check once for all visualization tasks. if not isinstance(vis_batch_range, list) and len(vis_batch_range) == 3 and all( isinstance(x, int) for x in vis_batch_range): raise logging.critical( ValueError( f'\nVis_batch_range expects three integers in a list: start batch, end batch, increment.' f'Got {vis_batch_range}')) vis_at_init_dataset = get_key_def('vis_at_init_dataset', cfg['visualization'], 'val') # Visualization at initialization. Visualize batch range before first eopch. if get_key_def('vis_at_init', cfg['visualization'], False): logging.info( f'\nVisualizing initialized model on batch range {vis_batch_range} ' f'from {vis_at_init_dataset} dataset...\n') vis_from_dataloader( vis_params=vis_params, eval_loader=val_dataloader if vis_at_init_dataset == 'val' else tst_dataloader, model=model, ep_num=0, output_path=output_path, dataset=vis_at_init_dataset, scale=scale, device=device, vis_batch_range=vis_batch_range) for epoch in range(0, num_epochs): logging.info(f'\nEpoch {epoch}/{num_epochs - 1}\n' + "-" * len(f'Epoch {epoch}/{num_epochs - 1}')) # creating trn_report trn_report = training(train_loader=trn_dataloader, model=model, criterion=criterion, optimizer=optimizer, scheduler=lr_scheduler, num_classes=num_classes_corrected, batch_size=batch_size, ep_idx=epoch, progress_log=progress_log, device=device, scale=scale, vis_params=vis_params, debug=debug) if 'trn_log' in locals(): # only save the value if a tracker is setup trn_log.add_values(trn_report, epoch, ignore=['precision', 'recall', 'fscore', 'iou']) val_report = evaluation(eval_loader=val_dataloader, model=model, criterion=criterion, num_classes=num_classes_corrected, batch_size=batch_size, ep_idx=epoch, progress_log=progress_log, batch_metrics=batch_metrics, dataset='val', device=device, scale=scale, vis_params=vis_params, debug=debug) val_loss = val_report['loss'].avg if 'val_log' in locals(): # only save the value if a tracker is setup if batch_metrics is not None: val_log.add_values(val_report, epoch) else: val_log.add_values( val_report, epoch, ignore=['precision', 'recall', 'fscore', 'iou']) if val_loss < best_loss: logging.info( "\nSave checkpoint with a validation loss of {:.4f}".format( val_loss)) # only allow 4 decimals best_loss = val_loss # More info: # https://pytorch.org/tutorials/beginner/saving_loading_models.html#saving-torch-nn-dataparallel-models state_dict = model.module.state_dict( ) if num_devices > 1 else model.state_dict() torch.save( { 'epoch': epoch, 'params': cfg, 'model': state_dict, 'best_loss': best_loss, 'optimizer': optimizer.state_dict() }, filename) if bucket_name: bucket_filename = bucket_output_path.joinpath( 'checkpoint.pth.tar') bucket.upload_file(filename, bucket_filename) # VISUALIZATION: generate pngs of img samples, labels and outputs as alternative to follow training if vis_batch_range is not None and vis_at_checkpoint and epoch - last_vis_epoch >= ep_vis_min_thresh: if last_vis_epoch == 0: logging.info( f'\nVisualizing with {vis_at_ckpt_dataset} dataset samples on checkpointed model for' f'batches in range {vis_batch_range}') vis_from_dataloader( vis_params=vis_params, eval_loader=val_dataloader if vis_at_ckpt_dataset == 'val' else tst_dataloader, model=model, ep_num=epoch + 1, output_path=output_path, dataset=vis_at_ckpt_dataset, scale=scale, device=device, vis_batch_range=vis_batch_range) last_vis_epoch = epoch if bucket_name: save_logs_to_bucket(bucket, bucket_output_path, output_path, now, cfg['training']['batch_metrics']) cur_elapsed = time.time() - since # logging.info(f'\nCurrent elapsed time {cur_elapsed // 60:.0f}m {cur_elapsed % 60:.0f}s') # copy the checkpoint in 'save_weights_dir' Path(cfg['general']['save_weights_dir']).mkdir(parents=True, exist_ok=True) copy(filename, cfg['general']['save_weights_dir']) # load checkpoint model and evaluate it on test dataset. if int( cfg['general']['max_epochs'] ) > 0: # if num_epochs is set to 0, model is loaded to evaluate on test set checkpoint = load_checkpoint(filename) model, _ = load_from_checkpoint(checkpoint, model) if tst_dataloader: tst_report = evaluation(eval_loader=tst_dataloader, model=model, criterion=criterion, num_classes=num_classes_corrected, batch_size=batch_size, ep_idx=num_epochs, progress_log=progress_log, batch_metrics=batch_metrics, dataset='tst', scale=scale, vis_params=vis_params, device=device) if 'tst_log' in locals(): # only save the value if a tracker is setup tst_log.add_values(tst_report, num_epochs) if bucket_name: bucket_filename = bucket_output_path.joinpath('last_epoch.pth.tar') bucket.upload_file( "output.txt", bucket_output_path.joinpath(f"Logs/{now}_output.txt")) bucket.upload_file(filename, bucket_filename)
def main(params): """ Identify the class to which each image belongs. :param params: (dict) Parameters found in the yaml config file. """ since = time.time() img_dir_or_csv = params['inference']['img_dir_or_csv_file'] working_folder = Path(params['inference']['working_folder']) Path.mkdir(working_folder, exist_ok=True) print(f'Inferences will be saved to: {working_folder}') bucket = None bucket_name = params['global']['bucket_name'] model, state_dict_path, model_name = net(params, inference=True) num_devices = params['global']['num_gpus'] if params['global'][ 'num_gpus'] else 0 # list of GPU devices that are available and unused. If no GPUs, returns empty list lst_device_ids = get_device_ids( num_devices) if torch.cuda.is_available() else [] device = torch.device(f'cuda:{lst_device_ids[0]}' if torch.cuda. is_available() and lst_device_ids else 'cpu') if lst_device_ids: print(f"Using Cuda device {lst_device_ids[0]}") else: warnings.warn( f"No Cuda device available. This process will only run on CPU") model.to(device) if bucket_name: s3 = boto3.resource('s3') bucket = s3.Bucket(bucket_name) if img_dir_or_csv.endswith('.csv'): bucket.download_file(img_dir_or_csv, 'img_csv_file.csv') list_img = read_csv('img_csv_file.csv', inference=True) else: raise NotImplementedError( 'Specify a csv file containing images for inference. Directory input not implemented yet' ) else: if img_dir_or_csv.endswith('.csv'): list_img = read_csv(img_dir_or_csv, inference=True) else: img_dir = Path(img_dir_or_csv) assert img_dir.exists( ), f'Could not find directory "{img_dir_or_csv}"' list_img_paths = sorted(img_dir.glob('*.tif')) list_img = [] for img_path in list_img_paths: img = {} img['tif'] = img_path list_img.append(img) assert len( list_img) >= 0, f'No .tif files found in {img_dir_or_csv}' if params['global']['task'] == 'classification': classifier(params, list_img, model) elif params['global']['task'] == 'segmentation': if bucket: bucket.download_file(state_dict_path, "saved_model.pth.tar") model, _ = load_from_checkpoint("saved_model.pth.tar", model) else: model, _ = load_from_checkpoint(state_dict_path, model) chunk_size, nbr_pix_overlap = calc_overlap(params) num_classes = params['global']['num_classes'] with tqdm(list_img, desc='image list', position=0) as _tqdm: for img in _tqdm: img_name = os.path.basename(img['tif']) if bucket: local_img = f"Images/{img_name}" bucket.download_file(img['tif'], local_img) inference_image = f"Classified_Images/{img_name.split('.')[0]}_inference.tif" else: local_img = img['tif'] inference_image = os.path.join( params['inference']['working_folder'], f"{img_name.split('.')[0]}_inference.tif") assert_band_number(local_img, params['global']['number_of_bands']) nd_array_tif = image_reader_as_array(local_img) assert (len(np.unique(nd_array_tif)) > 1), ( f'Image "{img_name}" only contains {np.unique(nd_array_tif)} value.' ) # See: http://cs231n.github.io/neural-networks-2/#datapre. e.g. Scale arrays from [0,255] to [0,1] scale = params['global']['scale_data'] if scale: sc_min, sc_max = params['global']['scale_data'] nd_array_tif = minmax_scale( nd_array_tif, orig_range=(np.min(nd_array_tif), np.max(nd_array_tif)), scale_range=(sc_min, sc_max)) if debug: _tqdm.set_postfix( OrderedDict(image_name=img_name, image_shape=nd_array_tif.shape, scale=scale)) sem_seg_results = sem_seg_inference(model, nd_array_tif, nbr_pix_overlap, chunk_size, num_classes, device) if debug and len(np.unique(sem_seg_results)) == 1: print( f'Something is wrong. Inference contains only "{np.unique(sem_seg_results)} value. Make sure ' f'"scale_data" parameter is coherent with parameters used for training model used in inference.' ) create_new_raster_from_base(local_img, inference_image, sem_seg_results) tqdm.write( f"Semantic segmentation of image {img_name} completed") if bucket: bucket.upload_file( inference_image, os.path.join( params['inference']['working_folder'], f"{img_name.split('.')[0]}_inference.tif")) else: raise ValueError( f"The task should be either classification or segmentation. The provided value is {params['global']['task']}" ) time_elapsed = time.time() - since print('Inference completed in {:.0f}m {:.0f}s'.format( time_elapsed // 60, time_elapsed % 60))
def main(params): """ Function to train and validate a models for semantic segmentation or classification. :param params: (dict) Parameters found in the yaml config file. """ model, state_dict_path, model_name = net(params) bucket_name = params['global']['bucket_name'] output_path = params['training']['output_path'] data_path = params['global']['data_path'] task = params['global']['task'] num_classes = params['global']['num_classes'] batch_size = params['training']['batch_size'] if bucket_name: bucket, bucket_output_path, output_path, data_path = download_s3_files( bucket_name=bucket_name, data_path=data_path, output_path=output_path, num_classes=num_classes, task=task) elif not bucket_name and task == 'classification': get_local_classes(num_classes, data_path, output_path) since = time.time() best_loss = 999 progress_log = Path(output_path) / 'progress.log' if not progress_log.exists(): # Add header progress_log.open('w', buffering=1).write( tsv_line('ep_idx', 'phase', 'iter', 'i_p_ep', 'time')) trn_log = InformationLogger(output_path, 'trn') val_log = InformationLogger(output_path, 'val') tst_log = InformationLogger(output_path, 'tst') model, criterion, optimizer, lr_scheduler, device, num_devices = set_hyperparameters( params, model, state_dict_path) num_samples = get_num_samples(data_path=data_path, params=params) print(f"Number of samples : {num_samples}") trn_dataloader, val_dataloader, tst_dataloader = create_dataloader( data_path=data_path, num_samples=num_samples, batch_size=batch_size, task=task) now = datetime.datetime.now().strftime("%Y-%m-%d_%I-%M ") filename = os.path.join( output_path, 'checkpoint.pth.tar' ) #TODO Should output directory hold same name as config file name? for epoch in range(0, params['training']['num_epochs']): print( f'\nEpoch {epoch}/{params["training"]["num_epochs"] - 1}\n{"-" * 20}' ) trn_report = train(train_loader=trn_dataloader, model=model, criterion=criterion, optimizer=optimizer, scheduler=lr_scheduler, num_classes=num_classes, batch_size=batch_size, task=task, ep_idx=epoch, progress_log=progress_log, device=device) trn_log.add_values(trn_report, epoch, ignore=['precision', 'recall', 'fscore', 'iou']) val_report = evaluation( eval_loader=val_dataloader, model=model, criterion=criterion, num_classes=num_classes, batch_size=batch_size, task=task, ep_idx=epoch, progress_log=progress_log, batch_metrics=params['training']['batch_metrics'], dataset='val', device=device) val_loss = val_report['loss'].avg if params['training']['batch_metrics'] is not None: val_log.add_values(val_report, epoch) else: val_log.add_values(val_report, epoch, ignore=['precision', 'recall', 'fscore', 'iou']) if val_loss < best_loss: print("save checkpoint") best_loss = val_loss # More info: https://pytorch.org/tutorials/beginner/saving_loading_models.html#saving-torch-nn-dataparallel-models state_dict = model.module.state_dict( ) if num_devices > 1 else model.state_dict() torch.save( { 'epoch': epoch, 'arch': model_name, 'model': state_dict, 'best_loss': best_loss, 'optimizer': optimizer.state_dict() }, filename) if bucket_name: bucket_filename = os.path.join(bucket_output_path, 'checkpoint.pth.tar') bucket.upload_file(filename, bucket_filename) if bucket_name: save_logs_to_bucket(bucket, bucket_output_path, output_path, now, params['training']['batch_metrics']) cur_elapsed = time.time() - since print( f'Current elapsed time {cur_elapsed // 60:.0f}m {cur_elapsed % 60:.0f}s' ) # load checkpoint model and evaluate it on test dataset. model = load_from_checkpoint(filename, model) tst_report = evaluation(eval_loader=tst_dataloader, model=model, criterion=criterion, num_classes=num_classes, batch_size=batch_size, task=task, ep_idx=params['training']['num_epochs'], progress_log=progress_log, batch_metrics=params['training']['batch_metrics'], dataset='tst', device=device) tst_log.add_values(tst_report, params['training']['num_epochs']) if bucket_name: bucket_filename = os.path.join(bucket_output_path, 'last_epoch.pth.tar') bucket.upload_file( "output.txt", os.path.join(bucket_output_path, f"Logs/{now}_output.txt")) bucket.upload_file(filename, bucket_filename) time_elapsed = time.time() - since print('Training complete in {:.0f}m {:.0f}s'.format( time_elapsed // 60, time_elapsed % 60))
def set_hyperparameters(params, model, state_dict_path): """ Function to set hyperparameters based on values provided in yaml config file. Will also set model to GPU, if available. If none provided, default functions values are used. :param params: (dict) Parameters found in the yaml config file :param model: Model loaded from model_choice.py :param state_dict_path: (str) Full file path to the state dict :return: model, criterion, optimizer, lr_scheduler, num_gpus """ # assign default values to hyperparameters loss_signature = inspect.signature(nn.CrossEntropyLoss).parameters optim_signature = inspect.signature(optim.Adam).parameters lr_scheduler_signature = inspect.signature( optim.lr_scheduler.StepLR).parameters class_weights = loss_signature['weight'].default ignore_index = loss_signature['ignore_index'].default lr = optim_signature['lr'].default weight_decay = optim_signature['weight_decay'].default step_size = lr_scheduler_signature['step_size'].default if not isinstance(step_size, int): step_size = params['training']['num_epochs'] + 1 gamma = lr_scheduler_signature['gamma'].default num_devices = 0 # replace default values by those in config file if they exist if params['training']['class_weights']: class_weights = torch.tensor(params['training']['class_weights']) verify_weights(params['global']['num_classes'], class_weights) if params['training']['ignore_index']: ignore_index = params['training']['ignore_index'] if params['training']['learning_rate']: lr = params['training']['learning_rate'] if params['training']['weight_decay']: weight_decay = params['training']['weight_decay'] if params['training']['step_size']: step_size = params['training']['step_size'] if params['training']['gamma']: gamma = params['training']['gamma'] if params['global']['num_gpus']: num_devices = params['global']['num_gpus'] # Loss function criterion = MultiClassCriterion(loss_type=params['training']['loss_fn'], ignore_index=ignore_index, weight=class_weights) # list of GPU devices that are available and unused. If no GPUs, returns empty list lst_device_ids = get_device_ids( num_devices) if torch.cuda.is_available() else [] num_devices = len(lst_device_ids) if lst_device_ids else 0 device = torch.device(f'cuda:{lst_device_ids[0]}' if torch.cuda. is_available() and lst_device_ids else 'cpu') if num_devices == 1: print(f"Using Cuda device {lst_device_ids[0]}") elif num_devices > 1: print(f"Using data parallel on devices {str(lst_device_ids)[1:-1]}") model = nn.DataParallel(model, device_ids=lst_device_ids ) # adds prefix 'module.' to state_dict keys else: warnings.warn( f"No Cuda device available. This process will only run on CPU") criterion = criterion.to(device) model = model.to(device) # Optimizer opt_fn = params['training']['optimizer'] optimizer = create_optimizer(params=model.parameters(), mode=opt_fn, base_lr=lr, weight_decay=weight_decay) lr_scheduler = optim.lr_scheduler.StepLR(optimizer=optimizer, step_size=step_size, gamma=gamma) if state_dict_path != '': model, optimizer = load_from_checkpoint(state_dict_path, model, optimizer=optimizer) return model, criterion, optimizer, lr_scheduler, device, num_devices
def main(params, config_path): """ Function to train and validate a models for semantic segmentation or classification. :param params: (dict) Parameters found in the yaml config file. :param config_path: (str) Path to the yaml config file. """ debug = get_key_def('debug_mode', params['global'], False) if debug: warnings.warn( f'Debug mode activated. Some debug features may mobilize extra disk space and cause delays in execution.' ) now = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M") num_classes = params['global']['num_classes'] task = params['global']['task'] assert task == 'segmentation', f"The task should be segmentation. The provided value is {task}" num_classes_corrected = num_classes + 1 # + 1 for background # FIXME temporary patch for num_classes problem. # INSTANTIATE MODEL AND LOAD CHECKPOINT FROM PATH model, checkpoint, model_name = net( params, num_classes_corrected) # pretrained could become a yaml parameter. tqdm.write( f'Instantiated {model_name} model with {num_classes_corrected} output channels.\n' ) bucket_name = params['global']['bucket_name'] data_path = params['global']['data_path'] assert Path(data_path).is_dir(), f'Could not locate data path {data_path}' samples_size = params["global"]["samples_size"] overlap = params["sample"]["overlap"] min_annot_perc = params['sample']['sampling']['map'] num_bands = params['global']['number_of_bands'] samples_folder_name = f'samples{samples_size}_overlap{overlap}_min-annot{min_annot_perc}_{num_bands}bands' # FIXME: preferred name structure? document! samples_folder = Path(data_path).joinpath( samples_folder_name) if task == 'segmentation' else Path(data_path) modelname = config_path.stem output_path = Path(samples_folder).joinpath('model') / modelname if output_path.is_dir(): output_path = Path(str(output_path) + '_' + now) output_path.mkdir(parents=True, exist_ok=False) shutil.copy(str(config_path), str(output_path)) tqdm.write(f'Model and log files will be saved to: {output_path}\n\n') task = params['global']['task'] batch_size = params['training']['batch_size'] if bucket_name: bucket, bucket_output_path, output_path, data_path = download_s3_files( bucket_name=bucket_name, data_path=data_path, output_path=output_path) since = time.time() best_loss = 999 last_vis_epoch = 0 progress_log = Path(output_path) / 'progress.log' if not progress_log.exists(): progress_log.open('w', buffering=1).write( tsv_line('ep_idx', 'phase', 'iter', 'i_p_ep', 'time')) # Add header trn_log = InformationLogger(output_path, 'trn') val_log = InformationLogger(output_path, 'val') tst_log = InformationLogger(output_path, 'tst') num_devices = params['global']['num_gpus'] assert num_devices is not None and num_devices >= 0, "missing mandatory num gpus parameter" # list of GPU devices that are available and unused. If no GPUs, returns empty list lst_device_ids = get_device_ids( num_devices) if torch.cuda.is_available() else [] num_devices = len(lst_device_ids) if lst_device_ids else 0 device = torch.device(f'cuda:{lst_device_ids[0]}' if torch.cuda. is_available() and lst_device_ids else 'cpu') print( f"Number of cuda devices requested: {params['global']['num_gpus']}. Cuda devices available: {lst_device_ids}\n" ) if num_devices == 1: print(f"Using Cuda device {lst_device_ids[0]}\n") elif num_devices > 1: print( f"Using data parallel on devices: {str(lst_device_ids)[1:-1]}. Main device: {lst_device_ids[0]}\n" ) # TODO: why are we showing indices [1:-1] for lst_device_ids? try: # FIXME: For HPC when device 0 not available. Error: Invalid device id (in torch/cuda/__init__.py). model = nn.DataParallel( model, device_ids=lst_device_ids ) # DataParallel adds prefix 'module.' to state_dict keys except AssertionError: warnings.warn( f"Unable to use devices {lst_device_ids}. Trying devices {list(range(len(lst_device_ids)))}" ) device = torch.device('cuda:0') lst_device_ids = range(len(lst_device_ids)) model = nn.DataParallel( model, device_ids=lst_device_ids ) # DataParallel adds prefix 'module.' to state_dict keys else: warnings.warn( f"No Cuda device available. This process will only run on CPU\n") tqdm.write(f'Creating dataloaders from data in {samples_folder}...\n') trn_dataloader, val_dataloader, tst_dataloader = create_dataloader( data_path=data_path, batch_size=batch_size, task=task, num_devices=num_devices, params=params, samples_folder=samples_folder) tqdm.write( f'Setting model, criterion, optimizer and learning rate scheduler...\n' ) model, criterion, optimizer, lr_scheduler = set_hyperparameters( params, num_classes_corrected, model, checkpoint) criterion = criterion.to(device) try: # For HPC when device 0 not available. Error: Cuda invalid device ordinal. model.to(device) except RuntimeError: warnings.warn(f"Unable to use device. Trying device 0...\n") device = torch.device(f'cuda:0' if torch.cuda.is_available() and lst_device_ids else 'cpu') model.to(device) filename = os.path.join(output_path, 'checkpoint.pth.tar') # VISUALIZATION: generate pngs of inputs, labels and outputs vis_batch_range = get_key_def('vis_batch_range', params['visualization'], None) if vis_batch_range is not None: # Make sure user-provided range is a tuple with 3 integers (start, finish, increment). Check once for all visualization tasks. assert isinstance(vis_batch_range, list) and len(vis_batch_range) == 3 and all( isinstance(x, int) for x in vis_batch_range) vis_at_init = get_key_def('vis_at_init', params['visualization'], False) vis_at_init_dataset = get_key_def('vis_at_init_dataset', params['visualization'], 'val') if vis_at_init: tqdm.write( f'Visualizing initialized model on batch range {vis_batch_range} from {vis_at_init_dataset} dataset...\n' ) vis_from_dataloader( params=params, eval_loader=val_dataloader if vis_at_init_dataset == 'val' else tst_dataloader, model=model, ep_num=0, output_path=output_path, dataset=vis_at_init_dataset, device=device, vis_batch_range=vis_batch_range) for epoch in range(0, params['training']['num_epochs']): print( f'\nEpoch {epoch}/{params["training"]["num_epochs"] - 1}\n{"-" * 20}' ) trn_report = train(train_loader=trn_dataloader, model=model, criterion=criterion, optimizer=optimizer, scheduler=lr_scheduler, num_classes=num_classes_corrected, batch_size=batch_size, task=task, ep_idx=epoch, progress_log=progress_log, vis_params=params, device=device, debug=debug) trn_log.add_values(trn_report, epoch, ignore=['precision', 'recall', 'fscore', 'iou']) val_report = evaluation( eval_loader=val_dataloader, model=model, criterion=criterion, num_classes=num_classes_corrected, batch_size=batch_size, task=task, ep_idx=epoch, progress_log=progress_log, vis_params=params, batch_metrics=params['training']['batch_metrics'], dataset='val', device=device, debug=debug) val_loss = val_report['loss'].avg if params['training']['batch_metrics'] is not None: val_log.add_values(val_report, epoch) else: val_log.add_values(val_report, epoch, ignore=['precision', 'recall', 'fscore', 'iou']) if val_loss < best_loss: tqdm.write("save checkpoint\n") best_loss = val_loss # More info: https://pytorch.org/tutorials/beginner/saving_loading_models.html#saving-torch-nn-dataparallel-models state_dict = model.module.state_dict( ) if num_devices > 1 else model.state_dict() torch.save( { 'epoch': epoch, 'arch': model_name, 'model': state_dict, 'best_loss': best_loss, 'optimizer': optimizer.state_dict() }, filename) if bucket_name: bucket_filename = os.path.join(bucket_output_path, 'checkpoint.pth.tar') bucket.upload_file(filename, bucket_filename) # VISUALIZATION: generate png of test samples, labels and outputs for visualisation to follow training performance vis_at_checkpoint = get_key_def('vis_at_checkpoint', params['visualization'], False) ep_vis_min_thresh = get_key_def('vis_at_ckpt_min_ep_diff', params['visualization'], 4) vis_at_ckpt_dataset = get_key_def('vis_at_ckpt_dataset', params['visualization'], 'val') if vis_batch_range is not None and vis_at_checkpoint and epoch - last_vis_epoch >= ep_vis_min_thresh: if last_vis_epoch == 0: tqdm.write( f'Visualizing with {vis_at_ckpt_dataset} dataset samples on checkpointed model for batches {vis_batch_range}' ) vis_from_dataloader( params=params, eval_loader=val_dataloader if vis_at_ckpt_dataset == 'val' else tst_dataloader, model=model, ep_num=epoch + 1, output_path=output_path, dataset=vis_at_ckpt_dataset, device=device, vis_batch_range=vis_batch_range) last_vis_epoch = epoch if bucket_name: save_logs_to_bucket(bucket, bucket_output_path, output_path, now, params['training']['batch_metrics']) cur_elapsed = time.time() - since print( f'Current elapsed time {cur_elapsed // 60:.0f}m {cur_elapsed % 60:.0f}s' ) # load checkpoint model and evaluate it on test dataset. if int( params['training']['num_epochs'] ) > 0: #if num_epochs is set to 0, model is loaded to evaluate on test set checkpoint = load_checkpoint(filename) model, _ = load_from_checkpoint(checkpoint, model) if tst_dataloader: tst_report = evaluation( eval_loader=tst_dataloader, model=model, criterion=criterion, num_classes=num_classes_corrected, batch_size=batch_size, task=task, ep_idx=params['training']['num_epochs'], progress_log=progress_log, vis_params=params, batch_metrics=params['training']['batch_metrics'], dataset='tst', device=device) tst_log.add_values(tst_report, params['training']['num_epochs']) if bucket_name: bucket_filename = os.path.join(bucket_output_path, 'last_epoch.pth.tar') bucket.upload_file( "output.txt", os.path.join(bucket_output_path, f"Logs/{now}_output.txt")) bucket.upload_file(filename, bucket_filename) time_elapsed = time.time() - since print('Training complete in {:.0f}m {:.0f}s'.format( time_elapsed // 60, time_elapsed % 60))
def main(params: dict): """ Identify the class to which each image belongs. :param params: (dict) Parameters found in the yaml config file. """ # SET BASIC VARIABLES AND PATHS since = time.time() task = params['global']['task'] img_dir_or_csv = params['inference']['img_dir_or_csv_file'] chunk_size = get_key_def('chunk_size', params['inference'], 512) prediction_with_smoothing = get_key_def('smooth_prediction', params['inference'], False) overlap = get_key_def('overlap', params['inference'], 2) num_classes = params['global']['num_classes'] num_classes_corrected = add_background_to_num_class(task, num_classes) num_bands = params['global']['number_of_bands'] working_folder = Path( params['inference']['state_dict_path']).parent.joinpath( f'inference_{num_bands}bands') num_devices = params['global']['num_gpus'] if params['global'][ 'num_gpus'] else 0 Path.mkdir(working_folder, parents=True, exist_ok=True) print(f'Inferences will be saved to: {working_folder}\n\n') bucket = None bucket_file_cache = [] bucket_name = get_key_def('bucket_name', params['global']) # list of GPU devices that are available and unused. If no GPUs, returns empty list lst_device_ids = get_device_ids( num_devices) if torch.cuda.is_available() else [] device = torch.device(f'cuda:{lst_device_ids[0]}' if torch.cuda. is_available() and lst_device_ids else 'cpu') if lst_device_ids: print( f"Number of cuda devices requested: {num_devices}. Cuda devices available: {lst_device_ids}. Using {lst_device_ids[0]}\n\n" ) else: warnings.warn( f"No Cuda device available. This process will only run on CPU") # CONFIGURE MODEL model, state_dict_path, model_name = net( params, num_channels=num_classes_corrected, inference=True) try: model.to(device) except RuntimeError: print(f"Unable to use device. Trying device 0") device = torch.device(f'cuda:0' if torch.cuda.is_available() and lst_device_ids else 'cpu') model.to(device) # mlflow tracking path + parameters logging set_tracking_uri( get_key_def('mlflow_uri', params['global'], default="./mlruns")) set_experiment('gdl-benchmarking/' + working_folder.name) log_params(params['global']) log_params(params['inference']) # CREATE LIST OF INPUT IMAGES FOR INFERENCE list_img = list_input_images(img_dir_or_csv, bucket_name, glob_patterns=["*.tif", "*.TIF"]) if task == 'classification': # FIXME: why don't we load from checkpoint in classification? classifier(params, list_img, model, device, working_folder) elif task == 'segmentation': # TODO: Add verifications? if bucket: bucket.download_file( state_dict_path, "saved_model.pth.tar") # TODO: is this still valid? model, _ = load_from_checkpoint("saved_model.pth.tar", model) else: model, _ = load_from_checkpoint(state_dict_path, model) # LOOP THROUGH LIST OF INPUT IMAGES with tqdm(list_img, desc='image list', position=0) as _tqdm: for info in _tqdm: img_name = Path(info['tif']).name local_gpkg = info['gpkg'] if local_gpkg: local_gpkg = Path(local_gpkg) if bucket: local_img = f"Images/{img_name}" bucket.download_file(info['tif'], local_img) inference_image = f"Classified_Images/{img_name.split('.')[0]}_inference.tif" if info['meta']: if info['meta'] not in bucket_file_cache: bucket_file_cache.append(info['meta']) bucket.download_file(info['meta'], info['meta'].split('/')[-1]) info['meta'] = info['meta'].split('/')[-1] else: # FIXME: else statement should support img['meta'] integration as well. local_img = Path(info['tif']) inference_image = working_folder.joinpath( f"{img_name.split('.')[0]}_inference.tif") print(inference_image) assert local_img.is_file( ), f"Could not locate raster file at {local_img}" with rasterio.open(local_img, 'r') as raster: inf_meta = raster.meta if prediction_with_smoothing: print('Smoothening Predictions with 2D interpolation') pred = segmentation_with_smoothing( raster, local_gpkg, model, chunk_size, overlap, num_bands, device) else: pred = segmentation(raster, local_gpkg, model, chunk_size, num_bands, device) if local_gpkg: assert local_gpkg.is_file( ), f"Could not locate gkpg file at {local_gpkg}" label = vector_to_raster( vector_file=local_gpkg, input_image=raster, out_shape=pred.shape[:2], attribute_name=info['attribute_name'], fill=0) # background value in rasterized vector. with start_run(run_name=img_name, nested=True): pixelMetrics = ComputePixelMetrics( label, pred, num_classes_corrected) log_metrics( pixelMetrics.update(pixelMetrics.jaccard)) log_metrics(pixelMetrics.update(pixelMetrics.dice)) log_metrics( pixelMetrics.update(pixelMetrics.accuracy)) log_metrics( pixelMetrics.update(pixelMetrics.precision)) log_metrics( pixelMetrics.update(pixelMetrics.recall)) log_metrics( pixelMetrics.update(pixelMetrics.matthews)) label_classes = np.unique(label) assert len(colors) >= len( label_classes ), f'Not enough colors and class names for number of classes in output' # FIXME: color mapping scheme is hardcoded for now because of memory constraint; To be fixed. label_rgb = ind2rgb(label, colors) pred_rgb = ind2rgb(pred, colors) Image.fromarray((label_rgb).astype(np.uint8), mode='RGB').save( os.path.join( working_folder, 'label_rgb_' + inference_image.stem + '.png')) Image.fromarray((pred_rgb).astype(np.uint8), mode='RGB').save( os.path.join( working_folder, 'pred_rgb_' + inference_image.stem + '.png')) del label_rgb, pred_rgb pred = pred[np.newaxis, :, :] inf_meta.update({ "driver": "GTiff", "height": pred.shape[1], "width": pred.shape[2], "count": pred.shape[0], "dtype": 'uint8' }) with rasterio.open(inference_image, 'w+', **inf_meta) as dest: dest.write(pred) log_artifact(working_folder) time_elapsed = time.time() - since print('Inference and Benchmarking completed in {:.0f}m {:.0f}s'.format( time_elapsed // 60, time_elapsed % 60))