def prepare_input(input_parcel_filepath: str, classtype_to_prepare: str, output_dir: str): """ This function creates a file that is compliant with the assumptions used by the rest of the classification functionality. It should be a csv file with the following columns: - object_id: column with a unique identifier - classname: a string column with a readable name of the classes that will be classified to """ # Check if input parameters are OK if not os.path.exists(input_parcel_filepath): raise Exception(f"Input file doesn't exist: {input_parcel_filepath}") else: logger.info(f"Process input file {input_parcel_filepath}") # Read input file logger.info(f"Read parceldata from {input_parcel_filepath}") if geofile_util.is_geofile(input_parcel_filepath): parceldata_df = geofile_util.read_file(input_parcel_filepath) else: parceldata_df = pdh.read_file(input_parcel_filepath) logger.info(f"Read Parceldata ready, info(): {parceldata_df.info()}") # Check if the id column is present... if conf.columns['id'] not in parceldata_df.columns: message = f"Column {conf.columns['id']} not found in input parcel file: {input_parcel_filepath}. Make sure the column is present or change the column name in global_constants.py" logger.critical(message) raise Exception(message) # Copy the refe file to the run dir, so we always keep knowing which refe was used input_classes_filepath = conf.preprocess[ 'classtype_to_prepare_refe_filepath'] if not os.path.exists(input_classes_filepath): raise Exception( f"Input classes file doesn't exist: {input_classes_filepath}") shutil.copy(input_classes_filepath, output_dir) # Now start prepare if classtype_to_prepare == 'CROPGROUP': parceldata_df = prepare_input_cropgroup( parceldata_df=parceldata_df, column_BEFL_cropcode=column_BEFL_crop_declared, column_output_class=conf.columns['class_declared']) return prepare_input_cropgroup( parceldata_df=parceldata_df, column_BEFL_cropcode=column_BEFL_crop, column_output_class=conf.columns['class']) elif classtype_to_prepare == 'CROPGROUP_GROUNDTRUTH': return prepare_input_cropgroup( parceldata_df=parceldata_df, column_BEFL_cropcode=column_BEFL_crop_gt_verified, column_output_class=conf.columns['class_groundtruth']) elif classtype_to_prepare == 'CROPGROUP_EARLY': parceldata_df = prepare_input_cropgroup_early( parceldata_df=parceldata_df, column_BEFL_cropcode=column_BEFL_crop_declared, column_output_class=conf.columns['class_declared']) return prepare_input_cropgroup_early( parceldata_df=parceldata_df, column_BEFL_cropcode=column_BEFL_crop, column_output_class=conf.columns['class']) elif classtype_to_prepare == 'CROPGROUP_EARLY_GROUNDTRUTH': return prepare_input_cropgroup_early( parceldata_df=parceldata_df, column_BEFL_cropcode=column_BEFL_crop_gt_verified, column_output_class=conf.columns['class_groundtruth']) elif classtype_to_prepare == 'LANDCOVER': parceldata_df = prepare_input_landcover( parceldata_df=parceldata_df, column_BEFL_cropcode=column_BEFL_crop_declared, column_output_class=conf.columns['class_declared']) return prepare_input_landcover( parceldata_df=parceldata_df, column_BEFL_cropcode=column_BEFL_crop, column_output_class=conf.columns['class']) elif classtype_to_prepare == 'LANDCOVER_GROUNDTRUTH': return prepare_input_landcover( parceldata_df=parceldata_df, column_BEFL_cropcode=column_BEFL_crop_gt_verified, column_output_class=conf.columns['class_groundtruth']) elif classtype_to_prepare == 'LANDCOVER_EARLY': parceldata_df = prepare_input_landcover_early( parceldata_df=parceldata_df, column_BEFL_cropcode=column_BEFL_crop_declared, column_output_class=conf.columns['class_declared']) return prepare_input_landcover_early( parceldata_df=parceldata_df, column_BEFL_cropcode=column_BEFL_crop, column_output_class=conf.columns['class']) elif classtype_to_prepare == 'LANDCOVER_EARLY_GROUNDTRUTH': return prepare_input_landcover_early( parceldata_df=parceldata_df, column_BEFL_cropcode=column_BEFL_crop_gt_verified, column_output_class=conf.columns['class_groundtruth']) elif classtype_to_prepare == 'POPULAR_CROP': parceldata_df = prepare_input_most_popular_crop( parceldata_df=parceldata_df, column_BEFL_cropcode=column_BEFL_crop_declared, column_output_class=conf.columns['class_declared']) return prepare_input_most_popular_crop( parceldata_df=parceldata_df, column_BEFL_cropcode=column_BEFL_crop, column_output_class=conf.columns['class']) elif classtype_to_prepare == 'POPULAR_CROP_GROUNDTRUTH': return prepare_input_most_popular_crop( parceldata_df=parceldata_df, column_BEFL_cropcode=column_BEFL_crop_gt_verified, column_output_class=conf.columns['class_groundtruth']) else: message = f"Unknown value for parameter classtype_to_prepare: {classtype_to_prepare}" logger.fatal(message) raise Exception(message)
def prepare_input(input_parcel_filepath: str, output_imagedata_parcel_input_filepath: str, output_parcel_nogeo_filepath: str = None, force: bool = False): """ This function creates a file that is preprocessed to be a good input file for timeseries extraction of sentinel images. Args input_parcel_filepath: input file output_imagedata_parcel_input_filepath: prepared output file output_parcel_nogeo_filepath: output file with a copy of the non-geo data force: force creation, even if output file(s) exist already """ ##### Check if parameters are OK and init some extra params ##### if not os.path.exists(input_parcel_filepath): raise Exception(f"Input file doesn't exist: {input_parcel_filepath}") # Check if the input file has a projection specified if geofile_util.get_crs(input_parcel_filepath) is None: message = f"The parcel input file doesn't have a projection/crs specified, so STOP: {input_parcel_filepath}" logger.critical(message) raise Exception(message) # If force == False Check and the output file exists already, stop. if (force is False and os.path.exists(output_imagedata_parcel_input_filepath) and (output_parcel_nogeo_filepath is None or os.path.exists(output_parcel_nogeo_filepath))): logger.warning( "prepare_input: force == False and output files exist, so stop: " + f"{output_imagedata_parcel_input_filepath}, " + f"{output_parcel_nogeo_filepath}") return logger.info(f"Process input file {input_parcel_filepath}") # Create temp dir to store temporary data for tracebility output_dir, output_filename = os.path.split( output_imagedata_parcel_input_filepath) output_filename_noext = os.path.splitext(output_filename)[0] temp_output_dir = os.path.join(output_dir, 'temp') if not os.path.exists(temp_output_dir): os.mkdir(temp_output_dir) ##### Read the parcel data and write nogeo version ##### parceldata_gdf = geofile_util.read_file(input_parcel_filepath) logger.info(f'Parceldata read, shape: {parceldata_gdf.shape}') # Check if the id column is present and set as index if conf.columns['id'] in parceldata_gdf.columns: parceldata_gdf.set_index(conf.columns['id'], inplace=True) else: message = f"STOP: Column {conf.columns['id']} not found in input parcel file: {input_parcel_filepath}. Make sure the column is present or change the column name in global_constants.py" logger.critical(message) raise Exception(message) if force is True or os.path.exists(output_parcel_nogeo_filepath) == False: logger.info(f"Save non-geo data to {output_parcel_nogeo_filepath}") parceldata_nogeo_df = parceldata_gdf.drop(['geometry'], axis=1) pdh.to_file(parceldata_nogeo_df, output_parcel_nogeo_filepath) ##### Do the necessary conversions and write buffered file ##### # If force == False Check and the output file exists already, stop. if (force is False and os.path.exists(output_imagedata_parcel_input_filepath)): logger.warning( "prepare_input: force == False and output files exist, so stop: " + f"{output_imagedata_parcel_input_filepath}") return logger.info('Apply buffer on parcel') parceldata_buf_gdf = parceldata_gdf.copy() # resolution = number of segments per circle buffer_size = -conf.marker.getint('buffer') parceldata_buf_gdf[conf.columns['geom']] = ( parceldata_buf_gdf[conf.columns['geom']].buffer(buffer_size, resolution=5)) # Export buffered geometries that result in empty geometries logger.info('Export parcels that are empty after buffer') parceldata_buf_empty_df = parceldata_buf_gdf.loc[parceldata_buf_gdf[ conf.columns['geom']].is_empty == True] if len(parceldata_buf_empty_df.index) > 0: parceldata_buf_empty_df.drop(conf.columns['geom'], axis=1, inplace=True) temp_empty_filepath = os.path.join( temp_output_dir, f"{output_filename_noext}_empty.sqlite") pdh.to_file(parceldata_buf_empty_df, temp_empty_filepath) # Export parcels that don't result in a (multi)polygon parceldata_buf_notempty_gdf = parceldata_buf_gdf.loc[parceldata_buf_gdf[ conf.columns['geom']].is_empty == False] parceldata_buf_nopoly_gdf = parceldata_buf_notempty_gdf.loc[ ~parceldata_buf_notempty_gdf[conf.columns['geom']].geom_type. isin(['Polygon', 'MultiPolygon'])] if len(parceldata_buf_nopoly_gdf.index) > 0: logger.info('Export parcels that are no (multi)polygons after buffer') parceldata_buf_nopoly_gdf.drop(conf.columns['geom'], axis=1, inplace=True) temp_nopoly_filepath = os.path.join( temp_output_dir, f"{output_filename_noext}_nopoly.sqlite") geofile_util.to_file(parceldata_buf_nopoly_gdf, temp_nopoly_filepath) # Export parcels that are (multi)polygons after buffering parceldata_buf_poly_gdf = parceldata_buf_notempty_gdf.loc[ parceldata_buf_notempty_gdf[conf.columns['geom']].geom_type.isin( ['Polygon', 'MultiPolygon'])] for column in parceldata_buf_poly_gdf.columns: if column not in [conf.columns['id'], conf.columns['geom']]: parceldata_buf_poly_gdf.drop(column, axis=1, inplace=True) logger.info( f"Export parcels that are (multi)polygons after buffer to {output_imagedata_parcel_input_filepath}" ) geofile_util.to_file(parceldata_buf_poly_gdf, output_imagedata_parcel_input_filepath) logger.info(parceldata_buf_poly_gdf)
def calc_timeseries_data(input_parcel_filepath: Path, input_country_code: str, start_date_str: str, end_date_str: str, sensordata_to_get: List[str], base_filename: str, dest_data_dir: Path): """ Calculate timeseries data for the input parcels args data_to_get: an array with data you want to be calculated: check out the constants starting with DATA_TO_GET... for the options. """ ##### Check and init some stuff ##### if sensordata_to_get is None: raise Exception("sensordata_to_get cannot be None") if not dest_data_dir.exists(): os.mkdir(dest_data_dir) # To have a good precision, the vector input must be uploaded to gee in WGS84! input_preprocessed_dir = conf.dirs.getpath('input_preprocessed_dir') input_parcel_4326_filepath = input_preprocessed_dir / f"{input_parcel_filepath.stem}_4326.shp" # If the WGS84 version doesn't exist yet, create it if (not os.path.exists(input_parcel_4326_filepath)): input_parcel_gdf = geofile.read_file(input_parcel_filepath) target_epsg = 4326 logger.info( f"Reproject features from {input_parcel_gdf.crs} to epsg:{target_epsg}" ) input_parcel_4326_gdf = input_parcel_gdf.to_crs(epsg=target_epsg) logger.info( f"Write reprojected features to {input_parcel_4326_filepath}") geofile.to_file(input_parcel_4326_gdf, input_parcel_4326_filepath) ##### Start calculation of the timeseries on gee ##### logger.info("Start create_sentinel_timeseries_info") # On windows machines there seems to be an issue with gee. The following error is very common, # probably because there are too many sockets created in a short time... and the cleanup # procedure in windows can't follow: # "OSError: [WinError 10048] Elk socketadres (protocol/netwerkadres/poort) kan normaal # slechts één keer worden gebruikt" # So execute in a loop and retry every 10 seconds... this seems to be a working workaround. nb_retries = 0 done_success = False while done_success is False and nb_retries < 10: try: calculate_sentinel_timeseries( input_parcel_filepath=input_parcel_filepath, input_country_code=input_country_code, start_date_str=start_date_str, end_date_str=end_date_str, sensordata_to_get=sensordata_to_get, base_filename=base_filename, dest_data_dir=dest_data_dir) done_success = True except OSError as ex: nb_retries += 1 if os.name == 'nt' and ex.winerror == 10048: logger.warning( f"Exception [WinError {ex.winerror}] while trying calculate_sentinel_timeseries, retry! (Full exception message {ex})" ) time.sleep(10) else: raise # If it wasn't successful, log and stop. if done_success is False: message = "STOP: calculate_sentinel_timeseries couldn't be completed even after many retries..." logger.critical(message) raise Exception(message) # Download the data from GEE return_status = 'UNDEFINED' number_retries = 0 while return_status == 'UNDEFINED' or return_status == 'RETRY_NEEDED': # Download the results try: logger.info('Now download needed timeseries files') return_status = download_sentinel_timeseries( dest_data_dir=dest_data_dir, base_filename=base_filename) # Retry every 10 minutes if return_status == 'RETRY_NEEDED': logger.info( 'Not all data was available yet on google drive... try again in a few minutes...' ) # Retry only 36 times, or +- 6 hours if number_retries >= 70: return_status = 'STOP' message = "Retried a lot of times, but data still isn't available" logger.error(message) raise Exception(message) # Wait for 10 minutes before retrying again... but only sleep 10 seconds at # a time so it can be cancelled. nb_sleeps = 0 while nb_sleeps < 30: time.sleep(10) nb_sleeps += 1 number_retries += 1 except: logger.error('ERROR downloading from google drive!') raise