def clean_gee_downloaded_csv(csv_file: str, remove_orig_csv: bool = False): """ Cleans a csv downloaded from gee by removing gee specific columns... """ try: # Prepare output filename file_noext, _ = os.path.splitext(csv_file) output_file = f"{file_noext}{conf.general['data_ext']}" # Check if output file exists already even though it is different from input file if output_file != csv_file and os.path.exists(output_file): logger.warning(f"Output file exists already, so don't create it again: {output_file}") elif os.path.getsize(csv_file) == 0: # If input file is empty... logger.info(f"File is empty, so just create new empty output file: {output_file}") open(output_file, 'w').close() else: # Read the file logger.info(f"Read file and remove gee specifice columns from {csv_file}") # Sample 100 rows of data to determine dtypes, so floats can be read as float32 instead of the # default float64. Writing those to eg. parquet is a lot more efficiƫnt. test_df = pd.read_csv(csv_file, nrows=100) float_cols = [c for c in test_df if test_df[c].dtype == "float64"] float32_cols = {c: np.float32 for c in float_cols} # Now read entire file data_read_df = pd.read_csv(csv_file, engine='c', dtype=float32_cols) # Drop unnecessary gee specific columns... for column in data_read_df.columns: if column in ['system:index', '.geo']: data_read_df.drop(column, axis=1, inplace=True) elif column == 'count': logger.info(f"Rename count column to {conf.columns['pixcount_s1s2']}") data_read_df.rename(columns={'count': conf.columns['pixcount_s1s2']}, inplace=True) # Set the id column as index data_read_df.set_index(conf.columns['id'], inplace=True) # If there are data columns, write to output file if len(data_read_df.columns) > 0: # Replace the original file by the cleaned one logger.info(f"Write the file with the gee specific columns removed to a new file: {output_file}") pdh.to_file(data_read_df, output_file, index=True) else: logger.warning(f"No data columns found in file {csv_file}, so return!!!") return # If remove_orig_csv is True and the output filepath is different from orig filepath, # remove orig file. if remove_orig_csv and output_file != csv_file: logger.info(f"Remove orig csv file: {csv_file}") os.remove(csv_file) except Exception as ex: raise Exception(f"Error processing file {csv_file}") from ex
def prepare_input(input_parcel_filepath: str, input_parcel_filetype: str, input_parcel_pixcount_filepath: str, classtype_to_prepare: str, output_parcel_filepath: str, force: bool = False): """ Prepare a raw input file by eg. adding the classification classes to use for the classification,... """ # If force == False Check and the output file exists already, stop. if force is False and os.path.exists(output_parcel_filepath) is True: logger.warning( f"prepare_input: output file already exists and force == False, so stop: {output_parcel_filepath}" ) return if input_parcel_filetype == 'BEFL': output_dir, _ = os.path.split(output_parcel_filepath) df_parceldata = befl.prepare_input( input_parcel_filepath=input_parcel_filepath, classtype_to_prepare=classtype_to_prepare, output_dir=output_dir) else: message = f"Unknown value for parameter input_parcel_filetype: {input_parcel_filetype}" logger.critical(message) raise Exception(message) # Load pixcount data and join it logger.info(f"Read pixcount file {input_parcel_pixcount_filepath}") df_pixcount = pdh.read_file(input_parcel_pixcount_filepath) logger.debug(f"Read pixcount file ready, shape: {df_pixcount.shape}") if df_pixcount.index.name != conf.columns['id']: df_pixcount.set_index(conf.columns['id'], inplace=True) df_parceldata.set_index(conf.columns['id'], inplace=True) df_parceldata = df_parceldata.join( df_pixcount[conf.columns['pixcount_s1s2']], how='left') # Export result to file output_ext = os.path.splitext(output_parcel_filepath)[1] for column in df_parceldata.columns: # if the output asked is a csv... we don't need the geometry... if column == conf.columns['geom'] and output_ext == '.csv': df_parceldata.drop(column, axis=1, inplace=True) logger.info(f"Write output to {output_parcel_filepath}") # If extension is not .shp, write using pandas (=a lot faster!) if output_ext.lower() != '.shp': pdh.to_file(df_parceldata, output_parcel_filepath) else: df_parceldata.to_file(output_parcel_filepath, index=False)
def detect_multicrop(input_parcel_filepath: Path, input_parcel_timeseries_data_filepath: Path): ''' logger.info(f"Read input file: {input_parcel_filepath}") df_input_parcel = pd.read_csv(input_parcel_filepath, low_memory=False) logger.debug('Read train file ready') ''' # If the classification data isn't passed as dataframe, read it from the csv logger.info( f"Read classification data file: {input_parcel_timeseries_data_filepath}" ) df_timeseries_data = pd.read_csv(input_parcel_timeseries_data_filepath, low_memory=False) df_timeseries_data.set_index(conf.columns['id'], inplace=True) logger.debug('Read classification data file ready') # Add column with the max of all columns (= all stdDev's) df_timeseries_data['max_stddev'] = df_timeseries_data.max(axis=1) ''' # Prepare the data to send to prediction logic... logger.info("Join train sample with the classification data") df_input_parcel_for_detect = (df_input_parcel#[[gs.id_column, gs.class_column]] .join(df_timeseries_data , how='inner', on=gs.id_column)) # Only keep the parcels with relevant crops/production types productiontype_column = 'GESP_PM' if productiontype_column in df_input_parcel_for_detect.columns: # Serres, tijdelijke overkappingen en loodsen df_input_parcel_for_detect.loc[~df_input_parcel_for_detect[productiontype_column].isin(['SER', 'SGM'])] df_input_parcel_for_detect.loc[~df_input_parcel_for_detect[productiontype_column].isin(['PLA', 'PLO', 'NPO'])] df_input_parcel_for_detect.loc[df_input_parcel_for_detect[productiontype_column] != 'LOO'] # Een loods is hetzelfde als een stal... df_input_parcel_for_detect.loc[df_input_parcel_for_detect[productiontype_column] != 'CON'] # Containers, niet op volle grond... crop_columnname = 'GWSCOD_H' df_input_parcel_for_detect.loc[~df_input_parcel_for_detect[crop_columnname].isin(['1', '2', '3'])] # Keep the parcels with the 1000 largest stdDev df_largest = df_input_parcel_for_detect.nlargest(1000, columns='max_stddev', keep='first') ''' #df_result = df_timeseries_data['max_stddev'].to_frame() df_result = df_timeseries_data logger.info(df_result) # Write to file output_filepath = Path( str(input_parcel_timeseries_data_filepath) + '_largestStdDev.csv') logger.info(f"Write output file: {output_filepath}") pdh.to_file(df_result, output_filepath)
def main(): dir = Path( "X:/Monitoring/Markers/playground/pierog/tmp/Run_2019-06-25_007_imported" ) in_filepaths = dir.glob("*.parquet") # Convert all files found for in_filepath in in_filepaths: # Read input file print(f"Read {in_filepath}") df = pdh.read_file(in_filepath) # Write to new file out_filepath = in_filepath.parent / f"{in_filepath.stem}.sqlite" print(f"Write {out_filepath}") pdh.to_file(df, out_filepath)
def prepare_input(input_parcel_filepath: str, output_imagedata_parcel_input_filepath: str, output_parcel_nogeo_filepath: str = None, force: bool = False): """ This function creates a file that is preprocessed to be a good input file for timeseries extraction of sentinel images. Args input_parcel_filepath: input file output_imagedata_parcel_input_filepath: prepared output file output_parcel_nogeo_filepath: output file with a copy of the non-geo data force: force creation, even if output file(s) exist already """ ##### Check if parameters are OK and init some extra params ##### if not os.path.exists(input_parcel_filepath): raise Exception(f"Input file doesn't exist: {input_parcel_filepath}") # Check if the input file has a projection specified if geofile_util.get_crs(input_parcel_filepath) is None: message = f"The parcel input file doesn't have a projection/crs specified, so STOP: {input_parcel_filepath}" logger.critical(message) raise Exception(message) # If force == False Check and the output file exists already, stop. if (force is False and os.path.exists(output_imagedata_parcel_input_filepath) and (output_parcel_nogeo_filepath is None or os.path.exists(output_parcel_nogeo_filepath))): logger.warning( "prepare_input: force == False and output files exist, so stop: " + f"{output_imagedata_parcel_input_filepath}, " + f"{output_parcel_nogeo_filepath}") return logger.info(f"Process input file {input_parcel_filepath}") # Create temp dir to store temporary data for tracebility output_dir, output_filename = os.path.split( output_imagedata_parcel_input_filepath) output_filename_noext = os.path.splitext(output_filename)[0] temp_output_dir = os.path.join(output_dir, 'temp') if not os.path.exists(temp_output_dir): os.mkdir(temp_output_dir) ##### Read the parcel data and write nogeo version ##### parceldata_gdf = geofile_util.read_file(input_parcel_filepath) logger.info(f'Parceldata read, shape: {parceldata_gdf.shape}') # Check if the id column is present and set as index if conf.columns['id'] in parceldata_gdf.columns: parceldata_gdf.set_index(conf.columns['id'], inplace=True) else: message = f"STOP: Column {conf.columns['id']} not found in input parcel file: {input_parcel_filepath}. Make sure the column is present or change the column name in global_constants.py" logger.critical(message) raise Exception(message) if force is True or os.path.exists(output_parcel_nogeo_filepath) == False: logger.info(f"Save non-geo data to {output_parcel_nogeo_filepath}") parceldata_nogeo_df = parceldata_gdf.drop(['geometry'], axis=1) pdh.to_file(parceldata_nogeo_df, output_parcel_nogeo_filepath) ##### Do the necessary conversions and write buffered file ##### # If force == False Check and the output file exists already, stop. if (force is False and os.path.exists(output_imagedata_parcel_input_filepath)): logger.warning( "prepare_input: force == False and output files exist, so stop: " + f"{output_imagedata_parcel_input_filepath}") return logger.info('Apply buffer on parcel') parceldata_buf_gdf = parceldata_gdf.copy() # resolution = number of segments per circle buffer_size = -conf.marker.getint('buffer') parceldata_buf_gdf[conf.columns['geom']] = ( parceldata_buf_gdf[conf.columns['geom']].buffer(buffer_size, resolution=5)) # Export buffered geometries that result in empty geometries logger.info('Export parcels that are empty after buffer') parceldata_buf_empty_df = parceldata_buf_gdf.loc[parceldata_buf_gdf[ conf.columns['geom']].is_empty == True] if len(parceldata_buf_empty_df.index) > 0: parceldata_buf_empty_df.drop(conf.columns['geom'], axis=1, inplace=True) temp_empty_filepath = os.path.join( temp_output_dir, f"{output_filename_noext}_empty.sqlite") pdh.to_file(parceldata_buf_empty_df, temp_empty_filepath) # Export parcels that don't result in a (multi)polygon parceldata_buf_notempty_gdf = parceldata_buf_gdf.loc[parceldata_buf_gdf[ conf.columns['geom']].is_empty == False] parceldata_buf_nopoly_gdf = parceldata_buf_notempty_gdf.loc[ ~parceldata_buf_notempty_gdf[conf.columns['geom']].geom_type. isin(['Polygon', 'MultiPolygon'])] if len(parceldata_buf_nopoly_gdf.index) > 0: logger.info('Export parcels that are no (multi)polygons after buffer') parceldata_buf_nopoly_gdf.drop(conf.columns['geom'], axis=1, inplace=True) temp_nopoly_filepath = os.path.join( temp_output_dir, f"{output_filename_noext}_nopoly.sqlite") geofile_util.to_file(parceldata_buf_nopoly_gdf, temp_nopoly_filepath) # Export parcels that are (multi)polygons after buffering parceldata_buf_poly_gdf = parceldata_buf_notempty_gdf.loc[ parceldata_buf_notempty_gdf[conf.columns['geom']].geom_type.isin( ['Polygon', 'MultiPolygon'])] for column in parceldata_buf_poly_gdf.columns: if column not in [conf.columns['id'], conf.columns['geom']]: parceldata_buf_poly_gdf.drop(column, axis=1, inplace=True) logger.info( f"Export parcels that are (multi)polygons after buffer to {output_imagedata_parcel_input_filepath}" ) geofile_util.to_file(parceldata_buf_poly_gdf, output_imagedata_parcel_input_filepath) logger.info(parceldata_buf_poly_gdf)
def calculate_periodic_data(input_parcel_filepath: str, input_base_dir: str, start_date_str: str, end_date_str: str, sensordata_to_get: [], dest_data_dir: str, force: bool = False): """ This function creates a file that is a weekly summarize of timeseries images from DIAS. TODO: add possibility to choose which values to extract (mean, min, max,...)? Args: input_parcel_filepath (str): [description] input_base_dir (str): [description] start_date_str (str): Start date in format %Y-%m-%d. Needs to be aligned already on the periods wanted. end_date_str (str): End date in format %Y-%m-%d. Needs to be aligned already on the periods wanted. sensordata_to_get ([]): dest_data_dir (str): [description] force (bool, optional): [description]. Defaults to False. """ logger.info('calculate_periodic_data') # Init input_parcels_filename = os.path.basename(input_parcel_filepath) input_parcels_filename_noext, _ = os.path.splitext(input_parcels_filename) input_dir = os.path.join(input_base_dir, input_parcels_filename_noext) # TODO: in config? input_ext = ".sqlite" output_ext = ".sqlite" start_date = datetime.strptime(start_date_str, '%Y-%m-%d') end_date = datetime.strptime(end_date_str, '%Y-%m-%d') year = start_date_str.split("-")[0] # Prepare output dir test = False if test is True: dest_data_dir += "_test" if not os.path.exists(dest_data_dir): os.mkdir(dest_data_dir) # Create Dataframe with all files with their info logger.debug('Create Dataframe with all files and their properties') file_info_list = [] for filename in os.listdir(input_dir): if filename.endswith(input_ext): # Get seperate filename parts file_info = get_file_info(os.path.join(input_dir, filename)) file_info_list.append(file_info) all_inputfiles_df = pd.DataFrame(file_info_list) # Loop over the data we need to get id_column = conf.columns['id'] for sensordata_type in sensordata_to_get: logger.debug( 'Get files we need based on start- & stopdates, sensordata_to_get,...' ) orbits = [None] if sensordata_type == conf.general['SENSORDATA_S1_ASCDESC']: # Filter files to the ones we need satellitetype = 'S1' imagetype = IMAGETYPE_S1_GRD bands = ['VV', 'VH'] orbits = ['ASC', 'DESC'] needed_inputfiles_df = all_inputfiles_df.loc[ (all_inputfiles_df.date >= start_date) & (all_inputfiles_df.date < end_date) & (all_inputfiles_df.imagetype == imagetype) & (all_inputfiles_df.band.isin(bands)) & (all_inputfiles_df.orbit.isin(orbits))] elif sensordata_type == conf.general['SENSORDATA_S2gt95']: satellitetype = 'S2' imagetype = IMAGETYPE_S2_L2A bands = ['B02-10m', 'B03-10m', 'B04-10m', 'B08-10m'] needed_inputfiles_df = all_inputfiles_df.loc[ (all_inputfiles_df.date >= start_date) & (all_inputfiles_df.date < end_date) & (all_inputfiles_df.imagetype == imagetype) & (all_inputfiles_df.band.isin(bands))] elif sensordata_type == conf.general['SENSORDATA_S1_COHERENCE']: satellitetype = 'S1' imagetype = IMAGETYPE_S1_COHERENCE bands = ['VV', 'VH'] orbits = ['ASC', 'DESC'] needed_inputfiles_df = all_inputfiles_df.loc[ (all_inputfiles_df.date >= start_date) & (all_inputfiles_df.date < end_date) & (all_inputfiles_df.imagetype == imagetype) & (all_inputfiles_df.band.isin(bands))] else: raise Exception(f"Unsupported sensordata_type: {sensordata_type}") # There should also be one pixcount file pixcount_filename = f"{input_parcels_filename_noext}_weekly_pixcount{output_ext}" pixcount_filepath = os.path.join(dest_data_dir, pixcount_filename) # For each week start_week = int(datetime.strftime(start_date, '%W')) end_week = int(datetime.strftime(end_date, '%W')) for period_index in range(start_week, end_week): # Get the date of the first day of period period_index (eg. monday for a week) period_date = datetime.strptime( str(year) + '_' + str(period_index) + '_1', '%Y_%W_%w') # New file name period_date_str_long = period_date.strftime('%Y-%m-%d') period_data_filename = f"{input_parcels_filename_noext}_weekly_{period_date_str_long}_{sensordata_type}{output_ext}" period_data_filepath = os.path.join(dest_data_dir, period_data_filename) # Check if output file exists already if os.path.exists(period_data_filepath): if force is False: logger.info( f"SKIP: force is False and file exists: {period_data_filepath}" ) continue else: os.remove(period_data_filepath) # Loop over bands and orbits (all combinations of bands and orbits!) logger.info(f"Calculate file: {period_data_filename}") period_data_df = None for band, orbit in [(band, orbit) for band in bands for orbit in orbits]: # Get list of files needed for this period, band period_files_df = needed_inputfiles_df.loc[ (needed_inputfiles_df.week == period_index) & (needed_inputfiles_df.band == band)] # If an orbit to be filtered was specified, filter if orbit is not None: period_files_df = period_files_df.loc[( period_files_df.orbit == orbit)] # Loop all period_files period_band_data_df = None statistic_columns_dict = { 'count': [], 'max': [], 'mean': [], 'min': [], 'std': [] } for j, imagedata_filepath in enumerate( period_files_df.filepath.tolist()): # If file has filesize == 0, skip if os.path.getsize(imagedata_filepath) == 0: continue # Read the file (but only the columns we need) columns = [column for column in statistic_columns_dict ].append(id_column) image_data_df = pdh.read_file(imagedata_filepath, columns=columns) image_data_df.set_index(id_column, inplace=True) image_data_df.index.name = id_column # Remove rows with nan values nb_before_dropna = len(image_data_df.index) image_data_df.dropna(inplace=True) nb_after_dropna = len(image_data_df.index) if nb_after_dropna != nb_before_dropna: logger.warning( f"Before dropna: {nb_before_dropna}, after: {nb_after_dropna} for file {imagedata_filepath}" ) if nb_after_dropna == 0: continue # Rename columns so column names stay unique for statistic_column in statistic_columns_dict: new_column_name = statistic_column + str(j + 1) image_data_df.rename( columns={statistic_column: new_column_name}, inplace=True) image_data_df[new_column_name] = image_data_df[ new_column_name].astype(float) statistic_columns_dict[statistic_column].append( new_column_name) # Create 1 dataframe for all weekfiles - one row for each code_obj - using concat (code_obj = index) if period_band_data_df is None: period_band_data_df = image_data_df else: period_band_data_df = pd.concat( [period_band_data_df, image_data_df], axis=1, sort=False) # Apparently concat removes the index name in some situations period_band_data_df.index.name = id_column # Calculate max, mean, min, ... if period_band_data_df is not None: logger.debug('Calculate max, mean, min, ...') period_date_str_short = period_date.strftime('%Y%m%d') # Remark: prefix column names: sqlite doesn't like a numeric start if orbit is None: column_basename = f"TS_{period_date_str_short}_{imagetype}_{band}" else: column_basename = f"TS_{period_date_str_short}_{imagetype}_{orbit}_{band}" # Number of pixels # TODO: onderzoeken hoe aantal pixels best bijgehouden wordt : afwijkingen weglaten ? max nemen ? ... period_band_data_df[f"{column_basename}_count"] = np.nanmax( period_band_data_df[statistic_columns_dict['count']], axis=1) # Maximum of all max columns period_band_data_df[f"{column_basename}_max"] = np.nanmax( period_band_data_df[statistic_columns_dict['max']], axis=1) # Mean of all mean columns period_band_data_df[f"{column_basename}_mean"] = np.nanmean( period_band_data_df[statistic_columns_dict['mean']], axis=1) # Minimum of all min columns period_band_data_df[f"{column_basename}_min"] = np.nanmin( period_band_data_df[statistic_columns_dict['min']], axis=1) # Mean of all std columns period_band_data_df[f"{column_basename}_std"] = np.nanmean( period_band_data_df[statistic_columns_dict['std']], axis=1) # Number of Files used period_band_data_df[ f"{column_basename}_used_files"] = period_band_data_df[ statistic_columns_dict['max']].count(axis=1) # Only keep the columns we want to keep columns_to_keep = [ f"{column_basename}_count", f"{column_basename}_max", f"{column_basename}_mean", f"{column_basename}_min", f"{column_basename}_std", f"{column_basename}_used_files" ] period_band_data_df = period_band_data_df[columns_to_keep] # Merge the data with the other bands/orbits for this period if period_data_df is None: period_data_df = period_band_data_df else: period_data_df = pd.concat( [period_band_data_df, period_data_df], axis=1, sort=False) # Apparently concat removes the index name in some situations period_data_df.index.name = id_column if period_data_df is not None: logger.info(f"Write new file: {period_data_filename}") pdh.to_file(period_data_df, period_data_filepath) if not os.path.exists(pixcount_filepath): pixcount_s1s2_column = conf.columns['pixcount_s1s2'] for column in period_data_df.columns: if column.endswith('_count'): period_data_df.rename( columns={column: pixcount_s1s2_column}, inplace=True) break pixcount_df = period_data_df[pixcount_s1s2_column] pixcount_df.fillna(value=0, inplace=True) pdh.to_file(pixcount_df, pixcount_filepath)
def predict_proba(parcel_df: pd.DataFrame, classifier_filepath: str, output_parcel_predictions_filepath: str) -> pd.DataFrame: """ Predict the probabilities for all input data using the classifier provided and write it to the output file. Args parcel_df: pandas DataFrame containing the data to classify. Columns: * global_settings.id_column: the id of the parcel. * global_settings.class_column: the class of the parcel. Isn't really used. * ... all columns that will be used as classification data. classifier_filepath: the filepath where the classifier can be written. output_parcel_predictions_filepath: file to write the predictions to. """ # Some basic checks that input is ok parcel_df.reset_index(inplace=True) if (conf.columns['id'] not in parcel_df.columns or conf.columns['class'] not in parcel_df.columns): message = f"Columns {conf.columns['id']} and {conf.columns['class']} are mandatory for input parameter parcel_df!" logger.critical(message) raise Exception(message) # Now do final preparation for the classification parcel_classes_df = parcel_df[conf.columns['class']] cols_to_keep = parcel_df.columns.difference( [conf.columns['id'], conf.columns['class']]) parcel_data_df = parcel_df[cols_to_keep] logger.info( f"Train file processed and rows with missing data removed, data shape: {parcel_data_df.shape}, labels shape: {parcel_classes_df.shape}" ) with pd.option_context('display.max_rows', None, 'display.max_columns', None): logger.info( f"Resulting Columns for training data: {parcel_data_df.columns}") # Load the classifier classifier = joblib.load(classifier_filepath) logger.info(f"Classifier has the following columns: {classifier.classes_}") logger.info( f"Predict classes with probabilities: {len(parcel_df.index)} rows") class_proba = classifier.predict_proba(parcel_data_df) logger.info(f"Predict classes with probabilities ready") # Convert probabilities to dataframe, combine with input data and write to file id_class_proba = np.concatenate([ parcel_df[[conf.columns['id'], conf.columns['class']]].values, class_proba ], axis=1) cols = [conf.columns['id'], conf.columns['class']] cols.extend(classifier.classes_) proba_df = pd.DataFrame(id_class_proba, columns=cols) # If output path provided, write results if output_parcel_predictions_filepath: pdh.to_file(proba_df, output_parcel_predictions_filepath) return proba_df
def collect_and_prepare_timeseries_data( input_parcel_filepath: Path, timeseries_dir: Path, base_filename: str, output_filepath: Path, start_date_str: str, end_date_str: str, sensordata_to_use: List[str], parceldata_aggregations_to_use: List[str], force: bool = False): """ Collect all timeseries data to use for the classification and prepare it by applying scaling,... as needed. """ # Some constants to choose which type of data to use in the marker. # Remark: the string needs to be the same as the end of the name of the columns in the csv files! # TODO: I'm not really happy with both a list in the ini file + here... not sure what the # cleanest solution is though... PARCELDATA_AGGRAGATION_MEAN = conf.general[ 'PARCELDATA_AGGRAGATION_MEAN'] # Mean value of the pixels values in a parcel. PARCELDATA_AGGRAGATION_STDDEV = conf.general[ 'PARCELDATA_AGGRAGATION_STDDEV'] # std dev of the values of the pixels in a parcel # Constants for types of sensor data SENSORDATA_S1 = conf.general['SENSORDATA_S1'] # Sentinel 1 data SENSORDATA_S1DB = conf.general['SENSORDATA_S1DB'] # Sentinel 1 data, in dB SENSORDATA_S1_ASCDESC = conf.general[ 'SENSORDATA_S1_ASCDESC'] # Sentinel 1 data, divided in Ascending and Descending passes SENSORDATA_S1DB_ASCDESC = conf.general[ 'SENSORDATA_S1DB_ASCDESC'] # Sentinel 1 data, in dB, divided in Ascending and Descending passes SENSORDATA_S2 = conf.general['SENSORDATA_S2'] # Sentinel 2 data SENSORDATA_S2gt95 = conf.general[ 'SENSORDATA_S2gt95'] # Sentinel 2 data (B2,B3,B4,B8) IF available for 95% or area SENSORDATA_S1_COHERENCE = conf.general['SENSORDATA_S1_COHERENCE'] # If force == False Check and the output file exists already, stop. if (force is False and output_filepath.exists() is True): logger.warning( f"Output file already exists and force == False, so stop: {output_filepath}" ) return # Init the result with the id's of the parcels we want to treat result_df = pdh.read_file(input_parcel_filepath, columns=[conf.columns['id']]) if result_df.index.name != conf.columns['id']: result_df.set_index(conf.columns['id'], inplace=True) nb_input_parcels = len(result_df.index) logger.info( f"Parceldata aggregations that need to be used: {parceldata_aggregations_to_use}" ) logger.setLevel(logging.DEBUG) # Loop over all input timeseries data to find the data we really need data_ext = conf.general['data_ext'] filepath_start = timeseries_dir / f"{base_filename}_{start_date_str}{data_ext}" filepath_end = timeseries_dir / f"{base_filename}_{end_date_str}{data_ext}" logger.debug(f'filepath_start_date: {filepath_start}') logger.debug(f'filepath_end_date: {filepath_end}') ts_data_files = timeseries_dir.glob(f"{base_filename}_*{data_ext}") for curr_filepath in sorted(ts_data_files): # Only process data that is of the right sensor types sensor_type = curr_filepath.stem.split('_')[-1] if sensor_type not in sensordata_to_use: logger.debug( f"SKIP: file is not in sensor types asked ({sensordata_to_use}): {curr_filepath}" ) continue # The only data we want to process is the data in the range of dates if ((str(curr_filepath) < str(filepath_start)) or (str(curr_filepath) >= str(filepath_end))): logger.debug( f"SKIP: File is not in date range asked: {curr_filepath}") continue # An empty file signifies that there wasn't any valable data for that period/sensor/... if os.path.getsize(curr_filepath) == 0: logger.info(f"SKIP: file is empty: {curr_filepath}") continue # Read data, and check if there is enough data in it data_read_df = pdh.read_file(curr_filepath) nb_data_read = len(data_read_df.index) data_available_pct = nb_data_read * 100 / nb_input_parcels min_parcels_with_data_pct = conf.timeseries.getfloat( 'min_parcels_with_data_pct') if data_available_pct < min_parcels_with_data_pct: logger.info( f"SKIP: only data for {data_available_pct:.2f}% of parcels, should be > {min_parcels_with_data_pct}%: {curr_filepath}" ) continue # Start processing the file logger.info(f'Process file: {curr_filepath}') if data_read_df.index.name != conf.columns['id']: data_read_df.set_index(conf.columns['id'], inplace=True) # Loop over columns to check if there are columns that need to be dropped. for column in data_read_df.columns: # If it is the id column, continue if column == conf.columns['id']: continue # Check if the column is "asked" column_ok = False for parceldata_aggregation in parceldata_aggregations_to_use: if column.endswith('_' + parceldata_aggregation): column_ok = True if column_ok is False: # Drop column if it doesn't end with something in parcel_data_aggregations_to_use logger.debug( f"Drop column as it's column aggregation isn't to be used: {column}" ) data_read_df.drop(column, axis=1, inplace=True) continue # Check if the column contains data for enough parcels valid_input_data_pct = ( 1 - (data_read_df[column].isnull().sum() / nb_input_parcels)) * 100 if valid_input_data_pct < min_parcels_with_data_pct: # If the number of nan values for the column > x %, drop column logger.warn( f"Drop column as it contains only {valid_input_data_pct:.2f}% real data compared to input (= not nan) which is < {min_parcels_with_data_pct}%!: {column}" ) data_read_df.drop(column, axis=1, inplace=True) # If S2, rescale data if sensor_type.startswith(SENSORDATA_S2): for column in data_read_df.columns: logger.info( f"Column contains S2 data, so scale it by dividing by 10.000: {column}" ) data_read_df[column] = data_read_df[column] / 10000 # If S1 coherence, rescale data if sensor_type == SENSORDATA_S1_COHERENCE: for column in data_read_df.columns: logger.info( f"Column contains S1 Coherence data, so scale it by dividing by 300: {column}" ) data_read_df[column] = data_read_df[column] / 300 # Join the data to the result... result_df = result_df.join(data_read_df, how='left') # Remove rows with many null values from result max_number_null = int(0.6 * len(result_df.columns)) parcel_many_null_df = result_df[result_df.isnull().sum( axis=1) > max_number_null] if len(parcel_many_null_df.index) > 0: # Write the rows with empty data to a file parcel_many_null_filepath = Path( f'{str(output_filepath)}_rows_many_null.sqlite') logger.warn( f"Write {len(parcel_many_null_df.index)} rows with > {max_number_null} of {len(result_df.columns)} columns==null to {parcel_many_null_filepath}" ) pdh.to_file(parcel_many_null_df, parcel_many_null_filepath) # Now remove them from result result_df = result_df[result_df.isnull().sum( axis=1) <= max_number_null] # For rows with some null values, set them to 0 # TODO: first rough test of using interpolation doesn't give a difference, maybe better if # smarter interpolation is used (= only between the different types of data: # S1_GRD_VV, S1_GRD_VH, S1_COH_VV, S1_COH_VH, ASC?, DESC?, S2 #result_df.interpolate(inplace=True) result_df.fillna(0, inplace=True) # Write output file... logger.info(f"Write output to file, start: {output_filepath}") pdh.to_file(result_df, output_filepath) logger.info(f"Write output to file, ready (with shape: {result_df.shape})")
def predict_proba(parcel_df: pd.DataFrame, classifier_filepath: str, output_parcel_predictions_filepath: str) -> pd.DataFrame: """ Predict the probabilities for all input data using the classifier provided and write it to the output file. Args parcel_df: pandas DataFrame containing the data to classify. Columns: * global_settings.id_column: the id of the parcel. * global_settings.class_column: the class of the parcel. Isn't really used. * ... all columns that will be used as classification data. classifier_filepath: the filepath where the classifier can be written. """ # Some basic checks that input is ok column_class = conf.columns['class'] column_class_declared = conf.columns['class_declared'] parcel_df.reset_index(inplace=True) if (conf.columns['id'] not in parcel_df.columns or column_class not in parcel_df.columns): message = f"Columns {conf.columns['id']} and {column_class} are mandatory for input parameter parcel_df!" logger.critical(message) raise Exception(message) # Now do final preparation for the classification parcel_classes_df = parcel_df[column_class] cols_to_keep = parcel_df.columns.difference( [conf.columns['id'], column_class, column_class_declared]) parcel_data_df = parcel_df[cols_to_keep] parcel_data_df.sort_index(axis=1, inplace=True) logger.info( f"Input predict file processed and rows with missing data removed, data shape: {parcel_data_df.shape}, labels shape: {parcel_classes_df.shape}" ) # Check of the input data columns match the columns needed for the neural net classifier_filepath_noext, _ = os.path.splitext(classifier_filepath) classifier_datacolumns_filepath = classifier_filepath_noext + '_datacolumns.txt' with open(classifier_datacolumns_filepath, "r") as file: classifier_datacolumns = eval(file.readline()) if classifier_datacolumns != list(parcel_data_df.columns): raise Exception( f"Input datacolumns for predict don't match needed columns for neural net: \ninput: {parcel_data_df.columns}, \nneeded: {classifier_datacolumns}" ) with pd.option_context('display.max_rows', None, 'display.max_columns', None): logger.info( f"Resulting Columns for predicting data: {parcel_data_df.columns}") # Load the classifier and predict model = keras.models.load_model(classifier_filepath) logger.info( f"Predict classes with probabilities: {len(parcel_df.index)} rows") class_proba = model.predict_proba(parcel_data_df) logger.info(f"Predict classes with probabilities ready") # Convert probabilities to dataframe, combine with input data and write to file # Load the classes from the classes file classifier_filepath_noext, _ = os.path.splitext(classifier_filepath) classifier_classes_filepath = classifier_filepath_noext + '_classes.txt' with open(classifier_classes_filepath, "r") as file: classes_dict = eval(file.readline()) id_class_proba = np.concatenate([ parcel_df[[conf.columns['id'], column_class, column_class_declared ]].values, class_proba ], axis=1) cols = [conf.columns['id'], column_class, column_class_declared] cols.extend(classes_dict) proba_df = pd.DataFrame(id_class_proba, columns=cols) proba_df.set_index(keys=conf.columns['id'], inplace=True) # If output path provided, write results if output_parcel_predictions_filepath: pdh.to_file(proba_df, output_parcel_predictions_filepath) return proba_df
def create_train_test_sample(input_parcel_filepath: str, output_parcel_train_filepath: str, output_parcel_test_filepath: str, balancing_strategy: str, force: bool = False): """ Create a seperate train and test sample from the general input file. """ # If force == False Check and the output files exist already, stop. if (force is False and os.path.exists(output_parcel_train_filepath) is True and os.path.exists(output_parcel_test_filepath) is True): logger.warning( f"create_train_test_sample: output files already exist and force == False, so stop: {output_parcel_train_filepath}, {output_parcel_test_filepath}" ) return # Load input data... logger.info( f"Start create_train_test_sample with balancing_strategy {balancing_strategy}" ) logger.info(f"Read input file {input_parcel_filepath}") df_in = pdh.read_file(input_parcel_filepath) logger.debug(f"Read input file ready, shape: {df_in.shape}") # Init some many-used variables from config class_balancing_column = conf.columns['class_balancing'] class_column = conf.columns['class'] with pd.option_context('display.max_rows', None, 'display.max_columns', None): count_per_class = df_in.groupby(class_balancing_column, as_index=False).size() logger.info( f"Number of elements per classname in input dataset:\n{count_per_class}" ) # The test dataset should be as representative as possible for the entire dataset, so create # this first as a 20% sample of each class without any additional checks... # Remark: group_keys=False evades that apply creates an extra index-level of the groups above # the data and evades having to do .reset_index(level=class_balancing_column_NAME, drop=True) # to get rid of the group level df_test = df_in.groupby(class_balancing_column, group_keys=False).apply(pd.DataFrame.sample, frac=0.20) logger.debug( f"df_test after sampling 20% of data per class, shape: {df_test.shape}" ) # The candidate parcel for training are all non-test parcel df_train_base = df_in[~df_in.index.isin(df_test.index)] logger.debug(f"df_train_base after isin\n{df_train_base}") # Remove parcel with too few pixels from the train sample min_pixcount = int(conf.marker['min_nb_pixels_train']) df_train_base = df_train_base[ df_train_base[conf.columns['pixcount_s1s2']] >= min_pixcount] logger.debug( f"Number of parcels in df_train_base after filter on pixcount >= {min_pixcount}: {len(df_train_base)}" ) # Some classes shouldn't be used for training... so remove them! logger.info( f"Remove 'classes_to_ignore_for_train' from train sample (= where {class_column} is in: {conf.marker.getlist('classes_to_ignore_for_train')}" ) df_train_base = df_train_base[~df_train_base[class_column].isin( conf.marker.getlist('classes_to_ignore_for_train'))] # All classes_to_ignore aren't meant for training either... logger.info( f"Remove 'classes_to_ignore' from train sample (= where {class_column} is in: {conf.marker.getlist('classes_to_ignore')}" ) df_train_base = df_train_base[~df_train_base[class_column].isin( conf.marker.getlist('classes_to_ignore'))] # Print the train base result before applying any balancing with pd.option_context('display.max_rows', None, 'display.max_columns', None): count_per_class = df_train_base.groupby(class_balancing_column, as_index=False).size() logger.info( f"Number of elements per classname for train dataset, before balancing:\n{count_per_class}" ) # Depending on the balancing_strategy, use different way to get a training sample if balancing_strategy == 'BALANCING_STRATEGY_NONE': # Just use 25% of all non-test data as train data -> 25% of 80% of data -> 20% of all data # will be training date # Remark: - this is very unbalanced, eg. classes with 10.000 times the input size than other # classes # - this results in a relatively high accuracy in overall numbers, but the small # classes are not detected at all df_train = (df_train_base.groupby(class_balancing_column, group_keys=False).apply( pd.DataFrame.sample, frac=0.25)) elif balancing_strategy == 'BALANCING_STRATEGY_MEDIUM': # Balance the train data, but still use some larger samples for the classes that have a lot # of members in the input dataset # Remark: with the upper limit of 10.000 this gives still OK results overall, and also the # smaller classes give some results with upper limit of 4000 results significantly # less good. # For the larger classes, favor them by leaving the samples larger but cap at upper_limit upper_limit = 10000 lower_limit = 1000 logger.info( f"Cap over {upper_limit}, keep the full number of training sample till {lower_limit}, samples smaller than that are oversampled" ) df_train = (df_train_base.groupby(class_balancing_column).filter( lambda x: len(x) >= upper_limit).groupby(class_balancing_column, group_keys=False).apply( pd.DataFrame.sample, upper_limit)) # Middle classes use the number as they are df_train = df_train.append( df_train_base.groupby(class_balancing_column).filter(lambda x: len( x) < upper_limit).groupby(class_balancing_column).filter( lambda x: len(x) >= lower_limit)) # For smaller classes, oversample... df_train = df_train.append( df_train_base.groupby(class_balancing_column).filter( lambda x: len(x) < lower_limit).groupby( class_balancing_column, group_keys=False).apply(pd.DataFrame.sample, lower_limit, replace=True)) elif balancing_strategy == 'BALANCING_STRATEGY_MEDIUM2': # Balance the train data, but still use some larger samples for the classes that have a lot # of members in the input dataset # Remark: with the upper limit of 10.000 this gives still OK results overall, and also the # smaller classes give some results with upper limit of 4000 results significantly # less good. # For the larger classes, leave the samples larger but cap cap_count_limit1 = 100000 cap_train_limit1 = 30000 logger.info( f"Cap balancing classes over {cap_count_limit1} to {cap_train_limit1}" ) df_train = (df_train_base.groupby(class_balancing_column).filter( lambda x: len(x) >= cap_count_limit1).groupby( class_balancing_column, group_keys=False).apply(pd.DataFrame.sample, cap_train_limit1)) cap_count_limit2 = 50000 cap_train_limit2 = 20000 logger.info( f"Cap balancing classes between {cap_count_limit2} and {cap_count_limit1} to {cap_train_limit2}" ) df_train = df_train.append( df_train_base.groupby(class_balancing_column).filter(lambda x: len( x) < cap_count_limit1).groupby(class_balancing_column).filter( lambda x: len(x) >= cap_count_limit2).groupby( class_balancing_column, group_keys=False).apply(pd.DataFrame.sample, cap_train_limit2)) cap_count_limit3 = 20000 cap_train_limit3 = 10000 logger.info( f"Cap balancing classes between {cap_count_limit3} and {cap_count_limit2} to {cap_train_limit3}" ) df_train = df_train.append( df_train_base.groupby(class_balancing_column).filter(lambda x: len( x) < cap_count_limit2).groupby(class_balancing_column).filter( lambda x: len(x) >= cap_count_limit3).groupby( class_balancing_column, group_keys=False).apply(pd.DataFrame.sample, cap_train_limit3)) cap_count_limit4 = 10000 cap_train_limit4 = 10000 logger.info( f"Cap balancing classes between {cap_count_limit4} and {cap_count_limit3} to {cap_train_limit4}" ) df_train = df_train.append( df_train_base.groupby(class_balancing_column).filter(lambda x: len( x) < cap_count_limit3).groupby(class_balancing_column).filter( lambda x: len(x) >= cap_count_limit4).groupby( class_balancing_column, group_keys=False).apply(pd.DataFrame.sample, cap_train_limit4)) oversample_count = 1000 # Middle classes use the number as they are logger.info( f"For classes between {cap_count_limit4} and {oversample_count}, just use all samples" ) df_train = df_train.append( df_train_base.groupby(class_balancing_column).filter(lambda x: len( x) < cap_count_limit4).groupby(class_balancing_column).filter( lambda x: len(x) >= oversample_count)) # For smaller classes, oversample... logger.info( f"For classes smaller than {oversample_count}, oversample to {oversample_count}" ) df_train = df_train.append( df_train_base.groupby(class_balancing_column).filter( lambda x: len(x) < oversample_count).groupby( class_balancing_column, group_keys=False).apply(pd.DataFrame.sample, oversample_count, replace=True)) elif balancing_strategy == 'BALANCING_STRATEGY_PROPORTIONAL_GROUPS': # Balance the train data, but still use some larger samples for the classes that have a lot # of members in the input dataset # Remark: with the upper limit of 10.000 this gives still OK results overall, and also the # smaller classes give some results with upper limit of 4000 results significantly # less good. # For the larger classes, leave the samples larger but cap upper_count_limit1 = 100000 upper_train_limit1 = 30000 logger.info( f"Cap balancing classes over {upper_count_limit1} to {upper_train_limit1}" ) df_train = (df_train_base.groupby(class_balancing_column).filter( lambda x: len(x) >= upper_count_limit1).groupby( class_balancing_column, group_keys=False).apply(pd.DataFrame.sample, upper_train_limit1)) upper_count_limit2 = 50000 upper_train_limit2 = 20000 logger.info( f"Cap balancing classes between {upper_count_limit2} and {upper_count_limit1} to {upper_train_limit2}" ) df_train = df_train.append( df_train_base.groupby(class_balancing_column).filter(lambda x: len( x) < upper_count_limit1).groupby(class_balancing_column). filter(lambda x: len(x) >= upper_count_limit2).groupby( class_balancing_column, group_keys=False).apply(pd.DataFrame.sample, upper_train_limit2)) upper_count_limit3 = 20000 upper_train_limit3 = 10000 logger.info( f"Cap balancing classes between {upper_count_limit3} and {upper_count_limit2} to {upper_train_limit3}" ) df_train = df_train.append( df_train_base.groupby(class_balancing_column).filter(lambda x: len( x) < upper_count_limit2).groupby(class_balancing_column). filter(lambda x: len(x) >= upper_count_limit3).groupby( class_balancing_column, group_keys=False).apply(pd.DataFrame.sample, upper_train_limit3)) upper_count_limit4 = 10000 upper_train_limit4 = 5000 logger.info( f"Cap balancing classes between {upper_count_limit4} and {upper_count_limit3} to {upper_train_limit4}" ) df_train = df_train.append( df_train_base.groupby(class_balancing_column).filter(lambda x: len( x) < upper_count_limit3).groupby(class_balancing_column). filter(lambda x: len(x) >= upper_count_limit4).groupby( class_balancing_column, group_keys=False).apply(pd.DataFrame.sample, upper_train_limit4)) # For smaller balancing classes, just use all samples df_train = df_train.append( df_train_base.groupby(class_balancing_column).filter( lambda x: len(x) < upper_count_limit4)) elif balancing_strategy == 'BALANCING_STRATEGY_UPPER_LIMIT': # Balance the train data, but still use some larger samples for the classes that have a lot # of members in the input dataset # Remark: with the upper limit of 10.000 this gives still OK results overall, and also the # smaller classes give some results with upper limit of 4000 results significantly # less good. # For the larger classes, favor them by leaving the samples larger but cap at upper_limit upper_limit = 10000 logger.info(f"Cap over {upper_limit}...") df_train = (df_train_base.groupby(class_balancing_column).filter( lambda x: len(x) >= upper_limit).groupby(class_balancing_column, group_keys=False).apply( pd.DataFrame.sample, upper_limit)) # For smaller classes, just use all samples df_train = df_train.append( df_train_base.groupby(class_balancing_column).filter( lambda x: len(x) < upper_limit)) elif balancing_strategy == 'BALANCING_STRATEGY_EQUAL': # In theory the most logical way to balance: make sure all classes have the same amount of # training data by undersampling the largest classes and oversampling the small classes. df_train = (df_train_base.groupby(class_balancing_column, group_keys=False).apply( pd.DataFrame.sample, 2000, replace=True)) else: logger.fatal( f"Unknown balancing strategy, STOP!: {balancing_strategy}") # Log the resulting numbers per class in the train sample with pd.option_context('display.max_rows', None, 'display.max_columns', None): count_per_class = df_train.groupby(class_balancing_column, as_index=False).size() logger.info( f'Number of elements per class_balancing_column in train dataset:\n{count_per_class}' ) if class_balancing_column != class_column: count_per_class = df_train.groupby(class_column, as_index=False).size() logger.info( f'Number of elements per class_column in train dataset:\n{count_per_class}' ) # Log the resulting numbers per class in the test sample with pd.option_context('display.max_rows', None, 'display.max_columns', None): count_per_class = df_test.groupby(class_balancing_column, as_index=False).size() logger.info( f'Number of elements per class_balancing_column in test dataset:\n{count_per_class}' ) if class_balancing_column != class_column: count_per_class = df_test.groupby(class_column, as_index=False).size() logger.info( f'Number of elements per class_column in test dataset:\n{count_per_class}' ) # Write to output files logger.info('Write the output files') df_train.set_index(conf.columns['id'], inplace=True) df_test.set_index(conf.columns['id'], inplace=True) pdh.to_file(df_train, output_parcel_train_filepath) # The ID column is the index... pdh.to_file(df_test, output_parcel_test_filepath) # The ID column is the index...
def calc_top3_and_consolidation(input_parcel_filepath: str, input_parcel_probabilities_filepath: str, output_predictions_filepath: str, output_predictions_output_filepath: str = None, force: bool = False): """ Calculate the top3 prediction and a consolidation prediction. Remark: in this logic the declared crop/class (class_declared) is used, as we want to compare with the declaration of the farmer, rather than taking into account corrections already. Args: input_parcel_filepath (str): [description] input_parcel_probabilities_filepath (str): [description] output_predictions_filepath (str): [description] output_predictions_output_filepath (str, optional): [description]. Defaults to None. force (bool, optional): [description]. Defaults to False. """ # If force is false and output exists, already, return if(force is False and os.path.exists(output_predictions_filepath)): logger.warning(f"calc_top3_and_consolidation: output file exist and force is False, so stop: {output_predictions_filepath}") return # Read input files logger.info("Read input file") proba_df = pdh.read_file(input_parcel_probabilities_filepath) top3_df = calc_top3(proba_df) # Read input files logger.info("Read input file") input_parcel_df = pdh.read_file(input_parcel_filepath) # All input parcels must stay in the output, so left join input with pred top3_df.set_index(conf.columns['id'], inplace=True) if input_parcel_df.index.name != conf.columns['id']: input_parcel_df.set_index(conf.columns['id'], inplace=True) cols_to_join = top3_df.columns.difference(input_parcel_df.columns) pred_df = input_parcel_df.join(top3_df[cols_to_join], how='left') # The parcels added by the join don't have a prediction yet, so apply it # For the ignore classes, set the prediction to the ignore type classes_to_ignore = conf.marker.getlist('classes_to_ignore') pred_df.loc[pred_df[conf.columns['class_declared']].isin(classes_to_ignore), 'pred1'] = pred_df[conf.columns['class_declared']] # For all other parcels without prediction there must have been no data # available for a classification, so set prediction to NODATA pred_df['pred1'].fillna('NODATA', inplace=True) # Add doubt columns add_doubt_column(pred_df=pred_df, new_pred_column=conf.columns['prediction_cons'], apply_doubt_min_nb_pixels=True) add_doubt_column(pred_df=pred_df, new_pred_column=conf.columns['prediction_full_alpha'], apply_doubt_min_nb_pixels=True, apply_doubt_marker_specific=True) # Calculate the status of the consolidated prediction (OK=usable, NOK=not) pred_df.loc[pred_df[conf.columns['prediction_cons']].isin(proba_df.columns.to_list()), conf.columns['prediction_cons_status']] = 'OK' pred_df[conf.columns['prediction_cons_status']].fillna('NOK', inplace=True) logger.info("Write full prediction data to file") pdh.to_file(pred_df, output_predictions_filepath) # Create final output file with the most important info if output_predictions_output_filepath is not None: # First add some aditional columns specific for the export pred_df['markercode'] = conf.marker['markertype'] pred_df['run_id'] = conf.general['run_id'] today = datetime.date.today() pred_df['cons_date'] = today pred_df['modify_date'] = today logger.info("Write final output prediction data to file") pred_df.reset_index(inplace=True) pred_df = pred_df[conf.columns.getlist('output_columns')] pdh.to_file(pred_df, output_predictions_output_filepath, index=False) # Write oracle sqlldr file if conf.marker['markertype'] in ['LANDCOVER', 'LANDCOVER_EARLY']: table_name = 'mon_marker_landcover' table_columns = ("layer_id, prc_id, versienummer, markercode, run_id, cons_landcover, " + "cons_status, cons_date date 'yyyy-mm-dd', landcover1, probability1, " + "landcover2, probability2, landcover3, probability3, " + "modify_date date 'yyyy-mm-dd'") elif conf.marker['markertype'] in ['CROPGROUP', 'CROPGROUP_EARLY']: table_name = 'mon_marker_cropgroup' table_columns = ("layer_id, prc_id, versienummer, markercode, run_id, cons_cropgroup, " + "cons_status, cons_date date 'yyyy-mm-dd', cropgroup1, probability1, " + "cropgroup2, probability2, cropgroup3, probability3, " + "modify_date date 'yyyy-mm-dd'") else: table_name = None logger.warning(f"Table unknown for marker type {conf.marker['markertype']}, so cannot write .ctl file") if table_name is not None: with open(output_predictions_output_filepath + '.ctl', 'w') as ctlfile: # SKIP=1 to skip the columns names line, the other ones to evade # more commits than needed ctlfile.write("OPTIONS (SKIP=1, ROWS=10000, BINDSIZE=40000000, READSIZE=40000000)\n") ctlfile.write("LOAD DATA\n") ctlfile.write(f"INFILE '{os.path.basename(output_predictions_output_filepath)}' \"str '\\n'\"\n") ctlfile.write(f"INSERT INTO TABLE {table_name} APPEND\n") # A tab as seperator is apparently X'9' ctlfile.write("FIELDS TERMINATED BY X'9'\n") ctlfile.write(f"({table_columns})\n")
def write_full_report(parcel_predictions_filepath: str, output_report_txt: str, parcel_ground_truth_filepath: str = None, force: bool = None): """Writes a report about the accuracy of the predictions to a file. Args: parcel_predictions_filepath: File name of csv file with the parcel with their predictions. prediction_columnname: Column name of the column that contains the predictions. output_report_txt: File name of txt file the report will be written to. parcel_ground_truth_filepath: List of parcels with ground truth to calculate eg. alfa and beta errors. If None, the part of the report that is based on this data is skipped TODO: refactor function to split logic more... """ # If force == False Check and the output file exists already, stop. if force is False and os.path.exists(output_report_txt): logger.warning( f"collect_and_prepare_timeseries_data: output file already exists and force == False, so stop: {output_report_txt}" ) return logger.info("Start write_full_report") pandas_option_context_list = [ 'display.max_rows', None, 'display.max_columns', None, 'display.max_colwidth', 300, 'display.width', 2000, 'display.colheader_justify', 'left' ] logger.info(f"Read file with predictions: {parcel_predictions_filepath}") df_predict = pdh.read_file(parcel_predictions_filepath) df_predict.set_index(conf.columns['id'], inplace=True) # Python template engine expects all values to be present, so initialize to empty empty_string = "''" html_data = { 'GENERAL_ACCURACIES_TABLE': empty_string, 'GENERAL_ACCURACIES_TEXT': empty_string, 'GENERAL_ACCURACIES_DATA': empty_string, 'CONFUSION_MATRICES_TABLE': empty_string, 'CONFUSION_MATRICES_DATA': empty_string, 'CONFUSION_MATRICES_CONSOLIDATED_TABLE': empty_string, 'CONFUSION_MATRICES_CONSOLIDATED_DATA': empty_string, 'PREDICTION_QUALITY_CONS_OVERVIEW_TEXT': empty_string, 'PREDICTION_QUALITY_CONS_OVERVIEW_TABLE': empty_string, 'PREDICTION_QUALITY_FULL_ALPHA_OVERVIEW_TEXT': empty_string, 'PREDICTION_QUALITY_FULL_ALPHA_OVERVIEW_TABLE': empty_string, 'PREDICTION_QUALITY_ALPHA_TEXT': empty_string, 'PREDICTION_QUALITY_BETA_TEXT': empty_string, 'PREDICTION_QUALITY_ALPHA_PER_PIXCOUNT_TEXT': empty_string, 'PREDICTION_QUALITY_ALPHA_PER_PIXCOUNT_TABLE': empty_string } # Build and write report... with open(output_report_txt, 'w') as outputfile: outputfile.write( "************************************************************\n") outputfile.write( "********************* PARAMETERS USED **********************\n") outputfile.write( "************************************************************\n") outputfile.write("\n") message = "Main parameters used for the marker" outputfile.write(f"\n{message}\n") html_data['PARAMETERS_USED_TEXT'] = message logger.info(f"{dict(conf.marker)}") parameter_list = [['marker', key, value] for key, value in conf.marker.items()] parameter_list += [['timeseries', key, value] for key, value in conf.timeseries.items()] parameter_list += [['preprocess', key, value] for key, value in conf.preprocess.items()] parameter_list += [['classifier', key, value] for key, value in conf.classifier.items()] parameter_list += [['postprocess', key, value] for key, value in conf.postprocess.items()] parameters_used_df = pd.DataFrame( parameter_list, columns=['parameter_type', 'parameter', 'value']) with pd.option_context(*pandas_option_context_list): outputfile.write(f"\n{parameters_used_df}\n") logger.info(f"{parameters_used_df}\n") html_data['PARAMETERS_USED_TABLE'] = parameters_used_df.to_html( index=False) outputfile.write( "************************************************************\n") outputfile.write( "**************** RECAP OF GENERAL RESULTS ******************\n") outputfile.write( "************************************************************\n") outputfile.write("\n") outputfile.write( "************************************************************\n") outputfile.write( "* GENERAL CONSOLIDATED CONCLUSIONS *\n") outputfile.write( "************************************************************\n") # Calculate + write general conclusions for consolidated prediction _add_prediction_conclusion( in_df=df_predict, new_columnname=conf.columns['prediction_conclusion_cons'], prediction_column_to_use=conf.columns['prediction_cons'], detailed=False) # Get the number of 'unimportant' ignore parcels and report them here df_predict_unimportant = df_predict[df_predict[ conf.columns['prediction_conclusion_cons']] == 'IGNORE_UNIMPORTANT'] # Now they can be removed for the rest of the reportings... df_predict = df_predict[ df_predict[conf.columns['prediction_conclusion_cons']] != 'IGNORE_UNIMPORTANT'] message = ( f"Prediction conclusions cons general overview, for {len(df_predict.index)} predicted cases." + f"The {len(df_predict_unimportant.index)} IGNORE_UNIMPORTANT parcels are excluded from the reporting!" ) outputfile.write(f"\n{message}\n") html_data['GENERAL_PREDICTION_CONCLUSION_CONS_OVERVIEW_TEXT'] = message count_per_class = (df_predict.groupby( conf.columns['prediction_conclusion_cons'], as_index=False).size().to_frame('count')) values = 100 * count_per_class['count'] / count_per_class['count'].sum( ) count_per_class.insert(loc=1, column='pct', value=values) with pd.option_context(*pandas_option_context_list): outputfile.write(f"\n{count_per_class}\n") logger.info(f"{count_per_class}\n") html_data[ 'GENERAL_PREDICTION_CONCLUSION_CONS_OVERVIEW_TABLE'] = count_per_class.to_html( ) html_data[ 'GENERAL_PREDICTION_CONCLUSION_CONS_OVERVIEW_DATA'] = count_per_class.to_dict( ) # Output general accuracies outputfile.write( "************************************************************\n") outputfile.write( "* OVERALL ACCURACIES *\n") outputfile.write( "************************************************************\n") overall_accuracies_list = [] # Calculate overall accuracies for all parcels try: oa = skmetrics.accuracy_score(df_predict[conf.columns['class']], df_predict['pred1'], normalize=True, sample_weight=None) * 100 overall_accuracies_list.append({ 'parcels': 'All', 'prediction_type': 'standard', 'accuracy': oa }) oa = skmetrics.accuracy_score( df_predict[conf.columns['class']], df_predict[conf.columns['prediction_cons']], normalize=True, sample_weight=None) * 100 overall_accuracies_list.append({ 'parcels': 'All', 'prediction_type': 'consolidated', 'accuracy': oa }) except: logger.exception("Error calculating overall accuracies!") # Calculate while ignoring the classes to be ignored... df_predict_accuracy_no_ignore = df_predict[ ~df_predict[conf.columns['class']]. isin(conf.marker.getlist('classes_to_ignore_for_train'))] df_predict_accuracy_no_ignore = df_predict_accuracy_no_ignore[ ~df_predict_accuracy_no_ignore[conf.columns['class']]. isin(conf.marker.getlist('classes_to_ignore'))] oa = skmetrics.accuracy_score( df_predict_accuracy_no_ignore[conf.columns['class']], df_predict_accuracy_no_ignore['pred1'], normalize=True, sample_weight=None) * 100 overall_accuracies_list.append({ 'parcels': 'Exclude classes_to_ignore(_for_train) classes', 'prediction_type': 'standard', 'accuracy': oa }) oa = skmetrics.accuracy_score( df_predict_accuracy_no_ignore[conf.columns['class']], df_predict_accuracy_no_ignore[conf.columns['prediction_cons']], normalize=True, sample_weight=None) * 100 overall_accuracies_list.append({ 'parcels': 'Exclude classes_to_ignore(_for_train) classes', 'prediction_type': 'consolidated', 'accuracy': oa }) # Calculate ignoring both classes to ignored + parcels not having a valid prediction df_predict_no_ignore_has_prediction = df_predict_accuracy_no_ignore.loc[ (df_predict_accuracy_no_ignore[conf.columns['prediction_cons']] != 'NODATA') & (df_predict_accuracy_no_ignore[conf.columns['prediction_cons']] != 'DOUBT:NOT_ENOUGH_PIXELS')] oa = skmetrics.accuracy_score( df_predict_no_ignore_has_prediction[conf.columns['class']], df_predict_no_ignore_has_prediction['pred1'], normalize=True, sample_weight=None) * 100 overall_accuracies_list.append({ 'parcels': 'Exclude ignored ones + with prediction (= excl. NODATA, NOT_ENOUGH_PIXELS)', 'prediction_type': 'standard', 'accuracy': oa }) oa = skmetrics.accuracy_score( df_predict_no_ignore_has_prediction[conf.columns['class']], df_predict_no_ignore_has_prediction[ conf.columns['prediction_cons']], normalize=True, sample_weight=None) * 100 overall_accuracies_list.append({ 'parcels': 'Exclude ignored ones + with prediction (= excl. NODATA, NOT_ENOUGH_PIXELS)', 'prediction_type': 'consolidated', 'accuracy': oa }) # Output the resulting overall accuracies message = 'Overall accuracies for different sub-groups of the data' outputfile.write(f"\n{message}\n") html_data['OVERALL_ACCURACIES_TEXT'] = message overall_accuracies_df = pd.DataFrame( overall_accuracies_list, columns=['parcels', 'prediction_type', 'accuracy']) overall_accuracies_df.set_index(keys=['parcels', 'prediction_type'], inplace=True) with pd.option_context(*pandas_option_context_list): outputfile.write(f"\n{overall_accuracies_df}\n") logger.info(f"{overall_accuracies_df}\n") html_data[ 'OVERALL_ACCURACIES_TABLE'] = overall_accuracies_df.to_html() # Write the recall, F1 score,... per class #message = skmetrics.classification_report(df_predict[gs.class_column] # , df_predict[gs.prediction_column] # , labels=classes) #outputfile.write(message) outputfile.write( "************************************************************\n") outputfile.write( "********************* DETAILED RESULTS *********************\n") outputfile.write( "************************************************************\n") outputfile.write("\n") outputfile.write( "************************************************************\n") outputfile.write( "* DETAILED PREDICTION CONCLUSIONS *\n") outputfile.write( "************************************************************\n") # Calculate detailed conclusions for the predictions logger.info("Calculate the detailed conclusions for the predictions") # Write the conclusions for the consolidated predictions _add_prediction_conclusion( in_df=df_predict, new_columnname=conf.columns['prediction_conclusion_detail_cons'], prediction_column_to_use=conf.columns['prediction_cons'], detailed=True) message = f"Prediction conclusions cons (doubt + not_enough_pixels) overview, for {len(df_predict.index)} predicted cases:" outputfile.write(f"\n{message}\n") html_data['PREDICTION_CONCLUSION_DETAIL_CONS_OVERVIEW_TEXT'] = message count_per_class = (df_predict.groupby( conf.columns['prediction_conclusion_detail_cons'], as_index=False).size().to_frame('count')) values = 100 * count_per_class['count'] / count_per_class['count'].sum( ) count_per_class.insert(loc=1, column='pct', value=values) with pd.option_context('display.max_rows', None, 'display.max_columns', None): outputfile.write(f"\n{count_per_class}\n") logger.info(f"{count_per_class}\n") html_data[ 'PREDICTION_CONCLUSION_DETAIL_CONS_OVERVIEW_TABLE'] = count_per_class.to_html( ) # Calculate detailed conclusions for the predictions logger.info("Calculate the detailed conclusions for the predictions") # Write the conclusions for the consolidated predictions _add_prediction_conclusion( in_df=df_predict, new_columnname=conf. columns['prediction_conclusion_detail_full_alpha'], prediction_column_to_use=conf.columns['prediction_full_alpha'], detailed=True) message = f"Prediction conclusions full alpha (doubt + not_enough_pixels) overview, for {len(df_predict.index)} predicted cases:" outputfile.write(f"\n{message}\n") html_data[ 'PREDICTION_CONCLUSION_DETAIL_FULL_ALPHA_OVERVIEW_TEXT'] = message count_per_class = (df_predict.groupby( conf.columns['prediction_conclusion_detail_full_alpha'], as_index=False).size().to_frame('count')) values = 100 * count_per_class['count'] / count_per_class['count'].sum( ) count_per_class.insert(loc=1, column='pct', value=values) with pd.option_context('display.max_rows', None, 'display.max_columns', None): outputfile.write(f"\n{count_per_class}\n") logger.info(f"{count_per_class}\n") html_data[ 'PREDICTION_CONCLUSION_DETAIL_FULL_ALPHA_OVERVIEW_TABLE'] = count_per_class.to_html( ) outputfile.write( "************************************************************\n") outputfile.write( "* CONFUSION MATRICES FOR PARCELS WITH PREDICTIONS *\n") outputfile.write( "************************************************************\n") # Calculate an extended confusion matrix with the standard prediction column and write # it to output... df_confmatrix_ext = _get_confusion_matrix_ext(df_predict, 'pred1') outputfile.write( "\nExtended confusion matrix of the predictions: Rows: true/input classes, columns: predicted classes\n" ) with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.width', 2000): outputfile.write(f"{df_confmatrix_ext}\n") html_data['CONFUSION_MATRICES_TABLE'] = df_confmatrix_ext.to_html() html_data['CONFUSION_MATRICES_DATA'] = df_confmatrix_ext.to_json() # Calculate an extended confusion matrix with the consolidated prediction column and write # it to output... df_confmatrix_ext = _get_confusion_matrix_ext( df_predict, conf.columns['prediction_cons']) outputfile.write( "\nExtended confusion matrix of the consolidated predictions: Rows: true/input classes, columns: predicted classes\n" ) with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.width', 2000): outputfile.write(f"{df_confmatrix_ext}\n\n") html_data[ 'CONFUSION_MATRICES_CONSOLIDATED_TABLE'] = df_confmatrix_ext.to_html( ) html_data[ 'CONFUSION_MATRICES_CONSOLIDATED_DATA'] = df_confmatrix_ext.to_json( ) # If the pixcount is available, write the OA per pixcount if conf.columns['pixcount_s1s2'] in df_predict.columns: pixcount_output_report_txt = output_report_txt + '_OA_per_pixcount.txt' _write_OA_per_pixcount( df_parcel_predictions=df_predict, output_report_txt=pixcount_output_report_txt, force=force) # If a ground truth file is provided, report on the ground truth if parcel_ground_truth_filepath is not None: outputfile.write( "************************************************************\n" ) outputfile.write( "* REPORTING ON PREDICTION QUALITY BASED ON GROUND TRUTH *\n" ) outputfile.write( "************************************************************\n" ) # Read ground truth logger.info( f"Read csv with ground truth (with their classes): {parcel_ground_truth_filepath}" ) df_parcel_gt = pdh.read_file(parcel_ground_truth_filepath) df_parcel_gt.set_index(conf.columns['id'], inplace=True) logger.info( f"Read csv with ground truth ready, shape: {df_parcel_gt.shape}" ) # Join the prediction data cols_to_join = df_predict.columns.difference(df_parcel_gt.columns) df_parcel_gt = df_predict[cols_to_join].join(df_parcel_gt, how='inner') logger.info( f"After join of ground truth with predictions, shape: {df_parcel_gt.shape}" ) if len(df_parcel_gt.index) == 0: message = "After join of ground truth with predictions the result was empty, so probably a wrong ground truth file was used!" logger.critical(message) raise Exception(message) # General ground truth statistics # ****************************************************************** # Calculate the conclusions based on ground truth # Calculate and write the result for the consolidated predictions _add_gt_conclusions(df_parcel_gt, conf.columns['prediction_cons']) message = f"Prediction quality cons (doubt + not_enough_pixels) overview, for {len(df_parcel_gt.index)} predicted cases in ground truth:" outputfile.write(f"\n{message}\n") html_data['PREDICTION_QUALITY_CONS_OVERVIEW_TEXT'] = message count_per_class = (df_parcel_gt.groupby( f"gt_conclusion_{conf.columns['prediction_cons']}", as_index=False).size().to_frame('count')) values = 100 * count_per_class['count'] / count_per_class[ 'count'].sum() count_per_class.insert(loc=1, column='pct', value=values) with pd.option_context('display.max_rows', None, 'display.max_columns', None): outputfile.write(f"\n{count_per_class}\n") logger.info(f"{count_per_class}\n") html_data[ 'PREDICTION_QUALITY_CONS_OVERVIEW_TABLE'] = count_per_class.to_html( ) # Calculate and write the result for the consolidated predictions _add_gt_conclusions(df_parcel_gt, conf.columns['prediction_full_alpha']) message = f"Prediction quality cons (doubt + not_enough_pixels) overview, for {len(df_parcel_gt.index)} predicted cases in ground truth:" outputfile.write(f"\n{message}\n") html_data['PREDICTION_QUALITY_FULL_ALPHA_OVERVIEW_TEXT'] = message count_per_class = (df_parcel_gt.groupby( f"gt_conclusion_{conf.columns['prediction_full_alpha']}", as_index=False).size().to_frame('count')) values = 100 * count_per_class['count'] / count_per_class[ 'count'].sum() count_per_class.insert(loc=1, column='pct', value=values) with pd.option_context('display.max_rows', None, 'display.max_columns', None): outputfile.write(f"\n{count_per_class}\n") logger.info(f"{count_per_class}\n") html_data[ 'PREDICTION_QUALITY_FULL_ALPHA_OVERVIEW_TABLE'] = count_per_class.to_html( ) # Write the ground truth conclusions to file pdh.to_file( df_parcel_gt, output_report_txt + "_groundtruth_pred_quality_details.tsv") # Alpha and beta error statistics based on CONS prediction # ****************************************************************** # Pct Alpha errors=alpha errors/(alpha errors + real errors) columnname = f"gt_conclusion_{conf.columns['prediction_cons']}" alpha_error_numerator = len(df_parcel_gt.loc[ df_parcel_gt[columnname] == 'FARMER-CORRECT_PRED-WRONG:ERROR_ALPHA'].index) alpha_error_denominator = ( alpha_error_numerator + len(df_parcel_gt.loc[df_parcel_gt[columnname].isin([ 'FARMER-WRONG_PRED-CORRECT', 'FARMER-WRONG_PRED-WRONG' ])].index)) if alpha_error_denominator > 0: message = ( f"Alpha error for cons: {alpha_error_numerator}/{alpha_error_denominator} = " + f"{(alpha_error_numerator/alpha_error_denominator):.02f}") else: message = f"Alpha error for cons: {alpha_error_numerator}/{alpha_error_denominator} = ?" outputfile.write(f"\n{message}\n") html_data['PREDICTION_QUALITY_ALPHA_TEXT'] = message beta_error_numerator = len(df_parcel_gt.loc[ df_parcel_gt[columnname] == 'FARMER-WRONG_PRED-DOESNT_OPPOSE:ERROR_BETA'].index) beta_error_denominator = ( beta_error_numerator + len(df_parcel_gt.loc[df_parcel_gt[columnname].str.startswith( 'FARMER-WRONG_PRED-')].index)) if beta_error_denominator > 0: message = ( f"Beta error for cons: {beta_error_numerator}/{beta_error_denominator} = " + f"{(beta_error_numerator/beta_error_denominator):.02f}") else: message = f"Beta error for cons: {beta_error_numerator}/{beta_error_denominator} = ?" outputfile.write(f"\n{message}\n") html_data['PREDICTION_QUALITY_BETA_TEXT'] = message # Alpha and beta error statistics based on CONS prediction # ****************************************************************** # Pct Alpha errors=alpha errors/(alpha errors + real errors) columnname = f"gt_conclusion_{conf.columns['prediction_full_alpha']}" alpha_error_numerator = len(df_parcel_gt.loc[ df_parcel_gt[columnname] == 'FARMER-CORRECT_PRED-WRONG:ERROR_ALPHA'].index) alpha_error_denominator = ( alpha_error_numerator + len(df_parcel_gt.loc[df_parcel_gt[columnname].isin([ 'FARMER-WRONG_PRED-CORRECT', 'FARMER-WRONG_PRED-WRONG' ])].index)) if alpha_error_denominator > 0: message = ( f"Alpha error full: {alpha_error_numerator}/{alpha_error_denominator} = " + f"{(alpha_error_numerator/alpha_error_denominator):.02f}") else: message = f"Alpha error full: {alpha_error_numerator}/{alpha_error_denominator} = ?" outputfile.write(f"\n{message}\n") html_data['PREDICTION_QUALITY_ALPHA_TEXT'] += '<br/>' + message beta_error_numerator = len(df_parcel_gt.loc[ df_parcel_gt[columnname] == 'FARMER-WRONG_PRED-DOESNT_OPPOSE:ERROR_BETA'].index) beta_error_denominator = ( beta_error_numerator + len(df_parcel_gt.loc[df_parcel_gt[columnname].str.startswith( 'FARMER-WRONG_PRED-')].index)) if beta_error_denominator > 0: message = ( f"Beta error full: {beta_error_numerator}/{beta_error_denominator} = " + f"{(beta_error_numerator/beta_error_denominator):.02f}") else: message = f"Beta error full: {beta_error_numerator}/{beta_error_denominator} = ?" outputfile.write(f"\n{message}\n") html_data['PREDICTION_QUALITY_BETA_TEXT'] += '<br/>' + message # If the pixcount is available, write the number of ALFA errors per pixcount (for the prediction with doubt) if conf.columns['pixcount_s1s2'] in df_parcel_gt.columns: # Get data, drop empty lines and write message = f"Number of ERROR_ALFA parcels for the 'prediction full alpha without NOT_ENOUGH_PIX' per pixcount for the ground truth parcels:" outputfile.write(f"\n{message}\n") html_data[ 'PREDICTION_QUALITY_ALPHA_PER_PIXCOUNT_TEXT'] = message # To get the number of alpha errors per pixcount, we also need alpha errors # also for parcels that had not_enough_pixels, so we need prediction_withdoubt # If they don't exist, calculate class_postpr.add_doubt_column( pred_df=df_parcel_gt, new_pred_column='pred_cons_no_min_pix', apply_doubt_marker_specific=True) _add_gt_conclusions(df_parcel_gt, 'pred_cons_no_min_pix') df_per_pixcount = _get_alfa_errors_per_pixcount( df_predquality_pixcount=df_parcel_gt, pred_quality_column="gt_conclusion_" + "pred_cons_no_min_pix", error_alpha_code='FARMER-CORRECT_PRED-WRONG:ERROR_ALPHA') df_per_pixcount.dropna(inplace=True) with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.width', 2000): outputfile.write(f"\n{df_per_pixcount}\n") logger.info(f"{df_per_pixcount}\n") html_data[ 'PREDICTION_QUALITY_ALPHA_PER_PIXCOUNT_TABLE'] = df_per_pixcount.to_html( ) with open(output_report_txt.replace('.txt', '.html'), 'w') as outputfile: html_template_file = open( './cropclassification/postprocess/html_rapport_template.html' ).read() src = Template(html_template_file) # replace strings and write to file output = src.substitute(html_data) outputfile.write(output)