def show(input_parcel_filepath: Path, filter_id: str): # Load input data... df = pdh.read_file(input_parcel_filepath) # Just keep one parcel id_column = 'UID' df = df[df[id_column] == filter_id] # Remove all unnecessary columns ''' for column in df: if(not column.startswith(SENSORDATA_VV + '_') and not column.startswith(SENSORDATA_VH + '_') and not column == conf.columns['id']): df = df.drop(columns=column) ''' # Set index for transpose df.set_index(id_column, inplace=True) # Transpose columns to rows to create time series df = df.transpose() df.reset_index(inplace=True) ''' df.rename(columns={'index':'polarization_date'}, inplace=True) # Split the date and the polarization, the drop the original column df.insert(0, 'polarization', df['polarization_date'].str.split('_').str.get(0)) #df['polarization'] = df['polarization_date'].str.split('_').str.get(0) df.insert(1, 'date', df['polarization_date'].str.split('_').str.get(1)) #df['date'] = df['polarization_date'].str.split('_').str.get(1) df = df.drop(columns='polarization_date') logger.info(df) # Pivot to put VV and VH in seperate columns df = df.pivot(index='date', columns='polarization') #df.unstack(level=-1) #df.set_index('date', inplace=True) df = df[filter_id] df[SENSORDATA_VH + '/' + SENSORDATA_VV] = np.log10(df[SENSORDATA_VH] / df[SENSORDATA_VV])*10 df[SENSORDATA_VH] = np.log10(df[SENSORDATA_VH])*10 df[SENSORDATA_VV] = np.log10(df[SENSORDATA_VV])*10 for column in df: logger.info(column) ''' logger.info(df) # Plot df.plot()
def prepare_input(input_parcel_filepath: str, input_parcel_filetype: str, input_parcel_pixcount_filepath: str, classtype_to_prepare: str, output_parcel_filepath: str, force: bool = False): """ Prepare a raw input file by eg. adding the classification classes to use for the classification,... """ # If force == False Check and the output file exists already, stop. if force is False and os.path.exists(output_parcel_filepath) is True: logger.warning( f"prepare_input: output file already exists and force == False, so stop: {output_parcel_filepath}" ) return if input_parcel_filetype == 'BEFL': output_dir, _ = os.path.split(output_parcel_filepath) df_parceldata = befl.prepare_input( input_parcel_filepath=input_parcel_filepath, classtype_to_prepare=classtype_to_prepare, output_dir=output_dir) else: message = f"Unknown value for parameter input_parcel_filetype: {input_parcel_filetype}" logger.critical(message) raise Exception(message) # Load pixcount data and join it logger.info(f"Read pixcount file {input_parcel_pixcount_filepath}") df_pixcount = pdh.read_file(input_parcel_pixcount_filepath) logger.debug(f"Read pixcount file ready, shape: {df_pixcount.shape}") if df_pixcount.index.name != conf.columns['id']: df_pixcount.set_index(conf.columns['id'], inplace=True) df_parceldata.set_index(conf.columns['id'], inplace=True) df_parceldata = df_parceldata.join( df_pixcount[conf.columns['pixcount_s1s2']], how='left') # Export result to file output_ext = os.path.splitext(output_parcel_filepath)[1] for column in df_parceldata.columns: # if the output asked is a csv... we don't need the geometry... if column == conf.columns['geom'] and output_ext == '.csv': df_parceldata.drop(column, axis=1, inplace=True) logger.info(f"Write output to {output_parcel_filepath}") # If extension is not .shp, write using pandas (=a lot faster!) if output_ext.lower() != '.shp': pdh.to_file(df_parceldata, output_parcel_filepath) else: df_parceldata.to_file(output_parcel_filepath, index=False)
def main(): dir = Path( "X:/Monitoring/Markers/playground/pierog/tmp/Run_2019-06-25_007_imported" ) in_filepaths = dir.glob("*.parquet") # Convert all files found for in_filepath in in_filepaths: # Read input file print(f"Read {in_filepath}") df = pdh.read_file(in_filepath) # Write to new file out_filepath = in_filepath.parent / f"{in_filepath.stem}.sqlite" print(f"Write {out_filepath}") pdh.to_file(df, out_filepath)
def calculate_periodic_data(input_parcel_filepath: str, input_base_dir: str, start_date_str: str, end_date_str: str, sensordata_to_get: [], dest_data_dir: str, force: bool = False): """ This function creates a file that is a weekly summarize of timeseries images from DIAS. TODO: add possibility to choose which values to extract (mean, min, max,...)? Args: input_parcel_filepath (str): [description] input_base_dir (str): [description] start_date_str (str): Start date in format %Y-%m-%d. Needs to be aligned already on the periods wanted. end_date_str (str): End date in format %Y-%m-%d. Needs to be aligned already on the periods wanted. sensordata_to_get ([]): dest_data_dir (str): [description] force (bool, optional): [description]. Defaults to False. """ logger.info('calculate_periodic_data') # Init input_parcels_filename = os.path.basename(input_parcel_filepath) input_parcels_filename_noext, _ = os.path.splitext(input_parcels_filename) input_dir = os.path.join(input_base_dir, input_parcels_filename_noext) # TODO: in config? input_ext = ".sqlite" output_ext = ".sqlite" start_date = datetime.strptime(start_date_str, '%Y-%m-%d') end_date = datetime.strptime(end_date_str, '%Y-%m-%d') year = start_date_str.split("-")[0] # Prepare output dir test = False if test is True: dest_data_dir += "_test" if not os.path.exists(dest_data_dir): os.mkdir(dest_data_dir) # Create Dataframe with all files with their info logger.debug('Create Dataframe with all files and their properties') file_info_list = [] for filename in os.listdir(input_dir): if filename.endswith(input_ext): # Get seperate filename parts file_info = get_file_info(os.path.join(input_dir, filename)) file_info_list.append(file_info) all_inputfiles_df = pd.DataFrame(file_info_list) # Loop over the data we need to get id_column = conf.columns['id'] for sensordata_type in sensordata_to_get: logger.debug( 'Get files we need based on start- & stopdates, sensordata_to_get,...' ) orbits = [None] if sensordata_type == conf.general['SENSORDATA_S1_ASCDESC']: # Filter files to the ones we need satellitetype = 'S1' imagetype = IMAGETYPE_S1_GRD bands = ['VV', 'VH'] orbits = ['ASC', 'DESC'] needed_inputfiles_df = all_inputfiles_df.loc[ (all_inputfiles_df.date >= start_date) & (all_inputfiles_df.date < end_date) & (all_inputfiles_df.imagetype == imagetype) & (all_inputfiles_df.band.isin(bands)) & (all_inputfiles_df.orbit.isin(orbits))] elif sensordata_type == conf.general['SENSORDATA_S2gt95']: satellitetype = 'S2' imagetype = IMAGETYPE_S2_L2A bands = ['B02-10m', 'B03-10m', 'B04-10m', 'B08-10m'] needed_inputfiles_df = all_inputfiles_df.loc[ (all_inputfiles_df.date >= start_date) & (all_inputfiles_df.date < end_date) & (all_inputfiles_df.imagetype == imagetype) & (all_inputfiles_df.band.isin(bands))] elif sensordata_type == conf.general['SENSORDATA_S1_COHERENCE']: satellitetype = 'S1' imagetype = IMAGETYPE_S1_COHERENCE bands = ['VV', 'VH'] orbits = ['ASC', 'DESC'] needed_inputfiles_df = all_inputfiles_df.loc[ (all_inputfiles_df.date >= start_date) & (all_inputfiles_df.date < end_date) & (all_inputfiles_df.imagetype == imagetype) & (all_inputfiles_df.band.isin(bands))] else: raise Exception(f"Unsupported sensordata_type: {sensordata_type}") # There should also be one pixcount file pixcount_filename = f"{input_parcels_filename_noext}_weekly_pixcount{output_ext}" pixcount_filepath = os.path.join(dest_data_dir, pixcount_filename) # For each week start_week = int(datetime.strftime(start_date, '%W')) end_week = int(datetime.strftime(end_date, '%W')) for period_index in range(start_week, end_week): # Get the date of the first day of period period_index (eg. monday for a week) period_date = datetime.strptime( str(year) + '_' + str(period_index) + '_1', '%Y_%W_%w') # New file name period_date_str_long = period_date.strftime('%Y-%m-%d') period_data_filename = f"{input_parcels_filename_noext}_weekly_{period_date_str_long}_{sensordata_type}{output_ext}" period_data_filepath = os.path.join(dest_data_dir, period_data_filename) # Check if output file exists already if os.path.exists(period_data_filepath): if force is False: logger.info( f"SKIP: force is False and file exists: {period_data_filepath}" ) continue else: os.remove(period_data_filepath) # Loop over bands and orbits (all combinations of bands and orbits!) logger.info(f"Calculate file: {period_data_filename}") period_data_df = None for band, orbit in [(band, orbit) for band in bands for orbit in orbits]: # Get list of files needed for this period, band period_files_df = needed_inputfiles_df.loc[ (needed_inputfiles_df.week == period_index) & (needed_inputfiles_df.band == band)] # If an orbit to be filtered was specified, filter if orbit is not None: period_files_df = period_files_df.loc[( period_files_df.orbit == orbit)] # Loop all period_files period_band_data_df = None statistic_columns_dict = { 'count': [], 'max': [], 'mean': [], 'min': [], 'std': [] } for j, imagedata_filepath in enumerate( period_files_df.filepath.tolist()): # If file has filesize == 0, skip if os.path.getsize(imagedata_filepath) == 0: continue # Read the file (but only the columns we need) columns = [column for column in statistic_columns_dict ].append(id_column) image_data_df = pdh.read_file(imagedata_filepath, columns=columns) image_data_df.set_index(id_column, inplace=True) image_data_df.index.name = id_column # Remove rows with nan values nb_before_dropna = len(image_data_df.index) image_data_df.dropna(inplace=True) nb_after_dropna = len(image_data_df.index) if nb_after_dropna != nb_before_dropna: logger.warning( f"Before dropna: {nb_before_dropna}, after: {nb_after_dropna} for file {imagedata_filepath}" ) if nb_after_dropna == 0: continue # Rename columns so column names stay unique for statistic_column in statistic_columns_dict: new_column_name = statistic_column + str(j + 1) image_data_df.rename( columns={statistic_column: new_column_name}, inplace=True) image_data_df[new_column_name] = image_data_df[ new_column_name].astype(float) statistic_columns_dict[statistic_column].append( new_column_name) # Create 1 dataframe for all weekfiles - one row for each code_obj - using concat (code_obj = index) if period_band_data_df is None: period_band_data_df = image_data_df else: period_band_data_df = pd.concat( [period_band_data_df, image_data_df], axis=1, sort=False) # Apparently concat removes the index name in some situations period_band_data_df.index.name = id_column # Calculate max, mean, min, ... if period_band_data_df is not None: logger.debug('Calculate max, mean, min, ...') period_date_str_short = period_date.strftime('%Y%m%d') # Remark: prefix column names: sqlite doesn't like a numeric start if orbit is None: column_basename = f"TS_{period_date_str_short}_{imagetype}_{band}" else: column_basename = f"TS_{period_date_str_short}_{imagetype}_{orbit}_{band}" # Number of pixels # TODO: onderzoeken hoe aantal pixels best bijgehouden wordt : afwijkingen weglaten ? max nemen ? ... period_band_data_df[f"{column_basename}_count"] = np.nanmax( period_band_data_df[statistic_columns_dict['count']], axis=1) # Maximum of all max columns period_band_data_df[f"{column_basename}_max"] = np.nanmax( period_band_data_df[statistic_columns_dict['max']], axis=1) # Mean of all mean columns period_band_data_df[f"{column_basename}_mean"] = np.nanmean( period_band_data_df[statistic_columns_dict['mean']], axis=1) # Minimum of all min columns period_band_data_df[f"{column_basename}_min"] = np.nanmin( period_band_data_df[statistic_columns_dict['min']], axis=1) # Mean of all std columns period_band_data_df[f"{column_basename}_std"] = np.nanmean( period_band_data_df[statistic_columns_dict['std']], axis=1) # Number of Files used period_band_data_df[ f"{column_basename}_used_files"] = period_band_data_df[ statistic_columns_dict['max']].count(axis=1) # Only keep the columns we want to keep columns_to_keep = [ f"{column_basename}_count", f"{column_basename}_max", f"{column_basename}_mean", f"{column_basename}_min", f"{column_basename}_std", f"{column_basename}_used_files" ] period_band_data_df = period_band_data_df[columns_to_keep] # Merge the data with the other bands/orbits for this period if period_data_df is None: period_data_df = period_band_data_df else: period_data_df = pd.concat( [period_band_data_df, period_data_df], axis=1, sort=False) # Apparently concat removes the index name in some situations period_data_df.index.name = id_column if period_data_df is not None: logger.info(f"Write new file: {period_data_filename}") pdh.to_file(period_data_df, period_data_filepath) if not os.path.exists(pixcount_filepath): pixcount_s1s2_column = conf.columns['pixcount_s1s2'] for column in period_data_df.columns: if column.endswith('_count'): period_data_df.rename( columns={column: pixcount_s1s2_column}, inplace=True) break pixcount_df = period_data_df[pixcount_s1s2_column] pixcount_df.fillna(value=0, inplace=True) pdh.to_file(pixcount_df, pixcount_filepath)
def prepare_input(input_parcel_filepath: str, classtype_to_prepare: str, output_dir: str): """ This function creates a file that is compliant with the assumptions used by the rest of the classification functionality. It should be a csv file with the following columns: - object_id: column with a unique identifier - classname: a string column with a readable name of the classes that will be classified to """ # Check if input parameters are OK if not os.path.exists(input_parcel_filepath): raise Exception(f"Input file doesn't exist: {input_parcel_filepath}") else: logger.info(f"Process input file {input_parcel_filepath}") # Read input file logger.info(f"Read parceldata from {input_parcel_filepath}") if geofile_util.is_geofile(input_parcel_filepath): parceldata_df = geofile_util.read_file(input_parcel_filepath) else: parceldata_df = pdh.read_file(input_parcel_filepath) logger.info(f"Read Parceldata ready, info(): {parceldata_df.info()}") # Check if the id column is present... if conf.columns['id'] not in parceldata_df.columns: message = f"Column {conf.columns['id']} not found in input parcel file: {input_parcel_filepath}. Make sure the column is present or change the column name in global_constants.py" logger.critical(message) raise Exception(message) # Copy the refe file to the run dir, so we always keep knowing which refe was used input_classes_filepath = conf.preprocess[ 'classtype_to_prepare_refe_filepath'] if not os.path.exists(input_classes_filepath): raise Exception( f"Input classes file doesn't exist: {input_classes_filepath}") shutil.copy(input_classes_filepath, output_dir) # Now start prepare if classtype_to_prepare == 'CROPGROUP': parceldata_df = prepare_input_cropgroup( parceldata_df=parceldata_df, column_BEFL_cropcode=column_BEFL_crop_declared, column_output_class=conf.columns['class_declared']) return prepare_input_cropgroup( parceldata_df=parceldata_df, column_BEFL_cropcode=column_BEFL_crop, column_output_class=conf.columns['class']) elif classtype_to_prepare == 'CROPGROUP_GROUNDTRUTH': return prepare_input_cropgroup( parceldata_df=parceldata_df, column_BEFL_cropcode=column_BEFL_crop_gt_verified, column_output_class=conf.columns['class_groundtruth']) elif classtype_to_prepare == 'CROPGROUP_EARLY': parceldata_df = prepare_input_cropgroup_early( parceldata_df=parceldata_df, column_BEFL_cropcode=column_BEFL_crop_declared, column_output_class=conf.columns['class_declared']) return prepare_input_cropgroup_early( parceldata_df=parceldata_df, column_BEFL_cropcode=column_BEFL_crop, column_output_class=conf.columns['class']) elif classtype_to_prepare == 'CROPGROUP_EARLY_GROUNDTRUTH': return prepare_input_cropgroup_early( parceldata_df=parceldata_df, column_BEFL_cropcode=column_BEFL_crop_gt_verified, column_output_class=conf.columns['class_groundtruth']) elif classtype_to_prepare == 'LANDCOVER': parceldata_df = prepare_input_landcover( parceldata_df=parceldata_df, column_BEFL_cropcode=column_BEFL_crop_declared, column_output_class=conf.columns['class_declared']) return prepare_input_landcover( parceldata_df=parceldata_df, column_BEFL_cropcode=column_BEFL_crop, column_output_class=conf.columns['class']) elif classtype_to_prepare == 'LANDCOVER_GROUNDTRUTH': return prepare_input_landcover( parceldata_df=parceldata_df, column_BEFL_cropcode=column_BEFL_crop_gt_verified, column_output_class=conf.columns['class_groundtruth']) elif classtype_to_prepare == 'LANDCOVER_EARLY': parceldata_df = prepare_input_landcover_early( parceldata_df=parceldata_df, column_BEFL_cropcode=column_BEFL_crop_declared, column_output_class=conf.columns['class_declared']) return prepare_input_landcover_early( parceldata_df=parceldata_df, column_BEFL_cropcode=column_BEFL_crop, column_output_class=conf.columns['class']) elif classtype_to_prepare == 'LANDCOVER_EARLY_GROUNDTRUTH': return prepare_input_landcover_early( parceldata_df=parceldata_df, column_BEFL_cropcode=column_BEFL_crop_gt_verified, column_output_class=conf.columns['class_groundtruth']) elif classtype_to_prepare == 'POPULAR_CROP': parceldata_df = prepare_input_most_popular_crop( parceldata_df=parceldata_df, column_BEFL_cropcode=column_BEFL_crop_declared, column_output_class=conf.columns['class_declared']) return prepare_input_most_popular_crop( parceldata_df=parceldata_df, column_BEFL_cropcode=column_BEFL_crop, column_output_class=conf.columns['class']) elif classtype_to_prepare == 'POPULAR_CROP_GROUNDTRUTH': return prepare_input_most_popular_crop( parceldata_df=parceldata_df, column_BEFL_cropcode=column_BEFL_crop_gt_verified, column_output_class=conf.columns['class_groundtruth']) else: message = f"Unknown value for parameter classtype_to_prepare: {classtype_to_prepare}" logger.fatal(message) raise Exception(message)
def prepare_input_landcover(parceldata_df, column_BEFL_cropcode: str = 'GWSCOD_H', column_output_class: str = None): """ This function creates a file that is compliant with the assumptions used by the rest of the classification functionality. It should be a csv file with the following columns: - object_id: column with a unique identifier - classname: a string column with a readable name of the classes that will be classified to This specific implementation converts the typiscal export format used in BE-Flanders to this format. """ # Check if parameters are OK and init some extra params #-------------------------------------------------------------------------- input_classes_filepath = conf.preprocess[ 'classtype_to_prepare_refe_filepath'] if not os.path.exists(input_classes_filepath): raise Exception( f"Input classes file doesn't exist: {input_classes_filepath}") # Convert the crop to unicode, in case the input is int... if column_BEFL_cropcode in parceldata_df.columns: parceldata_df[column_BEFL_cropcode] = parceldata_df[ column_BEFL_cropcode].astype('unicode') # Read and cleanup the mapping table from crop codes to classes #-------------------------------------------------------------------------- logger.info(f"Read classes conversion table from {input_classes_filepath}") classes_df = pdh.read_file(input_classes_filepath) logger.info( f"Read classes conversion table ready, info(): {classes_df.info()}") # Because the file was read as ansi, and gewas is int, so the data needs to be converted to # unicode to be able to do comparisons with the other data classes_df[column_BEFL_cropcode] = classes_df['CROPCODE'].astype('unicode') # Map column MON_group to orig classname column_output_class_orig = column_output_class + '_orig' classes_df[column_output_class_orig] = classes_df['MON_LC_GROUP'] # Remove unneeded columns for column in classes_df.columns: if (column not in [column_output_class_orig, column_BEFL_cropcode] and column not in columns_BEFL_to_keep): classes_df.drop(column, axis=1, inplace=True) # Set the index classes_df.set_index(column_BEFL_cropcode, inplace=True, verify_integrity=True) # Get only the columns in the classes_df that don't exist yet in parceldata_df cols_to_join = classes_df.columns.difference(parceldata_df.columns) # Join/merge the classname logger.info('Add the classes to the parceldata') parceldata_df = parceldata_df.merge(classes_df[cols_to_join], how='left', left_on=column_BEFL_cropcode, right_index=True, validate='many_to_one') # Copy orig classname to classification classname parceldata_df.insert(loc=0, column=column_output_class, value=parceldata_df[column_output_class_orig]) # If a column with extra info exists, use it as well to fine-tune the classification classes. if column_BEFL_gesp_pm in parceldata_df.columns: # Serres, tijdelijke overkappingen en loodsen parceldata_df.loc[ parceldata_df[column_BEFL_gesp_pm].isin(['SER', 'PLA', 'PLO']), column_output_class] = 'MON_LC_IGNORE_DIFFICULT_PERMANENT_CLASS' parceldata_df.loc[ parceldata_df[column_BEFL_gesp_pm]. isin(['SGM', 'NPO', 'LOO', 'CON']), column_output_class] = 'MON_LC_IGNORE_DIFFICULT_PERMANENT_CLASS_NS' # TODO: CIV, containers in volle grond, lijkt niet zo specifiek te zijn... #parceldata_df.loc[parceldata_df[column_BEFL_gesp_pm] == 'CIV', class_columnname] = 'MON_CONTAINERS' # Containers, niet op volle grond... else: logger.warning( f"The column {column_BEFL_gesp_pm} doesn't exist, so this part of the code was skipped!" ) # Some extra cleanup: classes starting with 'nvt' or empty ones logger.info( "Set classes that are still empty, not specific enough or that contain to little values to 'UNKNOWN'" ) parceldata_df.loc[ parceldata_df[column_output_class].str.startswith('nvt', na=True), column_output_class] = 'MON_LC_UNKNOWN' # Drop the columns that aren't useful at all for column in parceldata_df.columns: if (column not in [ column_output_class, conf.columns['id'], conf.columns['class_groundtruth'], conf.columns['class_declared'] ] and column not in conf.preprocess.getlist('extra_export_columns') and column not in columns_BEFL_to_keep): parceldata_df.drop(column, axis=1, inplace=True) elif column == column_BEFL_gesp_pm: parceldata_df[column_BEFL_gesp_pm] = parceldata_df[ column_BEFL_gesp_pm].str.replace(',', ';') return parceldata_df
def prepare_input_cropgroup(parceldata_df, column_BEFL_cropcode: str, column_output_class: str): """ This function creates a file that is compliant with the assumptions used by the rest of the classification functionality. It should be a csv file with the following columns: - object_id: column with a unique identifier - classname: a string column with a readable name of the classes that will be classified to This specific implementation converts the typiscal export format used in BE-Flanders to this format. """ # Check if parameters are OK and init some extra params #-------------------------------------------------------------------------- input_classes_filepath = conf.preprocess[ 'classtype_to_prepare_refe_filepath'] if not os.path.exists(input_classes_filepath): raise Exception( f"Input classes file doesn't exist: {input_classes_filepath}") # Convert the crop to unicode, in case the input is int... if column_BEFL_cropcode in parceldata_df.columns: parceldata_df[column_BEFL_cropcode] = parceldata_df[ column_BEFL_cropcode].astype('unicode') # Read and cleanup the mapping table from crop codes to classes #-------------------------------------------------------------------------- logger.info(f"Read classes conversion table from {input_classes_filepath}") classes_df = pdh.read_file(input_classes_filepath) logger.info( f"Read classes conversion table ready, info(): {classes_df.info()}") # Because the file was read as ansi, and gewas is int, so the data needs to be converted to # unicode to be able to do comparisons with the other data classes_df[column_BEFL_cropcode] = classes_df['CROPCODE'].astype('unicode') # Map column with the classname to orig classname column_output_class_orig = conf.columns['class'] + '_orig' classes_df[column_output_class_orig] = classes_df['MON_CROPGROUP'] # Remove unneeded columns for column in classes_df.columns: if (column not in [column_output_class_orig, column_BEFL_cropcode] and column not in columns_BEFL_to_keep): classes_df.drop(column, axis=1, inplace=True) # Set the index classes_df.set_index(column_BEFL_cropcode, inplace=True, verify_integrity=True) # Get only the columns in the classes_df that don't exist yet in parceldata_df cols_to_join = classes_df.columns.difference(parceldata_df.columns) # Join/merge the classname logger.info('Add the classes to the parceldata') parceldata_df = parceldata_df.merge(classes_df[cols_to_join], how='left', left_on=column_BEFL_cropcode, right_index=True, validate='many_to_one') # Copy orig classname to classification classname parceldata_df.insert(loc=0, column=column_output_class, value=parceldata_df[column_output_class_orig]) # For rows with no class, set to UNKNOWN parceldata_df.fillna(value={column_output_class: 'UNKNOWN'}, inplace=True) # If a column with extra info exists, use it as well to fine-tune the classification classes. if column_BEFL_gesp_pm in parceldata_df.columns: # Serres, tijdelijke overkappingen en loodsen parceldata_df.loc[ parceldata_df[column_BEFL_gesp_pm].isin(['SER', 'SGM']), column_output_class] = 'MON_STAL_SER' parceldata_df.loc[ parceldata_df[column_BEFL_gesp_pm].isin(['PLA', 'PLO', 'NPO']), column_output_class] = 'MON_STAL_SER' parceldata_df.loc[ parceldata_df[column_BEFL_gesp_pm] == 'LOO', column_output_class] = 'MON_STAL_SER' # Een loods is hetzelfde als een stal... parceldata_df.loc[ parceldata_df[column_BEFL_gesp_pm] == 'CON', column_output_class] = 'MON_CONTAINERS' # Containers, niet op volle grond... # TODO: CIV, containers in volle grond, lijkt niet zo specifiek te zijn... #parceldata_df.loc[parceldata_df[column_BEFL_gesp_pm] == 'CIV', class_columnname] = 'MON_CONTAINERS' # Containers, niet op volle grond... else: logger.warning( f"The column {column_BEFL_gesp_pm} doesn't exist, so this part of the code was skipped!" ) # Some extra cleanup: classes starting with 'nvt' or empty ones #logger.info("Set classes that are still empty, not specific enough or that contain to little values to 'UNKNOWN'") #parceldata_df.loc[parceldata_df[column_output_class].str.startswith('nvt', na=True), # column_output_class] = 'UNKNOWN' # 'MON_ANDERE_SUBSID_GEWASSEN': very low classification rate (< 1%), as it is a group with several very different classes in it # 'MON_AARDBEIEN': low classification rate (~10%), as many parcel actually are temporary greenhouses but aren't correctly applied # 'MON_BRAAK': very low classification rate (< 1%), spread over a lot of classes, but most popular are MON_BOOM, MON_GRASSEN, MON_FRUIT # 'MON_KLAVER': log classification rate (25%), spread over quite some classes, but MON GRASSES has 20% as well. # 'MON_MENGSEL': 25% correct classifications: rest spread over many other classes. Too heterogenous in group? # 'MON_POEL': 0% correct classifications: most are classified as MON_CONTAINER, MON_FRUIT. Almost nothing was misclassified as being POEL # 'MON_RAAPACHTIGEN': 25% correct classifications: rest spread over many other classes # 'MON_STRUIK': 10% # TODO: nakijken, wss opsplitsen of toevoegen aan MON_BOOMKWEEK??? #classes_badresults = ['MON_ANDERE_SUBSID_GEWASSEN', 'MON_AARDBEIEN', 'MON_BRAAK', 'MON_KLAVER', # 'MON_MENGSEL', 'MON_POEL', 'MON_RAAPACHTIGEN', 'MON_STRUIK'] #parceldata_df.loc[parceldata_df[column_output_class].isin(classes_badresults), # column_output_class] = 'UNKNOWN' # MON_BONEN en MON_WIKKEN have omongst each other a very large percentage of false # positives/negatives, so they seem very similar... lets create a class that combines both #parceldata_df.loc[parceldata_df[column_output_class].isin(['MON_BONEN', 'MON_WIKKEN']), # column_output_class] = 'MON_BONEN_WIKKEN' # MON_BOOM includes now also the growing new plants/trees, which is too differenct from grown # trees -> put growing new trees is seperate group #parceldata_df.loc[parceldata_df[column_BEFL_cropcode].isin(['9602', '9603', '9604', '9560']), # column_output_class] = 'MON_BOOMKWEEK' # 'MON_FRUIT': has a good accuracy (91%), but also has as much false positives (115% -> mainly # 'MON_GRASSEN' that are (mis)classified as 'MON_FRUIT') # 'MON_BOOM': has very bad accuracy (28%) and also very much false positives (450% -> mainly # 'MON_GRASSEN' that are misclassified as 'MON_BOOM') # MON_FRUIT and MON_BOOM are permanent anyway, so not mandatory that they are checked in # monitoring process. # Conclusion: put MON_BOOM and MON_FRUIT to IGNORE_DIFFICULT_PERMANENT_CLASS #parceldata_df.loc[parceldata_df[column_output_class].isin(['MON_BOOM', 'MON_FRUIT']), # column_output_class] = 'IGNORE_DIFFICULT_PERMANENT_CLASS' # Set classes with very few elements to IGNORE_NOT_ENOUGH_SAMPLES! for _, row in parceldata_df.groupby( column_output_class).size().reset_index(name='count').iterrows(): if row['count'] <= 50: logger.info( f"Class <{row[column_output_class]}> only contains {row['count']} elements, so put them to IGNORE_NOT_ENOUGH_SAMPLES" ) parceldata_df.loc[ parceldata_df[column_output_class] == row[column_output_class], column_output_class] = 'IGNORE_NOT_ENOUGH_SAMPLES' # Drop the columns that aren't useful at all for column in parceldata_df.columns: if (column not in [ column_output_class, conf.columns['id'], conf.columns['class_groundtruth'], conf.columns['class_declared'] ] and column not in conf.preprocess.getlist('extra_export_columns') and column not in columns_BEFL_to_keep): parceldata_df.drop(column, axis=1, inplace=True) elif column == column_BEFL_gesp_pm: parceldata_df[column_BEFL_gesp_pm] = parceldata_df[ column_BEFL_gesp_pm].str.replace(',', ';') return parceldata_df
def train_test_predict(input_parcel_train_filepath: str, input_parcel_test_filepath: str, input_parcel_all_filepath: str, input_parcel_classification_data_filepath: str, output_classifier_filepath: str, output_predictions_test_filepath: str, output_predictions_all_filepath: str, force: bool = False): """ Train a classifier, test it and do full predictions. Args input_parcel_classes_train_filepath: the list of parcels with classes to train the classifier, without data! input_parcel_classes_test_filepath: the list of parcels with classes to test the classifier, without data! input_parcel_classes_all_filepath: the list of parcels with classes that need to be classified, without data! input_parcel_classification_data_filepath: the data to be used for the classification for all parcels. output_classifier_filepath: the file path where to save the classifier. output_predictions_test_filepath: the file path where to save the test predictions. output_predictions_all_filepath: the file path where to save the predictions for all parcels. force: if True, overwrite all existing output files, if False, don't overwrite them. """ logger.info("train_test_predict: Start") if (force is False and os.path.exists(output_classifier_filepath) and os.path.exists(output_predictions_test_filepath) and os.path.exists(output_predictions_all_filepath)): logger.warning( f"predict: output files exist and force is False, so stop: {output_classifier_filepath}, {output_predictions_test_filepath}, {output_predictions_all_filepath}" ) return # Read the classification data from the csv so we can pass it on to the other functione to improve performance... logger.info( f"Read classification data file: {input_parcel_classification_data_filepath}" ) input_parcel_classification_data_df = pdh.read_file( input_parcel_classification_data_filepath) if input_parcel_classification_data_df.index.name != conf.columns['id']: input_parcel_classification_data_df.set_index(conf.columns['id'], inplace=True) logger.debug('Read classification data file ready') # Train the classification train( input_parcel_train_filepath=input_parcel_train_filepath, input_parcel_test_filepath=input_parcel_test_filepath, input_parcel_classification_data_filepath= input_parcel_classification_data_filepath, output_classifier_filepath=output_classifier_filepath, force=force, input_parcel_classification_data_df=input_parcel_classification_data_df ) # Predict the test parcels predict( input_parcel_filepath=input_parcel_test_filepath, input_parcel_classification_data_filepath= input_parcel_classification_data_filepath, input_classifier_filepath=output_classifier_filepath, output_predictions_filepath=output_predictions_test_filepath, force=force, input_parcel_classification_data_df=input_parcel_classification_data_df ) # Predict all parcels predict( input_parcel_filepath=input_parcel_all_filepath, input_parcel_classification_data_filepath= input_parcel_classification_data_filepath, input_classifier_filepath=output_classifier_filepath, output_predictions_filepath=output_predictions_all_filepath, force=force, input_parcel_classification_data_df=input_parcel_classification_data_df )
def predict(input_parcel_filepath: str, input_parcel_classification_data_filepath: str, input_classifier_filepath: str, output_predictions_filepath: str, force: bool = False, input_parcel_classification_data_df: pd.DataFrame = None): """ Predict the classes for the input data. """ # If force is False, and the output file exist already, return if (force is False and os.path.exists(output_predictions_filepath)): logger.warning( f"predict: predictions output file already exists and force is false, so stop: {output_predictions_filepath}" ) return # Read the input parcels logger.info(f"Read input file: {input_parcel_filepath}") input_parcel_df = pdh.read_file(input_parcel_filepath, columns=[ conf.columns['id'], conf.columns['class'], conf.columns['class_declared'] ]) if input_parcel_df.index.name != conf.columns['id']: input_parcel_df.set_index(conf.columns['id'], inplace=True) logger.debug('Read train file ready') # For parcels of a class that should be ignored, don't predict input_parcel_df = input_parcel_df.loc[ ~input_parcel_df[conf.columns['class_declared']]. isin(conf.marker.getlist('classes_to_ignore'))] # If the classification data isn't passed as dataframe, read it from the csv if input_parcel_classification_data_df is None: logger.info( f"Read classification data file: {input_parcel_classification_data_filepath}" ) input_parcel_classification_data_df = pdh.read_file( input_parcel_classification_data_filepath) if input_parcel_classification_data_df.index.name != conf.columns['id']: input_parcel_classification_data_df.set_index(conf.columns['id'], inplace=True) logger.debug('Read classification data file ready') # Join the data to send to prediction logic... logger.info("Join input parcels with the classification data") input_parcel_for_predict_df = input_parcel_df.join( input_parcel_classification_data_df, how='inner') # Predict! if conf.classifier['classifier_type'].lower( ) == 'keras_multilayer_perceptron': import cropclassification.predict.classification_keras as class_core_keras class_core_keras.predict_proba( parcel_df=input_parcel_for_predict_df, classifier_filepath=input_classifier_filepath, output_parcel_predictions_filepath=output_predictions_filepath) else: import cropclassification.predict.classification_sklearn as class_core_sklearn class_core_sklearn.predict_proba( parcel_df=input_parcel_for_predict_df, classifier_filepath=input_classifier_filepath, output_parcel_predictions_filepath=output_predictions_filepath)
def train(input_parcel_train_filepath: str, input_parcel_test_filepath: str, input_parcel_classification_data_filepath: str, output_classifier_filepath: str, force: bool = False, input_parcel_classification_data_df: pd.DataFrame = None): """ Train a classifier and test it by predicting the test cases. """ logger.info("train_and_test: Start") if (force is False and os.path.exists(output_classifier_filepath)): logger.warning( f"predict: classifier already exist and force == False, so don't retrain: {output_classifier_filepath}" ) return # If the classification data isn't passed as dataframe, read it from file if input_parcel_classification_data_df is None: logger.info( f"Read classification data file: {input_parcel_classification_data_filepath}" ) input_parcel_classification_data_df = pdh.read_file( input_parcel_classification_data_filepath) if input_parcel_classification_data_df.index.name != conf.columns['id']: input_parcel_classification_data_df.set_index(conf.columns['id'], inplace=True) logger.debug('Read classification data file ready') # Read the train parcels logger.info(f"Read train file: {input_parcel_train_filepath}") train_df = pdh.read_file( input_parcel_train_filepath, columns=[conf.columns['id'], conf.columns['class']]) if train_df.index.name != conf.columns['id']: train_df.set_index(conf.columns['id'], inplace=True) logger.debug('Read train file ready') # Join the columns of input_parcel_classification_data_df that aren't yet in train_df logger.info("Join train sample with the classification data") train_df = (train_df.join(input_parcel_classification_data_df, how='inner')) # Read the test/validation data logger.info(f"Read test file: {input_parcel_test_filepath}") test_df = pdh.read_file( input_parcel_test_filepath, columns=[conf.columns['id'], conf.columns['class']]) if test_df.index.name != conf.columns['id']: test_df.set_index(conf.columns['id'], inplace=True) logger.debug('Read test file ready') # Join the columns of input_parcel_classification_data_df that aren't yet in test_df logger.info("Join test sample with the classification data") test_df = (test_df.join(input_parcel_classification_data_df, how='inner')) # Train if conf.classifier['classifier_type'].lower( ) == 'keras_multilayer_perceptron': import cropclassification.predict.classification_keras as class_core_keras class_core_keras.train( train_df=train_df, test_df=test_df, output_classifier_filepath=output_classifier_filepath) else: import cropclassification.predict.classification_sklearn as class_core_sklearn class_core_sklearn.train( train_df=train_df, output_classifier_filepath=output_classifier_filepath)
def collect_and_prepare_timeseries_data( input_parcel_filepath: Path, timeseries_dir: Path, base_filename: str, output_filepath: Path, start_date_str: str, end_date_str: str, sensordata_to_use: List[str], parceldata_aggregations_to_use: List[str], force: bool = False): """ Collect all timeseries data to use for the classification and prepare it by applying scaling,... as needed. """ # Some constants to choose which type of data to use in the marker. # Remark: the string needs to be the same as the end of the name of the columns in the csv files! # TODO: I'm not really happy with both a list in the ini file + here... not sure what the # cleanest solution is though... PARCELDATA_AGGRAGATION_MEAN = conf.general[ 'PARCELDATA_AGGRAGATION_MEAN'] # Mean value of the pixels values in a parcel. PARCELDATA_AGGRAGATION_STDDEV = conf.general[ 'PARCELDATA_AGGRAGATION_STDDEV'] # std dev of the values of the pixels in a parcel # Constants for types of sensor data SENSORDATA_S1 = conf.general['SENSORDATA_S1'] # Sentinel 1 data SENSORDATA_S1DB = conf.general['SENSORDATA_S1DB'] # Sentinel 1 data, in dB SENSORDATA_S1_ASCDESC = conf.general[ 'SENSORDATA_S1_ASCDESC'] # Sentinel 1 data, divided in Ascending and Descending passes SENSORDATA_S1DB_ASCDESC = conf.general[ 'SENSORDATA_S1DB_ASCDESC'] # Sentinel 1 data, in dB, divided in Ascending and Descending passes SENSORDATA_S2 = conf.general['SENSORDATA_S2'] # Sentinel 2 data SENSORDATA_S2gt95 = conf.general[ 'SENSORDATA_S2gt95'] # Sentinel 2 data (B2,B3,B4,B8) IF available for 95% or area SENSORDATA_S1_COHERENCE = conf.general['SENSORDATA_S1_COHERENCE'] # If force == False Check and the output file exists already, stop. if (force is False and output_filepath.exists() is True): logger.warning( f"Output file already exists and force == False, so stop: {output_filepath}" ) return # Init the result with the id's of the parcels we want to treat result_df = pdh.read_file(input_parcel_filepath, columns=[conf.columns['id']]) if result_df.index.name != conf.columns['id']: result_df.set_index(conf.columns['id'], inplace=True) nb_input_parcels = len(result_df.index) logger.info( f"Parceldata aggregations that need to be used: {parceldata_aggregations_to_use}" ) logger.setLevel(logging.DEBUG) # Loop over all input timeseries data to find the data we really need data_ext = conf.general['data_ext'] filepath_start = timeseries_dir / f"{base_filename}_{start_date_str}{data_ext}" filepath_end = timeseries_dir / f"{base_filename}_{end_date_str}{data_ext}" logger.debug(f'filepath_start_date: {filepath_start}') logger.debug(f'filepath_end_date: {filepath_end}') ts_data_files = timeseries_dir.glob(f"{base_filename}_*{data_ext}") for curr_filepath in sorted(ts_data_files): # Only process data that is of the right sensor types sensor_type = curr_filepath.stem.split('_')[-1] if sensor_type not in sensordata_to_use: logger.debug( f"SKIP: file is not in sensor types asked ({sensordata_to_use}): {curr_filepath}" ) continue # The only data we want to process is the data in the range of dates if ((str(curr_filepath) < str(filepath_start)) or (str(curr_filepath) >= str(filepath_end))): logger.debug( f"SKIP: File is not in date range asked: {curr_filepath}") continue # An empty file signifies that there wasn't any valable data for that period/sensor/... if os.path.getsize(curr_filepath) == 0: logger.info(f"SKIP: file is empty: {curr_filepath}") continue # Read data, and check if there is enough data in it data_read_df = pdh.read_file(curr_filepath) nb_data_read = len(data_read_df.index) data_available_pct = nb_data_read * 100 / nb_input_parcels min_parcels_with_data_pct = conf.timeseries.getfloat( 'min_parcels_with_data_pct') if data_available_pct < min_parcels_with_data_pct: logger.info( f"SKIP: only data for {data_available_pct:.2f}% of parcels, should be > {min_parcels_with_data_pct}%: {curr_filepath}" ) continue # Start processing the file logger.info(f'Process file: {curr_filepath}') if data_read_df.index.name != conf.columns['id']: data_read_df.set_index(conf.columns['id'], inplace=True) # Loop over columns to check if there are columns that need to be dropped. for column in data_read_df.columns: # If it is the id column, continue if column == conf.columns['id']: continue # Check if the column is "asked" column_ok = False for parceldata_aggregation in parceldata_aggregations_to_use: if column.endswith('_' + parceldata_aggregation): column_ok = True if column_ok is False: # Drop column if it doesn't end with something in parcel_data_aggregations_to_use logger.debug( f"Drop column as it's column aggregation isn't to be used: {column}" ) data_read_df.drop(column, axis=1, inplace=True) continue # Check if the column contains data for enough parcels valid_input_data_pct = ( 1 - (data_read_df[column].isnull().sum() / nb_input_parcels)) * 100 if valid_input_data_pct < min_parcels_with_data_pct: # If the number of nan values for the column > x %, drop column logger.warn( f"Drop column as it contains only {valid_input_data_pct:.2f}% real data compared to input (= not nan) which is < {min_parcels_with_data_pct}%!: {column}" ) data_read_df.drop(column, axis=1, inplace=True) # If S2, rescale data if sensor_type.startswith(SENSORDATA_S2): for column in data_read_df.columns: logger.info( f"Column contains S2 data, so scale it by dividing by 10.000: {column}" ) data_read_df[column] = data_read_df[column] / 10000 # If S1 coherence, rescale data if sensor_type == SENSORDATA_S1_COHERENCE: for column in data_read_df.columns: logger.info( f"Column contains S1 Coherence data, so scale it by dividing by 300: {column}" ) data_read_df[column] = data_read_df[column] / 300 # Join the data to the result... result_df = result_df.join(data_read_df, how='left') # Remove rows with many null values from result max_number_null = int(0.6 * len(result_df.columns)) parcel_many_null_df = result_df[result_df.isnull().sum( axis=1) > max_number_null] if len(parcel_many_null_df.index) > 0: # Write the rows with empty data to a file parcel_many_null_filepath = Path( f'{str(output_filepath)}_rows_many_null.sqlite') logger.warn( f"Write {len(parcel_many_null_df.index)} rows with > {max_number_null} of {len(result_df.columns)} columns==null to {parcel_many_null_filepath}" ) pdh.to_file(parcel_many_null_df, parcel_many_null_filepath) # Now remove them from result result_df = result_df[result_df.isnull().sum( axis=1) <= max_number_null] # For rows with some null values, set them to 0 # TODO: first rough test of using interpolation doesn't give a difference, maybe better if # smarter interpolation is used (= only between the different types of data: # S1_GRD_VV, S1_GRD_VH, S1_COH_VV, S1_COH_VH, ASC?, DESC?, S2 #result_df.interpolate(inplace=True) result_df.fillna(0, inplace=True) # Write output file... logger.info(f"Write output to file, start: {output_filepath}") pdh.to_file(result_df, output_filepath) logger.info(f"Write output to file, ready (with shape: {result_df.shape})")
def create_train_test_sample(input_parcel_filepath: str, output_parcel_train_filepath: str, output_parcel_test_filepath: str, balancing_strategy: str, force: bool = False): """ Create a seperate train and test sample from the general input file. """ # If force == False Check and the output files exist already, stop. if (force is False and os.path.exists(output_parcel_train_filepath) is True and os.path.exists(output_parcel_test_filepath) is True): logger.warning( f"create_train_test_sample: output files already exist and force == False, so stop: {output_parcel_train_filepath}, {output_parcel_test_filepath}" ) return # Load input data... logger.info( f"Start create_train_test_sample with balancing_strategy {balancing_strategy}" ) logger.info(f"Read input file {input_parcel_filepath}") df_in = pdh.read_file(input_parcel_filepath) logger.debug(f"Read input file ready, shape: {df_in.shape}") # Init some many-used variables from config class_balancing_column = conf.columns['class_balancing'] class_column = conf.columns['class'] with pd.option_context('display.max_rows', None, 'display.max_columns', None): count_per_class = df_in.groupby(class_balancing_column, as_index=False).size() logger.info( f"Number of elements per classname in input dataset:\n{count_per_class}" ) # The test dataset should be as representative as possible for the entire dataset, so create # this first as a 20% sample of each class without any additional checks... # Remark: group_keys=False evades that apply creates an extra index-level of the groups above # the data and evades having to do .reset_index(level=class_balancing_column_NAME, drop=True) # to get rid of the group level df_test = df_in.groupby(class_balancing_column, group_keys=False).apply(pd.DataFrame.sample, frac=0.20) logger.debug( f"df_test after sampling 20% of data per class, shape: {df_test.shape}" ) # The candidate parcel for training are all non-test parcel df_train_base = df_in[~df_in.index.isin(df_test.index)] logger.debug(f"df_train_base after isin\n{df_train_base}") # Remove parcel with too few pixels from the train sample min_pixcount = int(conf.marker['min_nb_pixels_train']) df_train_base = df_train_base[ df_train_base[conf.columns['pixcount_s1s2']] >= min_pixcount] logger.debug( f"Number of parcels in df_train_base after filter on pixcount >= {min_pixcount}: {len(df_train_base)}" ) # Some classes shouldn't be used for training... so remove them! logger.info( f"Remove 'classes_to_ignore_for_train' from train sample (= where {class_column} is in: {conf.marker.getlist('classes_to_ignore_for_train')}" ) df_train_base = df_train_base[~df_train_base[class_column].isin( conf.marker.getlist('classes_to_ignore_for_train'))] # All classes_to_ignore aren't meant for training either... logger.info( f"Remove 'classes_to_ignore' from train sample (= where {class_column} is in: {conf.marker.getlist('classes_to_ignore')}" ) df_train_base = df_train_base[~df_train_base[class_column].isin( conf.marker.getlist('classes_to_ignore'))] # Print the train base result before applying any balancing with pd.option_context('display.max_rows', None, 'display.max_columns', None): count_per_class = df_train_base.groupby(class_balancing_column, as_index=False).size() logger.info( f"Number of elements per classname for train dataset, before balancing:\n{count_per_class}" ) # Depending on the balancing_strategy, use different way to get a training sample if balancing_strategy == 'BALANCING_STRATEGY_NONE': # Just use 25% of all non-test data as train data -> 25% of 80% of data -> 20% of all data # will be training date # Remark: - this is very unbalanced, eg. classes with 10.000 times the input size than other # classes # - this results in a relatively high accuracy in overall numbers, but the small # classes are not detected at all df_train = (df_train_base.groupby(class_balancing_column, group_keys=False).apply( pd.DataFrame.sample, frac=0.25)) elif balancing_strategy == 'BALANCING_STRATEGY_MEDIUM': # Balance the train data, but still use some larger samples for the classes that have a lot # of members in the input dataset # Remark: with the upper limit of 10.000 this gives still OK results overall, and also the # smaller classes give some results with upper limit of 4000 results significantly # less good. # For the larger classes, favor them by leaving the samples larger but cap at upper_limit upper_limit = 10000 lower_limit = 1000 logger.info( f"Cap over {upper_limit}, keep the full number of training sample till {lower_limit}, samples smaller than that are oversampled" ) df_train = (df_train_base.groupby(class_balancing_column).filter( lambda x: len(x) >= upper_limit).groupby(class_balancing_column, group_keys=False).apply( pd.DataFrame.sample, upper_limit)) # Middle classes use the number as they are df_train = df_train.append( df_train_base.groupby(class_balancing_column).filter(lambda x: len( x) < upper_limit).groupby(class_balancing_column).filter( lambda x: len(x) >= lower_limit)) # For smaller classes, oversample... df_train = df_train.append( df_train_base.groupby(class_balancing_column).filter( lambda x: len(x) < lower_limit).groupby( class_balancing_column, group_keys=False).apply(pd.DataFrame.sample, lower_limit, replace=True)) elif balancing_strategy == 'BALANCING_STRATEGY_MEDIUM2': # Balance the train data, but still use some larger samples for the classes that have a lot # of members in the input dataset # Remark: with the upper limit of 10.000 this gives still OK results overall, and also the # smaller classes give some results with upper limit of 4000 results significantly # less good. # For the larger classes, leave the samples larger but cap cap_count_limit1 = 100000 cap_train_limit1 = 30000 logger.info( f"Cap balancing classes over {cap_count_limit1} to {cap_train_limit1}" ) df_train = (df_train_base.groupby(class_balancing_column).filter( lambda x: len(x) >= cap_count_limit1).groupby( class_balancing_column, group_keys=False).apply(pd.DataFrame.sample, cap_train_limit1)) cap_count_limit2 = 50000 cap_train_limit2 = 20000 logger.info( f"Cap balancing classes between {cap_count_limit2} and {cap_count_limit1} to {cap_train_limit2}" ) df_train = df_train.append( df_train_base.groupby(class_balancing_column).filter(lambda x: len( x) < cap_count_limit1).groupby(class_balancing_column).filter( lambda x: len(x) >= cap_count_limit2).groupby( class_balancing_column, group_keys=False).apply(pd.DataFrame.sample, cap_train_limit2)) cap_count_limit3 = 20000 cap_train_limit3 = 10000 logger.info( f"Cap balancing classes between {cap_count_limit3} and {cap_count_limit2} to {cap_train_limit3}" ) df_train = df_train.append( df_train_base.groupby(class_balancing_column).filter(lambda x: len( x) < cap_count_limit2).groupby(class_balancing_column).filter( lambda x: len(x) >= cap_count_limit3).groupby( class_balancing_column, group_keys=False).apply(pd.DataFrame.sample, cap_train_limit3)) cap_count_limit4 = 10000 cap_train_limit4 = 10000 logger.info( f"Cap balancing classes between {cap_count_limit4} and {cap_count_limit3} to {cap_train_limit4}" ) df_train = df_train.append( df_train_base.groupby(class_balancing_column).filter(lambda x: len( x) < cap_count_limit3).groupby(class_balancing_column).filter( lambda x: len(x) >= cap_count_limit4).groupby( class_balancing_column, group_keys=False).apply(pd.DataFrame.sample, cap_train_limit4)) oversample_count = 1000 # Middle classes use the number as they are logger.info( f"For classes between {cap_count_limit4} and {oversample_count}, just use all samples" ) df_train = df_train.append( df_train_base.groupby(class_balancing_column).filter(lambda x: len( x) < cap_count_limit4).groupby(class_balancing_column).filter( lambda x: len(x) >= oversample_count)) # For smaller classes, oversample... logger.info( f"For classes smaller than {oversample_count}, oversample to {oversample_count}" ) df_train = df_train.append( df_train_base.groupby(class_balancing_column).filter( lambda x: len(x) < oversample_count).groupby( class_balancing_column, group_keys=False).apply(pd.DataFrame.sample, oversample_count, replace=True)) elif balancing_strategy == 'BALANCING_STRATEGY_PROPORTIONAL_GROUPS': # Balance the train data, but still use some larger samples for the classes that have a lot # of members in the input dataset # Remark: with the upper limit of 10.000 this gives still OK results overall, and also the # smaller classes give some results with upper limit of 4000 results significantly # less good. # For the larger classes, leave the samples larger but cap upper_count_limit1 = 100000 upper_train_limit1 = 30000 logger.info( f"Cap balancing classes over {upper_count_limit1} to {upper_train_limit1}" ) df_train = (df_train_base.groupby(class_balancing_column).filter( lambda x: len(x) >= upper_count_limit1).groupby( class_balancing_column, group_keys=False).apply(pd.DataFrame.sample, upper_train_limit1)) upper_count_limit2 = 50000 upper_train_limit2 = 20000 logger.info( f"Cap balancing classes between {upper_count_limit2} and {upper_count_limit1} to {upper_train_limit2}" ) df_train = df_train.append( df_train_base.groupby(class_balancing_column).filter(lambda x: len( x) < upper_count_limit1).groupby(class_balancing_column). filter(lambda x: len(x) >= upper_count_limit2).groupby( class_balancing_column, group_keys=False).apply(pd.DataFrame.sample, upper_train_limit2)) upper_count_limit3 = 20000 upper_train_limit3 = 10000 logger.info( f"Cap balancing classes between {upper_count_limit3} and {upper_count_limit2} to {upper_train_limit3}" ) df_train = df_train.append( df_train_base.groupby(class_balancing_column).filter(lambda x: len( x) < upper_count_limit2).groupby(class_balancing_column). filter(lambda x: len(x) >= upper_count_limit3).groupby( class_balancing_column, group_keys=False).apply(pd.DataFrame.sample, upper_train_limit3)) upper_count_limit4 = 10000 upper_train_limit4 = 5000 logger.info( f"Cap balancing classes between {upper_count_limit4} and {upper_count_limit3} to {upper_train_limit4}" ) df_train = df_train.append( df_train_base.groupby(class_balancing_column).filter(lambda x: len( x) < upper_count_limit3).groupby(class_balancing_column). filter(lambda x: len(x) >= upper_count_limit4).groupby( class_balancing_column, group_keys=False).apply(pd.DataFrame.sample, upper_train_limit4)) # For smaller balancing classes, just use all samples df_train = df_train.append( df_train_base.groupby(class_balancing_column).filter( lambda x: len(x) < upper_count_limit4)) elif balancing_strategy == 'BALANCING_STRATEGY_UPPER_LIMIT': # Balance the train data, but still use some larger samples for the classes that have a lot # of members in the input dataset # Remark: with the upper limit of 10.000 this gives still OK results overall, and also the # smaller classes give some results with upper limit of 4000 results significantly # less good. # For the larger classes, favor them by leaving the samples larger but cap at upper_limit upper_limit = 10000 logger.info(f"Cap over {upper_limit}...") df_train = (df_train_base.groupby(class_balancing_column).filter( lambda x: len(x) >= upper_limit).groupby(class_balancing_column, group_keys=False).apply( pd.DataFrame.sample, upper_limit)) # For smaller classes, just use all samples df_train = df_train.append( df_train_base.groupby(class_balancing_column).filter( lambda x: len(x) < upper_limit)) elif balancing_strategy == 'BALANCING_STRATEGY_EQUAL': # In theory the most logical way to balance: make sure all classes have the same amount of # training data by undersampling the largest classes and oversampling the small classes. df_train = (df_train_base.groupby(class_balancing_column, group_keys=False).apply( pd.DataFrame.sample, 2000, replace=True)) else: logger.fatal( f"Unknown balancing strategy, STOP!: {balancing_strategy}") # Log the resulting numbers per class in the train sample with pd.option_context('display.max_rows', None, 'display.max_columns', None): count_per_class = df_train.groupby(class_balancing_column, as_index=False).size() logger.info( f'Number of elements per class_balancing_column in train dataset:\n{count_per_class}' ) if class_balancing_column != class_column: count_per_class = df_train.groupby(class_column, as_index=False).size() logger.info( f'Number of elements per class_column in train dataset:\n{count_per_class}' ) # Log the resulting numbers per class in the test sample with pd.option_context('display.max_rows', None, 'display.max_columns', None): count_per_class = df_test.groupby(class_balancing_column, as_index=False).size() logger.info( f'Number of elements per class_balancing_column in test dataset:\n{count_per_class}' ) if class_balancing_column != class_column: count_per_class = df_test.groupby(class_column, as_index=False).size() logger.info( f'Number of elements per class_column in test dataset:\n{count_per_class}' ) # Write to output files logger.info('Write the output files') df_train.set_index(conf.columns['id'], inplace=True) df_test.set_index(conf.columns['id'], inplace=True) pdh.to_file(df_train, output_parcel_train_filepath) # The ID column is the index... pdh.to_file(df_test, output_parcel_test_filepath) # The ID column is the index...
def predict(input_parcel_filepath: Path, input_parcel_classification_data_filepath: Path, input_classifier_basefilepath: Path, input_classifier_filepath: Path, output_predictions_filepath: Path, force: bool = False, input_parcel_classification_data_df: pd.DataFrame = None): """ Predict the classes for the input data. """ # If force is False, and the output file exist already, return if (force is False and output_predictions_filepath.exists()): logger.warning( f"predict: predictions output file already exists and force is false, so stop: {output_predictions_filepath}" ) return # Read the input parcels logger.info(f"Read input file: {input_parcel_filepath}") input_parcel_df = pdh.read_file(input_parcel_filepath, columns=[ conf.columns['id'], conf.columns['class'], conf.columns['class_declared'] ]) if input_parcel_df.index.name != conf.columns['id']: input_parcel_df.set_index(conf.columns['id'], inplace=True) logger.debug('Read train file ready') # For parcels of a class that should be ignored, don't predict input_parcel_df = input_parcel_df.loc[ ~input_parcel_df[conf.columns['class_declared']]. isin(conf.marker.getlist('classes_to_ignore'))] # get the expected columns from the classifier datacolumns_filepath = glob.glob( os.path.join(os.path.dirname(input_classifier_filepath), "*datacolumns.txt"))[0] with open(datacolumns_filepath, "r") as f: input_classifier_datacolumns = ast.literal_eval(f.readline()) # If the classification data isn't passed as dataframe, read it from the csv if input_parcel_classification_data_df is None: logger.info( f"Read classification data file: {input_parcel_classification_data_filepath}" ) input_parcel_classification_data_df = pdh.read_file( input_parcel_classification_data_filepath) if input_parcel_classification_data_df.index.name != conf.columns['id']: input_parcel_classification_data_df.set_index(conf.columns['id'], inplace=True) logger.debug('Read classification data file ready') # only take the required columns as expected by the classifier input_parcel_classification_data_df = input_parcel_classification_data_df[ input_classifier_datacolumns] # Join the data to send to prediction logic logger.info("Join input parcels with the classification data") input_parcel_for_predict_df = input_parcel_df.join( input_parcel_classification_data_df, how='inner') # Predict! logger.info(f"Predict using this model: {input_classifier_filepath}") if conf.classifier['classifier_type'].lower( ) == 'keras_multilayer_perceptron': import cropclassification.predict.classification_keras as class_core_keras class_core_keras.predict_proba( parcel_df=input_parcel_for_predict_df, classifier_basefilepath=input_classifier_basefilepath, classifier_filepath=input_classifier_filepath, output_parcel_predictions_filepath=output_predictions_filepath) else: import cropclassification.predict.classification_sklearn as class_core_sklearn class_core_sklearn.predict_proba( parcel_df=input_parcel_for_predict_df, classifier_basefilepath=input_classifier_basefilepath, classifier_filepath=input_classifier_filepath, output_parcel_predictions_filepath=output_predictions_filepath)
def calc_top3_and_consolidation(input_parcel_filepath: str, input_parcel_probabilities_filepath: str, output_predictions_filepath: str, output_predictions_output_filepath: str = None, force: bool = False): """ Calculate the top3 prediction and a consolidation prediction. Remark: in this logic the declared crop/class (class_declared) is used, as we want to compare with the declaration of the farmer, rather than taking into account corrections already. Args: input_parcel_filepath (str): [description] input_parcel_probabilities_filepath (str): [description] output_predictions_filepath (str): [description] output_predictions_output_filepath (str, optional): [description]. Defaults to None. force (bool, optional): [description]. Defaults to False. """ # If force is false and output exists, already, return if(force is False and os.path.exists(output_predictions_filepath)): logger.warning(f"calc_top3_and_consolidation: output file exist and force is False, so stop: {output_predictions_filepath}") return # Read input files logger.info("Read input file") proba_df = pdh.read_file(input_parcel_probabilities_filepath) top3_df = calc_top3(proba_df) # Read input files logger.info("Read input file") input_parcel_df = pdh.read_file(input_parcel_filepath) # All input parcels must stay in the output, so left join input with pred top3_df.set_index(conf.columns['id'], inplace=True) if input_parcel_df.index.name != conf.columns['id']: input_parcel_df.set_index(conf.columns['id'], inplace=True) cols_to_join = top3_df.columns.difference(input_parcel_df.columns) pred_df = input_parcel_df.join(top3_df[cols_to_join], how='left') # The parcels added by the join don't have a prediction yet, so apply it # For the ignore classes, set the prediction to the ignore type classes_to_ignore = conf.marker.getlist('classes_to_ignore') pred_df.loc[pred_df[conf.columns['class_declared']].isin(classes_to_ignore), 'pred1'] = pred_df[conf.columns['class_declared']] # For all other parcels without prediction there must have been no data # available for a classification, so set prediction to NODATA pred_df['pred1'].fillna('NODATA', inplace=True) # Add doubt columns add_doubt_column(pred_df=pred_df, new_pred_column=conf.columns['prediction_cons'], apply_doubt_min_nb_pixels=True) add_doubt_column(pred_df=pred_df, new_pred_column=conf.columns['prediction_full_alpha'], apply_doubt_min_nb_pixels=True, apply_doubt_marker_specific=True) # Calculate the status of the consolidated prediction (OK=usable, NOK=not) pred_df.loc[pred_df[conf.columns['prediction_cons']].isin(proba_df.columns.to_list()), conf.columns['prediction_cons_status']] = 'OK' pred_df[conf.columns['prediction_cons_status']].fillna('NOK', inplace=True) logger.info("Write full prediction data to file") pdh.to_file(pred_df, output_predictions_filepath) # Create final output file with the most important info if output_predictions_output_filepath is not None: # First add some aditional columns specific for the export pred_df['markercode'] = conf.marker['markertype'] pred_df['run_id'] = conf.general['run_id'] today = datetime.date.today() pred_df['cons_date'] = today pred_df['modify_date'] = today logger.info("Write final output prediction data to file") pred_df.reset_index(inplace=True) pred_df = pred_df[conf.columns.getlist('output_columns')] pdh.to_file(pred_df, output_predictions_output_filepath, index=False) # Write oracle sqlldr file if conf.marker['markertype'] in ['LANDCOVER', 'LANDCOVER_EARLY']: table_name = 'mon_marker_landcover' table_columns = ("layer_id, prc_id, versienummer, markercode, run_id, cons_landcover, " + "cons_status, cons_date date 'yyyy-mm-dd', landcover1, probability1, " + "landcover2, probability2, landcover3, probability3, " + "modify_date date 'yyyy-mm-dd'") elif conf.marker['markertype'] in ['CROPGROUP', 'CROPGROUP_EARLY']: table_name = 'mon_marker_cropgroup' table_columns = ("layer_id, prc_id, versienummer, markercode, run_id, cons_cropgroup, " + "cons_status, cons_date date 'yyyy-mm-dd', cropgroup1, probability1, " + "cropgroup2, probability2, cropgroup3, probability3, " + "modify_date date 'yyyy-mm-dd'") else: table_name = None logger.warning(f"Table unknown for marker type {conf.marker['markertype']}, so cannot write .ctl file") if table_name is not None: with open(output_predictions_output_filepath + '.ctl', 'w') as ctlfile: # SKIP=1 to skip the columns names line, the other ones to evade # more commits than needed ctlfile.write("OPTIONS (SKIP=1, ROWS=10000, BINDSIZE=40000000, READSIZE=40000000)\n") ctlfile.write("LOAD DATA\n") ctlfile.write(f"INFILE '{os.path.basename(output_predictions_output_filepath)}' \"str '\\n'\"\n") ctlfile.write(f"INSERT INTO TABLE {table_name} APPEND\n") # A tab as seperator is apparently X'9' ctlfile.write("FIELDS TERMINATED BY X'9'\n") ctlfile.write(f"({table_columns})\n")
def write_full_report(parcel_predictions_filepath: str, output_report_txt: str, parcel_ground_truth_filepath: str = None, force: bool = None): """Writes a report about the accuracy of the predictions to a file. Args: parcel_predictions_filepath: File name of csv file with the parcel with their predictions. prediction_columnname: Column name of the column that contains the predictions. output_report_txt: File name of txt file the report will be written to. parcel_ground_truth_filepath: List of parcels with ground truth to calculate eg. alfa and beta errors. If None, the part of the report that is based on this data is skipped TODO: refactor function to split logic more... """ # If force == False Check and the output file exists already, stop. if force is False and os.path.exists(output_report_txt): logger.warning( f"collect_and_prepare_timeseries_data: output file already exists and force == False, so stop: {output_report_txt}" ) return logger.info("Start write_full_report") pandas_option_context_list = [ 'display.max_rows', None, 'display.max_columns', None, 'display.max_colwidth', 300, 'display.width', 2000, 'display.colheader_justify', 'left' ] logger.info(f"Read file with predictions: {parcel_predictions_filepath}") df_predict = pdh.read_file(parcel_predictions_filepath) df_predict.set_index(conf.columns['id'], inplace=True) # Python template engine expects all values to be present, so initialize to empty empty_string = "''" html_data = { 'GENERAL_ACCURACIES_TABLE': empty_string, 'GENERAL_ACCURACIES_TEXT': empty_string, 'GENERAL_ACCURACIES_DATA': empty_string, 'CONFUSION_MATRICES_TABLE': empty_string, 'CONFUSION_MATRICES_DATA': empty_string, 'CONFUSION_MATRICES_CONSOLIDATED_TABLE': empty_string, 'CONFUSION_MATRICES_CONSOLIDATED_DATA': empty_string, 'PREDICTION_QUALITY_CONS_OVERVIEW_TEXT': empty_string, 'PREDICTION_QUALITY_CONS_OVERVIEW_TABLE': empty_string, 'PREDICTION_QUALITY_FULL_ALPHA_OVERVIEW_TEXT': empty_string, 'PREDICTION_QUALITY_FULL_ALPHA_OVERVIEW_TABLE': empty_string, 'PREDICTION_QUALITY_ALPHA_TEXT': empty_string, 'PREDICTION_QUALITY_BETA_TEXT': empty_string, 'PREDICTION_QUALITY_ALPHA_PER_PIXCOUNT_TEXT': empty_string, 'PREDICTION_QUALITY_ALPHA_PER_PIXCOUNT_TABLE': empty_string } # Build and write report... with open(output_report_txt, 'w') as outputfile: outputfile.write( "************************************************************\n") outputfile.write( "********************* PARAMETERS USED **********************\n") outputfile.write( "************************************************************\n") outputfile.write("\n") message = "Main parameters used for the marker" outputfile.write(f"\n{message}\n") html_data['PARAMETERS_USED_TEXT'] = message logger.info(f"{dict(conf.marker)}") parameter_list = [['marker', key, value] for key, value in conf.marker.items()] parameter_list += [['timeseries', key, value] for key, value in conf.timeseries.items()] parameter_list += [['preprocess', key, value] for key, value in conf.preprocess.items()] parameter_list += [['classifier', key, value] for key, value in conf.classifier.items()] parameter_list += [['postprocess', key, value] for key, value in conf.postprocess.items()] parameters_used_df = pd.DataFrame( parameter_list, columns=['parameter_type', 'parameter', 'value']) with pd.option_context(*pandas_option_context_list): outputfile.write(f"\n{parameters_used_df}\n") logger.info(f"{parameters_used_df}\n") html_data['PARAMETERS_USED_TABLE'] = parameters_used_df.to_html( index=False) outputfile.write( "************************************************************\n") outputfile.write( "**************** RECAP OF GENERAL RESULTS ******************\n") outputfile.write( "************************************************************\n") outputfile.write("\n") outputfile.write( "************************************************************\n") outputfile.write( "* GENERAL CONSOLIDATED CONCLUSIONS *\n") outputfile.write( "************************************************************\n") # Calculate + write general conclusions for consolidated prediction _add_prediction_conclusion( in_df=df_predict, new_columnname=conf.columns['prediction_conclusion_cons'], prediction_column_to_use=conf.columns['prediction_cons'], detailed=False) # Get the number of 'unimportant' ignore parcels and report them here df_predict_unimportant = df_predict[df_predict[ conf.columns['prediction_conclusion_cons']] == 'IGNORE_UNIMPORTANT'] # Now they can be removed for the rest of the reportings... df_predict = df_predict[ df_predict[conf.columns['prediction_conclusion_cons']] != 'IGNORE_UNIMPORTANT'] message = ( f"Prediction conclusions cons general overview, for {len(df_predict.index)} predicted cases." + f"The {len(df_predict_unimportant.index)} IGNORE_UNIMPORTANT parcels are excluded from the reporting!" ) outputfile.write(f"\n{message}\n") html_data['GENERAL_PREDICTION_CONCLUSION_CONS_OVERVIEW_TEXT'] = message count_per_class = (df_predict.groupby( conf.columns['prediction_conclusion_cons'], as_index=False).size().to_frame('count')) values = 100 * count_per_class['count'] / count_per_class['count'].sum( ) count_per_class.insert(loc=1, column='pct', value=values) with pd.option_context(*pandas_option_context_list): outputfile.write(f"\n{count_per_class}\n") logger.info(f"{count_per_class}\n") html_data[ 'GENERAL_PREDICTION_CONCLUSION_CONS_OVERVIEW_TABLE'] = count_per_class.to_html( ) html_data[ 'GENERAL_PREDICTION_CONCLUSION_CONS_OVERVIEW_DATA'] = count_per_class.to_dict( ) # Output general accuracies outputfile.write( "************************************************************\n") outputfile.write( "* OVERALL ACCURACIES *\n") outputfile.write( "************************************************************\n") overall_accuracies_list = [] # Calculate overall accuracies for all parcels try: oa = skmetrics.accuracy_score(df_predict[conf.columns['class']], df_predict['pred1'], normalize=True, sample_weight=None) * 100 overall_accuracies_list.append({ 'parcels': 'All', 'prediction_type': 'standard', 'accuracy': oa }) oa = skmetrics.accuracy_score( df_predict[conf.columns['class']], df_predict[conf.columns['prediction_cons']], normalize=True, sample_weight=None) * 100 overall_accuracies_list.append({ 'parcels': 'All', 'prediction_type': 'consolidated', 'accuracy': oa }) except: logger.exception("Error calculating overall accuracies!") # Calculate while ignoring the classes to be ignored... df_predict_accuracy_no_ignore = df_predict[ ~df_predict[conf.columns['class']]. isin(conf.marker.getlist('classes_to_ignore_for_train'))] df_predict_accuracy_no_ignore = df_predict_accuracy_no_ignore[ ~df_predict_accuracy_no_ignore[conf.columns['class']]. isin(conf.marker.getlist('classes_to_ignore'))] oa = skmetrics.accuracy_score( df_predict_accuracy_no_ignore[conf.columns['class']], df_predict_accuracy_no_ignore['pred1'], normalize=True, sample_weight=None) * 100 overall_accuracies_list.append({ 'parcels': 'Exclude classes_to_ignore(_for_train) classes', 'prediction_type': 'standard', 'accuracy': oa }) oa = skmetrics.accuracy_score( df_predict_accuracy_no_ignore[conf.columns['class']], df_predict_accuracy_no_ignore[conf.columns['prediction_cons']], normalize=True, sample_weight=None) * 100 overall_accuracies_list.append({ 'parcels': 'Exclude classes_to_ignore(_for_train) classes', 'prediction_type': 'consolidated', 'accuracy': oa }) # Calculate ignoring both classes to ignored + parcels not having a valid prediction df_predict_no_ignore_has_prediction = df_predict_accuracy_no_ignore.loc[ (df_predict_accuracy_no_ignore[conf.columns['prediction_cons']] != 'NODATA') & (df_predict_accuracy_no_ignore[conf.columns['prediction_cons']] != 'DOUBT:NOT_ENOUGH_PIXELS')] oa = skmetrics.accuracy_score( df_predict_no_ignore_has_prediction[conf.columns['class']], df_predict_no_ignore_has_prediction['pred1'], normalize=True, sample_weight=None) * 100 overall_accuracies_list.append({ 'parcels': 'Exclude ignored ones + with prediction (= excl. NODATA, NOT_ENOUGH_PIXELS)', 'prediction_type': 'standard', 'accuracy': oa }) oa = skmetrics.accuracy_score( df_predict_no_ignore_has_prediction[conf.columns['class']], df_predict_no_ignore_has_prediction[ conf.columns['prediction_cons']], normalize=True, sample_weight=None) * 100 overall_accuracies_list.append({ 'parcels': 'Exclude ignored ones + with prediction (= excl. NODATA, NOT_ENOUGH_PIXELS)', 'prediction_type': 'consolidated', 'accuracy': oa }) # Output the resulting overall accuracies message = 'Overall accuracies for different sub-groups of the data' outputfile.write(f"\n{message}\n") html_data['OVERALL_ACCURACIES_TEXT'] = message overall_accuracies_df = pd.DataFrame( overall_accuracies_list, columns=['parcels', 'prediction_type', 'accuracy']) overall_accuracies_df.set_index(keys=['parcels', 'prediction_type'], inplace=True) with pd.option_context(*pandas_option_context_list): outputfile.write(f"\n{overall_accuracies_df}\n") logger.info(f"{overall_accuracies_df}\n") html_data[ 'OVERALL_ACCURACIES_TABLE'] = overall_accuracies_df.to_html() # Write the recall, F1 score,... per class #message = skmetrics.classification_report(df_predict[gs.class_column] # , df_predict[gs.prediction_column] # , labels=classes) #outputfile.write(message) outputfile.write( "************************************************************\n") outputfile.write( "********************* DETAILED RESULTS *********************\n") outputfile.write( "************************************************************\n") outputfile.write("\n") outputfile.write( "************************************************************\n") outputfile.write( "* DETAILED PREDICTION CONCLUSIONS *\n") outputfile.write( "************************************************************\n") # Calculate detailed conclusions for the predictions logger.info("Calculate the detailed conclusions for the predictions") # Write the conclusions for the consolidated predictions _add_prediction_conclusion( in_df=df_predict, new_columnname=conf.columns['prediction_conclusion_detail_cons'], prediction_column_to_use=conf.columns['prediction_cons'], detailed=True) message = f"Prediction conclusions cons (doubt + not_enough_pixels) overview, for {len(df_predict.index)} predicted cases:" outputfile.write(f"\n{message}\n") html_data['PREDICTION_CONCLUSION_DETAIL_CONS_OVERVIEW_TEXT'] = message count_per_class = (df_predict.groupby( conf.columns['prediction_conclusion_detail_cons'], as_index=False).size().to_frame('count')) values = 100 * count_per_class['count'] / count_per_class['count'].sum( ) count_per_class.insert(loc=1, column='pct', value=values) with pd.option_context('display.max_rows', None, 'display.max_columns', None): outputfile.write(f"\n{count_per_class}\n") logger.info(f"{count_per_class}\n") html_data[ 'PREDICTION_CONCLUSION_DETAIL_CONS_OVERVIEW_TABLE'] = count_per_class.to_html( ) # Calculate detailed conclusions for the predictions logger.info("Calculate the detailed conclusions for the predictions") # Write the conclusions for the consolidated predictions _add_prediction_conclusion( in_df=df_predict, new_columnname=conf. columns['prediction_conclusion_detail_full_alpha'], prediction_column_to_use=conf.columns['prediction_full_alpha'], detailed=True) message = f"Prediction conclusions full alpha (doubt + not_enough_pixels) overview, for {len(df_predict.index)} predicted cases:" outputfile.write(f"\n{message}\n") html_data[ 'PREDICTION_CONCLUSION_DETAIL_FULL_ALPHA_OVERVIEW_TEXT'] = message count_per_class = (df_predict.groupby( conf.columns['prediction_conclusion_detail_full_alpha'], as_index=False).size().to_frame('count')) values = 100 * count_per_class['count'] / count_per_class['count'].sum( ) count_per_class.insert(loc=1, column='pct', value=values) with pd.option_context('display.max_rows', None, 'display.max_columns', None): outputfile.write(f"\n{count_per_class}\n") logger.info(f"{count_per_class}\n") html_data[ 'PREDICTION_CONCLUSION_DETAIL_FULL_ALPHA_OVERVIEW_TABLE'] = count_per_class.to_html( ) outputfile.write( "************************************************************\n") outputfile.write( "* CONFUSION MATRICES FOR PARCELS WITH PREDICTIONS *\n") outputfile.write( "************************************************************\n") # Calculate an extended confusion matrix with the standard prediction column and write # it to output... df_confmatrix_ext = _get_confusion_matrix_ext(df_predict, 'pred1') outputfile.write( "\nExtended confusion matrix of the predictions: Rows: true/input classes, columns: predicted classes\n" ) with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.width', 2000): outputfile.write(f"{df_confmatrix_ext}\n") html_data['CONFUSION_MATRICES_TABLE'] = df_confmatrix_ext.to_html() html_data['CONFUSION_MATRICES_DATA'] = df_confmatrix_ext.to_json() # Calculate an extended confusion matrix with the consolidated prediction column and write # it to output... df_confmatrix_ext = _get_confusion_matrix_ext( df_predict, conf.columns['prediction_cons']) outputfile.write( "\nExtended confusion matrix of the consolidated predictions: Rows: true/input classes, columns: predicted classes\n" ) with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.width', 2000): outputfile.write(f"{df_confmatrix_ext}\n\n") html_data[ 'CONFUSION_MATRICES_CONSOLIDATED_TABLE'] = df_confmatrix_ext.to_html( ) html_data[ 'CONFUSION_MATRICES_CONSOLIDATED_DATA'] = df_confmatrix_ext.to_json( ) # If the pixcount is available, write the OA per pixcount if conf.columns['pixcount_s1s2'] in df_predict.columns: pixcount_output_report_txt = output_report_txt + '_OA_per_pixcount.txt' _write_OA_per_pixcount( df_parcel_predictions=df_predict, output_report_txt=pixcount_output_report_txt, force=force) # If a ground truth file is provided, report on the ground truth if parcel_ground_truth_filepath is not None: outputfile.write( "************************************************************\n" ) outputfile.write( "* REPORTING ON PREDICTION QUALITY BASED ON GROUND TRUTH *\n" ) outputfile.write( "************************************************************\n" ) # Read ground truth logger.info( f"Read csv with ground truth (with their classes): {parcel_ground_truth_filepath}" ) df_parcel_gt = pdh.read_file(parcel_ground_truth_filepath) df_parcel_gt.set_index(conf.columns['id'], inplace=True) logger.info( f"Read csv with ground truth ready, shape: {df_parcel_gt.shape}" ) # Join the prediction data cols_to_join = df_predict.columns.difference(df_parcel_gt.columns) df_parcel_gt = df_predict[cols_to_join].join(df_parcel_gt, how='inner') logger.info( f"After join of ground truth with predictions, shape: {df_parcel_gt.shape}" ) if len(df_parcel_gt.index) == 0: message = "After join of ground truth with predictions the result was empty, so probably a wrong ground truth file was used!" logger.critical(message) raise Exception(message) # General ground truth statistics # ****************************************************************** # Calculate the conclusions based on ground truth # Calculate and write the result for the consolidated predictions _add_gt_conclusions(df_parcel_gt, conf.columns['prediction_cons']) message = f"Prediction quality cons (doubt + not_enough_pixels) overview, for {len(df_parcel_gt.index)} predicted cases in ground truth:" outputfile.write(f"\n{message}\n") html_data['PREDICTION_QUALITY_CONS_OVERVIEW_TEXT'] = message count_per_class = (df_parcel_gt.groupby( f"gt_conclusion_{conf.columns['prediction_cons']}", as_index=False).size().to_frame('count')) values = 100 * count_per_class['count'] / count_per_class[ 'count'].sum() count_per_class.insert(loc=1, column='pct', value=values) with pd.option_context('display.max_rows', None, 'display.max_columns', None): outputfile.write(f"\n{count_per_class}\n") logger.info(f"{count_per_class}\n") html_data[ 'PREDICTION_QUALITY_CONS_OVERVIEW_TABLE'] = count_per_class.to_html( ) # Calculate and write the result for the consolidated predictions _add_gt_conclusions(df_parcel_gt, conf.columns['prediction_full_alpha']) message = f"Prediction quality cons (doubt + not_enough_pixels) overview, for {len(df_parcel_gt.index)} predicted cases in ground truth:" outputfile.write(f"\n{message}\n") html_data['PREDICTION_QUALITY_FULL_ALPHA_OVERVIEW_TEXT'] = message count_per_class = (df_parcel_gt.groupby( f"gt_conclusion_{conf.columns['prediction_full_alpha']}", as_index=False).size().to_frame('count')) values = 100 * count_per_class['count'] / count_per_class[ 'count'].sum() count_per_class.insert(loc=1, column='pct', value=values) with pd.option_context('display.max_rows', None, 'display.max_columns', None): outputfile.write(f"\n{count_per_class}\n") logger.info(f"{count_per_class}\n") html_data[ 'PREDICTION_QUALITY_FULL_ALPHA_OVERVIEW_TABLE'] = count_per_class.to_html( ) # Write the ground truth conclusions to file pdh.to_file( df_parcel_gt, output_report_txt + "_groundtruth_pred_quality_details.tsv") # Alpha and beta error statistics based on CONS prediction # ****************************************************************** # Pct Alpha errors=alpha errors/(alpha errors + real errors) columnname = f"gt_conclusion_{conf.columns['prediction_cons']}" alpha_error_numerator = len(df_parcel_gt.loc[ df_parcel_gt[columnname] == 'FARMER-CORRECT_PRED-WRONG:ERROR_ALPHA'].index) alpha_error_denominator = ( alpha_error_numerator + len(df_parcel_gt.loc[df_parcel_gt[columnname].isin([ 'FARMER-WRONG_PRED-CORRECT', 'FARMER-WRONG_PRED-WRONG' ])].index)) if alpha_error_denominator > 0: message = ( f"Alpha error for cons: {alpha_error_numerator}/{alpha_error_denominator} = " + f"{(alpha_error_numerator/alpha_error_denominator):.02f}") else: message = f"Alpha error for cons: {alpha_error_numerator}/{alpha_error_denominator} = ?" outputfile.write(f"\n{message}\n") html_data['PREDICTION_QUALITY_ALPHA_TEXT'] = message beta_error_numerator = len(df_parcel_gt.loc[ df_parcel_gt[columnname] == 'FARMER-WRONG_PRED-DOESNT_OPPOSE:ERROR_BETA'].index) beta_error_denominator = ( beta_error_numerator + len(df_parcel_gt.loc[df_parcel_gt[columnname].str.startswith( 'FARMER-WRONG_PRED-')].index)) if beta_error_denominator > 0: message = ( f"Beta error for cons: {beta_error_numerator}/{beta_error_denominator} = " + f"{(beta_error_numerator/beta_error_denominator):.02f}") else: message = f"Beta error for cons: {beta_error_numerator}/{beta_error_denominator} = ?" outputfile.write(f"\n{message}\n") html_data['PREDICTION_QUALITY_BETA_TEXT'] = message # Alpha and beta error statistics based on CONS prediction # ****************************************************************** # Pct Alpha errors=alpha errors/(alpha errors + real errors) columnname = f"gt_conclusion_{conf.columns['prediction_full_alpha']}" alpha_error_numerator = len(df_parcel_gt.loc[ df_parcel_gt[columnname] == 'FARMER-CORRECT_PRED-WRONG:ERROR_ALPHA'].index) alpha_error_denominator = ( alpha_error_numerator + len(df_parcel_gt.loc[df_parcel_gt[columnname].isin([ 'FARMER-WRONG_PRED-CORRECT', 'FARMER-WRONG_PRED-WRONG' ])].index)) if alpha_error_denominator > 0: message = ( f"Alpha error full: {alpha_error_numerator}/{alpha_error_denominator} = " + f"{(alpha_error_numerator/alpha_error_denominator):.02f}") else: message = f"Alpha error full: {alpha_error_numerator}/{alpha_error_denominator} = ?" outputfile.write(f"\n{message}\n") html_data['PREDICTION_QUALITY_ALPHA_TEXT'] += '<br/>' + message beta_error_numerator = len(df_parcel_gt.loc[ df_parcel_gt[columnname] == 'FARMER-WRONG_PRED-DOESNT_OPPOSE:ERROR_BETA'].index) beta_error_denominator = ( beta_error_numerator + len(df_parcel_gt.loc[df_parcel_gt[columnname].str.startswith( 'FARMER-WRONG_PRED-')].index)) if beta_error_denominator > 0: message = ( f"Beta error full: {beta_error_numerator}/{beta_error_denominator} = " + f"{(beta_error_numerator/beta_error_denominator):.02f}") else: message = f"Beta error full: {beta_error_numerator}/{beta_error_denominator} = ?" outputfile.write(f"\n{message}\n") html_data['PREDICTION_QUALITY_BETA_TEXT'] += '<br/>' + message # If the pixcount is available, write the number of ALFA errors per pixcount (for the prediction with doubt) if conf.columns['pixcount_s1s2'] in df_parcel_gt.columns: # Get data, drop empty lines and write message = f"Number of ERROR_ALFA parcels for the 'prediction full alpha without NOT_ENOUGH_PIX' per pixcount for the ground truth parcels:" outputfile.write(f"\n{message}\n") html_data[ 'PREDICTION_QUALITY_ALPHA_PER_PIXCOUNT_TEXT'] = message # To get the number of alpha errors per pixcount, we also need alpha errors # also for parcels that had not_enough_pixels, so we need prediction_withdoubt # If they don't exist, calculate class_postpr.add_doubt_column( pred_df=df_parcel_gt, new_pred_column='pred_cons_no_min_pix', apply_doubt_marker_specific=True) _add_gt_conclusions(df_parcel_gt, 'pred_cons_no_min_pix') df_per_pixcount = _get_alfa_errors_per_pixcount( df_predquality_pixcount=df_parcel_gt, pred_quality_column="gt_conclusion_" + "pred_cons_no_min_pix", error_alpha_code='FARMER-CORRECT_PRED-WRONG:ERROR_ALPHA') df_per_pixcount.dropna(inplace=True) with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.width', 2000): outputfile.write(f"\n{df_per_pixcount}\n") logger.info(f"{df_per_pixcount}\n") html_data[ 'PREDICTION_QUALITY_ALPHA_PER_PIXCOUNT_TABLE'] = df_per_pixcount.to_html( ) with open(output_report_txt.replace('.txt', '.html'), 'w') as outputfile: html_template_file = open( './cropclassification/postprocess/html_rapport_template.html' ).read() src = Template(html_template_file) # replace strings and write to file output = src.substitute(html_data) outputfile.write(output)
def prepare_input_latecrop(parceldata_df, column_BEFL_cropcode: str, column_output_class: str, classes_refe_filepath: Path): """ This function creates a file that is compliant with the assumptions used by the rest of the classification functionality. It should be a csv file with the following columns: - object_id: column with a unique identifier - classname: a string column with a readable name of the classes that will be classified to This specific implementation converts the typiscal export format used in BE-Flanders to this format. """ # Check if parameters are OK and init some extra params #-------------------------------------------------------------------------- if not classes_refe_filepath.exists(): raise Exception( f"Input classes file doesn't exist: {classes_refe_filepath}") # Convert the crop to unicode, in case the input is int... if column_BEFL_cropcode in parceldata_df.columns: parceldata_df[column_BEFL_cropcode] = parceldata_df[ column_BEFL_cropcode].astype('unicode') # Read and cleanup the mapping table from crop codes to classes #-------------------------------------------------------------------------- logger.info(f"Read classes conversion table from {classes_refe_filepath}") classes_df = pdh.read_file(classes_refe_filepath) logger.info( f"Read classes conversion table ready, info(): {classes_df.info()}") # Because the file was read as ansi, and gewas is int, so the data needs to be converted to # unicode to be able to do comparisons with the other data classes_df[column_BEFL_cropcode] = classes_df['CROPCODE'].astype('unicode') # Map column with the classname to orig classname column_output_class_orig = conf.columns['class'] + '_orig' classes_df[column_output_class_orig] = classes_df[ conf.columns['class_refe']] # Remove unneeded columns for column in classes_df.columns: if (column not in [ column_output_class_orig, column_BEFL_cropcode, conf.columns['class_declared'], conf.columns['class'] ] and column not in columns_BEFL_to_keep): classes_df.drop(column, axis=1, inplace=True) # Set the index classes_df.set_index(column_BEFL_cropcode, inplace=True, verify_integrity=True) # Get only the columns in the classes_df that don't exist yet in parceldata_df cols_to_join = classes_df.columns.difference(parceldata_df.columns) # Join/merge the classname logger.info('Add the classes to the parceldata') parceldata_df = parceldata_df.merge(classes_df[cols_to_join], how='left', left_on=column_BEFL_cropcode, right_index=True, validate='many_to_one') # Copy orig classname to classification classname parceldata_df.insert(loc=0, column=column_output_class, value=parceldata_df[column_output_class_orig]) # For rows with no class, set to UNKNOWN parceldata_df.fillna(value={column_output_class: 'UNKNOWN'}, inplace=True) # If a column with extra info exists, use it as well to fine-tune the classification classes. if column_BEFL_gesp_pm in parceldata_df.columns: # Serres, tijdelijke overkappingen en loodsen parceldata_df.loc[ parceldata_df[column_BEFL_gesp_pm].isin(['SER', 'SGM']), column_output_class] = 'MON_OVERK_LOO' parceldata_df.loc[ parceldata_df[column_BEFL_gesp_pm].isin(['PLA', 'PLO', 'NPO']), column_output_class] = 'MON_OVERK_LOO' parceldata_df.loc[ parceldata_df[column_BEFL_gesp_pm] == 'LOO', column_output_class] = 'MON_OVERK_LOO' # Een loods is hetzelfde als een stal... parceldata_df.loc[ parceldata_df[column_BEFL_gesp_pm] == 'CON', column_output_class] = 'MON_CONTAINERS' # Containers, niet op volle grond... # TODO: CIV, containers in volle grond, lijkt niet zo specifiek te zijn... #parceldata_df.loc[parceldata_df[column_BEFL_gesp_pm] == 'CIV', class_columnname] = 'MON_CONTAINERS' # Containers, niet op volle grond... else: logger.warning( f"The column {column_BEFL_gesp_pm} doesn't exist, so this part of the code was skipped!" ) # Set classes with very few elements to IGNORE_NOT_ENOUGH_SAMPLES! for _, row in parceldata_df.groupby( column_output_class).size().reset_index(name='count').iterrows(): if row['count'] <= 50: logger.info( f"Class <{row[column_output_class]}> only contains {row['count']} elements, so put them to IGNORE_NOT_ENOUGH_SAMPLES" ) parceldata_df.loc[ parceldata_df[column_output_class] == row[column_output_class], column_output_class] = 'IGNORE_NOT_ENOUGH_SAMPLES' # Drop the columns that aren't useful at all for column in parceldata_df.columns: if (column not in [ column_output_class, conf.columns['id'], conf.columns['class_groundtruth'], conf.columns['class_declared'] ] and column not in conf.preprocess.getlist('extra_export_columns') and column not in columns_BEFL_to_keep): parceldata_df.drop(column, axis=1, inplace=True) elif column == column_BEFL_gesp_pm: parceldata_df[column_BEFL_gesp_pm] = parceldata_df[ column_BEFL_gesp_pm].str.replace(',', ';') return parceldata_df