Esempio n. 1
0
def show(input_parcel_filepath: Path,
         filter_id: str):

    # Load input data...
    df = pdh.read_file(input_parcel_filepath)

    # Just keep one parcel
    id_column = 'UID'
    df = df[df[id_column] == filter_id]

    # Remove all unnecessary columns
    '''
    for column in df:
        if(not column.startswith(SENSORDATA_VV + '_')
           and not column.startswith(SENSORDATA_VH + '_')
           and not column == conf.columns['id']):
            df = df.drop(columns=column)
    '''

    # Set index for transpose
    df.set_index(id_column, inplace=True)

    # Transpose columns to rows to create time series
    df = df.transpose()
    df.reset_index(inplace=True)

    '''    
    df.rename(columns={'index':'polarization_date'}, inplace=True)

    # Split the date and the polarization, the drop the original column
    df.insert(0, 'polarization', df['polarization_date'].str.split('_').str.get(0))
    #df['polarization'] = df['polarization_date'].str.split('_').str.get(0)
    df.insert(1, 'date', df['polarization_date'].str.split('_').str.get(1))
    #df['date'] = df['polarization_date'].str.split('_').str.get(1)
    df = df.drop(columns='polarization_date')

    logger.info(df)

    # Pivot to put VV and VH in seperate columns
    df = df.pivot(index='date', columns='polarization')
    #df.unstack(level=-1)
    #df.set_index('date', inplace=True)

    df = df[filter_id]
    df[SENSORDATA_VH + '/' + SENSORDATA_VV] = np.log10(df[SENSORDATA_VH] / df[SENSORDATA_VV])*10
    df[SENSORDATA_VH] = np.log10(df[SENSORDATA_VH])*10
    df[SENSORDATA_VV] = np.log10(df[SENSORDATA_VV])*10

    for column in df:
        logger.info(column)
    '''
    
    logger.info(df)

    # Plot
    df.plot()
def prepare_input(input_parcel_filepath: str,
                  input_parcel_filetype: str,
                  input_parcel_pixcount_filepath: str,
                  classtype_to_prepare: str,
                  output_parcel_filepath: str,
                  force: bool = False):
    """
    Prepare a raw input file by eg. adding the classification classes to use for the
    classification,...
    """

    # If force == False Check and the output file exists already, stop.
    if force is False and os.path.exists(output_parcel_filepath) is True:
        logger.warning(
            f"prepare_input: output file already exists and force == False, so stop: {output_parcel_filepath}"
        )
        return

    if input_parcel_filetype == 'BEFL':
        output_dir, _ = os.path.split(output_parcel_filepath)
        df_parceldata = befl.prepare_input(
            input_parcel_filepath=input_parcel_filepath,
            classtype_to_prepare=classtype_to_prepare,
            output_dir=output_dir)
    else:
        message = f"Unknown value for parameter input_parcel_filetype: {input_parcel_filetype}"
        logger.critical(message)
        raise Exception(message)

    # Load pixcount data and join it
    logger.info(f"Read pixcount file {input_parcel_pixcount_filepath}")
    df_pixcount = pdh.read_file(input_parcel_pixcount_filepath)
    logger.debug(f"Read pixcount file ready, shape: {df_pixcount.shape}")
    if df_pixcount.index.name != conf.columns['id']:
        df_pixcount.set_index(conf.columns['id'], inplace=True)

    df_parceldata.set_index(conf.columns['id'], inplace=True)
    df_parceldata = df_parceldata.join(
        df_pixcount[conf.columns['pixcount_s1s2']], how='left')

    # Export result to file
    output_ext = os.path.splitext(output_parcel_filepath)[1]
    for column in df_parceldata.columns:
        # if the output asked is a csv... we don't need the geometry...
        if column == conf.columns['geom'] and output_ext == '.csv':
            df_parceldata.drop(column, axis=1, inplace=True)

    logger.info(f"Write output to {output_parcel_filepath}")
    # If extension is not .shp, write using pandas (=a lot faster!)
    if output_ext.lower() != '.shp':
        pdh.to_file(df_parceldata, output_parcel_filepath)
    else:
        df_parceldata.to_file(output_parcel_filepath, index=False)
Esempio n. 3
0
def main():

    dir = Path(
        "X:/Monitoring/Markers/playground/pierog/tmp/Run_2019-06-25_007_imported"
    )
    in_filepaths = dir.glob("*.parquet")

    # Convert all files found
    for in_filepath in in_filepaths:

        # Read input file
        print(f"Read {in_filepath}")
        df = pdh.read_file(in_filepath)

        # Write to new file
        out_filepath = in_filepath.parent / f"{in_filepath.stem}.sqlite"
        print(f"Write {out_filepath}")
        pdh.to_file(df, out_filepath)
Esempio n. 4
0
def calculate_periodic_data(input_parcel_filepath: str,
                            input_base_dir: str,
                            start_date_str: str,
                            end_date_str: str,
                            sensordata_to_get: [],
                            dest_data_dir: str,
                            force: bool = False):
    """
    This function creates a file that is a weekly summarize of timeseries images from DIAS.

    TODO: add possibility to choose which values to extract (mean, min, max,...)?
        
    Args:
        input_parcel_filepath (str): [description]
        input_base_dir (str): [description]
        start_date_str (str): Start date in format %Y-%m-%d. Needs to be aligned already on the 
                periods wanted.
        end_date_str (str): End date in format %Y-%m-%d. Needs to be aligned already on the 
                periods wanted.
        sensordata_to_get ([]): 
        dest_data_dir (str): [description]
        force (bool, optional): [description]. Defaults to False.
    """
    logger.info('calculate_periodic_data')

    # Init
    input_parcels_filename = os.path.basename(input_parcel_filepath)
    input_parcels_filename_noext, _ = os.path.splitext(input_parcels_filename)
    input_dir = os.path.join(input_base_dir, input_parcels_filename_noext)

    # TODO: in config?
    input_ext = ".sqlite"
    output_ext = ".sqlite"

    start_date = datetime.strptime(start_date_str, '%Y-%m-%d')
    end_date = datetime.strptime(end_date_str, '%Y-%m-%d')
    year = start_date_str.split("-")[0]

    # Prepare output dir
    test = False
    if test is True:
        dest_data_dir += "_test"
    if not os.path.exists(dest_data_dir):
        os.mkdir(dest_data_dir)

    # Create Dataframe with all files with their info
    logger.debug('Create Dataframe with all files and their properties')
    file_info_list = []
    for filename in os.listdir(input_dir):
        if filename.endswith(input_ext):
            # Get seperate filename parts
            file_info = get_file_info(os.path.join(input_dir, filename))
            file_info_list.append(file_info)

    all_inputfiles_df = pd.DataFrame(file_info_list)

    # Loop over the data we need to get
    id_column = conf.columns['id']
    for sensordata_type in sensordata_to_get:

        logger.debug(
            'Get files we need based on start- & stopdates, sensordata_to_get,...'
        )
        orbits = [None]
        if sensordata_type == conf.general['SENSORDATA_S1_ASCDESC']:
            # Filter files to the ones we need
            satellitetype = 'S1'
            imagetype = IMAGETYPE_S1_GRD
            bands = ['VV', 'VH']
            orbits = ['ASC', 'DESC']
            needed_inputfiles_df = all_inputfiles_df.loc[
                (all_inputfiles_df.date >= start_date)
                & (all_inputfiles_df.date < end_date)
                & (all_inputfiles_df.imagetype == imagetype)
                & (all_inputfiles_df.band.isin(bands))
                & (all_inputfiles_df.orbit.isin(orbits))]
        elif sensordata_type == conf.general['SENSORDATA_S2gt95']:
            satellitetype = 'S2'
            imagetype = IMAGETYPE_S2_L2A
            bands = ['B02-10m', 'B03-10m', 'B04-10m', 'B08-10m']
            needed_inputfiles_df = all_inputfiles_df.loc[
                (all_inputfiles_df.date >= start_date)
                & (all_inputfiles_df.date < end_date)
                & (all_inputfiles_df.imagetype == imagetype)
                & (all_inputfiles_df.band.isin(bands))]
        elif sensordata_type == conf.general['SENSORDATA_S1_COHERENCE']:
            satellitetype = 'S1'
            imagetype = IMAGETYPE_S1_COHERENCE
            bands = ['VV', 'VH']
            orbits = ['ASC', 'DESC']
            needed_inputfiles_df = all_inputfiles_df.loc[
                (all_inputfiles_df.date >= start_date)
                & (all_inputfiles_df.date < end_date)
                & (all_inputfiles_df.imagetype == imagetype)
                & (all_inputfiles_df.band.isin(bands))]
        else:
            raise Exception(f"Unsupported sensordata_type: {sensordata_type}")

        # There should also be one pixcount file
        pixcount_filename = f"{input_parcels_filename_noext}_weekly_pixcount{output_ext}"
        pixcount_filepath = os.path.join(dest_data_dir, pixcount_filename)

        # For each week
        start_week = int(datetime.strftime(start_date, '%W'))
        end_week = int(datetime.strftime(end_date, '%W'))
        for period_index in range(start_week, end_week):

            # Get the date of the first day of period period_index (eg. monday for a week)
            period_date = datetime.strptime(
                str(year) + '_' + str(period_index) + '_1', '%Y_%W_%w')

            # New file name
            period_date_str_long = period_date.strftime('%Y-%m-%d')
            period_data_filename = f"{input_parcels_filename_noext}_weekly_{period_date_str_long}_{sensordata_type}{output_ext}"
            period_data_filepath = os.path.join(dest_data_dir,
                                                period_data_filename)

            # Check if output file exists already
            if os.path.exists(period_data_filepath):
                if force is False:
                    logger.info(
                        f"SKIP: force is False and file exists: {period_data_filepath}"
                    )
                    continue
                else:
                    os.remove(period_data_filepath)

            # Loop over bands and orbits (all combinations of bands and orbits!)
            logger.info(f"Calculate file: {period_data_filename}")
            period_data_df = None
            for band, orbit in [(band, orbit) for band in bands
                                for orbit in orbits]:

                # Get list of files needed for this period, band
                period_files_df = needed_inputfiles_df.loc[
                    (needed_inputfiles_df.week == period_index)
                    & (needed_inputfiles_df.band == band)]

                # If an orbit to be filtered was specified, filter
                if orbit is not None:
                    period_files_df = period_files_df.loc[(
                        period_files_df.orbit == orbit)]

                # Loop all period_files
                period_band_data_df = None
                statistic_columns_dict = {
                    'count': [],
                    'max': [],
                    'mean': [],
                    'min': [],
                    'std': []
                }
                for j, imagedata_filepath in enumerate(
                        period_files_df.filepath.tolist()):

                    # If file has filesize == 0, skip
                    if os.path.getsize(imagedata_filepath) == 0:
                        continue

                    # Read the file (but only the columns we need)
                    columns = [column for column in statistic_columns_dict
                               ].append(id_column)
                    image_data_df = pdh.read_file(imagedata_filepath,
                                                  columns=columns)
                    image_data_df.set_index(id_column, inplace=True)
                    image_data_df.index.name = id_column

                    # Remove rows with nan values
                    nb_before_dropna = len(image_data_df.index)
                    image_data_df.dropna(inplace=True)
                    nb_after_dropna = len(image_data_df.index)
                    if nb_after_dropna != nb_before_dropna:
                        logger.warning(
                            f"Before dropna: {nb_before_dropna}, after: {nb_after_dropna} for file {imagedata_filepath}"
                        )
                    if nb_after_dropna == 0:
                        continue

                    # Rename columns so column names stay unique
                    for statistic_column in statistic_columns_dict:
                        new_column_name = statistic_column + str(j + 1)
                        image_data_df.rename(
                            columns={statistic_column: new_column_name},
                            inplace=True)
                        image_data_df[new_column_name] = image_data_df[
                            new_column_name].astype(float)
                        statistic_columns_dict[statistic_column].append(
                            new_column_name)

                    # Create 1 dataframe for all weekfiles - one row for each code_obj - using concat (code_obj = index)
                    if period_band_data_df is None:
                        period_band_data_df = image_data_df
                    else:
                        period_band_data_df = pd.concat(
                            [period_band_data_df, image_data_df],
                            axis=1,
                            sort=False)
                        # Apparently concat removes the index name in some situations
                        period_band_data_df.index.name = id_column

                # Calculate max, mean, min, ...
                if period_band_data_df is not None:
                    logger.debug('Calculate max, mean, min, ...')
                    period_date_str_short = period_date.strftime('%Y%m%d')
                    # Remark: prefix column names: sqlite doesn't like a numeric start
                    if orbit is None:
                        column_basename = f"TS_{period_date_str_short}_{imagetype}_{band}"
                    else:
                        column_basename = f"TS_{period_date_str_short}_{imagetype}_{orbit}_{band}"

                    # Number of pixels
                    # TODO: onderzoeken hoe aantal pixels best bijgehouden wordt : afwijkingen weglaten ? max nemen ? ...
                    period_band_data_df[f"{column_basename}_count"] = np.nanmax(
                        period_band_data_df[statistic_columns_dict['count']],
                        axis=1)
                    # Maximum of all max columns
                    period_band_data_df[f"{column_basename}_max"] = np.nanmax(
                        period_band_data_df[statistic_columns_dict['max']],
                        axis=1)
                    # Mean of all mean columns
                    period_band_data_df[f"{column_basename}_mean"] = np.nanmean(
                        period_band_data_df[statistic_columns_dict['mean']],
                        axis=1)
                    # Minimum of all min columns
                    period_band_data_df[f"{column_basename}_min"] = np.nanmin(
                        period_band_data_df[statistic_columns_dict['min']],
                        axis=1)
                    # Mean of all std columns
                    period_band_data_df[f"{column_basename}_std"] = np.nanmean(
                        period_band_data_df[statistic_columns_dict['std']],
                        axis=1)
                    # Number of Files used
                    period_band_data_df[
                        f"{column_basename}_used_files"] = period_band_data_df[
                            statistic_columns_dict['max']].count(axis=1)

                    # Only keep the columns we want to keep
                    columns_to_keep = [
                        f"{column_basename}_count", f"{column_basename}_max",
                        f"{column_basename}_mean", f"{column_basename}_min",
                        f"{column_basename}_std",
                        f"{column_basename}_used_files"
                    ]
                    period_band_data_df = period_band_data_df[columns_to_keep]

                    # Merge the data with the other bands/orbits for this period
                    if period_data_df is None:
                        period_data_df = period_band_data_df
                    else:
                        period_data_df = pd.concat(
                            [period_band_data_df, period_data_df],
                            axis=1,
                            sort=False)
                        # Apparently concat removes the index name in some situations
                        period_data_df.index.name = id_column

            if period_data_df is not None:
                logger.info(f"Write new file: {period_data_filename}")
                pdh.to_file(period_data_df, period_data_filepath)

                if not os.path.exists(pixcount_filepath):
                    pixcount_s1s2_column = conf.columns['pixcount_s1s2']
                    for column in period_data_df.columns:
                        if column.endswith('_count'):
                            period_data_df.rename(
                                columns={column: pixcount_s1s2_column},
                                inplace=True)
                            break
                    pixcount_df = period_data_df[pixcount_s1s2_column]
                    pixcount_df.fillna(value=0, inplace=True)
                    pdh.to_file(pixcount_df, pixcount_filepath)
def prepare_input(input_parcel_filepath: str, classtype_to_prepare: str,
                  output_dir: str):
    """
    This function creates a file that is compliant with the assumptions used by the rest of the
    classification functionality.

    It should be a csv file with the following columns:
        - object_id: column with a unique identifier
        - classname: a string column with a readable name of the classes that will be classified to
    """
    # Check if input parameters are OK
    if not os.path.exists(input_parcel_filepath):
        raise Exception(f"Input file doesn't exist: {input_parcel_filepath}")
    else:
        logger.info(f"Process input file {input_parcel_filepath}")

    # Read input file
    logger.info(f"Read parceldata from {input_parcel_filepath}")
    if geofile_util.is_geofile(input_parcel_filepath):
        parceldata_df = geofile_util.read_file(input_parcel_filepath)
    else:
        parceldata_df = pdh.read_file(input_parcel_filepath)
    logger.info(f"Read Parceldata ready, info(): {parceldata_df.info()}")

    # Check if the id column is present...
    if conf.columns['id'] not in parceldata_df.columns:
        message = f"Column {conf.columns['id']} not found in input parcel file: {input_parcel_filepath}. Make sure the column is present or change the column name in global_constants.py"
        logger.critical(message)
        raise Exception(message)

    # Copy the refe file to the run dir, so we always keep knowing which refe was used
    input_classes_filepath = conf.preprocess[
        'classtype_to_prepare_refe_filepath']
    if not os.path.exists(input_classes_filepath):
        raise Exception(
            f"Input classes file doesn't exist: {input_classes_filepath}")
    shutil.copy(input_classes_filepath, output_dir)

    # Now start prepare
    if classtype_to_prepare == 'CROPGROUP':
        parceldata_df = prepare_input_cropgroup(
            parceldata_df=parceldata_df,
            column_BEFL_cropcode=column_BEFL_crop_declared,
            column_output_class=conf.columns['class_declared'])
        return prepare_input_cropgroup(
            parceldata_df=parceldata_df,
            column_BEFL_cropcode=column_BEFL_crop,
            column_output_class=conf.columns['class'])
    elif classtype_to_prepare == 'CROPGROUP_GROUNDTRUTH':
        return prepare_input_cropgroup(
            parceldata_df=parceldata_df,
            column_BEFL_cropcode=column_BEFL_crop_gt_verified,
            column_output_class=conf.columns['class_groundtruth'])
    elif classtype_to_prepare == 'CROPGROUP_EARLY':
        parceldata_df = prepare_input_cropgroup_early(
            parceldata_df=parceldata_df,
            column_BEFL_cropcode=column_BEFL_crop_declared,
            column_output_class=conf.columns['class_declared'])
        return prepare_input_cropgroup_early(
            parceldata_df=parceldata_df,
            column_BEFL_cropcode=column_BEFL_crop,
            column_output_class=conf.columns['class'])
    elif classtype_to_prepare == 'CROPGROUP_EARLY_GROUNDTRUTH':
        return prepare_input_cropgroup_early(
            parceldata_df=parceldata_df,
            column_BEFL_cropcode=column_BEFL_crop_gt_verified,
            column_output_class=conf.columns['class_groundtruth'])
    elif classtype_to_prepare == 'LANDCOVER':
        parceldata_df = prepare_input_landcover(
            parceldata_df=parceldata_df,
            column_BEFL_cropcode=column_BEFL_crop_declared,
            column_output_class=conf.columns['class_declared'])
        return prepare_input_landcover(
            parceldata_df=parceldata_df,
            column_BEFL_cropcode=column_BEFL_crop,
            column_output_class=conf.columns['class'])
    elif classtype_to_prepare == 'LANDCOVER_GROUNDTRUTH':
        return prepare_input_landcover(
            parceldata_df=parceldata_df,
            column_BEFL_cropcode=column_BEFL_crop_gt_verified,
            column_output_class=conf.columns['class_groundtruth'])
    elif classtype_to_prepare == 'LANDCOVER_EARLY':
        parceldata_df = prepare_input_landcover_early(
            parceldata_df=parceldata_df,
            column_BEFL_cropcode=column_BEFL_crop_declared,
            column_output_class=conf.columns['class_declared'])
        return prepare_input_landcover_early(
            parceldata_df=parceldata_df,
            column_BEFL_cropcode=column_BEFL_crop,
            column_output_class=conf.columns['class'])
    elif classtype_to_prepare == 'LANDCOVER_EARLY_GROUNDTRUTH':
        return prepare_input_landcover_early(
            parceldata_df=parceldata_df,
            column_BEFL_cropcode=column_BEFL_crop_gt_verified,
            column_output_class=conf.columns['class_groundtruth'])
    elif classtype_to_prepare == 'POPULAR_CROP':
        parceldata_df = prepare_input_most_popular_crop(
            parceldata_df=parceldata_df,
            column_BEFL_cropcode=column_BEFL_crop_declared,
            column_output_class=conf.columns['class_declared'])
        return prepare_input_most_popular_crop(
            parceldata_df=parceldata_df,
            column_BEFL_cropcode=column_BEFL_crop,
            column_output_class=conf.columns['class'])
    elif classtype_to_prepare == 'POPULAR_CROP_GROUNDTRUTH':
        return prepare_input_most_popular_crop(
            parceldata_df=parceldata_df,
            column_BEFL_cropcode=column_BEFL_crop_gt_verified,
            column_output_class=conf.columns['class_groundtruth'])
    else:
        message = f"Unknown value for parameter classtype_to_prepare: {classtype_to_prepare}"
        logger.fatal(message)
        raise Exception(message)
def prepare_input_landcover(parceldata_df,
                            column_BEFL_cropcode: str = 'GWSCOD_H',
                            column_output_class: str = None):
    """
    This function creates a file that is compliant with the assumptions used by the rest of the
    classification functionality.

    It should be a csv file with the following columns:
        - object_id: column with a unique identifier
        - classname: a string column with a readable name of the classes that will be classified to

    This specific implementation converts the typiscal export format used in BE-Flanders to
    this format.
    """
    # Check if parameters are OK and init some extra params
    #--------------------------------------------------------------------------
    input_classes_filepath = conf.preprocess[
        'classtype_to_prepare_refe_filepath']
    if not os.path.exists(input_classes_filepath):
        raise Exception(
            f"Input classes file doesn't exist: {input_classes_filepath}")

    # Convert the crop to unicode, in case the input is int...
    if column_BEFL_cropcode in parceldata_df.columns:
        parceldata_df[column_BEFL_cropcode] = parceldata_df[
            column_BEFL_cropcode].astype('unicode')

    # Read and cleanup the mapping table from crop codes to classes
    #--------------------------------------------------------------------------
    logger.info(f"Read classes conversion table from {input_classes_filepath}")
    classes_df = pdh.read_file(input_classes_filepath)
    logger.info(
        f"Read classes conversion table ready, info(): {classes_df.info()}")

    # Because the file was read as ansi, and gewas is int, so the data needs to be converted to
    # unicode to be able to do comparisons with the other data
    classes_df[column_BEFL_cropcode] = classes_df['CROPCODE'].astype('unicode')

    # Map column MON_group to orig classname
    column_output_class_orig = column_output_class + '_orig'
    classes_df[column_output_class_orig] = classes_df['MON_LC_GROUP']

    # Remove unneeded columns
    for column in classes_df.columns:
        if (column not in [column_output_class_orig, column_BEFL_cropcode]
                and column not in columns_BEFL_to_keep):
            classes_df.drop(column, axis=1, inplace=True)

    # Set the index
    classes_df.set_index(column_BEFL_cropcode,
                         inplace=True,
                         verify_integrity=True)

    # Get only the columns in the classes_df that don't exist yet in parceldata_df
    cols_to_join = classes_df.columns.difference(parceldata_df.columns)

    # Join/merge the classname
    logger.info('Add the classes to the parceldata')
    parceldata_df = parceldata_df.merge(classes_df[cols_to_join],
                                        how='left',
                                        left_on=column_BEFL_cropcode,
                                        right_index=True,
                                        validate='many_to_one')

    # Copy orig classname to classification classname
    parceldata_df.insert(loc=0,
                         column=column_output_class,
                         value=parceldata_df[column_output_class_orig])

    # If a column with extra info exists, use it as well to fine-tune the classification classes.
    if column_BEFL_gesp_pm in parceldata_df.columns:
        # Serres, tijdelijke overkappingen en loodsen
        parceldata_df.loc[
            parceldata_df[column_BEFL_gesp_pm].isin(['SER', 'PLA', 'PLO']),
            column_output_class] = 'MON_LC_IGNORE_DIFFICULT_PERMANENT_CLASS'
        parceldata_df.loc[
            parceldata_df[column_BEFL_gesp_pm].
            isin(['SGM', 'NPO', 'LOO', 'CON']),
            column_output_class] = 'MON_LC_IGNORE_DIFFICULT_PERMANENT_CLASS_NS'
        # TODO: CIV, containers in volle grond, lijkt niet zo specifiek te zijn...
        #parceldata_df.loc[parceldata_df[column_BEFL_gesp_pm] == 'CIV', class_columnname] = 'MON_CONTAINERS'   # Containers, niet op volle grond...
    else:
        logger.warning(
            f"The column {column_BEFL_gesp_pm} doesn't exist, so this part of the code was skipped!"
        )

    # Some extra cleanup: classes starting with 'nvt' or empty ones
    logger.info(
        "Set classes that are still empty, not specific enough or that contain to little values to 'UNKNOWN'"
    )
    parceldata_df.loc[
        parceldata_df[column_output_class].str.startswith('nvt', na=True),
        column_output_class] = 'MON_LC_UNKNOWN'

    # Drop the columns that aren't useful at all
    for column in parceldata_df.columns:
        if (column not in [
                column_output_class, conf.columns['id'],
                conf.columns['class_groundtruth'],
                conf.columns['class_declared']
        ] and column not in conf.preprocess.getlist('extra_export_columns')
                and column not in columns_BEFL_to_keep):
            parceldata_df.drop(column, axis=1, inplace=True)
        elif column == column_BEFL_gesp_pm:
            parceldata_df[column_BEFL_gesp_pm] = parceldata_df[
                column_BEFL_gesp_pm].str.replace(',', ';')

    return parceldata_df
def prepare_input_cropgroup(parceldata_df, column_BEFL_cropcode: str,
                            column_output_class: str):
    """
    This function creates a file that is compliant with the assumptions used by the rest of the
    classification functionality.

    It should be a csv file with the following columns:
        - object_id: column with a unique identifier
        - classname: a string column with a readable name of the classes that will be classified to

    This specific implementation converts the typiscal export format used in BE-Flanders to
    this format.
    """
    # Check if parameters are OK and init some extra params
    #--------------------------------------------------------------------------
    input_classes_filepath = conf.preprocess[
        'classtype_to_prepare_refe_filepath']
    if not os.path.exists(input_classes_filepath):
        raise Exception(
            f"Input classes file doesn't exist: {input_classes_filepath}")

    # Convert the crop to unicode, in case the input is int...
    if column_BEFL_cropcode in parceldata_df.columns:
        parceldata_df[column_BEFL_cropcode] = parceldata_df[
            column_BEFL_cropcode].astype('unicode')

    # Read and cleanup the mapping table from crop codes to classes
    #--------------------------------------------------------------------------
    logger.info(f"Read classes conversion table from {input_classes_filepath}")
    classes_df = pdh.read_file(input_classes_filepath)
    logger.info(
        f"Read classes conversion table ready, info(): {classes_df.info()}")

    # Because the file was read as ansi, and gewas is int, so the data needs to be converted to
    # unicode to be able to do comparisons with the other data
    classes_df[column_BEFL_cropcode] = classes_df['CROPCODE'].astype('unicode')

    # Map column with the classname to orig classname
    column_output_class_orig = conf.columns['class'] + '_orig'
    classes_df[column_output_class_orig] = classes_df['MON_CROPGROUP']

    # Remove unneeded columns
    for column in classes_df.columns:
        if (column not in [column_output_class_orig, column_BEFL_cropcode]
                and column not in columns_BEFL_to_keep):
            classes_df.drop(column, axis=1, inplace=True)

    # Set the index
    classes_df.set_index(column_BEFL_cropcode,
                         inplace=True,
                         verify_integrity=True)

    # Get only the columns in the classes_df that don't exist yet in parceldata_df
    cols_to_join = classes_df.columns.difference(parceldata_df.columns)

    # Join/merge the classname
    logger.info('Add the classes to the parceldata')
    parceldata_df = parceldata_df.merge(classes_df[cols_to_join],
                                        how='left',
                                        left_on=column_BEFL_cropcode,
                                        right_index=True,
                                        validate='many_to_one')

    # Copy orig classname to classification classname
    parceldata_df.insert(loc=0,
                         column=column_output_class,
                         value=parceldata_df[column_output_class_orig])

    # For rows with no class, set to UNKNOWN
    parceldata_df.fillna(value={column_output_class: 'UNKNOWN'}, inplace=True)

    # If a column with extra info exists, use it as well to fine-tune the classification classes.
    if column_BEFL_gesp_pm in parceldata_df.columns:
        # Serres, tijdelijke overkappingen en loodsen
        parceldata_df.loc[
            parceldata_df[column_BEFL_gesp_pm].isin(['SER', 'SGM']),
            column_output_class] = 'MON_STAL_SER'
        parceldata_df.loc[
            parceldata_df[column_BEFL_gesp_pm].isin(['PLA', 'PLO', 'NPO']),
            column_output_class] = 'MON_STAL_SER'
        parceldata_df.loc[
            parceldata_df[column_BEFL_gesp_pm] == 'LOO',
            column_output_class] = 'MON_STAL_SER'  # Een loods is hetzelfde als een stal...
        parceldata_df.loc[
            parceldata_df[column_BEFL_gesp_pm] == 'CON',
            column_output_class] = 'MON_CONTAINERS'  # Containers, niet op volle grond...
        # TODO: CIV, containers in volle grond, lijkt niet zo specifiek te zijn...
        #parceldata_df.loc[parceldata_df[column_BEFL_gesp_pm] == 'CIV', class_columnname] = 'MON_CONTAINERS'   # Containers, niet op volle grond...
    else:
        logger.warning(
            f"The column {column_BEFL_gesp_pm} doesn't exist, so this part of the code was skipped!"
        )

    # Some extra cleanup: classes starting with 'nvt' or empty ones
    #logger.info("Set classes that are still empty, not specific enough or that contain to little values to 'UNKNOWN'")
    #parceldata_df.loc[parceldata_df[column_output_class].str.startswith('nvt', na=True),
    #                  column_output_class] = 'UNKNOWN'

    # 'MON_ANDERE_SUBSID_GEWASSEN': very low classification rate (< 1%), as it is a group with several very different classes in it
    # 'MON_AARDBEIEN': low classification rate (~10%), as many parcel actually are temporary greenhouses but aren't correctly applied
    # 'MON_BRAAK': very low classification rate (< 1%), spread over a lot of classes, but most popular are MON_BOOM, MON_GRASSEN, MON_FRUIT
    # 'MON_KLAVER': log classification rate (25%), spread over quite some classes, but MON GRASSES has 20% as well.
    # 'MON_MENGSEL': 25% correct classifications: rest spread over many other classes. Too heterogenous in group?
    # 'MON_POEL': 0% correct classifications: most are classified as MON_CONTAINER, MON_FRUIT. Almost nothing was misclassified as being POEL
    # 'MON_RAAPACHTIGEN': 25% correct classifications: rest spread over many other classes
    # 'MON_STRUIK': 10%
    #    TODO: nakijken, wss opsplitsen of toevoegen aan MON_BOOMKWEEK???
    #classes_badresults = ['MON_ANDERE_SUBSID_GEWASSEN', 'MON_AARDBEIEN', 'MON_BRAAK', 'MON_KLAVER',
    #                      'MON_MENGSEL', 'MON_POEL', 'MON_RAAPACHTIGEN', 'MON_STRUIK']
    #parceldata_df.loc[parceldata_df[column_output_class].isin(classes_badresults),
    #                  column_output_class] = 'UNKNOWN'

    # MON_BONEN en MON_WIKKEN have omongst each other a very large percentage of false
    # positives/negatives, so they seem very similar... lets create a class that combines both
    #parceldata_df.loc[parceldata_df[column_output_class].isin(['MON_BONEN', 'MON_WIKKEN']),
    #                  column_output_class] = 'MON_BONEN_WIKKEN'

    # MON_BOOM includes now also the growing new plants/trees, which is too differenct from grown
    # trees -> put growing new trees is seperate group
    #parceldata_df.loc[parceldata_df[column_BEFL_cropcode].isin(['9602', '9603', '9604', '9560']),
    #                  column_output_class] = 'MON_BOOMKWEEK'

    # 'MON_FRUIT': has a good accuracy (91%), but also has as much false positives (115% -> mainly
    #              'MON_GRASSEN' that are (mis)classified as 'MON_FRUIT')
    # 'MON_BOOM': has very bad accuracy (28%) and also very much false positives (450% -> mainly
    #              'MON_GRASSEN' that are misclassified as 'MON_BOOM')
    # MON_FRUIT and MON_BOOM are permanent anyway, so not mandatory that they are checked in
    # monitoring process.
    # Conclusion: put MON_BOOM and MON_FRUIT to IGNORE_DIFFICULT_PERMANENT_CLASS
    #parceldata_df.loc[parceldata_df[column_output_class].isin(['MON_BOOM', 'MON_FRUIT']),
    #                  column_output_class] = 'IGNORE_DIFFICULT_PERMANENT_CLASS'

    # Set classes with very few elements to IGNORE_NOT_ENOUGH_SAMPLES!
    for _, row in parceldata_df.groupby(
            column_output_class).size().reset_index(name='count').iterrows():
        if row['count'] <= 50:
            logger.info(
                f"Class <{row[column_output_class]}> only contains {row['count']} elements, so put them to IGNORE_NOT_ENOUGH_SAMPLES"
            )
            parceldata_df.loc[
                parceldata_df[column_output_class] == row[column_output_class],
                column_output_class] = 'IGNORE_NOT_ENOUGH_SAMPLES'

    # Drop the columns that aren't useful at all
    for column in parceldata_df.columns:
        if (column not in [
                column_output_class, conf.columns['id'],
                conf.columns['class_groundtruth'],
                conf.columns['class_declared']
        ] and column not in conf.preprocess.getlist('extra_export_columns')
                and column not in columns_BEFL_to_keep):
            parceldata_df.drop(column, axis=1, inplace=True)
        elif column == column_BEFL_gesp_pm:
            parceldata_df[column_BEFL_gesp_pm] = parceldata_df[
                column_BEFL_gesp_pm].str.replace(',', ';')

    return parceldata_df
Esempio n. 8
0
def train_test_predict(input_parcel_train_filepath: str,
                       input_parcel_test_filepath: str,
                       input_parcel_all_filepath: str,
                       input_parcel_classification_data_filepath: str,
                       output_classifier_filepath: str,
                       output_predictions_test_filepath: str,
                       output_predictions_all_filepath: str,
                       force: bool = False):
    """ Train a classifier, test it and do full predictions.

    Args
        input_parcel_classes_train_filepath: the list of parcels with classes to train the classifier, without data!
        input_parcel_classes_test_filepath: the list of parcels with classes to test the classifier, without data!
        input_parcel_classes_all_filepath: the list of parcels with classes that need to be classified, without data!
        input_parcel_classification_data_filepath: the data to be used for the classification for all parcels.
        output_classifier_filepath: the file path where to save the classifier.
        output_predictions_test_filepath: the file path where to save the test predictions.
        output_predictions_all_filepath: the file path where to save the predictions for all parcels.
        force: if True, overwrite all existing output files, if False, don't overwrite them.
    """

    logger.info("train_test_predict: Start")

    if (force is False and os.path.exists(output_classifier_filepath)
            and os.path.exists(output_predictions_test_filepath)
            and os.path.exists(output_predictions_all_filepath)):
        logger.warning(
            f"predict: output files exist and force is False, so stop: {output_classifier_filepath}, {output_predictions_test_filepath}, {output_predictions_all_filepath}"
        )
        return

    # Read the classification data from the csv so we can pass it on to the other functione to improve performance...
    logger.info(
        f"Read classification data file: {input_parcel_classification_data_filepath}"
    )
    input_parcel_classification_data_df = pdh.read_file(
        input_parcel_classification_data_filepath)
    if input_parcel_classification_data_df.index.name != conf.columns['id']:
        input_parcel_classification_data_df.set_index(conf.columns['id'],
                                                      inplace=True)
    logger.debug('Read classification data file ready')

    # Train the classification
    train(
        input_parcel_train_filepath=input_parcel_train_filepath,
        input_parcel_test_filepath=input_parcel_test_filepath,
        input_parcel_classification_data_filepath=
        input_parcel_classification_data_filepath,
        output_classifier_filepath=output_classifier_filepath,
        force=force,
        input_parcel_classification_data_df=input_parcel_classification_data_df
    )

    # Predict the test parcels
    predict(
        input_parcel_filepath=input_parcel_test_filepath,
        input_parcel_classification_data_filepath=
        input_parcel_classification_data_filepath,
        input_classifier_filepath=output_classifier_filepath,
        output_predictions_filepath=output_predictions_test_filepath,
        force=force,
        input_parcel_classification_data_df=input_parcel_classification_data_df
    )

    # Predict all parcels
    predict(
        input_parcel_filepath=input_parcel_all_filepath,
        input_parcel_classification_data_filepath=
        input_parcel_classification_data_filepath,
        input_classifier_filepath=output_classifier_filepath,
        output_predictions_filepath=output_predictions_all_filepath,
        force=force,
        input_parcel_classification_data_df=input_parcel_classification_data_df
    )
Esempio n. 9
0
def predict(input_parcel_filepath: str,
            input_parcel_classification_data_filepath: str,
            input_classifier_filepath: str,
            output_predictions_filepath: str,
            force: bool = False,
            input_parcel_classification_data_df: pd.DataFrame = None):
    """ Predict the classes for the input data. """

    # If force is False, and the output file exist already, return
    if (force is False and os.path.exists(output_predictions_filepath)):
        logger.warning(
            f"predict: predictions output file already exists and force is false, so stop: {output_predictions_filepath}"
        )
        return

    # Read the input parcels
    logger.info(f"Read input file: {input_parcel_filepath}")
    input_parcel_df = pdh.read_file(input_parcel_filepath,
                                    columns=[
                                        conf.columns['id'],
                                        conf.columns['class'],
                                        conf.columns['class_declared']
                                    ])
    if input_parcel_df.index.name != conf.columns['id']:
        input_parcel_df.set_index(conf.columns['id'], inplace=True)
    logger.debug('Read train file ready')

    # For parcels of a class that should be ignored, don't predict
    input_parcel_df = input_parcel_df.loc[
        ~input_parcel_df[conf.columns['class_declared']].
        isin(conf.marker.getlist('classes_to_ignore'))]

    # If the classification data isn't passed as dataframe, read it from the csv
    if input_parcel_classification_data_df is None:
        logger.info(
            f"Read classification data file: {input_parcel_classification_data_filepath}"
        )
        input_parcel_classification_data_df = pdh.read_file(
            input_parcel_classification_data_filepath)
        if input_parcel_classification_data_df.index.name != conf.columns['id']:
            input_parcel_classification_data_df.set_index(conf.columns['id'],
                                                          inplace=True)
        logger.debug('Read classification data file ready')

    # Join the data to send to prediction logic...
    logger.info("Join input parcels with the classification data")
    input_parcel_for_predict_df = input_parcel_df.join(
        input_parcel_classification_data_df, how='inner')

    # Predict!
    if conf.classifier['classifier_type'].lower(
    ) == 'keras_multilayer_perceptron':
        import cropclassification.predict.classification_keras as class_core_keras
        class_core_keras.predict_proba(
            parcel_df=input_parcel_for_predict_df,
            classifier_filepath=input_classifier_filepath,
            output_parcel_predictions_filepath=output_predictions_filepath)
    else:
        import cropclassification.predict.classification_sklearn as class_core_sklearn
        class_core_sklearn.predict_proba(
            parcel_df=input_parcel_for_predict_df,
            classifier_filepath=input_classifier_filepath,
            output_parcel_predictions_filepath=output_predictions_filepath)
Esempio n. 10
0
def train(input_parcel_train_filepath: str,
          input_parcel_test_filepath: str,
          input_parcel_classification_data_filepath: str,
          output_classifier_filepath: str,
          force: bool = False,
          input_parcel_classification_data_df: pd.DataFrame = None):
    """ Train a classifier and test it by predicting the test cases. """

    logger.info("train_and_test: Start")
    if (force is False and os.path.exists(output_classifier_filepath)):
        logger.warning(
            f"predict: classifier already exist and force == False, so don't retrain: {output_classifier_filepath}"
        )
        return

    # If the classification data isn't passed as dataframe, read it from file
    if input_parcel_classification_data_df is None:
        logger.info(
            f"Read classification data file: {input_parcel_classification_data_filepath}"
        )
        input_parcel_classification_data_df = pdh.read_file(
            input_parcel_classification_data_filepath)
        if input_parcel_classification_data_df.index.name != conf.columns['id']:
            input_parcel_classification_data_df.set_index(conf.columns['id'],
                                                          inplace=True)
        logger.debug('Read classification data file ready')

    # Read the train parcels
    logger.info(f"Read train file: {input_parcel_train_filepath}")
    train_df = pdh.read_file(
        input_parcel_train_filepath,
        columns=[conf.columns['id'], conf.columns['class']])
    if train_df.index.name != conf.columns['id']:
        train_df.set_index(conf.columns['id'], inplace=True)
    logger.debug('Read train file ready')

    # Join the columns of input_parcel_classification_data_df that aren't yet in train_df
    logger.info("Join train sample with the classification data")
    train_df = (train_df.join(input_parcel_classification_data_df,
                              how='inner'))

    # Read the test/validation data
    logger.info(f"Read test file: {input_parcel_test_filepath}")
    test_df = pdh.read_file(
        input_parcel_test_filepath,
        columns=[conf.columns['id'], conf.columns['class']])
    if test_df.index.name != conf.columns['id']:
        test_df.set_index(conf.columns['id'], inplace=True)
    logger.debug('Read test file ready')

    # Join the columns of input_parcel_classification_data_df that aren't yet in test_df
    logger.info("Join test sample with the classification data")
    test_df = (test_df.join(input_parcel_classification_data_df, how='inner'))

    # Train
    if conf.classifier['classifier_type'].lower(
    ) == 'keras_multilayer_perceptron':
        import cropclassification.predict.classification_keras as class_core_keras
        class_core_keras.train(
            train_df=train_df,
            test_df=test_df,
            output_classifier_filepath=output_classifier_filepath)
    else:
        import cropclassification.predict.classification_sklearn as class_core_sklearn
        class_core_sklearn.train(
            train_df=train_df,
            output_classifier_filepath=output_classifier_filepath)
Esempio n. 11
0
def collect_and_prepare_timeseries_data(
        input_parcel_filepath: Path,
        timeseries_dir: Path,
        base_filename: str,
        output_filepath: Path,
        start_date_str: str,
        end_date_str: str,
        sensordata_to_use: List[str],
        parceldata_aggregations_to_use: List[str],
        force: bool = False):
    """
    Collect all timeseries data to use for the classification and prepare it by applying
    scaling,... as needed.
    """

    # Some constants to choose which type of data to use in the marker.
    # Remark: the string needs to be the same as the end of the name of the columns in the csv files!
    # TODO: I'm not really happy with both a list in the ini file + here... not sure what the
    #       cleanest solution is though...
    PARCELDATA_AGGRAGATION_MEAN = conf.general[
        'PARCELDATA_AGGRAGATION_MEAN']  # Mean value of the pixels values in a parcel.
    PARCELDATA_AGGRAGATION_STDDEV = conf.general[
        'PARCELDATA_AGGRAGATION_STDDEV']  # std dev of the values of the pixels in a parcel

    # Constants for types of sensor data
    SENSORDATA_S1 = conf.general['SENSORDATA_S1']  # Sentinel 1 data
    SENSORDATA_S1DB = conf.general['SENSORDATA_S1DB']  # Sentinel 1 data, in dB
    SENSORDATA_S1_ASCDESC = conf.general[
        'SENSORDATA_S1_ASCDESC']  # Sentinel 1 data, divided in Ascending and Descending passes
    SENSORDATA_S1DB_ASCDESC = conf.general[
        'SENSORDATA_S1DB_ASCDESC']  # Sentinel 1 data, in dB, divided in Ascending and Descending passes
    SENSORDATA_S2 = conf.general['SENSORDATA_S2']  # Sentinel 2 data
    SENSORDATA_S2gt95 = conf.general[
        'SENSORDATA_S2gt95']  # Sentinel 2 data (B2,B3,B4,B8) IF available for 95% or area
    SENSORDATA_S1_COHERENCE = conf.general['SENSORDATA_S1_COHERENCE']

    # If force == False Check and the output file exists already, stop.
    if (force is False and output_filepath.exists() is True):
        logger.warning(
            f"Output file already exists and force == False, so stop: {output_filepath}"
        )
        return

    # Init the result with the id's of the parcels we want to treat
    result_df = pdh.read_file(input_parcel_filepath,
                              columns=[conf.columns['id']])
    if result_df.index.name != conf.columns['id']:
        result_df.set_index(conf.columns['id'], inplace=True)
    nb_input_parcels = len(result_df.index)
    logger.info(
        f"Parceldata aggregations that need to be used: {parceldata_aggregations_to_use}"
    )
    logger.setLevel(logging.DEBUG)

    # Loop over all input timeseries data to find the data we really need
    data_ext = conf.general['data_ext']
    filepath_start = timeseries_dir / f"{base_filename}_{start_date_str}{data_ext}"
    filepath_end = timeseries_dir / f"{base_filename}_{end_date_str}{data_ext}"
    logger.debug(f'filepath_start_date: {filepath_start}')
    logger.debug(f'filepath_end_date: {filepath_end}')

    ts_data_files = timeseries_dir.glob(f"{base_filename}_*{data_ext}")
    for curr_filepath in sorted(ts_data_files):

        # Only process data that is of the right sensor types
        sensor_type = curr_filepath.stem.split('_')[-1]
        if sensor_type not in sensordata_to_use:
            logger.debug(
                f"SKIP: file is not in sensor types asked ({sensordata_to_use}): {curr_filepath}"
            )
            continue
        # The only data we want to process is the data in the range of dates
        if ((str(curr_filepath) < str(filepath_start))
                or (str(curr_filepath) >= str(filepath_end))):
            logger.debug(
                f"SKIP: File is not in date range asked: {curr_filepath}")
            continue
        # An empty file signifies that there wasn't any valable data for that period/sensor/...
        if os.path.getsize(curr_filepath) == 0:
            logger.info(f"SKIP: file is empty: {curr_filepath}")
            continue

        # Read data, and check if there is enough data in it
        data_read_df = pdh.read_file(curr_filepath)
        nb_data_read = len(data_read_df.index)
        data_available_pct = nb_data_read * 100 / nb_input_parcels
        min_parcels_with_data_pct = conf.timeseries.getfloat(
            'min_parcels_with_data_pct')
        if data_available_pct < min_parcels_with_data_pct:
            logger.info(
                f"SKIP: only data for {data_available_pct:.2f}% of parcels, should be > {min_parcels_with_data_pct}%: {curr_filepath}"
            )
            continue

        # Start processing the file
        logger.info(f'Process file: {curr_filepath}')
        if data_read_df.index.name != conf.columns['id']:
            data_read_df.set_index(conf.columns['id'], inplace=True)

        # Loop over columns to check if there are columns that need to be dropped.
        for column in data_read_df.columns:

            # If it is the id column, continue
            if column == conf.columns['id']:
                continue

            # Check if the column is "asked"
            column_ok = False
            for parceldata_aggregation in parceldata_aggregations_to_use:
                if column.endswith('_' + parceldata_aggregation):
                    column_ok = True
            if column_ok is False:
                # Drop column if it doesn't end with something in parcel_data_aggregations_to_use
                logger.debug(
                    f"Drop column as it's column aggregation isn't to be used: {column}"
                )
                data_read_df.drop(column, axis=1, inplace=True)
                continue

            # Check if the column contains data for enough parcels
            valid_input_data_pct = (
                1 -
                (data_read_df[column].isnull().sum() / nb_input_parcels)) * 100
            if valid_input_data_pct < min_parcels_with_data_pct:
                # If the number of nan values for the column > x %, drop column
                logger.warn(
                    f"Drop column as it contains only {valid_input_data_pct:.2f}% real data compared to input (= not nan) which is < {min_parcels_with_data_pct}%!: {column}"
                )
                data_read_df.drop(column, axis=1, inplace=True)

        # If S2, rescale data
        if sensor_type.startswith(SENSORDATA_S2):
            for column in data_read_df.columns:
                logger.info(
                    f"Column contains S2 data, so scale it by dividing by 10.000: {column}"
                )
                data_read_df[column] = data_read_df[column] / 10000

        # If S1 coherence, rescale data
        if sensor_type == SENSORDATA_S1_COHERENCE:
            for column in data_read_df.columns:
                logger.info(
                    f"Column contains S1 Coherence data, so scale it by dividing by 300: {column}"
                )
                data_read_df[column] = data_read_df[column] / 300

        # Join the data to the result...
        result_df = result_df.join(data_read_df, how='left')

    # Remove rows with many null values from result
    max_number_null = int(0.6 * len(result_df.columns))
    parcel_many_null_df = result_df[result_df.isnull().sum(
        axis=1) > max_number_null]
    if len(parcel_many_null_df.index) > 0:
        # Write the rows with empty data to a file
        parcel_many_null_filepath = Path(
            f'{str(output_filepath)}_rows_many_null.sqlite')
        logger.warn(
            f"Write {len(parcel_many_null_df.index)} rows with > {max_number_null} of {len(result_df.columns)} columns==null to {parcel_many_null_filepath}"
        )
        pdh.to_file(parcel_many_null_df, parcel_many_null_filepath)

        # Now remove them from result
        result_df = result_df[result_df.isnull().sum(
            axis=1) <= max_number_null]

    # For rows with some null values, set them to 0
    # TODO: first rough test of using interpolation doesn't give a difference, maybe better if
    #       smarter interpolation is used (= only between the different types of data:
    #       S1_GRD_VV, S1_GRD_VH, S1_COH_VV, S1_COH_VH, ASC?, DESC?, S2
    #result_df.interpolate(inplace=True)
    result_df.fillna(0, inplace=True)

    # Write output file...
    logger.info(f"Write output to file, start: {output_filepath}")
    pdh.to_file(result_df, output_filepath)
    logger.info(f"Write output to file, ready (with shape: {result_df.shape})")
def create_train_test_sample(input_parcel_filepath: str,
                             output_parcel_train_filepath: str,
                             output_parcel_test_filepath: str,
                             balancing_strategy: str,
                             force: bool = False):
    """ Create a seperate train and test sample from the general input file. """

    # If force == False Check and the output files exist already, stop.
    if (force is False and os.path.exists(output_parcel_train_filepath) is True
            and os.path.exists(output_parcel_test_filepath) is True):
        logger.warning(
            f"create_train_test_sample: output files already exist and force == False, so stop: {output_parcel_train_filepath}, {output_parcel_test_filepath}"
        )
        return

    # Load input data...
    logger.info(
        f"Start create_train_test_sample with balancing_strategy {balancing_strategy}"
    )
    logger.info(f"Read input file {input_parcel_filepath}")
    df_in = pdh.read_file(input_parcel_filepath)
    logger.debug(f"Read input file ready, shape: {df_in.shape}")

    # Init some many-used variables from config
    class_balancing_column = conf.columns['class_balancing']
    class_column = conf.columns['class']

    with pd.option_context('display.max_rows', None, 'display.max_columns',
                           None):
        count_per_class = df_in.groupby(class_balancing_column,
                                        as_index=False).size()
        logger.info(
            f"Number of elements per classname in input dataset:\n{count_per_class}"
        )

    # The test dataset should be as representative as possible for the entire dataset, so create
    # this first as a 20% sample of each class without any additional checks...
    # Remark: group_keys=False evades that apply creates an extra index-level of the groups above
    #         the data and evades having to do .reset_index(level=class_balancing_column_NAME, drop=True)
    #         to get rid of the group level
    df_test = df_in.groupby(class_balancing_column,
                            group_keys=False).apply(pd.DataFrame.sample,
                                                    frac=0.20)
    logger.debug(
        f"df_test after sampling 20% of data per class, shape: {df_test.shape}"
    )

    # The candidate parcel for training are all non-test parcel
    df_train_base = df_in[~df_in.index.isin(df_test.index)]
    logger.debug(f"df_train_base after isin\n{df_train_base}")

    # Remove parcel with too few pixels from the train sample
    min_pixcount = int(conf.marker['min_nb_pixels_train'])
    df_train_base = df_train_base[
        df_train_base[conf.columns['pixcount_s1s2']] >= min_pixcount]
    logger.debug(
        f"Number of parcels in df_train_base after filter on pixcount >= {min_pixcount}: {len(df_train_base)}"
    )

    # Some classes shouldn't be used for training... so remove them!
    logger.info(
        f"Remove 'classes_to_ignore_for_train' from train sample (= where {class_column} is in: {conf.marker.getlist('classes_to_ignore_for_train')}"
    )
    df_train_base = df_train_base[~df_train_base[class_column].isin(
        conf.marker.getlist('classes_to_ignore_for_train'))]

    # All classes_to_ignore aren't meant for training either...
    logger.info(
        f"Remove 'classes_to_ignore' from train sample (= where {class_column} is in: {conf.marker.getlist('classes_to_ignore')}"
    )
    df_train_base = df_train_base[~df_train_base[class_column].isin(
        conf.marker.getlist('classes_to_ignore'))]

    # Print the train base result before applying any balancing
    with pd.option_context('display.max_rows', None, 'display.max_columns',
                           None):
        count_per_class = df_train_base.groupby(class_balancing_column,
                                                as_index=False).size()
        logger.info(
            f"Number of elements per classname for train dataset, before balancing:\n{count_per_class}"
        )

    # Depending on the balancing_strategy, use different way to get a training sample
    if balancing_strategy == 'BALANCING_STRATEGY_NONE':
        # Just use 25% of all non-test data as train data -> 25% of 80% of data -> 20% of all data
        # will be training date
        # Remark: - this is very unbalanced, eg. classes with 10.000 times the input size than other
        #           classes
        #         - this results in a relatively high accuracy in overall numbers, but the small
        #           classes are not detected at all
        df_train = (df_train_base.groupby(class_balancing_column,
                                          group_keys=False).apply(
                                              pd.DataFrame.sample, frac=0.25))

    elif balancing_strategy == 'BALANCING_STRATEGY_MEDIUM':
        # Balance the train data, but still use some larger samples for the classes that have a lot
        # of members in the input dataset
        # Remark: with the upper limit of 10.000 this gives still OK results overall, and also the
        #         smaller classes give some results with upper limit of 4000 results significantly
        #         less good.

        # For the larger classes, favor them by leaving the samples larger but cap at upper_limit
        upper_limit = 10000
        lower_limit = 1000
        logger.info(
            f"Cap over {upper_limit}, keep the full number of training sample till {lower_limit}, samples smaller than that are oversampled"
        )
        df_train = (df_train_base.groupby(class_balancing_column).filter(
            lambda x: len(x) >= upper_limit).groupby(class_balancing_column,
                                                     group_keys=False).apply(
                                                         pd.DataFrame.sample,
                                                         upper_limit))
        # Middle classes use the number as they are
        df_train = df_train.append(
            df_train_base.groupby(class_balancing_column).filter(lambda x: len(
                x) < upper_limit).groupby(class_balancing_column).filter(
                    lambda x: len(x) >= lower_limit))
        # For smaller classes, oversample...
        df_train = df_train.append(
            df_train_base.groupby(class_balancing_column).filter(
                lambda x: len(x) < lower_limit).groupby(
                    class_balancing_column,
                    group_keys=False).apply(pd.DataFrame.sample,
                                            lower_limit,
                                            replace=True))

    elif balancing_strategy == 'BALANCING_STRATEGY_MEDIUM2':
        # Balance the train data, but still use some larger samples for the classes that have a lot
        # of members in the input dataset
        # Remark: with the upper limit of 10.000 this gives still OK results overall, and also the
        #         smaller classes give some results with upper limit of 4000 results significantly
        #         less good.

        # For the larger classes, leave the samples larger but cap
        cap_count_limit1 = 100000
        cap_train_limit1 = 30000
        logger.info(
            f"Cap balancing classes over {cap_count_limit1} to {cap_train_limit1}"
        )
        df_train = (df_train_base.groupby(class_balancing_column).filter(
            lambda x: len(x) >= cap_count_limit1).groupby(
                class_balancing_column,
                group_keys=False).apply(pd.DataFrame.sample, cap_train_limit1))
        cap_count_limit2 = 50000
        cap_train_limit2 = 20000
        logger.info(
            f"Cap balancing classes between {cap_count_limit2} and {cap_count_limit1} to {cap_train_limit2}"
        )
        df_train = df_train.append(
            df_train_base.groupby(class_balancing_column).filter(lambda x: len(
                x) < cap_count_limit1).groupby(class_balancing_column).filter(
                    lambda x: len(x) >= cap_count_limit2).groupby(
                        class_balancing_column,
                        group_keys=False).apply(pd.DataFrame.sample,
                                                cap_train_limit2))
        cap_count_limit3 = 20000
        cap_train_limit3 = 10000
        logger.info(
            f"Cap balancing classes between {cap_count_limit3} and {cap_count_limit2} to {cap_train_limit3}"
        )
        df_train = df_train.append(
            df_train_base.groupby(class_balancing_column).filter(lambda x: len(
                x) < cap_count_limit2).groupby(class_balancing_column).filter(
                    lambda x: len(x) >= cap_count_limit3).groupby(
                        class_balancing_column,
                        group_keys=False).apply(pd.DataFrame.sample,
                                                cap_train_limit3))
        cap_count_limit4 = 10000
        cap_train_limit4 = 10000
        logger.info(
            f"Cap balancing classes between {cap_count_limit4} and {cap_count_limit3} to {cap_train_limit4}"
        )
        df_train = df_train.append(
            df_train_base.groupby(class_balancing_column).filter(lambda x: len(
                x) < cap_count_limit3).groupby(class_balancing_column).filter(
                    lambda x: len(x) >= cap_count_limit4).groupby(
                        class_balancing_column,
                        group_keys=False).apply(pd.DataFrame.sample,
                                                cap_train_limit4))
        oversample_count = 1000
        # Middle classes use the number as they are
        logger.info(
            f"For classes between {cap_count_limit4} and {oversample_count}, just use all samples"
        )
        df_train = df_train.append(
            df_train_base.groupby(class_balancing_column).filter(lambda x: len(
                x) < cap_count_limit4).groupby(class_balancing_column).filter(
                    lambda x: len(x) >= oversample_count))
        # For smaller classes, oversample...
        logger.info(
            f"For classes smaller than {oversample_count}, oversample to {oversample_count}"
        )
        df_train = df_train.append(
            df_train_base.groupby(class_balancing_column).filter(
                lambda x: len(x) < oversample_count).groupby(
                    class_balancing_column,
                    group_keys=False).apply(pd.DataFrame.sample,
                                            oversample_count,
                                            replace=True))

    elif balancing_strategy == 'BALANCING_STRATEGY_PROPORTIONAL_GROUPS':
        # Balance the train data, but still use some larger samples for the classes that have a lot
        # of members in the input dataset
        # Remark: with the upper limit of 10.000 this gives still OK results overall, and also the
        #         smaller classes give some results with upper limit of 4000 results significantly
        #         less good.

        # For the larger classes, leave the samples larger but cap
        upper_count_limit1 = 100000
        upper_train_limit1 = 30000
        logger.info(
            f"Cap balancing classes over {upper_count_limit1} to {upper_train_limit1}"
        )
        df_train = (df_train_base.groupby(class_balancing_column).filter(
            lambda x: len(x) >= upper_count_limit1).groupby(
                class_balancing_column,
                group_keys=False).apply(pd.DataFrame.sample,
                                        upper_train_limit1))
        upper_count_limit2 = 50000
        upper_train_limit2 = 20000
        logger.info(
            f"Cap balancing classes between {upper_count_limit2} and {upper_count_limit1} to {upper_train_limit2}"
        )
        df_train = df_train.append(
            df_train_base.groupby(class_balancing_column).filter(lambda x: len(
                x) < upper_count_limit1).groupby(class_balancing_column).
            filter(lambda x: len(x) >= upper_count_limit2).groupby(
                class_balancing_column,
                group_keys=False).apply(pd.DataFrame.sample,
                                        upper_train_limit2))
        upper_count_limit3 = 20000
        upper_train_limit3 = 10000
        logger.info(
            f"Cap balancing classes between {upper_count_limit3} and {upper_count_limit2} to {upper_train_limit3}"
        )
        df_train = df_train.append(
            df_train_base.groupby(class_balancing_column).filter(lambda x: len(
                x) < upper_count_limit2).groupby(class_balancing_column).
            filter(lambda x: len(x) >= upper_count_limit3).groupby(
                class_balancing_column,
                group_keys=False).apply(pd.DataFrame.sample,
                                        upper_train_limit3))
        upper_count_limit4 = 10000
        upper_train_limit4 = 5000
        logger.info(
            f"Cap balancing classes between {upper_count_limit4} and {upper_count_limit3} to {upper_train_limit4}"
        )
        df_train = df_train.append(
            df_train_base.groupby(class_balancing_column).filter(lambda x: len(
                x) < upper_count_limit3).groupby(class_balancing_column).
            filter(lambda x: len(x) >= upper_count_limit4).groupby(
                class_balancing_column,
                group_keys=False).apply(pd.DataFrame.sample,
                                        upper_train_limit4))

        # For smaller balancing classes, just use all samples
        df_train = df_train.append(
            df_train_base.groupby(class_balancing_column).filter(
                lambda x: len(x) < upper_count_limit4))

    elif balancing_strategy == 'BALANCING_STRATEGY_UPPER_LIMIT':
        # Balance the train data, but still use some larger samples for the classes that have a lot
        # of members in the input dataset
        # Remark: with the upper limit of 10.000 this gives still OK results overall, and also the
        #         smaller classes give some results with upper limit of 4000 results significantly
        #         less good.

        # For the larger classes, favor them by leaving the samples larger but cap at upper_limit
        upper_limit = 10000
        logger.info(f"Cap over {upper_limit}...")
        df_train = (df_train_base.groupby(class_balancing_column).filter(
            lambda x: len(x) >= upper_limit).groupby(class_balancing_column,
                                                     group_keys=False).apply(
                                                         pd.DataFrame.sample,
                                                         upper_limit))
        # For smaller classes, just use all samples
        df_train = df_train.append(
            df_train_base.groupby(class_balancing_column).filter(
                lambda x: len(x) < upper_limit))

    elif balancing_strategy == 'BALANCING_STRATEGY_EQUAL':
        # In theory the most logical way to balance: make sure all classes have the same amount of
        # training data by undersampling the largest classes and oversampling the small classes.
        df_train = (df_train_base.groupby(class_balancing_column,
                                          group_keys=False).apply(
                                              pd.DataFrame.sample,
                                              2000,
                                              replace=True))

    else:
        logger.fatal(
            f"Unknown balancing strategy, STOP!: {balancing_strategy}")

    # Log the resulting numbers per class in the train sample
    with pd.option_context('display.max_rows', None, 'display.max_columns',
                           None):
        count_per_class = df_train.groupby(class_balancing_column,
                                           as_index=False).size()
        logger.info(
            f'Number of elements per class_balancing_column in train dataset:\n{count_per_class}'
        )
        if class_balancing_column != class_column:
            count_per_class = df_train.groupby(class_column,
                                               as_index=False).size()
            logger.info(
                f'Number of elements per class_column in train dataset:\n{count_per_class}'
            )

    # Log the resulting numbers per class in the test sample
    with pd.option_context('display.max_rows', None, 'display.max_columns',
                           None):
        count_per_class = df_test.groupby(class_balancing_column,
                                          as_index=False).size()
        logger.info(
            f'Number of elements per class_balancing_column in test dataset:\n{count_per_class}'
        )
        if class_balancing_column != class_column:
            count_per_class = df_test.groupby(class_column,
                                              as_index=False).size()
            logger.info(
                f'Number of elements per class_column in test dataset:\n{count_per_class}'
            )

    # Write to output files
    logger.info('Write the output files')
    df_train.set_index(conf.columns['id'], inplace=True)
    df_test.set_index(conf.columns['id'], inplace=True)
    pdh.to_file(df_train,
                output_parcel_train_filepath)  # The ID column is the index...
    pdh.to_file(df_test,
                output_parcel_test_filepath)  # The ID column is the index...
Esempio n. 13
0
def predict(input_parcel_filepath: Path,
            input_parcel_classification_data_filepath: Path,
            input_classifier_basefilepath: Path,
            input_classifier_filepath: Path,
            output_predictions_filepath: Path,
            force: bool = False,
            input_parcel_classification_data_df: pd.DataFrame = None):
    """ Predict the classes for the input data. """

    # If force is False, and the output file exist already, return
    if (force is False and output_predictions_filepath.exists()):
        logger.warning(
            f"predict: predictions output file already exists and force is false, so stop: {output_predictions_filepath}"
        )
        return

    # Read the input parcels
    logger.info(f"Read input file: {input_parcel_filepath}")
    input_parcel_df = pdh.read_file(input_parcel_filepath,
                                    columns=[
                                        conf.columns['id'],
                                        conf.columns['class'],
                                        conf.columns['class_declared']
                                    ])
    if input_parcel_df.index.name != conf.columns['id']:
        input_parcel_df.set_index(conf.columns['id'], inplace=True)
    logger.debug('Read train file ready')

    # For parcels of a class that should be ignored, don't predict
    input_parcel_df = input_parcel_df.loc[
        ~input_parcel_df[conf.columns['class_declared']].
        isin(conf.marker.getlist('classes_to_ignore'))]

    # get the expected columns from the classifier
    datacolumns_filepath = glob.glob(
        os.path.join(os.path.dirname(input_classifier_filepath),
                     "*datacolumns.txt"))[0]
    with open(datacolumns_filepath, "r") as f:
        input_classifier_datacolumns = ast.literal_eval(f.readline())

    # If the classification data isn't passed as dataframe, read it from the csv
    if input_parcel_classification_data_df is None:
        logger.info(
            f"Read classification data file: {input_parcel_classification_data_filepath}"
        )
        input_parcel_classification_data_df = pdh.read_file(
            input_parcel_classification_data_filepath)
        if input_parcel_classification_data_df.index.name != conf.columns['id']:
            input_parcel_classification_data_df.set_index(conf.columns['id'],
                                                          inplace=True)
        logger.debug('Read classification data file ready')

    # only take the required columns as expected by the classifier
    input_parcel_classification_data_df = input_parcel_classification_data_df[
        input_classifier_datacolumns]

    # Join the data to send to prediction logic
    logger.info("Join input parcels with the classification data")
    input_parcel_for_predict_df = input_parcel_df.join(
        input_parcel_classification_data_df, how='inner')

    # Predict!
    logger.info(f"Predict using this model: {input_classifier_filepath}")
    if conf.classifier['classifier_type'].lower(
    ) == 'keras_multilayer_perceptron':
        import cropclassification.predict.classification_keras as class_core_keras
        class_core_keras.predict_proba(
            parcel_df=input_parcel_for_predict_df,
            classifier_basefilepath=input_classifier_basefilepath,
            classifier_filepath=input_classifier_filepath,
            output_parcel_predictions_filepath=output_predictions_filepath)
    else:
        import cropclassification.predict.classification_sklearn as class_core_sklearn
        class_core_sklearn.predict_proba(
            parcel_df=input_parcel_for_predict_df,
            classifier_basefilepath=input_classifier_basefilepath,
            classifier_filepath=input_classifier_filepath,
            output_parcel_predictions_filepath=output_predictions_filepath)
def calc_top3_and_consolidation(input_parcel_filepath: str,
                                input_parcel_probabilities_filepath: str,
                                output_predictions_filepath: str,
                                output_predictions_output_filepath: str = None,
                                force: bool = False):
    """
    Calculate the top3 prediction and a consolidation prediction.

    Remark: in this logic the declared crop/class (class_declared) is used, as we want to compare 
    with the declaration of the farmer, rather than taking into account corrections already.
    
    Args:
        input_parcel_filepath (str): [description]
        input_parcel_probabilities_filepath (str): [description]
        output_predictions_filepath (str): [description]
        output_predictions_output_filepath (str, optional): [description]. Defaults to None.
        force (bool, optional): [description]. Defaults to False.
    """
    # If force is false and output exists, already, return
    if(force is False
       and os.path.exists(output_predictions_filepath)):
        logger.warning(f"calc_top3_and_consolidation: output file exist and force is False, so stop: {output_predictions_filepath}")
        return

    # Read input files
    logger.info("Read input file")
    proba_df = pdh.read_file(input_parcel_probabilities_filepath)

    top3_df = calc_top3(proba_df)

    # Read input files
    logger.info("Read input file")
    input_parcel_df = pdh.read_file(input_parcel_filepath)

    # All input parcels must stay in the output, so left join input with pred
    top3_df.set_index(conf.columns['id'], inplace=True)
    if input_parcel_df.index.name != conf.columns['id']:
        input_parcel_df.set_index(conf.columns['id'], inplace=True)
    cols_to_join = top3_df.columns.difference(input_parcel_df.columns)
    pred_df = input_parcel_df.join(top3_df[cols_to_join], how='left')

    # The parcels added by the join don't have a prediction yet, so apply it
    # For the ignore classes, set the prediction to the ignore type
    classes_to_ignore = conf.marker.getlist('classes_to_ignore')
    pred_df.loc[pred_df[conf.columns['class_declared']].isin(classes_to_ignore), 
                'pred1'] = pred_df[conf.columns['class_declared']]
    # For all other parcels without prediction there must have been no data 
    # available for a classification, so set prediction to NODATA
    pred_df['pred1'].fillna('NODATA', inplace=True)
     
    # Add doubt columns
    add_doubt_column(pred_df=pred_df, 
                     new_pred_column=conf.columns['prediction_cons'],
                     apply_doubt_min_nb_pixels=True)
    add_doubt_column(pred_df=pred_df, 
                     new_pred_column=conf.columns['prediction_full_alpha'],
                     apply_doubt_min_nb_pixels=True,
                     apply_doubt_marker_specific=True)

    # Calculate the status of the consolidated prediction (OK=usable, NOK=not)
    pred_df.loc[pred_df[conf.columns['prediction_cons']].isin(proba_df.columns.to_list()), 
                conf.columns['prediction_cons_status']] = 'OK'
    pred_df[conf.columns['prediction_cons_status']].fillna('NOK', inplace=True)    

    logger.info("Write full prediction data to file")
    pdh.to_file(pred_df, output_predictions_filepath)

    # Create final output file with the most important info
    if output_predictions_output_filepath is not None:

        # First add some aditional columns specific for the export
        pred_df['markercode'] = conf.marker['markertype']
        pred_df['run_id'] = conf.general['run_id']
        today = datetime.date.today()
        pred_df['cons_date'] = today
        pred_df['modify_date'] = today
        logger.info("Write final output prediction data to file")
        pred_df.reset_index(inplace=True)
        pred_df = pred_df[conf.columns.getlist('output_columns')] 
        pdh.to_file(pred_df, output_predictions_output_filepath, index=False) 

        # Write oracle sqlldr file
        if conf.marker['markertype'] in ['LANDCOVER', 'LANDCOVER_EARLY']:
            table_name = 'mon_marker_landcover'
            table_columns = ("layer_id, prc_id, versienummer, markercode, run_id, cons_landcover, "
                             + "cons_status, cons_date date 'yyyy-mm-dd', landcover1, probability1, "
                             + "landcover2, probability2, landcover3, probability3, "
                             + "modify_date date 'yyyy-mm-dd'")
        elif conf.marker['markertype'] in ['CROPGROUP', 'CROPGROUP_EARLY']:
            table_name = 'mon_marker_cropgroup'
            table_columns = ("layer_id, prc_id, versienummer, markercode, run_id, cons_cropgroup, "
                             + "cons_status, cons_date date 'yyyy-mm-dd', cropgroup1, probability1, "
                             + "cropgroup2, probability2, cropgroup3, probability3, "
                             + "modify_date date 'yyyy-mm-dd'")
        else: 
            table_name = None
            logger.warning(f"Table unknown for marker type {conf.marker['markertype']}, so cannot write .ctl file")

        if table_name is not None:
            with open(output_predictions_output_filepath + '.ctl', 'w') as ctlfile:
                # SKIP=1 to skip the columns names line, the other ones to evade 
                # more commits than needed
                ctlfile.write("OPTIONS (SKIP=1, ROWS=10000, BINDSIZE=40000000, READSIZE=40000000)\n")     
                ctlfile.write("LOAD DATA\n")
                ctlfile.write(f"INFILE '{os.path.basename(output_predictions_output_filepath)}'  \"str '\\n'\"\n")
                ctlfile.write(f"INSERT INTO TABLE {table_name} APPEND\n")
                # A tab as seperator is apparently X'9'  
                ctlfile.write("FIELDS TERMINATED BY X'9'\n")
                ctlfile.write(f"({table_columns})\n")
def write_full_report(parcel_predictions_filepath: str,
                      output_report_txt: str,
                      parcel_ground_truth_filepath: str = None,
                      force: bool = None):
    """Writes a report about the accuracy of the predictions to a file.

    Args:
        parcel_predictions_filepath: File name of csv file with the parcel with their predictions.
        prediction_columnname: Column name of the column that contains the predictions.
        output_report_txt: File name of txt file the report will be written to.
        parcel_ground_truth_filepath: List of parcels with ground truth to calculate eg. alfa and
            beta errors. If None, the part of the report that is based on this data is skipped

    TODO: refactor function to split logic more...
    """

    # If force == False Check and the output file exists already, stop.
    if force is False and os.path.exists(output_report_txt):
        logger.warning(
            f"collect_and_prepare_timeseries_data: output file already exists and force == False, so stop: {output_report_txt}"
        )
        return

    logger.info("Start write_full_report")

    pandas_option_context_list = [
        'display.max_rows', None, 'display.max_columns', None,
        'display.max_colwidth', 300, 'display.width', 2000,
        'display.colheader_justify', 'left'
    ]
    logger.info(f"Read file with predictions: {parcel_predictions_filepath}")
    df_predict = pdh.read_file(parcel_predictions_filepath)
    df_predict.set_index(conf.columns['id'], inplace=True)

    # Python template engine expects all values to be present, so initialize to empty
    empty_string = "''"
    html_data = {
        'GENERAL_ACCURACIES_TABLE': empty_string,
        'GENERAL_ACCURACIES_TEXT': empty_string,
        'GENERAL_ACCURACIES_DATA': empty_string,
        'CONFUSION_MATRICES_TABLE': empty_string,
        'CONFUSION_MATRICES_DATA': empty_string,
        'CONFUSION_MATRICES_CONSOLIDATED_TABLE': empty_string,
        'CONFUSION_MATRICES_CONSOLIDATED_DATA': empty_string,
        'PREDICTION_QUALITY_CONS_OVERVIEW_TEXT': empty_string,
        'PREDICTION_QUALITY_CONS_OVERVIEW_TABLE': empty_string,
        'PREDICTION_QUALITY_FULL_ALPHA_OVERVIEW_TEXT': empty_string,
        'PREDICTION_QUALITY_FULL_ALPHA_OVERVIEW_TABLE': empty_string,
        'PREDICTION_QUALITY_ALPHA_TEXT': empty_string,
        'PREDICTION_QUALITY_BETA_TEXT': empty_string,
        'PREDICTION_QUALITY_ALPHA_PER_PIXCOUNT_TEXT': empty_string,
        'PREDICTION_QUALITY_ALPHA_PER_PIXCOUNT_TABLE': empty_string
    }

    # Build and write report...
    with open(output_report_txt, 'w') as outputfile:

        outputfile.write(
            "************************************************************\n")
        outputfile.write(
            "********************* PARAMETERS USED **********************\n")
        outputfile.write(
            "************************************************************\n")
        outputfile.write("\n")
        message = "Main parameters used for the marker"
        outputfile.write(f"\n{message}\n")
        html_data['PARAMETERS_USED_TEXT'] = message

        logger.info(f"{dict(conf.marker)}")
        parameter_list = [['marker', key, value]
                          for key, value in conf.marker.items()]
        parameter_list += [['timeseries', key, value]
                           for key, value in conf.timeseries.items()]
        parameter_list += [['preprocess', key, value]
                           for key, value in conf.preprocess.items()]
        parameter_list += [['classifier', key, value]
                           for key, value in conf.classifier.items()]
        parameter_list += [['postprocess', key, value]
                           for key, value in conf.postprocess.items()]

        parameters_used_df = pd.DataFrame(
            parameter_list, columns=['parameter_type', 'parameter', 'value'])
        with pd.option_context(*pandas_option_context_list):
            outputfile.write(f"\n{parameters_used_df}\n")
            logger.info(f"{parameters_used_df}\n")
            html_data['PARAMETERS_USED_TABLE'] = parameters_used_df.to_html(
                index=False)

        outputfile.write(
            "************************************************************\n")
        outputfile.write(
            "**************** RECAP OF GENERAL RESULTS ******************\n")
        outputfile.write(
            "************************************************************\n")
        outputfile.write("\n")
        outputfile.write(
            "************************************************************\n")
        outputfile.write(
            "*             GENERAL CONSOLIDATED CONCLUSIONS             *\n")
        outputfile.write(
            "************************************************************\n")
        # Calculate + write general conclusions for consolidated prediction
        _add_prediction_conclusion(
            in_df=df_predict,
            new_columnname=conf.columns['prediction_conclusion_cons'],
            prediction_column_to_use=conf.columns['prediction_cons'],
            detailed=False)

        # Get the number of 'unimportant' ignore parcels and report them here
        df_predict_unimportant = df_predict[df_predict[
            conf.columns['prediction_conclusion_cons']] ==
                                            'IGNORE_UNIMPORTANT']
        # Now they can be removed for the rest of the reportings...
        df_predict = df_predict[
            df_predict[conf.columns['prediction_conclusion_cons']] !=
            'IGNORE_UNIMPORTANT']

        message = (
            f"Prediction conclusions cons general overview, for {len(df_predict.index)} predicted cases."
            +
            f"The {len(df_predict_unimportant.index)} IGNORE_UNIMPORTANT parcels are excluded from the reporting!"
        )
        outputfile.write(f"\n{message}\n")
        html_data['GENERAL_PREDICTION_CONCLUSION_CONS_OVERVIEW_TEXT'] = message

        count_per_class = (df_predict.groupby(
            conf.columns['prediction_conclusion_cons'],
            as_index=False).size().to_frame('count'))
        values = 100 * count_per_class['count'] / count_per_class['count'].sum(
        )
        count_per_class.insert(loc=1, column='pct', value=values)

        with pd.option_context(*pandas_option_context_list):
            outputfile.write(f"\n{count_per_class}\n")
            logger.info(f"{count_per_class}\n")
            html_data[
                'GENERAL_PREDICTION_CONCLUSION_CONS_OVERVIEW_TABLE'] = count_per_class.to_html(
                )
            html_data[
                'GENERAL_PREDICTION_CONCLUSION_CONS_OVERVIEW_DATA'] = count_per_class.to_dict(
                )

        # Output general accuracies
        outputfile.write(
            "************************************************************\n")
        outputfile.write(
            "*                   OVERALL ACCURACIES                     *\n")
        outputfile.write(
            "************************************************************\n")
        overall_accuracies_list = []

        # Calculate overall accuracies for all parcels
        try:
            oa = skmetrics.accuracy_score(df_predict[conf.columns['class']],
                                          df_predict['pred1'],
                                          normalize=True,
                                          sample_weight=None) * 100
            overall_accuracies_list.append({
                'parcels': 'All',
                'prediction_type': 'standard',
                'accuracy': oa
            })

            oa = skmetrics.accuracy_score(
                df_predict[conf.columns['class']],
                df_predict[conf.columns['prediction_cons']],
                normalize=True,
                sample_weight=None) * 100
            overall_accuracies_list.append({
                'parcels': 'All',
                'prediction_type': 'consolidated',
                'accuracy': oa
            })
        except:
            logger.exception("Error calculating overall accuracies!")

        # Calculate while ignoring the classes to be ignored...
        df_predict_accuracy_no_ignore = df_predict[
            ~df_predict[conf.columns['class']].
            isin(conf.marker.getlist('classes_to_ignore_for_train'))]
        df_predict_accuracy_no_ignore = df_predict_accuracy_no_ignore[
            ~df_predict_accuracy_no_ignore[conf.columns['class']].
            isin(conf.marker.getlist('classes_to_ignore'))]

        oa = skmetrics.accuracy_score(
            df_predict_accuracy_no_ignore[conf.columns['class']],
            df_predict_accuracy_no_ignore['pred1'],
            normalize=True,
            sample_weight=None) * 100
        overall_accuracies_list.append({
            'parcels': 'Exclude classes_to_ignore(_for_train) classes',
            'prediction_type': 'standard',
            'accuracy': oa
        })

        oa = skmetrics.accuracy_score(
            df_predict_accuracy_no_ignore[conf.columns['class']],
            df_predict_accuracy_no_ignore[conf.columns['prediction_cons']],
            normalize=True,
            sample_weight=None) * 100
        overall_accuracies_list.append({
            'parcels': 'Exclude classes_to_ignore(_for_train) classes',
            'prediction_type': 'consolidated',
            'accuracy': oa
        })

        # Calculate ignoring both classes to ignored + parcels not having a valid prediction
        df_predict_no_ignore_has_prediction = df_predict_accuracy_no_ignore.loc[
            (df_predict_accuracy_no_ignore[conf.columns['prediction_cons']] !=
             'NODATA')
            & (df_predict_accuracy_no_ignore[conf.columns['prediction_cons']]
               != 'DOUBT:NOT_ENOUGH_PIXELS')]
        oa = skmetrics.accuracy_score(
            df_predict_no_ignore_has_prediction[conf.columns['class']],
            df_predict_no_ignore_has_prediction['pred1'],
            normalize=True,
            sample_weight=None) * 100
        overall_accuracies_list.append({
            'parcels':
            'Exclude ignored ones + with prediction (= excl. NODATA, NOT_ENOUGH_PIXELS)',
            'prediction_type': 'standard',
            'accuracy': oa
        })

        oa = skmetrics.accuracy_score(
            df_predict_no_ignore_has_prediction[conf.columns['class']],
            df_predict_no_ignore_has_prediction[
                conf.columns['prediction_cons']],
            normalize=True,
            sample_weight=None) * 100
        overall_accuracies_list.append({
            'parcels':
            'Exclude ignored ones + with prediction (= excl. NODATA, NOT_ENOUGH_PIXELS)',
            'prediction_type': 'consolidated',
            'accuracy': oa
        })

        # Output the resulting overall accuracies
        message = 'Overall accuracies for different sub-groups of the data'
        outputfile.write(f"\n{message}\n")
        html_data['OVERALL_ACCURACIES_TEXT'] = message

        overall_accuracies_df = pd.DataFrame(
            overall_accuracies_list,
            columns=['parcels', 'prediction_type', 'accuracy'])
        overall_accuracies_df.set_index(keys=['parcels', 'prediction_type'],
                                        inplace=True)
        with pd.option_context(*pandas_option_context_list):
            outputfile.write(f"\n{overall_accuracies_df}\n")
            logger.info(f"{overall_accuracies_df}\n")
            html_data[
                'OVERALL_ACCURACIES_TABLE'] = overall_accuracies_df.to_html()

        # Write the recall, F1 score,... per class
        #message = skmetrics.classification_report(df_predict[gs.class_column]
        #                                                , df_predict[gs.prediction_column]
        #                                                , labels=classes)
        #outputfile.write(message)

        outputfile.write(
            "************************************************************\n")
        outputfile.write(
            "********************* DETAILED RESULTS *********************\n")
        outputfile.write(
            "************************************************************\n")
        outputfile.write("\n")
        outputfile.write(
            "************************************************************\n")
        outputfile.write(
            "*             DETAILED PREDICTION CONCLUSIONS              *\n")
        outputfile.write(
            "************************************************************\n")

        # Calculate detailed conclusions for the predictions
        logger.info("Calculate the detailed conclusions for the predictions")

        # Write the conclusions for the consolidated predictions
        _add_prediction_conclusion(
            in_df=df_predict,
            new_columnname=conf.columns['prediction_conclusion_detail_cons'],
            prediction_column_to_use=conf.columns['prediction_cons'],
            detailed=True)
        message = f"Prediction conclusions cons (doubt + not_enough_pixels) overview, for {len(df_predict.index)} predicted cases:"
        outputfile.write(f"\n{message}\n")
        html_data['PREDICTION_CONCLUSION_DETAIL_CONS_OVERVIEW_TEXT'] = message

        count_per_class = (df_predict.groupby(
            conf.columns['prediction_conclusion_detail_cons'],
            as_index=False).size().to_frame('count'))
        values = 100 * count_per_class['count'] / count_per_class['count'].sum(
        )
        count_per_class.insert(loc=1, column='pct', value=values)

        with pd.option_context('display.max_rows', None, 'display.max_columns',
                               None):
            outputfile.write(f"\n{count_per_class}\n")
            logger.info(f"{count_per_class}\n")
            html_data[
                'PREDICTION_CONCLUSION_DETAIL_CONS_OVERVIEW_TABLE'] = count_per_class.to_html(
                )

        # Calculate detailed conclusions for the predictions
        logger.info("Calculate the detailed conclusions for the predictions")

        # Write the conclusions for the consolidated predictions
        _add_prediction_conclusion(
            in_df=df_predict,
            new_columnname=conf.
            columns['prediction_conclusion_detail_full_alpha'],
            prediction_column_to_use=conf.columns['prediction_full_alpha'],
            detailed=True)
        message = f"Prediction conclusions full alpha (doubt + not_enough_pixels) overview, for {len(df_predict.index)} predicted cases:"
        outputfile.write(f"\n{message}\n")
        html_data[
            'PREDICTION_CONCLUSION_DETAIL_FULL_ALPHA_OVERVIEW_TEXT'] = message

        count_per_class = (df_predict.groupby(
            conf.columns['prediction_conclusion_detail_full_alpha'],
            as_index=False).size().to_frame('count'))
        values = 100 * count_per_class['count'] / count_per_class['count'].sum(
        )
        count_per_class.insert(loc=1, column='pct', value=values)

        with pd.option_context('display.max_rows', None, 'display.max_columns',
                               None):
            outputfile.write(f"\n{count_per_class}\n")
            logger.info(f"{count_per_class}\n")
            html_data[
                'PREDICTION_CONCLUSION_DETAIL_FULL_ALPHA_OVERVIEW_TABLE'] = count_per_class.to_html(
                )

        outputfile.write(
            "************************************************************\n")
        outputfile.write(
            "*     CONFUSION MATRICES FOR PARCELS WITH PREDICTIONS      *\n")
        outputfile.write(
            "************************************************************\n")
        # Calculate an extended confusion matrix with the standard prediction column and write
        # it to output...
        df_confmatrix_ext = _get_confusion_matrix_ext(df_predict, 'pred1')
        outputfile.write(
            "\nExtended confusion matrix of the predictions: Rows: true/input classes, columns: predicted classes\n"
        )
        with pd.option_context('display.max_rows', None, 'display.max_columns',
                               None, 'display.width', 2000):
            outputfile.write(f"{df_confmatrix_ext}\n")
            html_data['CONFUSION_MATRICES_TABLE'] = df_confmatrix_ext.to_html()
            html_data['CONFUSION_MATRICES_DATA'] = df_confmatrix_ext.to_json()

        # Calculate an extended confusion matrix with the consolidated prediction column and write
        # it to output...
        df_confmatrix_ext = _get_confusion_matrix_ext(
            df_predict, conf.columns['prediction_cons'])
        outputfile.write(
            "\nExtended confusion matrix of the consolidated predictions: Rows: true/input classes, columns: predicted classes\n"
        )
        with pd.option_context('display.max_rows', None, 'display.max_columns',
                               None, 'display.width', 2000):
            outputfile.write(f"{df_confmatrix_ext}\n\n")
            html_data[
                'CONFUSION_MATRICES_CONSOLIDATED_TABLE'] = df_confmatrix_ext.to_html(
                )
            html_data[
                'CONFUSION_MATRICES_CONSOLIDATED_DATA'] = df_confmatrix_ext.to_json(
                )

        # If the pixcount is available, write the OA per pixcount
        if conf.columns['pixcount_s1s2'] in df_predict.columns:
            pixcount_output_report_txt = output_report_txt + '_OA_per_pixcount.txt'
            _write_OA_per_pixcount(
                df_parcel_predictions=df_predict,
                output_report_txt=pixcount_output_report_txt,
                force=force)

        # If a ground truth file is provided, report on the ground truth
        if parcel_ground_truth_filepath is not None:
            outputfile.write(
                "************************************************************\n"
            )
            outputfile.write(
                "*   REPORTING ON PREDICTION QUALITY BASED ON GROUND TRUTH  *\n"
            )
            outputfile.write(
                "************************************************************\n"
            )

            # Read ground truth
            logger.info(
                f"Read csv with ground truth (with their classes): {parcel_ground_truth_filepath}"
            )
            df_parcel_gt = pdh.read_file(parcel_ground_truth_filepath)
            df_parcel_gt.set_index(conf.columns['id'], inplace=True)
            logger.info(
                f"Read csv with ground truth ready, shape: {df_parcel_gt.shape}"
            )

            # Join the prediction data
            cols_to_join = df_predict.columns.difference(df_parcel_gt.columns)
            df_parcel_gt = df_predict[cols_to_join].join(df_parcel_gt,
                                                         how='inner')
            logger.info(
                f"After join of ground truth with predictions, shape: {df_parcel_gt.shape}"
            )

            if len(df_parcel_gt.index) == 0:
                message = "After join of ground truth with predictions the result was empty, so probably a wrong ground truth file was used!"
                logger.critical(message)
                raise Exception(message)

            # General ground truth statistics
            # ******************************************************************
            # Calculate the conclusions based on ground truth

            # Calculate and write the result for the consolidated predictions
            _add_gt_conclusions(df_parcel_gt, conf.columns['prediction_cons'])
            message = f"Prediction quality cons (doubt + not_enough_pixels) overview, for {len(df_parcel_gt.index)} predicted cases in ground truth:"
            outputfile.write(f"\n{message}\n")
            html_data['PREDICTION_QUALITY_CONS_OVERVIEW_TEXT'] = message

            count_per_class = (df_parcel_gt.groupby(
                f"gt_conclusion_{conf.columns['prediction_cons']}",
                as_index=False).size().to_frame('count'))
            values = 100 * count_per_class['count'] / count_per_class[
                'count'].sum()
            count_per_class.insert(loc=1, column='pct', value=values)

            with pd.option_context('display.max_rows', None,
                                   'display.max_columns', None):
                outputfile.write(f"\n{count_per_class}\n")
                logger.info(f"{count_per_class}\n")
                html_data[
                    'PREDICTION_QUALITY_CONS_OVERVIEW_TABLE'] = count_per_class.to_html(
                    )

            # Calculate and write the result for the consolidated predictions
            _add_gt_conclusions(df_parcel_gt,
                                conf.columns['prediction_full_alpha'])
            message = f"Prediction quality cons (doubt + not_enough_pixels) overview, for {len(df_parcel_gt.index)} predicted cases in ground truth:"
            outputfile.write(f"\n{message}\n")
            html_data['PREDICTION_QUALITY_FULL_ALPHA_OVERVIEW_TEXT'] = message

            count_per_class = (df_parcel_gt.groupby(
                f"gt_conclusion_{conf.columns['prediction_full_alpha']}",
                as_index=False).size().to_frame('count'))
            values = 100 * count_per_class['count'] / count_per_class[
                'count'].sum()
            count_per_class.insert(loc=1, column='pct', value=values)

            with pd.option_context('display.max_rows', None,
                                   'display.max_columns', None):
                outputfile.write(f"\n{count_per_class}\n")
                logger.info(f"{count_per_class}\n")
                html_data[
                    'PREDICTION_QUALITY_FULL_ALPHA_OVERVIEW_TABLE'] = count_per_class.to_html(
                    )

            # Write the ground truth conclusions to file
            pdh.to_file(
                df_parcel_gt,
                output_report_txt + "_groundtruth_pred_quality_details.tsv")

            # Alpha and beta error statistics based on CONS prediction
            # ******************************************************************
            # Pct Alpha errors=alpha errors/(alpha errors + real errors)
            columnname = f"gt_conclusion_{conf.columns['prediction_cons']}"
            alpha_error_numerator = len(df_parcel_gt.loc[
                df_parcel_gt[columnname] ==
                'FARMER-CORRECT_PRED-WRONG:ERROR_ALPHA'].index)
            alpha_error_denominator = (
                alpha_error_numerator +
                len(df_parcel_gt.loc[df_parcel_gt[columnname].isin([
                    'FARMER-WRONG_PRED-CORRECT', 'FARMER-WRONG_PRED-WRONG'
                ])].index))
            if alpha_error_denominator > 0:
                message = (
                    f"Alpha error for cons: {alpha_error_numerator}/{alpha_error_denominator} = "
                    +
                    f"{(alpha_error_numerator/alpha_error_denominator):.02f}")
            else:
                message = f"Alpha error for cons: {alpha_error_numerator}/{alpha_error_denominator} = ?"

            outputfile.write(f"\n{message}\n")
            html_data['PREDICTION_QUALITY_ALPHA_TEXT'] = message

            beta_error_numerator = len(df_parcel_gt.loc[
                df_parcel_gt[columnname] ==
                'FARMER-WRONG_PRED-DOESNT_OPPOSE:ERROR_BETA'].index)
            beta_error_denominator = (
                beta_error_numerator +
                len(df_parcel_gt.loc[df_parcel_gt[columnname].str.startswith(
                    'FARMER-WRONG_PRED-')].index))
            if beta_error_denominator > 0:
                message = (
                    f"Beta error for cons: {beta_error_numerator}/{beta_error_denominator} = "
                    + f"{(beta_error_numerator/beta_error_denominator):.02f}")
            else:
                message = f"Beta error for cons: {beta_error_numerator}/{beta_error_denominator} = ?"

            outputfile.write(f"\n{message}\n")
            html_data['PREDICTION_QUALITY_BETA_TEXT'] = message

            # Alpha and beta error statistics based on CONS prediction
            # ******************************************************************
            # Pct Alpha errors=alpha errors/(alpha errors + real errors)
            columnname = f"gt_conclusion_{conf.columns['prediction_full_alpha']}"
            alpha_error_numerator = len(df_parcel_gt.loc[
                df_parcel_gt[columnname] ==
                'FARMER-CORRECT_PRED-WRONG:ERROR_ALPHA'].index)
            alpha_error_denominator = (
                alpha_error_numerator +
                len(df_parcel_gt.loc[df_parcel_gt[columnname].isin([
                    'FARMER-WRONG_PRED-CORRECT', 'FARMER-WRONG_PRED-WRONG'
                ])].index))
            if alpha_error_denominator > 0:
                message = (
                    f"Alpha error full: {alpha_error_numerator}/{alpha_error_denominator} = "
                    +
                    f"{(alpha_error_numerator/alpha_error_denominator):.02f}")
            else:
                message = f"Alpha error full: {alpha_error_numerator}/{alpha_error_denominator} = ?"

            outputfile.write(f"\n{message}\n")
            html_data['PREDICTION_QUALITY_ALPHA_TEXT'] += '<br/>' + message

            beta_error_numerator = len(df_parcel_gt.loc[
                df_parcel_gt[columnname] ==
                'FARMER-WRONG_PRED-DOESNT_OPPOSE:ERROR_BETA'].index)
            beta_error_denominator = (
                beta_error_numerator +
                len(df_parcel_gt.loc[df_parcel_gt[columnname].str.startswith(
                    'FARMER-WRONG_PRED-')].index))
            if beta_error_denominator > 0:
                message = (
                    f"Beta error full: {beta_error_numerator}/{beta_error_denominator} = "
                    + f"{(beta_error_numerator/beta_error_denominator):.02f}")
            else:
                message = f"Beta error full: {beta_error_numerator}/{beta_error_denominator} = ?"

            outputfile.write(f"\n{message}\n")
            html_data['PREDICTION_QUALITY_BETA_TEXT'] += '<br/>' + message

            # If the pixcount is available, write the number of ALFA errors per pixcount (for the prediction with doubt)
            if conf.columns['pixcount_s1s2'] in df_parcel_gt.columns:
                # Get data, drop empty lines and write
                message = f"Number of ERROR_ALFA parcels for the 'prediction full alpha without NOT_ENOUGH_PIX' per pixcount for the ground truth parcels:"
                outputfile.write(f"\n{message}\n")
                html_data[
                    'PREDICTION_QUALITY_ALPHA_PER_PIXCOUNT_TEXT'] = message

                # To get the number of alpha errors per pixcount, we also need alpha errors
                # also for parcels that had not_enough_pixels, so we need prediction_withdoubt
                # If they don't exist, calculate
                class_postpr.add_doubt_column(
                    pred_df=df_parcel_gt,
                    new_pred_column='pred_cons_no_min_pix',
                    apply_doubt_marker_specific=True)
                _add_gt_conclusions(df_parcel_gt, 'pred_cons_no_min_pix')

                df_per_pixcount = _get_alfa_errors_per_pixcount(
                    df_predquality_pixcount=df_parcel_gt,
                    pred_quality_column="gt_conclusion_" +
                    "pred_cons_no_min_pix",
                    error_alpha_code='FARMER-CORRECT_PRED-WRONG:ERROR_ALPHA')
                df_per_pixcount.dropna(inplace=True)
                with pd.option_context('display.max_rows', None,
                                       'display.max_columns', None,
                                       'display.width', 2000):
                    outputfile.write(f"\n{df_per_pixcount}\n")
                    logger.info(f"{df_per_pixcount}\n")
                    html_data[
                        'PREDICTION_QUALITY_ALPHA_PER_PIXCOUNT_TABLE'] = df_per_pixcount.to_html(
                        )

    with open(output_report_txt.replace('.txt', '.html'), 'w') as outputfile:
        html_template_file = open(
            './cropclassification/postprocess/html_rapport_template.html'
        ).read()
        src = Template(html_template_file)
        # replace strings and write to file
        output = src.substitute(html_data)
        outputfile.write(output)
Esempio n. 16
0
def prepare_input_latecrop(parceldata_df, column_BEFL_cropcode: str,
                           column_output_class: str,
                           classes_refe_filepath: Path):
    """
    This function creates a file that is compliant with the assumptions used by the rest of the
    classification functionality.

    It should be a csv file with the following columns:
        - object_id: column with a unique identifier
        - classname: a string column with a readable name of the classes that will be classified to

    This specific implementation converts the typiscal export format used in BE-Flanders to
    this format.
    """
    # Check if parameters are OK and init some extra params
    #--------------------------------------------------------------------------
    if not classes_refe_filepath.exists():
        raise Exception(
            f"Input classes file doesn't exist: {classes_refe_filepath}")

    # Convert the crop to unicode, in case the input is int...
    if column_BEFL_cropcode in parceldata_df.columns:
        parceldata_df[column_BEFL_cropcode] = parceldata_df[
            column_BEFL_cropcode].astype('unicode')

    # Read and cleanup the mapping table from crop codes to classes
    #--------------------------------------------------------------------------
    logger.info(f"Read classes conversion table from {classes_refe_filepath}")
    classes_df = pdh.read_file(classes_refe_filepath)
    logger.info(
        f"Read classes conversion table ready, info(): {classes_df.info()}")

    # Because the file was read as ansi, and gewas is int, so the data needs to be converted to
    # unicode to be able to do comparisons with the other data
    classes_df[column_BEFL_cropcode] = classes_df['CROPCODE'].astype('unicode')

    # Map column with the classname to orig classname
    column_output_class_orig = conf.columns['class'] + '_orig'
    classes_df[column_output_class_orig] = classes_df[
        conf.columns['class_refe']]

    # Remove unneeded columns
    for column in classes_df.columns:
        if (column not in [
                column_output_class_orig, column_BEFL_cropcode,
                conf.columns['class_declared'], conf.columns['class']
        ] and column not in columns_BEFL_to_keep):
            classes_df.drop(column, axis=1, inplace=True)

    # Set the index
    classes_df.set_index(column_BEFL_cropcode,
                         inplace=True,
                         verify_integrity=True)

    # Get only the columns in the classes_df that don't exist yet in parceldata_df
    cols_to_join = classes_df.columns.difference(parceldata_df.columns)

    # Join/merge the classname
    logger.info('Add the classes to the parceldata')
    parceldata_df = parceldata_df.merge(classes_df[cols_to_join],
                                        how='left',
                                        left_on=column_BEFL_cropcode,
                                        right_index=True,
                                        validate='many_to_one')

    # Copy orig classname to classification classname
    parceldata_df.insert(loc=0,
                         column=column_output_class,
                         value=parceldata_df[column_output_class_orig])

    # For rows with no class, set to UNKNOWN
    parceldata_df.fillna(value={column_output_class: 'UNKNOWN'}, inplace=True)

    # If a column with extra info exists, use it as well to fine-tune the classification classes.
    if column_BEFL_gesp_pm in parceldata_df.columns:
        # Serres, tijdelijke overkappingen en loodsen
        parceldata_df.loc[
            parceldata_df[column_BEFL_gesp_pm].isin(['SER', 'SGM']),
            column_output_class] = 'MON_OVERK_LOO'
        parceldata_df.loc[
            parceldata_df[column_BEFL_gesp_pm].isin(['PLA', 'PLO', 'NPO']),
            column_output_class] = 'MON_OVERK_LOO'
        parceldata_df.loc[
            parceldata_df[column_BEFL_gesp_pm] == 'LOO',
            column_output_class] = 'MON_OVERK_LOO'  # Een loods is hetzelfde als een stal...
        parceldata_df.loc[
            parceldata_df[column_BEFL_gesp_pm] == 'CON',
            column_output_class] = 'MON_CONTAINERS'  # Containers, niet op volle grond...
        # TODO: CIV, containers in volle grond, lijkt niet zo specifiek te zijn...
        #parceldata_df.loc[parceldata_df[column_BEFL_gesp_pm] == 'CIV', class_columnname] = 'MON_CONTAINERS'   # Containers, niet op volle grond...
    else:
        logger.warning(
            f"The column {column_BEFL_gesp_pm} doesn't exist, so this part of the code was skipped!"
        )

    # Set classes with very few elements to IGNORE_NOT_ENOUGH_SAMPLES!
    for _, row in parceldata_df.groupby(
            column_output_class).size().reset_index(name='count').iterrows():
        if row['count'] <= 50:
            logger.info(
                f"Class <{row[column_output_class]}> only contains {row['count']} elements, so put them to IGNORE_NOT_ENOUGH_SAMPLES"
            )
            parceldata_df.loc[
                parceldata_df[column_output_class] == row[column_output_class],
                column_output_class] = 'IGNORE_NOT_ENOUGH_SAMPLES'

    # Drop the columns that aren't useful at all
    for column in parceldata_df.columns:
        if (column not in [
                column_output_class, conf.columns['id'],
                conf.columns['class_groundtruth'],
                conf.columns['class_declared']
        ] and column not in conf.preprocess.getlist('extra_export_columns')
                and column not in columns_BEFL_to_keep):
            parceldata_df.drop(column, axis=1, inplace=True)
        elif column == column_BEFL_gesp_pm:
            parceldata_df[column_BEFL_gesp_pm] = parceldata_df[
                column_BEFL_gesp_pm].str.replace(',', ';')

    return parceldata_df