Python to_file Examples

Programming Language: Python

Namespace/Package Name: cropclassification.helpers.pandas_helper

Method/Function: to_file

Examples at hotexamples.com: 12

Python to_file - 12 examples found. These are the top rated real world Python examples of cropclassification.helpers.pandas_helper.to_file extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: timeseries_calc_gee.py Project: draadzic/cropclassification

def clean_gee_downloaded_csv(csv_file: str,
                             remove_orig_csv: bool = False):
    """ 
    Cleans a csv downloaded from gee by removing gee specific columns... 
    """

    try:
        # Prepare output filename
        file_noext, _ = os.path.splitext(csv_file)
        output_file = f"{file_noext}{conf.general['data_ext']}"

        # Check if output file exists already even though it is different from input file
        if output_file != csv_file and os.path.exists(output_file):
            logger.warning(f"Output file exists already, so don't create it again: {output_file}")
        elif os.path.getsize(csv_file) == 0:
            # If input file is empty...
            logger.info(f"File is empty, so just create new empty output file: {output_file}")
            open(output_file, 'w').close()
        else:
            # Read the file
            logger.info(f"Read file and remove gee specifice columns from {csv_file}")

            # Sample 100 rows of data to determine dtypes, so floats can be read as float32 instead of the 
            # default float64. Writing those to eg. parquet is a lot more efficiënt.
            test_df = pd.read_csv(csv_file, nrows=100)
            float_cols = [c for c in test_df if test_df[c].dtype == "float64"]
            float32_cols = {c: np.float32 for c in float_cols}

            # Now read entire file
            data_read_df = pd.read_csv(csv_file, engine='c', dtype=float32_cols)
            
            # Drop unnecessary gee specific columns...
            for column in data_read_df.columns:
                if column in ['system:index', '.geo']:
                    data_read_df.drop(column, axis=1, inplace=True)
                elif column == 'count':
                    logger.info(f"Rename count column to {conf.columns['pixcount_s1s2']}")
                    data_read_df.rename(columns={'count': conf.columns['pixcount_s1s2']}, inplace=True)

            # Set the id column as index
            data_read_df.set_index(conf.columns['id'], inplace=True)

            # If there are data columns, write to output file
            if len(data_read_df.columns) > 0:
                # Replace the original file by the cleaned one
                logger.info(f"Write the file with the gee specific columns removed to a new file: {output_file}")
                pdh.to_file(data_read_df, output_file, index=True)
            else:
                logger.warning(f"No data columns found in file {csv_file}, so return!!!")
                return            

        # If remove_orig_csv is True and the output filepath is different from orig filepath, 
        # remove orig file. 
        if remove_orig_csv and output_file != csv_file:
            logger.info(f"Remove orig csv file: {csv_file}")
            os.remove(csv_file)

    except Exception as ex:
        raise Exception(f"Error processing file {csv_file}") from ex

Example #2

Show file

File: classification_preprocess.py Project: draadzic/cropclassification

def prepare_input(input_parcel_filepath: str,
                  input_parcel_filetype: str,
                  input_parcel_pixcount_filepath: str,
                  classtype_to_prepare: str,
                  output_parcel_filepath: str,
                  force: bool = False):
    """
    Prepare a raw input file by eg. adding the classification classes to use for the
    classification,...
    """

    # If force == False Check and the output file exists already, stop.
    if force is False and os.path.exists(output_parcel_filepath) is True:
        logger.warning(
            f"prepare_input: output file already exists and force == False, so stop: {output_parcel_filepath}"
        )
        return

    if input_parcel_filetype == 'BEFL':
        output_dir, _ = os.path.split(output_parcel_filepath)
        df_parceldata = befl.prepare_input(
            input_parcel_filepath=input_parcel_filepath,
            classtype_to_prepare=classtype_to_prepare,
            output_dir=output_dir)
    else:
        message = f"Unknown value for parameter input_parcel_filetype: {input_parcel_filetype}"
        logger.critical(message)
        raise Exception(message)

    # Load pixcount data and join it
    logger.info(f"Read pixcount file {input_parcel_pixcount_filepath}")
    df_pixcount = pdh.read_file(input_parcel_pixcount_filepath)
    logger.debug(f"Read pixcount file ready, shape: {df_pixcount.shape}")
    if df_pixcount.index.name != conf.columns['id']:
        df_pixcount.set_index(conf.columns['id'], inplace=True)

    df_parceldata.set_index(conf.columns['id'], inplace=True)
    df_parceldata = df_parceldata.join(
        df_pixcount[conf.columns['pixcount_s1s2']], how='left')

    # Export result to file
    output_ext = os.path.splitext(output_parcel_filepath)[1]
    for column in df_parceldata.columns:
        # if the output asked is a csv... we don't need the geometry...
        if column == conf.columns['geom'] and output_ext == '.csv':
            df_parceldata.drop(column, axis=1, inplace=True)

    logger.info(f"Write output to {output_parcel_filepath}")
    # If extension is not .shp, write using pandas (=a lot faster!)
    if output_ext.lower() != '.shp':
        pdh.to_file(df_parceldata, output_parcel_filepath)
    else:
        df_parceldata.to_file(output_parcel_filepath, index=False)

Example #3

Show file

def detect_multicrop(input_parcel_filepath: Path,
                     input_parcel_timeseries_data_filepath: Path):
    '''
    logger.info(f"Read input file: {input_parcel_filepath}")
    df_input_parcel = pd.read_csv(input_parcel_filepath, low_memory=False)
    logger.debug('Read train file ready')
    '''

    # If the classification data isn't passed as dataframe, read it from the csv
    logger.info(
        f"Read classification data file: {input_parcel_timeseries_data_filepath}"
    )
    df_timeseries_data = pd.read_csv(input_parcel_timeseries_data_filepath,
                                     low_memory=False)
    df_timeseries_data.set_index(conf.columns['id'], inplace=True)
    logger.debug('Read classification data file ready')

    # Add column with the max of all columns (= all stdDev's)
    df_timeseries_data['max_stddev'] = df_timeseries_data.max(axis=1)
    '''
    # Prepare the data to send to prediction logic...
    logger.info("Join train sample with the classification data")
    df_input_parcel_for_detect = (df_input_parcel#[[gs.id_column, gs.class_column]]
                                   .join(df_timeseries_data
                                         , how='inner', on=gs.id_column))

    # Only keep the parcels with relevant crops/production types
    productiontype_column = 'GESP_PM'
    if productiontype_column in df_input_parcel_for_detect.columns:
        # Serres, tijdelijke overkappingen en loodsen
        df_input_parcel_for_detect.loc[~df_input_parcel_for_detect[productiontype_column].isin(['SER', 'SGM'])]
        df_input_parcel_for_detect.loc[~df_input_parcel_for_detect[productiontype_column].isin(['PLA', 'PLO', 'NPO'])]
        df_input_parcel_for_detect.loc[df_input_parcel_for_detect[productiontype_column] != 'LOO']     # Een loods is hetzelfde als een stal...
        df_input_parcel_for_detect.loc[df_input_parcel_for_detect[productiontype_column] != 'CON']    # Containers, niet op volle grond...

    crop_columnname = 'GWSCOD_H'
    df_input_parcel_for_detect.loc[~df_input_parcel_for_detect[crop_columnname].isin(['1', '2', '3'])]

    # Keep the parcels with the 1000 largest stdDev
    df_largest = df_input_parcel_for_detect.nlargest(1000, columns='max_stddev', keep='first')
    '''

    #df_result = df_timeseries_data['max_stddev'].to_frame()
    df_result = df_timeseries_data
    logger.info(df_result)

    # Write to file
    output_filepath = Path(
        str(input_parcel_timeseries_data_filepath) + '_largestStdDev.csv')
    logger.info(f"Write output file: {output_filepath}")
    pdh.to_file(df_result, output_filepath)

Example #4

Show file

def main():

    dir = Path(
        "X:/Monitoring/Markers/playground/pierog/tmp/Run_2019-06-25_007_imported"
    )
    in_filepaths = dir.glob("*.parquet")

    # Convert all files found
    for in_filepath in in_filepaths:

        # Read input file
        print(f"Read {in_filepath}")
        df = pdh.read_file(in_filepath)

        # Write to new file
        out_filepath = in_filepath.parent / f"{in_filepath.stem}.sqlite"
        print(f"Write {out_filepath}")
        pdh.to_file(df, out_filepath)

Example #5

Show file

def prepare_input(input_parcel_filepath: str,
                  output_imagedata_parcel_input_filepath: str,
                  output_parcel_nogeo_filepath: str = None,
                  force: bool = False):
    """
    This function creates a file that is preprocessed to be a good input file for
    timeseries extraction of sentinel images.

    Args
        input_parcel_filepath: input file
        output_imagedata_parcel_input_filepath: prepared output file
        output_parcel_nogeo_filepath: output file with a copy of the non-geo data
        force: force creation, even if output file(s) exist already

    """
    ##### Check if parameters are OK and init some extra params #####
    if not os.path.exists(input_parcel_filepath):
        raise Exception(f"Input file doesn't exist: {input_parcel_filepath}")

    # Check if the input file has a projection specified
    if geofile_util.get_crs(input_parcel_filepath) is None:
        message = f"The parcel input file doesn't have a projection/crs specified, so STOP: {input_parcel_filepath}"
        logger.critical(message)
        raise Exception(message)

    # If force == False Check and the output file exists already, stop.
    if (force is False
            and os.path.exists(output_imagedata_parcel_input_filepath)
            and (output_parcel_nogeo_filepath is None
                 or os.path.exists(output_parcel_nogeo_filepath))):
        logger.warning(
            "prepare_input: force == False and output files exist, so stop: " +
            f"{output_imagedata_parcel_input_filepath}, " +
            f"{output_parcel_nogeo_filepath}")
        return

    logger.info(f"Process input file {input_parcel_filepath}")

    # Create temp dir to store temporary data for tracebility
    output_dir, output_filename = os.path.split(
        output_imagedata_parcel_input_filepath)
    output_filename_noext = os.path.splitext(output_filename)[0]
    temp_output_dir = os.path.join(output_dir, 'temp')
    if not os.path.exists(temp_output_dir):
        os.mkdir(temp_output_dir)

    ##### Read the parcel data and write nogeo version #####
    parceldata_gdf = geofile_util.read_file(input_parcel_filepath)
    logger.info(f'Parceldata read, shape: {parceldata_gdf.shape}')

    # Check if the id column is present and set as index
    if conf.columns['id'] in parceldata_gdf.columns:
        parceldata_gdf.set_index(conf.columns['id'], inplace=True)
    else:
        message = f"STOP: Column {conf.columns['id']} not found in input parcel file: {input_parcel_filepath}. Make sure the column is present or change the column name in global_constants.py"
        logger.critical(message)
        raise Exception(message)

    if force is True or os.path.exists(output_parcel_nogeo_filepath) == False:
        logger.info(f"Save non-geo data to {output_parcel_nogeo_filepath}")
        parceldata_nogeo_df = parceldata_gdf.drop(['geometry'], axis=1)
        pdh.to_file(parceldata_nogeo_df, output_parcel_nogeo_filepath)

    ##### Do the necessary conversions and write buffered file #####

    # If force == False Check and the output file exists already, stop.
    if (force is False
            and os.path.exists(output_imagedata_parcel_input_filepath)):
        logger.warning(
            "prepare_input: force == False and output files exist, so stop: " +
            f"{output_imagedata_parcel_input_filepath}")
        return

    logger.info('Apply buffer on parcel')
    parceldata_buf_gdf = parceldata_gdf.copy()

    # resolution = number of segments per circle
    buffer_size = -conf.marker.getint('buffer')
    parceldata_buf_gdf[conf.columns['geom']] = (
        parceldata_buf_gdf[conf.columns['geom']].buffer(buffer_size,
                                                        resolution=5))

    # Export buffered geometries that result in empty geometries
    logger.info('Export parcels that are empty after buffer')
    parceldata_buf_empty_df = parceldata_buf_gdf.loc[parceldata_buf_gdf[
        conf.columns['geom']].is_empty == True]
    if len(parceldata_buf_empty_df.index) > 0:
        parceldata_buf_empty_df.drop(conf.columns['geom'],
                                     axis=1,
                                     inplace=True)
        temp_empty_filepath = os.path.join(
            temp_output_dir, f"{output_filename_noext}_empty.sqlite")
        pdh.to_file(parceldata_buf_empty_df, temp_empty_filepath)

    # Export parcels that don't result in a (multi)polygon
    parceldata_buf_notempty_gdf = parceldata_buf_gdf.loc[parceldata_buf_gdf[
        conf.columns['geom']].is_empty == False]
    parceldata_buf_nopoly_gdf = parceldata_buf_notempty_gdf.loc[
        ~parceldata_buf_notempty_gdf[conf.columns['geom']].geom_type.
        isin(['Polygon', 'MultiPolygon'])]
    if len(parceldata_buf_nopoly_gdf.index) > 0:
        logger.info('Export parcels that are no (multi)polygons after buffer')
        parceldata_buf_nopoly_gdf.drop(conf.columns['geom'],
                                       axis=1,
                                       inplace=True)
        temp_nopoly_filepath = os.path.join(
            temp_output_dir, f"{output_filename_noext}_nopoly.sqlite")
        geofile_util.to_file(parceldata_buf_nopoly_gdf, temp_nopoly_filepath)

    # Export parcels that are (multi)polygons after buffering
    parceldata_buf_poly_gdf = parceldata_buf_notempty_gdf.loc[
        parceldata_buf_notempty_gdf[conf.columns['geom']].geom_type.isin(
            ['Polygon', 'MultiPolygon'])]
    for column in parceldata_buf_poly_gdf.columns:
        if column not in [conf.columns['id'], conf.columns['geom']]:
            parceldata_buf_poly_gdf.drop(column, axis=1, inplace=True)
    logger.info(
        f"Export parcels that are (multi)polygons after buffer to {output_imagedata_parcel_input_filepath}"
    )
    geofile_util.to_file(parceldata_buf_poly_gdf,
                         output_imagedata_parcel_input_filepath)
    logger.info(parceldata_buf_poly_gdf)

Example #6

Show file

def calculate_periodic_data(input_parcel_filepath: str,
                            input_base_dir: str,
                            start_date_str: str,
                            end_date_str: str,
                            sensordata_to_get: [],
                            dest_data_dir: str,
                            force: bool = False):
    """
    This function creates a file that is a weekly summarize of timeseries images from DIAS.

    TODO: add possibility to choose which values to extract (mean, min, max,...)?
        
    Args:
        input_parcel_filepath (str): [description]
        input_base_dir (str): [description]
        start_date_str (str): Start date in format %Y-%m-%d. Needs to be aligned already on the 
                periods wanted.
        end_date_str (str): End date in format %Y-%m-%d. Needs to be aligned already on the 
                periods wanted.
        sensordata_to_get ([]): 
        dest_data_dir (str): [description]
        force (bool, optional): [description]. Defaults to False.
    """
    logger.info('calculate_periodic_data')

    # Init
    input_parcels_filename = os.path.basename(input_parcel_filepath)
    input_parcels_filename_noext, _ = os.path.splitext(input_parcels_filename)
    input_dir = os.path.join(input_base_dir, input_parcels_filename_noext)

    # TODO: in config?
    input_ext = ".sqlite"
    output_ext = ".sqlite"

    start_date = datetime.strptime(start_date_str, '%Y-%m-%d')
    end_date = datetime.strptime(end_date_str, '%Y-%m-%d')
    year = start_date_str.split("-")[0]

    # Prepare output dir
    test = False
    if test is True:
        dest_data_dir += "_test"
    if not os.path.exists(dest_data_dir):
        os.mkdir(dest_data_dir)

    # Create Dataframe with all files with their info
    logger.debug('Create Dataframe with all files and their properties')
    file_info_list = []
    for filename in os.listdir(input_dir):
        if filename.endswith(input_ext):
            # Get seperate filename parts
            file_info = get_file_info(os.path.join(input_dir, filename))
            file_info_list.append(file_info)

    all_inputfiles_df = pd.DataFrame(file_info_list)

    # Loop over the data we need to get
    id_column = conf.columns['id']
    for sensordata_type in sensordata_to_get:

        logger.debug(
            'Get files we need based on start- & stopdates, sensordata_to_get,...'
        )
        orbits = [None]
        if sensordata_type == conf.general['SENSORDATA_S1_ASCDESC']:
            # Filter files to the ones we need
            satellitetype = 'S1'
            imagetype = IMAGETYPE_S1_GRD
            bands = ['VV', 'VH']
            orbits = ['ASC', 'DESC']
            needed_inputfiles_df = all_inputfiles_df.loc[
                (all_inputfiles_df.date >= start_date)
                & (all_inputfiles_df.date < end_date)
                & (all_inputfiles_df.imagetype == imagetype)
                & (all_inputfiles_df.band.isin(bands))
                & (all_inputfiles_df.orbit.isin(orbits))]
        elif sensordata_type == conf.general['SENSORDATA_S2gt95']:
            satellitetype = 'S2'
            imagetype = IMAGETYPE_S2_L2A
            bands = ['B02-10m', 'B03-10m', 'B04-10m', 'B08-10m']
            needed_inputfiles_df = all_inputfiles_df.loc[
                (all_inputfiles_df.date >= start_date)
                & (all_inputfiles_df.date < end_date)
                & (all_inputfiles_df.imagetype == imagetype)
                & (all_inputfiles_df.band.isin(bands))]
        elif sensordata_type == conf.general['SENSORDATA_S1_COHERENCE']:
            satellitetype = 'S1'
            imagetype = IMAGETYPE_S1_COHERENCE
            bands = ['VV', 'VH']
            orbits = ['ASC', 'DESC']
            needed_inputfiles_df = all_inputfiles_df.loc[
                (all_inputfiles_df.date >= start_date)
                & (all_inputfiles_df.date < end_date)
                & (all_inputfiles_df.imagetype == imagetype)
                & (all_inputfiles_df.band.isin(bands))]
        else:
            raise Exception(f"Unsupported sensordata_type: {sensordata_type}")

        # There should also be one pixcount file
        pixcount_filename = f"{input_parcels_filename_noext}_weekly_pixcount{output_ext}"
        pixcount_filepath = os.path.join(dest_data_dir, pixcount_filename)

        # For each week
        start_week = int(datetime.strftime(start_date, '%W'))
        end_week = int(datetime.strftime(end_date, '%W'))
        for period_index in range(start_week, end_week):

            # Get the date of the first day of period period_index (eg. monday for a week)
            period_date = datetime.strptime(
                str(year) + '_' + str(period_index) + '_1', '%Y_%W_%w')

            # New file name
            period_date_str_long = period_date.strftime('%Y-%m-%d')
            period_data_filename = f"{input_parcels_filename_noext}_weekly_{period_date_str_long}_{sensordata_type}{output_ext}"
            period_data_filepath = os.path.join(dest_data_dir,
                                                period_data_filename)

            # Check if output file exists already
            if os.path.exists(period_data_filepath):
                if force is False:
                    logger.info(
                        f"SKIP: force is False and file exists: {period_data_filepath}"
                    )
                    continue
                else:
                    os.remove(period_data_filepath)

            # Loop over bands and orbits (all combinations of bands and orbits!)
            logger.info(f"Calculate file: {period_data_filename}")
            period_data_df = None
            for band, orbit in [(band, orbit) for band in bands
                                for orbit in orbits]:

                # Get list of files needed for this period, band
                period_files_df = needed_inputfiles_df.loc[
                    (needed_inputfiles_df.week == period_index)
                    & (needed_inputfiles_df.band == band)]

                # If an orbit to be filtered was specified, filter
                if orbit is not None:
                    period_files_df = period_files_df.loc[(
                        period_files_df.orbit == orbit)]

                # Loop all period_files
                period_band_data_df = None
                statistic_columns_dict = {
                    'count': [],
                    'max': [],
                    'mean': [],
                    'min': [],
                    'std': []
                }
                for j, imagedata_filepath in enumerate(
                        period_files_df.filepath.tolist()):

                    # If file has filesize == 0, skip
                    if os.path.getsize(imagedata_filepath) == 0:
                        continue

                    # Read the file (but only the columns we need)
                    columns = [column for column in statistic_columns_dict
                               ].append(id_column)
                    image_data_df = pdh.read_file(imagedata_filepath,
                                                  columns=columns)
                    image_data_df.set_index(id_column, inplace=True)
                    image_data_df.index.name = id_column

                    # Remove rows with nan values
                    nb_before_dropna = len(image_data_df.index)
                    image_data_df.dropna(inplace=True)
                    nb_after_dropna = len(image_data_df.index)
                    if nb_after_dropna != nb_before_dropna:
                        logger.warning(
                            f"Before dropna: {nb_before_dropna}, after: {nb_after_dropna} for file {imagedata_filepath}"
                        )
                    if nb_after_dropna == 0:
                        continue

                    # Rename columns so column names stay unique
                    for statistic_column in statistic_columns_dict:
                        new_column_name = statistic_column + str(j + 1)
                        image_data_df.rename(
                            columns={statistic_column: new_column_name},
                            inplace=True)
                        image_data_df[new_column_name] = image_data_df[
                            new_column_name].astype(float)
                        statistic_columns_dict[statistic_column].append(
                            new_column_name)

                    # Create 1 dataframe for all weekfiles - one row for each code_obj - using concat (code_obj = index)
                    if period_band_data_df is None:
                        period_band_data_df = image_data_df
                    else:
                        period_band_data_df = pd.concat(
                            [period_band_data_df, image_data_df],
                            axis=1,
                            sort=False)
                        # Apparently concat removes the index name in some situations
                        period_band_data_df.index.name = id_column

                # Calculate max, mean, min, ...
                if period_band_data_df is not None:
                    logger.debug('Calculate max, mean, min, ...')
                    period_date_str_short = period_date.strftime('%Y%m%d')
                    # Remark: prefix column names: sqlite doesn't like a numeric start
                    if orbit is None:
                        column_basename = f"TS_{period_date_str_short}_{imagetype}_{band}"
                    else:
                        column_basename = f"TS_{period_date_str_short}_{imagetype}_{orbit}_{band}"

                    # Number of pixels
                    # TODO: onderzoeken hoe aantal pixels best bijgehouden wordt : afwijkingen weglaten ? max nemen ? ...
                    period_band_data_df[f"{column_basename}_count"] = np.nanmax(
                        period_band_data_df[statistic_columns_dict['count']],
                        axis=1)
                    # Maximum of all max columns
                    period_band_data_df[f"{column_basename}_max"] = np.nanmax(
                        period_band_data_df[statistic_columns_dict['max']],
                        axis=1)
                    # Mean of all mean columns
                    period_band_data_df[f"{column_basename}_mean"] = np.nanmean(
                        period_band_data_df[statistic_columns_dict['mean']],
                        axis=1)
                    # Minimum of all min columns
                    period_band_data_df[f"{column_basename}_min"] = np.nanmin(
                        period_band_data_df[statistic_columns_dict['min']],
                        axis=1)
                    # Mean of all std columns
                    period_band_data_df[f"{column_basename}_std"] = np.nanmean(
                        period_band_data_df[statistic_columns_dict['std']],
                        axis=1)
                    # Number of Files used
                    period_band_data_df[
                        f"{column_basename}_used_files"] = period_band_data_df[
                            statistic_columns_dict['max']].count(axis=1)

                    # Only keep the columns we want to keep
                    columns_to_keep = [
                        f"{column_basename}_count", f"{column_basename}_max",
                        f"{column_basename}_mean", f"{column_basename}_min",
                        f"{column_basename}_std",
                        f"{column_basename}_used_files"
                    ]
                    period_band_data_df = period_band_data_df[columns_to_keep]

                    # Merge the data with the other bands/orbits for this period
                    if period_data_df is None:
                        period_data_df = period_band_data_df
                    else:
                        period_data_df = pd.concat(
                            [period_band_data_df, period_data_df],
                            axis=1,
                            sort=False)
                        # Apparently concat removes the index name in some situations
                        period_data_df.index.name = id_column

            if period_data_df is not None:
                logger.info(f"Write new file: {period_data_filename}")
                pdh.to_file(period_data_df, period_data_filepath)

                if not os.path.exists(pixcount_filepath):
                    pixcount_s1s2_column = conf.columns['pixcount_s1s2']
                    for column in period_data_df.columns:
                        if column.endswith('_count'):
                            period_data_df.rename(
                                columns={column: pixcount_s1s2_column},
                                inplace=True)
                            break
                    pixcount_df = period_data_df[pixcount_s1s2_column]
                    pixcount_df.fillna(value=0, inplace=True)
                    pdh.to_file(pixcount_df, pixcount_filepath)

Example #7

Show file

File: classification_sklearn.py Project: draadzic/cropclassification

def predict_proba(parcel_df: pd.DataFrame, classifier_filepath: str,
                  output_parcel_predictions_filepath: str) -> pd.DataFrame:
    """
    Predict the probabilities for all input data using the classifier provided and write it
    to the output file.

    Args
        parcel_df: pandas DataFrame containing the data to classify. Columns:
            * global_settings.id_column: the id of the parcel.
            * global_settings.class_column: the class of the parcel. Isn't really used.
            * ... all columns that will be used as classification data.
        classifier_filepath: the filepath where the classifier can be written.
        output_parcel_predictions_filepath: file to write the predictions to.
    """

    # Some basic checks that input is ok
    parcel_df.reset_index(inplace=True)
    if (conf.columns['id'] not in parcel_df.columns
            or conf.columns['class'] not in parcel_df.columns):
        message = f"Columns {conf.columns['id']} and {conf.columns['class']} are mandatory for input parameter parcel_df!"
        logger.critical(message)
        raise Exception(message)

    # Now do final preparation for the classification
    parcel_classes_df = parcel_df[conf.columns['class']]
    cols_to_keep = parcel_df.columns.difference(
        [conf.columns['id'], conf.columns['class']])
    parcel_data_df = parcel_df[cols_to_keep]

    logger.info(
        f"Train file processed and rows with missing data removed, data shape: {parcel_data_df.shape}, labels shape: {parcel_classes_df.shape}"
    )
    with pd.option_context('display.max_rows', None, 'display.max_columns',
                           None):
        logger.info(
            f"Resulting Columns for training data: {parcel_data_df.columns}")

    # Load the classifier
    classifier = joblib.load(classifier_filepath)
    logger.info(f"Classifier has the following columns: {classifier.classes_}")

    logger.info(
        f"Predict classes with probabilities: {len(parcel_df.index)} rows")
    class_proba = classifier.predict_proba(parcel_data_df)
    logger.info(f"Predict classes with probabilities ready")

    # Convert probabilities to dataframe, combine with input data and write to file
    id_class_proba = np.concatenate([
        parcel_df[[conf.columns['id'], conf.columns['class']]].values,
        class_proba
    ],
                                    axis=1)
    cols = [conf.columns['id'], conf.columns['class']]
    cols.extend(classifier.classes_)
    proba_df = pd.DataFrame(id_class_proba, columns=cols)

    # If output path provided, write results
    if output_parcel_predictions_filepath:
        pdh.to_file(proba_df, output_parcel_predictions_filepath)

    return proba_df

Example #8

Show file

File: timeseries.py Project: theroggy/cropclassification

def collect_and_prepare_timeseries_data(
        input_parcel_filepath: Path,
        timeseries_dir: Path,
        base_filename: str,
        output_filepath: Path,
        start_date_str: str,
        end_date_str: str,
        sensordata_to_use: List[str],
        parceldata_aggregations_to_use: List[str],
        force: bool = False):
    """
    Collect all timeseries data to use for the classification and prepare it by applying
    scaling,... as needed.
    """

    # Some constants to choose which type of data to use in the marker.
    # Remark: the string needs to be the same as the end of the name of the columns in the csv files!
    # TODO: I'm not really happy with both a list in the ini file + here... not sure what the
    #       cleanest solution is though...
    PARCELDATA_AGGRAGATION_MEAN = conf.general[
        'PARCELDATA_AGGRAGATION_MEAN']  # Mean value of the pixels values in a parcel.
    PARCELDATA_AGGRAGATION_STDDEV = conf.general[
        'PARCELDATA_AGGRAGATION_STDDEV']  # std dev of the values of the pixels in a parcel

    # Constants for types of sensor data
    SENSORDATA_S1 = conf.general['SENSORDATA_S1']  # Sentinel 1 data
    SENSORDATA_S1DB = conf.general['SENSORDATA_S1DB']  # Sentinel 1 data, in dB
    SENSORDATA_S1_ASCDESC = conf.general[
        'SENSORDATA_S1_ASCDESC']  # Sentinel 1 data, divided in Ascending and Descending passes
    SENSORDATA_S1DB_ASCDESC = conf.general[
        'SENSORDATA_S1DB_ASCDESC']  # Sentinel 1 data, in dB, divided in Ascending and Descending passes
    SENSORDATA_S2 = conf.general['SENSORDATA_S2']  # Sentinel 2 data
    SENSORDATA_S2gt95 = conf.general[
        'SENSORDATA_S2gt95']  # Sentinel 2 data (B2,B3,B4,B8) IF available for 95% or area
    SENSORDATA_S1_COHERENCE = conf.general['SENSORDATA_S1_COHERENCE']

    # If force == False Check and the output file exists already, stop.
    if (force is False and output_filepath.exists() is True):
        logger.warning(
            f"Output file already exists and force == False, so stop: {output_filepath}"
        )
        return

    # Init the result with the id's of the parcels we want to treat
    result_df = pdh.read_file(input_parcel_filepath,
                              columns=[conf.columns['id']])
    if result_df.index.name != conf.columns['id']:
        result_df.set_index(conf.columns['id'], inplace=True)
    nb_input_parcels = len(result_df.index)
    logger.info(
        f"Parceldata aggregations that need to be used: {parceldata_aggregations_to_use}"
    )
    logger.setLevel(logging.DEBUG)

    # Loop over all input timeseries data to find the data we really need
    data_ext = conf.general['data_ext']
    filepath_start = timeseries_dir / f"{base_filename}_{start_date_str}{data_ext}"
    filepath_end = timeseries_dir / f"{base_filename}_{end_date_str}{data_ext}"
    logger.debug(f'filepath_start_date: {filepath_start}')
    logger.debug(f'filepath_end_date: {filepath_end}')

    ts_data_files = timeseries_dir.glob(f"{base_filename}_*{data_ext}")
    for curr_filepath in sorted(ts_data_files):

        # Only process data that is of the right sensor types
        sensor_type = curr_filepath.stem.split('_')[-1]
        if sensor_type not in sensordata_to_use:
            logger.debug(
                f"SKIP: file is not in sensor types asked ({sensordata_to_use}): {curr_filepath}"
            )
            continue
        # The only data we want to process is the data in the range of dates
        if ((str(curr_filepath) < str(filepath_start))
                or (str(curr_filepath) >= str(filepath_end))):
            logger.debug(
                f"SKIP: File is not in date range asked: {curr_filepath}")
            continue
        # An empty file signifies that there wasn't any valable data for that period/sensor/...
        if os.path.getsize(curr_filepath) == 0:
            logger.info(f"SKIP: file is empty: {curr_filepath}")
            continue

        # Read data, and check if there is enough data in it
        data_read_df = pdh.read_file(curr_filepath)
        nb_data_read = len(data_read_df.index)
        data_available_pct = nb_data_read * 100 / nb_input_parcels
        min_parcels_with_data_pct = conf.timeseries.getfloat(
            'min_parcels_with_data_pct')
        if data_available_pct < min_parcels_with_data_pct:
            logger.info(
                f"SKIP: only data for {data_available_pct:.2f}% of parcels, should be > {min_parcels_with_data_pct}%: {curr_filepath}"
            )
            continue

        # Start processing the file
        logger.info(f'Process file: {curr_filepath}')
        if data_read_df.index.name != conf.columns['id']:
            data_read_df.set_index(conf.columns['id'], inplace=True)

        # Loop over columns to check if there are columns that need to be dropped.
        for column in data_read_df.columns:

            # If it is the id column, continue
            if column == conf.columns['id']:
                continue

            # Check if the column is "asked"
            column_ok = False
            for parceldata_aggregation in parceldata_aggregations_to_use:
                if column.endswith('_' + parceldata_aggregation):
                    column_ok = True
            if column_ok is False:
                # Drop column if it doesn't end with something in parcel_data_aggregations_to_use
                logger.debug(
                    f"Drop column as it's column aggregation isn't to be used: {column}"
                )
                data_read_df.drop(column, axis=1, inplace=True)
                continue

            # Check if the column contains data for enough parcels
            valid_input_data_pct = (
                1 -
                (data_read_df[column].isnull().sum() / nb_input_parcels)) * 100
            if valid_input_data_pct < min_parcels_with_data_pct:
                # If the number of nan values for the column > x %, drop column
                logger.warn(
                    f"Drop column as it contains only {valid_input_data_pct:.2f}% real data compared to input (= not nan) which is < {min_parcels_with_data_pct}%!: {column}"
                )
                data_read_df.drop(column, axis=1, inplace=True)

        # If S2, rescale data
        if sensor_type.startswith(SENSORDATA_S2):
            for column in data_read_df.columns:
                logger.info(
                    f"Column contains S2 data, so scale it by dividing by 10.000: {column}"
                )
                data_read_df[column] = data_read_df[column] / 10000

        # If S1 coherence, rescale data
        if sensor_type == SENSORDATA_S1_COHERENCE:
            for column in data_read_df.columns:
                logger.info(
                    f"Column contains S1 Coherence data, so scale it by dividing by 300: {column}"
                )
                data_read_df[column] = data_read_df[column] / 300

        # Join the data to the result...
        result_df = result_df.join(data_read_df, how='left')

    # Remove rows with many null values from result
    max_number_null = int(0.6 * len(result_df.columns))
    parcel_many_null_df = result_df[result_df.isnull().sum(
        axis=1) > max_number_null]
    if len(parcel_many_null_df.index) > 0:
        # Write the rows with empty data to a file
        parcel_many_null_filepath = Path(
            f'{str(output_filepath)}_rows_many_null.sqlite')
        logger.warn(
            f"Write {len(parcel_many_null_df.index)} rows with > {max_number_null} of {len(result_df.columns)} columns==null to {parcel_many_null_filepath}"
        )
        pdh.to_file(parcel_many_null_df, parcel_many_null_filepath)

        # Now remove them from result
        result_df = result_df[result_df.isnull().sum(
            axis=1) <= max_number_null]

    # For rows with some null values, set them to 0
    # TODO: first rough test of using interpolation doesn't give a difference, maybe better if
    #       smarter interpolation is used (= only between the different types of data:
    #       S1_GRD_VV, S1_GRD_VH, S1_COH_VV, S1_COH_VH, ASC?, DESC?, S2
    #result_df.interpolate(inplace=True)
    result_df.fillna(0, inplace=True)

    # Write output file...
    logger.info(f"Write output to file, start: {output_filepath}")
    pdh.to_file(result_df, output_filepath)
    logger.info(f"Write output to file, ready (with shape: {result_df.shape})")

Example #9

Show file

File: classification_keras.py Project: draadzic/cropclassification

def predict_proba(parcel_df: pd.DataFrame, classifier_filepath: str,
                  output_parcel_predictions_filepath: str) -> pd.DataFrame:
    """
    Predict the probabilities for all input data using the classifier provided and write it
    to the output file.

    Args
        parcel_df: pandas DataFrame containing the data to classify. Columns:
            * global_settings.id_column: the id of the parcel.
            * global_settings.class_column: the class of the parcel. Isn't really used.
            * ... all columns that will be used as classification data.
        classifier_filepath: the filepath where the classifier can be written.
    """

    # Some basic checks that input is ok
    column_class = conf.columns['class']
    column_class_declared = conf.columns['class_declared']
    parcel_df.reset_index(inplace=True)
    if (conf.columns['id'] not in parcel_df.columns
            or column_class not in parcel_df.columns):
        message = f"Columns {conf.columns['id']} and {column_class} are mandatory for input parameter parcel_df!"
        logger.critical(message)
        raise Exception(message)

    # Now do final preparation for the classification
    parcel_classes_df = parcel_df[column_class]
    cols_to_keep = parcel_df.columns.difference(
        [conf.columns['id'], column_class, column_class_declared])
    parcel_data_df = parcel_df[cols_to_keep]
    parcel_data_df.sort_index(axis=1, inplace=True)

    logger.info(
        f"Input predict file processed and rows with missing data removed, data shape: {parcel_data_df.shape}, labels shape: {parcel_classes_df.shape}"
    )

    # Check of the input data columns match the columns needed for the neural net
    classifier_filepath_noext, _ = os.path.splitext(classifier_filepath)
    classifier_datacolumns_filepath = classifier_filepath_noext + '_datacolumns.txt'
    with open(classifier_datacolumns_filepath, "r") as file:
        classifier_datacolumns = eval(file.readline())
    if classifier_datacolumns != list(parcel_data_df.columns):
        raise Exception(
            f"Input datacolumns for predict don't match needed columns for neural net: \ninput: {parcel_data_df.columns}, \nneeded: {classifier_datacolumns}"
        )

    with pd.option_context('display.max_rows', None, 'display.max_columns',
                           None):
        logger.info(
            f"Resulting Columns for predicting data: {parcel_data_df.columns}")

    # Load the classifier and predict
    model = keras.models.load_model(classifier_filepath)
    logger.info(
        f"Predict classes with probabilities: {len(parcel_df.index)} rows")
    class_proba = model.predict_proba(parcel_data_df)
    logger.info(f"Predict classes with probabilities ready")

    # Convert probabilities to dataframe, combine with input data and write to file
    # Load the classes from the classes file
    classifier_filepath_noext, _ = os.path.splitext(classifier_filepath)
    classifier_classes_filepath = classifier_filepath_noext + '_classes.txt'
    with open(classifier_classes_filepath, "r") as file:
        classes_dict = eval(file.readline())
    id_class_proba = np.concatenate([
        parcel_df[[conf.columns['id'], column_class, column_class_declared
                   ]].values, class_proba
    ],
                                    axis=1)
    cols = [conf.columns['id'], column_class, column_class_declared]
    cols.extend(classes_dict)
    proba_df = pd.DataFrame(id_class_proba, columns=cols)
    proba_df.set_index(keys=conf.columns['id'], inplace=True)

    # If output path provided, write results
    if output_parcel_predictions_filepath:
        pdh.to_file(proba_df, output_parcel_predictions_filepath)

    return proba_df

Example #10

Show file

File: classification_preprocess.py Project: draadzic/cropclassification

def create_train_test_sample(input_parcel_filepath: str,
                             output_parcel_train_filepath: str,
                             output_parcel_test_filepath: str,
                             balancing_strategy: str,
                             force: bool = False):
    """ Create a seperate train and test sample from the general input file. """

    # If force == False Check and the output files exist already, stop.
    if (force is False and os.path.exists(output_parcel_train_filepath) is True
            and os.path.exists(output_parcel_test_filepath) is True):
        logger.warning(
            f"create_train_test_sample: output files already exist and force == False, so stop: {output_parcel_train_filepath}, {output_parcel_test_filepath}"
        )
        return

    # Load input data...
    logger.info(
        f"Start create_train_test_sample with balancing_strategy {balancing_strategy}"
    )
    logger.info(f"Read input file {input_parcel_filepath}")
    df_in = pdh.read_file(input_parcel_filepath)
    logger.debug(f"Read input file ready, shape: {df_in.shape}")

    # Init some many-used variables from config
    class_balancing_column = conf.columns['class_balancing']
    class_column = conf.columns['class']

    with pd.option_context('display.max_rows', None, 'display.max_columns',
                           None):
        count_per_class = df_in.groupby(class_balancing_column,
                                        as_index=False).size()
        logger.info(
            f"Number of elements per classname in input dataset:\n{count_per_class}"
        )

    # The test dataset should be as representative as possible for the entire dataset, so create
    # this first as a 20% sample of each class without any additional checks...
    # Remark: group_keys=False evades that apply creates an extra index-level of the groups above
    #         the data and evades having to do .reset_index(level=class_balancing_column_NAME, drop=True)
    #         to get rid of the group level
    df_test = df_in.groupby(class_balancing_column,
                            group_keys=False).apply(pd.DataFrame.sample,
                                                    frac=0.20)
    logger.debug(
        f"df_test after sampling 20% of data per class, shape: {df_test.shape}"
    )

    # The candidate parcel for training are all non-test parcel
    df_train_base = df_in[~df_in.index.isin(df_test.index)]
    logger.debug(f"df_train_base after isin\n{df_train_base}")

    # Remove parcel with too few pixels from the train sample
    min_pixcount = int(conf.marker['min_nb_pixels_train'])
    df_train_base = df_train_base[
        df_train_base[conf.columns['pixcount_s1s2']] >= min_pixcount]
    logger.debug(
        f"Number of parcels in df_train_base after filter on pixcount >= {min_pixcount}: {len(df_train_base)}"
    )

    # Some classes shouldn't be used for training... so remove them!
    logger.info(
        f"Remove 'classes_to_ignore_for_train' from train sample (= where {class_column} is in: {conf.marker.getlist('classes_to_ignore_for_train')}"
    )
    df_train_base = df_train_base[~df_train_base[class_column].isin(
        conf.marker.getlist('classes_to_ignore_for_train'))]

    # All classes_to_ignore aren't meant for training either...
    logger.info(
        f"Remove 'classes_to_ignore' from train sample (= where {class_column} is in: {conf.marker.getlist('classes_to_ignore')}"
    )
    df_train_base = df_train_base[~df_train_base[class_column].isin(
        conf.marker.getlist('classes_to_ignore'))]

    # Print the train base result before applying any balancing
    with pd.option_context('display.max_rows', None, 'display.max_columns',
                           None):
        count_per_class = df_train_base.groupby(class_balancing_column,
                                                as_index=False).size()
        logger.info(
            f"Number of elements per classname for train dataset, before balancing:\n{count_per_class}"
        )

    # Depending on the balancing_strategy, use different way to get a training sample
    if balancing_strategy == 'BALANCING_STRATEGY_NONE':
        # Just use 25% of all non-test data as train data -> 25% of 80% of data -> 20% of all data
        # will be training date
        # Remark: - this is very unbalanced, eg. classes with 10.000 times the input size than other
        #           classes
        #         - this results in a relatively high accuracy in overall numbers, but the small
        #           classes are not detected at all
        df_train = (df_train_base.groupby(class_balancing_column,
                                          group_keys=False).apply(
                                              pd.DataFrame.sample, frac=0.25))

    elif balancing_strategy == 'BALANCING_STRATEGY_MEDIUM':
        # Balance the train data, but still use some larger samples for the classes that have a lot
        # of members in the input dataset
        # Remark: with the upper limit of 10.000 this gives still OK results overall, and also the
        #         smaller classes give some results with upper limit of 4000 results significantly
        #         less good.

        # For the larger classes, favor them by leaving the samples larger but cap at upper_limit
        upper_limit = 10000
        lower_limit = 1000
        logger.info(
            f"Cap over {upper_limit}, keep the full number of training sample till {lower_limit}, samples smaller than that are oversampled"
        )
        df_train = (df_train_base.groupby(class_balancing_column).filter(
            lambda x: len(x) >= upper_limit).groupby(class_balancing_column,
                                                     group_keys=False).apply(
                                                         pd.DataFrame.sample,
                                                         upper_limit))
        # Middle classes use the number as they are
        df_train = df_train.append(
            df_train_base.groupby(class_balancing_column).filter(lambda x: len(
                x) < upper_limit).groupby(class_balancing_column).filter(
                    lambda x: len(x) >= lower_limit))
        # For smaller classes, oversample...
        df_train = df_train.append(
            df_train_base.groupby(class_balancing_column).filter(
                lambda x: len(x) < lower_limit).groupby(
                    class_balancing_column,
                    group_keys=False).apply(pd.DataFrame.sample,
                                            lower_limit,
                                            replace=True))

    elif balancing_strategy == 'BALANCING_STRATEGY_MEDIUM2':
        # Balance the train data, but still use some larger samples for the classes that have a lot
        # of members in the input dataset
        # Remark: with the upper limit of 10.000 this gives still OK results overall, and also the
        #         smaller classes give some results with upper limit of 4000 results significantly
        #         less good.

        # For the larger classes, leave the samples larger but cap
        cap_count_limit1 = 100000
        cap_train_limit1 = 30000
        logger.info(
            f"Cap balancing classes over {cap_count_limit1} to {cap_train_limit1}"
        )
        df_train = (df_train_base.groupby(class_balancing_column).filter(
            lambda x: len(x) >= cap_count_limit1).groupby(
                class_balancing_column,
                group_keys=False).apply(pd.DataFrame.sample, cap_train_limit1))
        cap_count_limit2 = 50000
        cap_train_limit2 = 20000
        logger.info(
            f"Cap balancing classes between {cap_count_limit2} and {cap_count_limit1} to {cap_train_limit2}"
        )
        df_train = df_train.append(
            df_train_base.groupby(class_balancing_column).filter(lambda x: len(
                x) < cap_count_limit1).groupby(class_balancing_column).filter(
                    lambda x: len(x) >= cap_count_limit2).groupby(
                        class_balancing_column,
                        group_keys=False).apply(pd.DataFrame.sample,
                                                cap_train_limit2))
        cap_count_limit3 = 20000
        cap_train_limit3 = 10000
        logger.info(
            f"Cap balancing classes between {cap_count_limit3} and {cap_count_limit2} to {cap_train_limit3}"
        )
        df_train = df_train.append(
            df_train_base.groupby(class_balancing_column).filter(lambda x: len(
                x) < cap_count_limit2).groupby(class_balancing_column).filter(
                    lambda x: len(x) >= cap_count_limit3).groupby(
                        class_balancing_column,
                        group_keys=False).apply(pd.DataFrame.sample,
                                                cap_train_limit3))
        cap_count_limit4 = 10000
        cap_train_limit4 = 10000
        logger.info(
            f"Cap balancing classes between {cap_count_limit4} and {cap_count_limit3} to {cap_train_limit4}"
        )
        df_train = df_train.append(
            df_train_base.groupby(class_balancing_column).filter(lambda x: len(
                x) < cap_count_limit3).groupby(class_balancing_column).filter(
                    lambda x: len(x) >= cap_count_limit4).groupby(
                        class_balancing_column,
                        group_keys=False).apply(pd.DataFrame.sample,
                                                cap_train_limit4))
        oversample_count = 1000
        # Middle classes use the number as they are
        logger.info(
            f"For classes between {cap_count_limit4} and {oversample_count}, just use all samples"
        )
        df_train = df_train.append(
            df_train_base.groupby(class_balancing_column).filter(lambda x: len(
                x) < cap_count_limit4).groupby(class_balancing_column).filter(
                    lambda x: len(x) >= oversample_count))
        # For smaller classes, oversample...
        logger.info(
            f"For classes smaller than {oversample_count}, oversample to {oversample_count}"
        )
        df_train = df_train.append(
            df_train_base.groupby(class_balancing_column).filter(
                lambda x: len(x) < oversample_count).groupby(
                    class_balancing_column,
                    group_keys=False).apply(pd.DataFrame.sample,
                                            oversample_count,
                                            replace=True))

    elif balancing_strategy == 'BALANCING_STRATEGY_PROPORTIONAL_GROUPS':
        # Balance the train data, but still use some larger samples for the classes that have a lot
        # of members in the input dataset
        # Remark: with the upper limit of 10.000 this gives still OK results overall, and also the
        #         smaller classes give some results with upper limit of 4000 results significantly
        #         less good.

        # For the larger classes, leave the samples larger but cap
        upper_count_limit1 = 100000
        upper_train_limit1 = 30000
        logger.info(
            f"Cap balancing classes over {upper_count_limit1} to {upper_train_limit1}"
        )
        df_train = (df_train_base.groupby(class_balancing_column).filter(
            lambda x: len(x) >= upper_count_limit1).groupby(
                class_balancing_column,
                group_keys=False).apply(pd.DataFrame.sample,
                                        upper_train_limit1))
        upper_count_limit2 = 50000
        upper_train_limit2 = 20000
        logger.info(
            f"Cap balancing classes between {upper_count_limit2} and {upper_count_limit1} to {upper_train_limit2}"
        )
        df_train = df_train.append(
            df_train_base.groupby(class_balancing_column).filter(lambda x: len(
                x) < upper_count_limit1).groupby(class_balancing_column).
            filter(lambda x: len(x) >= upper_count_limit2).groupby(
                class_balancing_column,
                group_keys=False).apply(pd.DataFrame.sample,
                                        upper_train_limit2))
        upper_count_limit3 = 20000
        upper_train_limit3 = 10000
        logger.info(
            f"Cap balancing classes between {upper_count_limit3} and {upper_count_limit2} to {upper_train_limit3}"
        )
        df_train = df_train.append(
            df_train_base.groupby(class_balancing_column).filter(lambda x: len(
                x) < upper_count_limit2).groupby(class_balancing_column).
            filter(lambda x: len(x) >= upper_count_limit3).groupby(
                class_balancing_column,
                group_keys=False).apply(pd.DataFrame.sample,
                                        upper_train_limit3))
        upper_count_limit4 = 10000
        upper_train_limit4 = 5000
        logger.info(
            f"Cap balancing classes between {upper_count_limit4} and {upper_count_limit3} to {upper_train_limit4}"
        )
        df_train = df_train.append(
            df_train_base.groupby(class_balancing_column).filter(lambda x: len(
                x) < upper_count_limit3).groupby(class_balancing_column).
            filter(lambda x: len(x) >= upper_count_limit4).groupby(
                class_balancing_column,
                group_keys=False).apply(pd.DataFrame.sample,
                                        upper_train_limit4))

        # For smaller balancing classes, just use all samples
        df_train = df_train.append(
            df_train_base.groupby(class_balancing_column).filter(
                lambda x: len(x) < upper_count_limit4))

    elif balancing_strategy == 'BALANCING_STRATEGY_UPPER_LIMIT':
        # Balance the train data, but still use some larger samples for the classes that have a lot
        # of members in the input dataset
        # Remark: with the upper limit of 10.000 this gives still OK results overall, and also the
        #         smaller classes give some results with upper limit of 4000 results significantly
        #         less good.

        # For the larger classes, favor them by leaving the samples larger but cap at upper_limit
        upper_limit = 10000
        logger.info(f"Cap over {upper_limit}...")
        df_train = (df_train_base.groupby(class_balancing_column).filter(
            lambda x: len(x) >= upper_limit).groupby(class_balancing_column,
                                                     group_keys=False).apply(
                                                         pd.DataFrame.sample,
                                                         upper_limit))
        # For smaller classes, just use all samples
        df_train = df_train.append(
            df_train_base.groupby(class_balancing_column).filter(
                lambda x: len(x) < upper_limit))

    elif balancing_strategy == 'BALANCING_STRATEGY_EQUAL':
        # In theory the most logical way to balance: make sure all classes have the same amount of
        # training data by undersampling the largest classes and oversampling the small classes.
        df_train = (df_train_base.groupby(class_balancing_column,
                                          group_keys=False).apply(
                                              pd.DataFrame.sample,
                                              2000,
                                              replace=True))

    else:
        logger.fatal(
            f"Unknown balancing strategy, STOP!: {balancing_strategy}")

    # Log the resulting numbers per class in the train sample
    with pd.option_context('display.max_rows', None, 'display.max_columns',
                           None):
        count_per_class = df_train.groupby(class_balancing_column,
                                           as_index=False).size()
        logger.info(
            f'Number of elements per class_balancing_column in train dataset:\n{count_per_class}'
        )
        if class_balancing_column != class_column:
            count_per_class = df_train.groupby(class_column,
                                               as_index=False).size()
            logger.info(
                f'Number of elements per class_column in train dataset:\n{count_per_class}'
            )

    # Log the resulting numbers per class in the test sample
    with pd.option_context('display.max_rows', None, 'display.max_columns',
                           None):
        count_per_class = df_test.groupby(class_balancing_column,
                                          as_index=False).size()
        logger.info(
            f'Number of elements per class_balancing_column in test dataset:\n{count_per_class}'
        )
        if class_balancing_column != class_column:
            count_per_class = df_test.groupby(class_column,
                                              as_index=False).size()
            logger.info(
                f'Number of elements per class_column in test dataset:\n{count_per_class}'
            )

    # Write to output files
    logger.info('Write the output files')
    df_train.set_index(conf.columns['id'], inplace=True)
    df_test.set_index(conf.columns['id'], inplace=True)
    pdh.to_file(df_train,
                output_parcel_train_filepath)  # The ID column is the index...
    pdh.to_file(df_test,
                output_parcel_test_filepath)  # The ID column is the index...

Example #11

Show file

File: classification_postprocess.py Project: draadzic/cropclassification

def calc_top3_and_consolidation(input_parcel_filepath: str,
                                input_parcel_probabilities_filepath: str,
                                output_predictions_filepath: str,
                                output_predictions_output_filepath: str = None,
                                force: bool = False):
    """
    Calculate the top3 prediction and a consolidation prediction.

    Remark: in this logic the declared crop/class (class_declared) is used, as we want to compare 
    with the declaration of the farmer, rather than taking into account corrections already.
    
    Args:
        input_parcel_filepath (str): [description]
        input_parcel_probabilities_filepath (str): [description]
        output_predictions_filepath (str): [description]
        output_predictions_output_filepath (str, optional): [description]. Defaults to None.
        force (bool, optional): [description]. Defaults to False.
    """
    # If force is false and output exists, already, return
    if(force is False
       and os.path.exists(output_predictions_filepath)):
        logger.warning(f"calc_top3_and_consolidation: output file exist and force is False, so stop: {output_predictions_filepath}")
        return

    # Read input files
    logger.info("Read input file")
    proba_df = pdh.read_file(input_parcel_probabilities_filepath)

    top3_df = calc_top3(proba_df)

    # Read input files
    logger.info("Read input file")
    input_parcel_df = pdh.read_file(input_parcel_filepath)

    # All input parcels must stay in the output, so left join input with pred
    top3_df.set_index(conf.columns['id'], inplace=True)
    if input_parcel_df.index.name != conf.columns['id']:
        input_parcel_df.set_index(conf.columns['id'], inplace=True)
    cols_to_join = top3_df.columns.difference(input_parcel_df.columns)
    pred_df = input_parcel_df.join(top3_df[cols_to_join], how='left')

    # The parcels added by the join don't have a prediction yet, so apply it
    # For the ignore classes, set the prediction to the ignore type
    classes_to_ignore = conf.marker.getlist('classes_to_ignore')
    pred_df.loc[pred_df[conf.columns['class_declared']].isin(classes_to_ignore), 
                'pred1'] = pred_df[conf.columns['class_declared']]
    # For all other parcels without prediction there must have been no data 
    # available for a classification, so set prediction to NODATA
    pred_df['pred1'].fillna('NODATA', inplace=True)
     
    # Add doubt columns
    add_doubt_column(pred_df=pred_df, 
                     new_pred_column=conf.columns['prediction_cons'],
                     apply_doubt_min_nb_pixels=True)
    add_doubt_column(pred_df=pred_df, 
                     new_pred_column=conf.columns['prediction_full_alpha'],
                     apply_doubt_min_nb_pixels=True,
                     apply_doubt_marker_specific=True)

    # Calculate the status of the consolidated prediction (OK=usable, NOK=not)
    pred_df.loc[pred_df[conf.columns['prediction_cons']].isin(proba_df.columns.to_list()), 
                conf.columns['prediction_cons_status']] = 'OK'
    pred_df[conf.columns['prediction_cons_status']].fillna('NOK', inplace=True)    

    logger.info("Write full prediction data to file")
    pdh.to_file(pred_df, output_predictions_filepath)

    # Create final output file with the most important info
    if output_predictions_output_filepath is not None:

        # First add some aditional columns specific for the export
        pred_df['markercode'] = conf.marker['markertype']
        pred_df['run_id'] = conf.general['run_id']
        today = datetime.date.today()
        pred_df['cons_date'] = today
        pred_df['modify_date'] = today
        logger.info("Write final output prediction data to file")
        pred_df.reset_index(inplace=True)
        pred_df = pred_df[conf.columns.getlist('output_columns')] 
        pdh.to_file(pred_df, output_predictions_output_filepath, index=False) 

        # Write oracle sqlldr file
        if conf.marker['markertype'] in ['LANDCOVER', 'LANDCOVER_EARLY']:
            table_name = 'mon_marker_landcover'
            table_columns = ("layer_id, prc_id, versienummer, markercode, run_id, cons_landcover, "
                             + "cons_status, cons_date date 'yyyy-mm-dd', landcover1, probability1, "
                             + "landcover2, probability2, landcover3, probability3, "
                             + "modify_date date 'yyyy-mm-dd'")
        elif conf.marker['markertype'] in ['CROPGROUP', 'CROPGROUP_EARLY']:
            table_name = 'mon_marker_cropgroup'
            table_columns = ("layer_id, prc_id, versienummer, markercode, run_id, cons_cropgroup, "
                             + "cons_status, cons_date date 'yyyy-mm-dd', cropgroup1, probability1, "
                             + "cropgroup2, probability2, cropgroup3, probability3, "
                             + "modify_date date 'yyyy-mm-dd'")
        else: 
            table_name = None
            logger.warning(f"Table unknown for marker type {conf.marker['markertype']}, so cannot write .ctl file")

        if table_name is not None:
            with open(output_predictions_output_filepath + '.ctl', 'w') as ctlfile:
                # SKIP=1 to skip the columns names line, the other ones to evade 
                # more commits than needed
                ctlfile.write("OPTIONS (SKIP=1, ROWS=10000, BINDSIZE=40000000, READSIZE=40000000)\n")     
                ctlfile.write("LOAD DATA\n")
                ctlfile.write(f"INFILE '{os.path.basename(output_predictions_output_filepath)}'  \"str '\\n'\"\n")
                ctlfile.write(f"INSERT INTO TABLE {table_name} APPEND\n")
                # A tab as seperator is apparently X'9'  
                ctlfile.write("FIELDS TERMINATED BY X'9'\n")
                ctlfile.write(f"({table_columns})\n")

Example #12

Show file

File: classification_reporting.py Project: draadzic/cropclassification

def write_full_report(parcel_predictions_filepath: str,
                      output_report_txt: str,
                      parcel_ground_truth_filepath: str = None,
                      force: bool = None):
    """Writes a report about the accuracy of the predictions to a file.

    Args:
        parcel_predictions_filepath: File name of csv file with the parcel with their predictions.
        prediction_columnname: Column name of the column that contains the predictions.
        output_report_txt: File name of txt file the report will be written to.
        parcel_ground_truth_filepath: List of parcels with ground truth to calculate eg. alfa and
            beta errors. If None, the part of the report that is based on this data is skipped

    TODO: refactor function to split logic more...
    """

    # If force == False Check and the output file exists already, stop.
    if force is False and os.path.exists(output_report_txt):
        logger.warning(
            f"collect_and_prepare_timeseries_data: output file already exists and force == False, so stop: {output_report_txt}"
        )
        return

    logger.info("Start write_full_report")

    pandas_option_context_list = [
        'display.max_rows', None, 'display.max_columns', None,
        'display.max_colwidth', 300, 'display.width', 2000,
        'display.colheader_justify', 'left'
    ]
    logger.info(f"Read file with predictions: {parcel_predictions_filepath}")
    df_predict = pdh.read_file(parcel_predictions_filepath)
    df_predict.set_index(conf.columns['id'], inplace=True)

    # Python template engine expects all values to be present, so initialize to empty
    empty_string = "''"
    html_data = {
        'GENERAL_ACCURACIES_TABLE': empty_string,
        'GENERAL_ACCURACIES_TEXT': empty_string,
        'GENERAL_ACCURACIES_DATA': empty_string,
        'CONFUSION_MATRICES_TABLE': empty_string,
        'CONFUSION_MATRICES_DATA': empty_string,
        'CONFUSION_MATRICES_CONSOLIDATED_TABLE': empty_string,
        'CONFUSION_MATRICES_CONSOLIDATED_DATA': empty_string,
        'PREDICTION_QUALITY_CONS_OVERVIEW_TEXT': empty_string,
        'PREDICTION_QUALITY_CONS_OVERVIEW_TABLE': empty_string,
        'PREDICTION_QUALITY_FULL_ALPHA_OVERVIEW_TEXT': empty_string,
        'PREDICTION_QUALITY_FULL_ALPHA_OVERVIEW_TABLE': empty_string,
        'PREDICTION_QUALITY_ALPHA_TEXT': empty_string,
        'PREDICTION_QUALITY_BETA_TEXT': empty_string,
        'PREDICTION_QUALITY_ALPHA_PER_PIXCOUNT_TEXT': empty_string,
        'PREDICTION_QUALITY_ALPHA_PER_PIXCOUNT_TABLE': empty_string
    }

    # Build and write report...
    with open(output_report_txt, 'w') as outputfile:

        outputfile.write(
            "************************************************************\n")
        outputfile.write(
            "********************* PARAMETERS USED **********************\n")
        outputfile.write(
            "************************************************************\n")
        outputfile.write("\n")
        message = "Main parameters used for the marker"
        outputfile.write(f"\n{message}\n")
        html_data['PARAMETERS_USED_TEXT'] = message

        logger.info(f"{dict(conf.marker)}")
        parameter_list = [['marker', key, value]
                          for key, value in conf.marker.items()]
        parameter_list += [['timeseries', key, value]
                           for key, value in conf.timeseries.items()]
        parameter_list += [['preprocess', key, value]
                           for key, value in conf.preprocess.items()]
        parameter_list += [['classifier', key, value]
                           for key, value in conf.classifier.items()]
        parameter_list += [['postprocess', key, value]
                           for key, value in conf.postprocess.items()]

        parameters_used_df = pd.DataFrame(
            parameter_list, columns=['parameter_type', 'parameter', 'value'])
        with pd.option_context(*pandas_option_context_list):
            outputfile.write(f"\n{parameters_used_df}\n")
            logger.info(f"{parameters_used_df}\n")
            html_data['PARAMETERS_USED_TABLE'] = parameters_used_df.to_html(
                index=False)

        outputfile.write(
            "************************************************************\n")
        outputfile.write(
            "**************** RECAP OF GENERAL RESULTS ******************\n")
        outputfile.write(
            "************************************************************\n")
        outputfile.write("\n")
        outputfile.write(
            "************************************************************\n")
        outputfile.write(
            "*             GENERAL CONSOLIDATED CONCLUSIONS             *\n")
        outputfile.write(
            "************************************************************\n")
        # Calculate + write general conclusions for consolidated prediction
        _add_prediction_conclusion(
            in_df=df_predict,
            new_columnname=conf.columns['prediction_conclusion_cons'],
            prediction_column_to_use=conf.columns['prediction_cons'],
            detailed=False)

        # Get the number of 'unimportant' ignore parcels and report them here
        df_predict_unimportant = df_predict[df_predict[
            conf.columns['prediction_conclusion_cons']] ==
                                            'IGNORE_UNIMPORTANT']
        # Now they can be removed for the rest of the reportings...
        df_predict = df_predict[
            df_predict[conf.columns['prediction_conclusion_cons']] !=
            'IGNORE_UNIMPORTANT']

        message = (
            f"Prediction conclusions cons general overview, for {len(df_predict.index)} predicted cases."
            +
            f"The {len(df_predict_unimportant.index)} IGNORE_UNIMPORTANT parcels are excluded from the reporting!"
        )
        outputfile.write(f"\n{message}\n")
        html_data['GENERAL_PREDICTION_CONCLUSION_CONS_OVERVIEW_TEXT'] = message

        count_per_class = (df_predict.groupby(
            conf.columns['prediction_conclusion_cons'],
            as_index=False).size().to_frame('count'))
        values = 100 * count_per_class['count'] / count_per_class['count'].sum(
        )
        count_per_class.insert(loc=1, column='pct', value=values)

        with pd.option_context(*pandas_option_context_list):
            outputfile.write(f"\n{count_per_class}\n")
            logger.info(f"{count_per_class}\n")
            html_data[
                'GENERAL_PREDICTION_CONCLUSION_CONS_OVERVIEW_TABLE'] = count_per_class.to_html(
                )
            html_data[
                'GENERAL_PREDICTION_CONCLUSION_CONS_OVERVIEW_DATA'] = count_per_class.to_dict(
                )

        # Output general accuracies
        outputfile.write(
            "************************************************************\n")
        outputfile.write(
            "*                   OVERALL ACCURACIES                     *\n")
        outputfile.write(
            "************************************************************\n")
        overall_accuracies_list = []

        # Calculate overall accuracies for all parcels
        try:
            oa = skmetrics.accuracy_score(df_predict[conf.columns['class']],
                                          df_predict['pred1'],
                                          normalize=True,
                                          sample_weight=None) * 100
            overall_accuracies_list.append({
                'parcels': 'All',
                'prediction_type': 'standard',
                'accuracy': oa
            })

            oa = skmetrics.accuracy_score(
                df_predict[conf.columns['class']],
                df_predict[conf.columns['prediction_cons']],
                normalize=True,
                sample_weight=None) * 100
            overall_accuracies_list.append({
                'parcels': 'All',
                'prediction_type': 'consolidated',
                'accuracy': oa
            })
        except:
            logger.exception("Error calculating overall accuracies!")

        # Calculate while ignoring the classes to be ignored...
        df_predict_accuracy_no_ignore = df_predict[
            ~df_predict[conf.columns['class']].
            isin(conf.marker.getlist('classes_to_ignore_for_train'))]
        df_predict_accuracy_no_ignore = df_predict_accuracy_no_ignore[
            ~df_predict_accuracy_no_ignore[conf.columns['class']].
            isin(conf.marker.getlist('classes_to_ignore'))]

        oa = skmetrics.accuracy_score(
            df_predict_accuracy_no_ignore[conf.columns['class']],
            df_predict_accuracy_no_ignore['pred1'],
            normalize=True,
            sample_weight=None) * 100
        overall_accuracies_list.append({
            'parcels': 'Exclude classes_to_ignore(_for_train) classes',
            'prediction_type': 'standard',
            'accuracy': oa
        })

        oa = skmetrics.accuracy_score(
            df_predict_accuracy_no_ignore[conf.columns['class']],
            df_predict_accuracy_no_ignore[conf.columns['prediction_cons']],
            normalize=True,
            sample_weight=None) * 100
        overall_accuracies_list.append({
            'parcels': 'Exclude classes_to_ignore(_for_train) classes',
            'prediction_type': 'consolidated',
            'accuracy': oa
        })

        # Calculate ignoring both classes to ignored + parcels not having a valid prediction
        df_predict_no_ignore_has_prediction = df_predict_accuracy_no_ignore.loc[
            (df_predict_accuracy_no_ignore[conf.columns['prediction_cons']] !=
             'NODATA')
            & (df_predict_accuracy_no_ignore[conf.columns['prediction_cons']]
               != 'DOUBT:NOT_ENOUGH_PIXELS')]
        oa = skmetrics.accuracy_score(
            df_predict_no_ignore_has_prediction[conf.columns['class']],
            df_predict_no_ignore_has_prediction['pred1'],
            normalize=True,
            sample_weight=None) * 100
        overall_accuracies_list.append({
            'parcels':
            'Exclude ignored ones + with prediction (= excl. NODATA, NOT_ENOUGH_PIXELS)',
            'prediction_type': 'standard',
            'accuracy': oa
        })

        oa = skmetrics.accuracy_score(
            df_predict_no_ignore_has_prediction[conf.columns['class']],
            df_predict_no_ignore_has_prediction[
                conf.columns['prediction_cons']],
            normalize=True,
            sample_weight=None) * 100
        overall_accuracies_list.append({
            'parcels':
            'Exclude ignored ones + with prediction (= excl. NODATA, NOT_ENOUGH_PIXELS)',
            'prediction_type': 'consolidated',
            'accuracy': oa
        })

        # Output the resulting overall accuracies
        message = 'Overall accuracies for different sub-groups of the data'
        outputfile.write(f"\n{message}\n")
        html_data['OVERALL_ACCURACIES_TEXT'] = message

        overall_accuracies_df = pd.DataFrame(
            overall_accuracies_list,
            columns=['parcels', 'prediction_type', 'accuracy'])
        overall_accuracies_df.set_index(keys=['parcels', 'prediction_type'],
                                        inplace=True)
        with pd.option_context(*pandas_option_context_list):
            outputfile.write(f"\n{overall_accuracies_df}\n")
            logger.info(f"{overall_accuracies_df}\n")
            html_data[
                'OVERALL_ACCURACIES_TABLE'] = overall_accuracies_df.to_html()

        # Write the recall, F1 score,... per class
        #message = skmetrics.classification_report(df_predict[gs.class_column]
        #                                                , df_predict[gs.prediction_column]
        #                                                , labels=classes)
        #outputfile.write(message)

        outputfile.write(
            "************************************************************\n")
        outputfile.write(
            "********************* DETAILED RESULTS *********************\n")
        outputfile.write(
            "************************************************************\n")
        outputfile.write("\n")
        outputfile.write(
            "************************************************************\n")
        outputfile.write(
            "*             DETAILED PREDICTION CONCLUSIONS              *\n")
        outputfile.write(
            "************************************************************\n")

        # Calculate detailed conclusions for the predictions
        logger.info("Calculate the detailed conclusions for the predictions")

        # Write the conclusions for the consolidated predictions
        _add_prediction_conclusion(
            in_df=df_predict,
            new_columnname=conf.columns['prediction_conclusion_detail_cons'],
            prediction_column_to_use=conf.columns['prediction_cons'],
            detailed=True)
        message = f"Prediction conclusions cons (doubt + not_enough_pixels) overview, for {len(df_predict.index)} predicted cases:"
        outputfile.write(f"\n{message}\n")
        html_data['PREDICTION_CONCLUSION_DETAIL_CONS_OVERVIEW_TEXT'] = message

        count_per_class = (df_predict.groupby(
            conf.columns['prediction_conclusion_detail_cons'],
            as_index=False).size().to_frame('count'))
        values = 100 * count_per_class['count'] / count_per_class['count'].sum(
        )
        count_per_class.insert(loc=1, column='pct', value=values)

        with pd.option_context('display.max_rows', None, 'display.max_columns',
                               None):
            outputfile.write(f"\n{count_per_class}\n")
            logger.info(f"{count_per_class}\n")
            html_data[
                'PREDICTION_CONCLUSION_DETAIL_CONS_OVERVIEW_TABLE'] = count_per_class.to_html(
                )

        # Calculate detailed conclusions for the predictions
        logger.info("Calculate the detailed conclusions for the predictions")

        # Write the conclusions for the consolidated predictions
        _add_prediction_conclusion(
            in_df=df_predict,
            new_columnname=conf.
            columns['prediction_conclusion_detail_full_alpha'],
            prediction_column_to_use=conf.columns['prediction_full_alpha'],
            detailed=True)
        message = f"Prediction conclusions full alpha (doubt + not_enough_pixels) overview, for {len(df_predict.index)} predicted cases:"
        outputfile.write(f"\n{message}\n")
        html_data[
            'PREDICTION_CONCLUSION_DETAIL_FULL_ALPHA_OVERVIEW_TEXT'] = message

        count_per_class = (df_predict.groupby(
            conf.columns['prediction_conclusion_detail_full_alpha'],
            as_index=False).size().to_frame('count'))
        values = 100 * count_per_class['count'] / count_per_class['count'].sum(
        )
        count_per_class.insert(loc=1, column='pct', value=values)

        with pd.option_context('display.max_rows', None, 'display.max_columns',
                               None):
            outputfile.write(f"\n{count_per_class}\n")
            logger.info(f"{count_per_class}\n")
            html_data[
                'PREDICTION_CONCLUSION_DETAIL_FULL_ALPHA_OVERVIEW_TABLE'] = count_per_class.to_html(
                )

        outputfile.write(
            "************************************************************\n")
        outputfile.write(
            "*     CONFUSION MATRICES FOR PARCELS WITH PREDICTIONS      *\n")
        outputfile.write(
            "************************************************************\n")
        # Calculate an extended confusion matrix with the standard prediction column and write
        # it to output...
        df_confmatrix_ext = _get_confusion_matrix_ext(df_predict, 'pred1')
        outputfile.write(
            "\nExtended confusion matrix of the predictions: Rows: true/input classes, columns: predicted classes\n"
        )
        with pd.option_context('display.max_rows', None, 'display.max_columns',
                               None, 'display.width', 2000):
            outputfile.write(f"{df_confmatrix_ext}\n")
            html_data['CONFUSION_MATRICES_TABLE'] = df_confmatrix_ext.to_html()
            html_data['CONFUSION_MATRICES_DATA'] = df_confmatrix_ext.to_json()

        # Calculate an extended confusion matrix with the consolidated prediction column and write
        # it to output...
        df_confmatrix_ext = _get_confusion_matrix_ext(
            df_predict, conf.columns['prediction_cons'])
        outputfile.write(
            "\nExtended confusion matrix of the consolidated predictions: Rows: true/input classes, columns: predicted classes\n"
        )
        with pd.option_context('display.max_rows', None, 'display.max_columns',
                               None, 'display.width', 2000):
            outputfile.write(f"{df_confmatrix_ext}\n\n")
            html_data[
                'CONFUSION_MATRICES_CONSOLIDATED_TABLE'] = df_confmatrix_ext.to_html(
                )
            html_data[
                'CONFUSION_MATRICES_CONSOLIDATED_DATA'] = df_confmatrix_ext.to_json(
                )

        # If the pixcount is available, write the OA per pixcount
        if conf.columns['pixcount_s1s2'] in df_predict.columns:
            pixcount_output_report_txt = output_report_txt + '_OA_per_pixcount.txt'
            _write_OA_per_pixcount(
                df_parcel_predictions=df_predict,
                output_report_txt=pixcount_output_report_txt,
                force=force)

        # If a ground truth file is provided, report on the ground truth
        if parcel_ground_truth_filepath is not None:
            outputfile.write(
                "************************************************************\n"
            )
            outputfile.write(
                "*   REPORTING ON PREDICTION QUALITY BASED ON GROUND TRUTH  *\n"
            )
            outputfile.write(
                "************************************************************\n"
            )

            # Read ground truth
            logger.info(
                f"Read csv with ground truth (with their classes): {parcel_ground_truth_filepath}"
            )
            df_parcel_gt = pdh.read_file(parcel_ground_truth_filepath)
            df_parcel_gt.set_index(conf.columns['id'], inplace=True)
            logger.info(
                f"Read csv with ground truth ready, shape: {df_parcel_gt.shape}"
            )

            # Join the prediction data
            cols_to_join = df_predict.columns.difference(df_parcel_gt.columns)
            df_parcel_gt = df_predict[cols_to_join].join(df_parcel_gt,
                                                         how='inner')
            logger.info(
                f"After join of ground truth with predictions, shape: {df_parcel_gt.shape}"
            )

            if len(df_parcel_gt.index) == 0:
                message = "After join of ground truth with predictions the result was empty, so probably a wrong ground truth file was used!"
                logger.critical(message)
                raise Exception(message)

            # General ground truth statistics
            # ******************************************************************
            # Calculate the conclusions based on ground truth

            # Calculate and write the result for the consolidated predictions
            _add_gt_conclusions(df_parcel_gt, conf.columns['prediction_cons'])
            message = f"Prediction quality cons (doubt + not_enough_pixels) overview, for {len(df_parcel_gt.index)} predicted cases in ground truth:"
            outputfile.write(f"\n{message}\n")
            html_data['PREDICTION_QUALITY_CONS_OVERVIEW_TEXT'] = message

            count_per_class = (df_parcel_gt.groupby(
                f"gt_conclusion_{conf.columns['prediction_cons']}",
                as_index=False).size().to_frame('count'))
            values = 100 * count_per_class['count'] / count_per_class[
                'count'].sum()
            count_per_class.insert(loc=1, column='pct', value=values)

            with pd.option_context('display.max_rows', None,
                                   'display.max_columns', None):
                outputfile.write(f"\n{count_per_class}\n")
                logger.info(f"{count_per_class}\n")
                html_data[
                    'PREDICTION_QUALITY_CONS_OVERVIEW_TABLE'] = count_per_class.to_html(
                    )

            # Calculate and write the result for the consolidated predictions
            _add_gt_conclusions(df_parcel_gt,
                                conf.columns['prediction_full_alpha'])
            message = f"Prediction quality cons (doubt + not_enough_pixels) overview, for {len(df_parcel_gt.index)} predicted cases in ground truth:"
            outputfile.write(f"\n{message}\n")
            html_data['PREDICTION_QUALITY_FULL_ALPHA_OVERVIEW_TEXT'] = message

            count_per_class = (df_parcel_gt.groupby(
                f"gt_conclusion_{conf.columns['prediction_full_alpha']}",
                as_index=False).size().to_frame('count'))
            values = 100 * count_per_class['count'] / count_per_class[
                'count'].sum()
            count_per_class.insert(loc=1, column='pct', value=values)

            with pd.option_context('display.max_rows', None,
                                   'display.max_columns', None):
                outputfile.write(f"\n{count_per_class}\n")
                logger.info(f"{count_per_class}\n")
                html_data[
                    'PREDICTION_QUALITY_FULL_ALPHA_OVERVIEW_TABLE'] = count_per_class.to_html(
                    )

            # Write the ground truth conclusions to file
            pdh.to_file(
                df_parcel_gt,
                output_report_txt + "_groundtruth_pred_quality_details.tsv")

            # Alpha and beta error statistics based on CONS prediction
            # ******************************************************************
            # Pct Alpha errors=alpha errors/(alpha errors + real errors)
            columnname = f"gt_conclusion_{conf.columns['prediction_cons']}"
            alpha_error_numerator = len(df_parcel_gt.loc[
                df_parcel_gt[columnname] ==
                'FARMER-CORRECT_PRED-WRONG:ERROR_ALPHA'].index)
            alpha_error_denominator = (
                alpha_error_numerator +
                len(df_parcel_gt.loc[df_parcel_gt[columnname].isin([
                    'FARMER-WRONG_PRED-CORRECT', 'FARMER-WRONG_PRED-WRONG'
                ])].index))
            if alpha_error_denominator > 0:
                message = (
                    f"Alpha error for cons: {alpha_error_numerator}/{alpha_error_denominator} = "
                    +
                    f"{(alpha_error_numerator/alpha_error_denominator):.02f}")
            else:
                message = f"Alpha error for cons: {alpha_error_numerator}/{alpha_error_denominator} = ?"

            outputfile.write(f"\n{message}\n")
            html_data['PREDICTION_QUALITY_ALPHA_TEXT'] = message

            beta_error_numerator = len(df_parcel_gt.loc[
                df_parcel_gt[columnname] ==
                'FARMER-WRONG_PRED-DOESNT_OPPOSE:ERROR_BETA'].index)
            beta_error_denominator = (
                beta_error_numerator +
                len(df_parcel_gt.loc[df_parcel_gt[columnname].str.startswith(
                    'FARMER-WRONG_PRED-')].index))
            if beta_error_denominator > 0:
                message = (
                    f"Beta error for cons: {beta_error_numerator}/{beta_error_denominator} = "
                    + f"{(beta_error_numerator/beta_error_denominator):.02f}")
            else:
                message = f"Beta error for cons: {beta_error_numerator}/{beta_error_denominator} = ?"

            outputfile.write(f"\n{message}\n")
            html_data['PREDICTION_QUALITY_BETA_TEXT'] = message

            # Alpha and beta error statistics based on CONS prediction
            # ******************************************************************
            # Pct Alpha errors=alpha errors/(alpha errors + real errors)
            columnname = f"gt_conclusion_{conf.columns['prediction_full_alpha']}"
            alpha_error_numerator = len(df_parcel_gt.loc[
                df_parcel_gt[columnname] ==
                'FARMER-CORRECT_PRED-WRONG:ERROR_ALPHA'].index)
            alpha_error_denominator = (
                alpha_error_numerator +
                len(df_parcel_gt.loc[df_parcel_gt[columnname].isin([
                    'FARMER-WRONG_PRED-CORRECT', 'FARMER-WRONG_PRED-WRONG'
                ])].index))
            if alpha_error_denominator > 0:
                message = (
                    f"Alpha error full: {alpha_error_numerator}/{alpha_error_denominator} = "
                    +
                    f"{(alpha_error_numerator/alpha_error_denominator):.02f}")
            else:
                message = f"Alpha error full: {alpha_error_numerator}/{alpha_error_denominator} = ?"

            outputfile.write(f"\n{message}\n")
            html_data['PREDICTION_QUALITY_ALPHA_TEXT'] += '<br/>' + message

            beta_error_numerator = len(df_parcel_gt.loc[
                df_parcel_gt[columnname] ==
                'FARMER-WRONG_PRED-DOESNT_OPPOSE:ERROR_BETA'].index)
            beta_error_denominator = (
                beta_error_numerator +
                len(df_parcel_gt.loc[df_parcel_gt[columnname].str.startswith(
                    'FARMER-WRONG_PRED-')].index))
            if beta_error_denominator > 0:
                message = (
                    f"Beta error full: {beta_error_numerator}/{beta_error_denominator} = "
                    + f"{(beta_error_numerator/beta_error_denominator):.02f}")
            else:
                message = f"Beta error full: {beta_error_numerator}/{beta_error_denominator} = ?"

            outputfile.write(f"\n{message}\n")
            html_data['PREDICTION_QUALITY_BETA_TEXT'] += '<br/>' + message

            # If the pixcount is available, write the number of ALFA errors per pixcount (for the prediction with doubt)
            if conf.columns['pixcount_s1s2'] in df_parcel_gt.columns:
                # Get data, drop empty lines and write
                message = f"Number of ERROR_ALFA parcels for the 'prediction full alpha without NOT_ENOUGH_PIX' per pixcount for the ground truth parcels:"
                outputfile.write(f"\n{message}\n")
                html_data[
                    'PREDICTION_QUALITY_ALPHA_PER_PIXCOUNT_TEXT'] = message

                # To get the number of alpha errors per pixcount, we also need alpha errors
                # also for parcels that had not_enough_pixels, so we need prediction_withdoubt
                # If they don't exist, calculate
                class_postpr.add_doubt_column(
                    pred_df=df_parcel_gt,
                    new_pred_column='pred_cons_no_min_pix',
                    apply_doubt_marker_specific=True)
                _add_gt_conclusions(df_parcel_gt, 'pred_cons_no_min_pix')

                df_per_pixcount = _get_alfa_errors_per_pixcount(
                    df_predquality_pixcount=df_parcel_gt,
                    pred_quality_column="gt_conclusion_" +
                    "pred_cons_no_min_pix",
                    error_alpha_code='FARMER-CORRECT_PRED-WRONG:ERROR_ALPHA')
                df_per_pixcount.dropna(inplace=True)
                with pd.option_context('display.max_rows', None,
                                       'display.max_columns', None,
                                       'display.width', 2000):
                    outputfile.write(f"\n{df_per_pixcount}\n")
                    logger.info(f"{df_per_pixcount}\n")
                    html_data[
                        'PREDICTION_QUALITY_ALPHA_PER_PIXCOUNT_TABLE'] = df_per_pixcount.to_html(
                        )

    with open(output_report_txt.replace('.txt', '.html'), 'w') as outputfile:
        html_template_file = open(
            './cropclassification/postprocess/html_rapport_template.html'
        ).read()
        src = Template(html_template_file)
        # replace strings and write to file
        output = src.substitute(html_data)
        outputfile.write(output)