Exemple #1
0
    # testing destination directory
    if not os.path.isdir(args.destination):
        os.mkdir(args.destination)

    export_idx = 0
    for idx, occurrence in enumerate(df.iterrows()):
        # adding an occurrence latitude and longitude
        positions.append((occurrence[1].Latitude, occurrence[1].Longitude))

        # if the batch is full, extract and export
        if len(positions) == batch_size or idx == len(df) - 1:
            variables = []
            for i, raster in enumerate(sorted(raster_metadata.keys())):
                if raster in exception:
                    continue
                ext.clean()
                ext.append(raster, normalized=args.norm)
                variable = np.stack([ext[p] for p in positions])

                variables.append(variable)

            variables = np.concatenate(variables, axis=1)
            # the shape of variables is (batch_size, nb_rasters, size, size)

            for p_idx in range(variables.shape[0]):

                np.save(args.destination + '/' + str(export_idx),
                        variables[p_idx])

                export_idx += 1
            # resetting positions for new batch
def extract_environmental_data(dataset,
                               rasters,
                               destination=None,
                               mean_window_size=None,
                               patch_size=1,
                               row_number_limit=None):
    """This function builds a dataset containing all the latitude,
       longitude, and vectors build from the environmental tensors associated
       saved in a directory, and save in optionnally in a csv file.
       Used to fit to Scikit-Learn models.
       If the environmental tensors are just row vectors (i.e the env. variables
       values at the location) then it loads them in a new dataframe.
       Otherwise, we either take the mean of the tensor values under a window
       parameter, or the tensors are flattened as long row vectors. This last
       option is very expensive in memory and will not work on dataset containing
       250k+ occurrences.

       :param df: the locations dataframe, containing Latitude,Longitude
           columns, and glc19SpId, the labels column.
       :param rasters: the directory where the rasters are located
       :param destination: an optional csv file where to save the data. The script
       takes quite some time so its useful to save the result in a file.

       :param mean_window_size: if not None, takes the mean value of each channel
       on under this window size
       :param patch_size: size of channels in the patches. 1 means channels are
       scalar values at this position, >1 means they are arrays around this
       position.
       :param row_number_limit: a max number of rows to extract. Default extract
       all the data.

       :return: a new dataframe containing the locations concatenated with
       their env. vectors
    """
    n_env_features = len(raster_metadata.keys())
    rasters_names = sorted(raster_metadata.keys())

    if patch_size == 1 and mean_window_size:
        raise Exception(
            'Patches are already vectors of scalars (size 1), cannot provide a window size'
        )

    if patch_size == 1 or mean_window_size:
        shape_env = (n_env_features)
    else:
        shape_env = n_env_features * patch_size * patch_size

    print('Will build row vectors of size', shape_env)

    # Reads the csv file containing the occurences
    df = pd.read_csv(dataset, sep=';', header='infer', quotechar='"')\
           .dropna(axis=0, how='all')

    #test data file: different label column name

    if 'glc19SpId' in df.columns:
        target_column = 'glc19SpId'
    elif 'glc19TestOccId' in df.columns:
        target_column = 'glc19TestOccId'
    else:
        raise Exception('Unknown target column in the data')

    df = df.astype({target_column: 'int64'})

    # keep only columns required, to free up memory
    df = df[['Latitude', 'Longitude', target_column]]
    ext = PatchExtractor(rasters, size=patch_size, verbose=True)

    positions = []
    # exception = ('proxi_eau_fast','alti', 'clc') # add rasters that don't fit into memory
    exception = tuple()
    env_vectors = list()
    # number of values per channel, 1 if patches are vector
    n_features_per_channel = 1

    if not row_number_limit:
        row_number_limit = len(df)
    print('Starting')
    try:
        positions = list(zip(df.Latitude, df.Longitude))[:row_number_limit]
        print('Loading rasters and extract..')
        variables = []
        for raster in rasters_names:
            if raster in exception:
                continue
            ext.clean()
            ext.append(raster)
            variable = np.stack([ext[p] for p in positions])
            variables.append(variable)
        ext.clean()

        variables = np.concatenate(variables, axis=1)
        # the shape of variables is (batch_size, nb_rasters, size, size)
        print('Build env vectors..')
        # build env vector for each occurrence in the batch
        for p_idx, patch in enumerate(variables):

            if mean_window_size:
                patch = np.array([
                    ch[ch.shape[0] // 2 -
                       mean_window_size // 2:ch.shape[0] // 2 +
                       mean_window_size // 2, ch.shape[1] // 2 -
                       mean_window_size // 2:ch.shape[1] // 2 +
                       mean_window_size // 2].mean() for ch in patch
                ])
            else:
                if len(patch.shape) > 1:
                    n_features_per_channel = patch[0].shape[0] * patch[
                        0].shape[1]

            # flatten to build row vector
            lat, lng = positions[p_idx]
            env_vectors.append(np.concatenate(([lat, lng], patch), axis=None))

        print('Done! building dataframe')
    except MemoryError as e:
        raise e(
            f'Reached out of memory, was able to extract {len(env_vectors)} rows'
        )

    if n_features_per_channel == 1:
        header_env = rasters_names
    else:
        header_env = []
        for name in rasters_names:
            header_env.extend(
                [name + f'__{i}' for i in range(n_features_per_channel)])
    header = ['Latitude', 'Longitude'] + header_env

    env_df = pd.DataFrame(env_vectors, columns=header, dtype='float64')
    print('Saving on disk')

    # concatenate column for the specie's label
    target_df = df[target_column].reset_index(drop=True).loc[:row_number_limit]

    env_df = pd.concat((env_df, target_df), axis=1)
    if destination:
        env_df.to_csv(destination, sep=';', index=False, quotechar='"')

    return env_df