Exemple #1
    def cleanup_missing_data(self, sensor):
        """ This function ensure that incomplete scenes are removed : for a given date, if an
        input data layer is missing, all other layers are also removed for this date. """

        # get all available dates
        alldates = []

        for path in self['input'][sensor]['_names']:
            alldates += get_param_in_tree(self, ['input', sensor] + path,
        # make it a set (unique value and allow to compute difference with "-")
        alldates = set(alldates)
        # find missing data dates
        toremove = []
        for path in self['input'][sensor]['_names']:

            data_type = get_param_in_tree(self, ['input', sensor] + path,
            logging.debug(f'data type in cleanup_missing_data {data_type}')

            if data_type == 'dynamic':
                dates_for_this_path = get_param_in_tree(
                    self, ['input', sensor] + path, '_scenes_dates')
                missing = alldates - set(dates_for_this_path)
                    f'find missing : {path} :  {len(dates_for_this_path)} : {len(missing)}'
                if missing:
                        f'{sensor} : Removing date {missing} because {path} data is missing'
                toremove += missing
        fulldates = sorted(list(alldates - set(toremove)))

        for path in self['input'][sensor]['_names']:
            data_type = get_param_in_tree(self, ['input', sensor] + path,
            if data_type == 'dynamic':
                set_param_in_tree(self, ['input', sensor] + path,
                    f"{len(fulldates)} dates actually will be used for {sensor} {path}"
                dateslist = get_param_in_tree(self, ['input', sensor] + path,
                dateslist = dateslist * len(fulldates)
                set_param_in_tree(self, ['input', sensor] + path,
                    f"{len(dateslist)} dates actually will be used for {sensor} {path}"
                    f"Some dates were actually artificially replicated for {sensor} {path}"
Exemple #2
    def get_data_per_band(self, sensor, internalkey):
        """ This function uses the config dictionnary to locate the appropriate code that
        must be used to read the data. The piece of code to run is located in the folder
        'readers'. The function 'get_data_reader()' (which is in readers/__init__.py) is
        responsible to transform the string "readername" into data_reader_class (code
        that can read the data) and data_params (list params required by
        data_reader_class). Then actual data is read and the return value of the get_data
        function is an object containing the data (along with some logging information).
        This function is used when each band needs to be read separately. Then it
        aggregate them together """

        # there are multiple values for this layer, one for each band.
        # Find the paths to the config for each band
        band_keys = get_param_in_tree(self, ['input', sensor, internalkey],
        paths = [['input', sensor, internalkey, band_key]
                 for band_key in band_keys]

        data_objects = []
        for path in paths:
            # get the reader from the config
            readername = get_param_in_tree(self, path, 'data_reader_name')
            data_reader_class, data_params = get_data_reader(readername)

            # get the actual value of the parameters required by this reader
            data_params_dict = {
                p: get_param_in_tree(self, path, p)
                for p in data_params
            data_object = data_reader_class(name=internalkey)
            # read the data
                f' using {readername} {path}, and parameters {data_params_dict.keys()}'
            data_object = data_object.load(
                    'scenes_dates': get_param_in_tree(self, path,
                    'xslice': get_param_in_tree(self, [], 'xslice'),
                    'yslice': get_param_in_tree(self, [], 'yslice'),
                    'dataloc': get_param_in_tree(self, path, '_dataloc'),

        # now the list "data_objects" contains a list of matrix, let us merge it into one unique matrix
        # a drawback of moving the data around like this is slightly slower than loading
        # directly into the final matrix, but this is not a real problem considering
        # that the moves performed in memory are very fast compared to reading from disk.
        # the main advantage of this is to simplify the I/O code (and to allow easily a different
        # configuration for each band if needed)
        data_object = stack_it(data_objects, sensorname=sensor)

            f'Data loaded {internalkey} : {data_object.values.shape} matrix for sensor {sensor}'
        return data_object
Exemple #3
    def get_data(self, internalkey, sensor):
        # loop through all sensors, even multi sensor is not implemented :
        # dbox['input']['sensors'] should have only one element.

        logging.debug(f'Reading {internalkey} data for sensor {sensor}')
        indexing_method = get_param_in_tree(self,
                                            ['input', sensor, internalkey],
        if indexing_method == 'full':
            return self.get_data_per_band(sensor, internalkey)
        elif indexing_method == 'sparse':
            return self.get_data_sparse(sensor, internalkey)
        elif indexing_method == 'constant':
            return self.get_data_all_bands(sensor, internalkey)
Exemple #4
    def get_checkpoint_data(self, dbox, date, sensor):
        """ To read previous BRDF parameters estimation for Kalman filter """

        inputcheckpoint = dbox['inputcheckpoint']
        logging.debug(f'Getting checkpoint from {inputcheckpoint}')
        xslice = dbox['xslice']
        yslice = dbox['yslice']
        model_len = dbox['model_len']
        n_channels_ref = dbox['n_channels_ref']

        if inputcheckpoint and not dbox[f'{sensor}']['startseries']:
            # If inputcheckpoint is available or if second calculation after spin-off
            self.current_startseries = False
            data_reader_class, data_params = get_data_reader(
            data_params_dict = {
                p: get_param_in_tree(inputcheckpoint, [], p)
                for p in data_params

            reader = data_reader_class()
            # no check point, create empty initial state
            logging.debug('Setting up empty initial brdf')
            print('Setting up empty initial brdf, often due to error')
            self.current_startseries = True
            reader = EmptyBrdf().load_brdf(xslice, yslice, n_channels_ref,
        self.quality_in = reader.quality
        self.age_obs_in = reader.age_obs
        self.brdf_in = reader.brdf
        self.covariance_in = reader.covariance

        self.days_last_in = (date - reader.previous_date).days
Exemple #5
 def get_data_all_bands(self, sensor, internalkey):
     """ This function uses the config dictionnary to locate the appropriate
     code that must be used to read the data.  The piece of code to run is
     located in the folder 'readers'.  The function 'get_data_reader()'
     (which is in readers/__init__.py) is responsible to transform the string
     "readername" into data_reader_class  (code that can read the data) and
     data_params (list params required by data_reader_class).  Then actual data is
     read and the return value of the get_data function is an object
     containing the data and some logging information. This function is used
     when all bands need to be read together """
     path = ['input', sensor, internalkey]
     # get the reader from the config
     readername = get_param_in_tree(self, path, 'data_reader_name')
     data_reader_class, data_params = get_data_reader(readername)
     # get the actual value of the parameters required by this reader
     data_params_dict = {
         p: get_param_in_tree(self, path, p)
         for p in data_params
     data_object = data_reader_class(name=internalkey)
     # read the data
         f' using {readername} {path}, and parameters {data_params_dict.keys()}'
     data_object = data_object.load(
             'scenes_dates': get_param_in_tree(self, path, '_scenes_dates'),
             'xslice': get_param_in_tree(self, [], 'xslice'),
             'yslice': get_param_in_tree(self, [], 'yslice'),
             'dataloc': get_param_in_tree(self, path, '_dataloc'),
         f'Data loaded {internalkey} : {data_object.values.shape} matrix for sensor {sensor}'
     return data_object
Exemple #6
    def filter_scenes_dates(self, dstore, sensor, outputdate):
        """ This function filters the scenes dates available from the dstore.
        It keeps only the dates that are relevant for the current time step and
        to store them in the newly created DataBox object """

        # create a deep copy of the dstore dict because we will delete the information
        # about files that are irrelevant for the current date.


        for path in self['input'][sensor][
                '_names']:  # the variable "path" loops through all input data layers
            if (path[-1] is 'brdf_clim') or (path[-1] is 'brdf_clim_cov'):
            # We there exclude the BRDF_clim criteria to select the available dates for the calculation of Albedo
            # The concequence could be that no BRDF clim is found but we continue anyway to process.

            data_type = get_param_in_tree(self, ['input', sensor] + path,

                f' in data_box filter_scenes_dates we now treat {path}')
            # get the data location : filenames for this input layer
            dataloc = get_param_in_tree(self, ['input', sensor] + path,
            # get ALL the dates that are available for this layer

            availabledates = list(dataloc.keys())

            # using the time_span from the config, select only
            # the relevant dates in relevant_scene_dates
            time_span_for_composition_in_days = self[
            relevant_scene_dates = compute_relevant_scenes_dates(
                availabledates, outputdate, time_span_for_composition_in_days,
                [sensor, path])
                f'{sensor}/{path}: {len(relevant_scene_dates)} dates ({len(availabledates)} available)'
            # remove the dates that are not relevant
            if data_type == 'dynamic':
                for toremove in (set(availabledates) -
                    del dataloc[toremove]
            elif data_type == 'static':
                relevant_scene_dates = availabledates

                f' Length of the relevant_scene_dates is now {len(relevant_scene_dates)} '
            # and save the list of dates in the DataBox object
            # to keep the information about their order (even if we can alway reorder them)
            set_param_in_tree(self, ['input', sensor] + path,
                self, ['input', sensor] + path,
                f'This list contains all the relevant scene dates to compute output for date ({outputdate})'
Exemple #7
    def setup_input_one_sensor(self, inputconfig, sensor):
        Quick parse of all the available data and create dict of
        how to find data with date for each variable and each file.

        This function will created dictionaries following the scheme :

        :param inputconfig: input configuration for one sensor
        :type inputconfig: dict
        :rtype: dict of dict of dict date -> { sensor -> scene -> band -> reflectance,etc}
                                                               -> angle
                                                               -> lat/lon}

        See the output file in .yaml format to understand/check
        the structure of the nested dictionnary

        # make a copy of the config dic because we are going to populate
        # it an we don't want to change the original input
        self['input'][sensor] = deepcopy(inputconfig)

        # The number of bands is infered from the input files :
        # This is the number of input layers starting with "band" followed by a number
        band_keys = [k for k in inputconfig['toc_reflectance'].keys() if re.match('^band[0-9]*$', k) ]
        self['input'][sensor]['n_channels_meas'] = len(band_keys)
        self['input'][sensor]['band_keys'] = band_keys
        logging.info(f'Sensor {sensor} : {band_keys}')

        # the list of input names will be used in data_manager_one_step.
        self['input'][sensor]['_names'] = []

        list_param = ["toc_reflectance", "toc_reflectance_cov", "lwcs_mask", "azimuth_sol", "azimuth_sat" \
                  , "zenith_sol", "zenith_sat", "latitude", "longitude"]

        if self.acf['nam_inv']['brdf_clim_activated'] is True:

        for k in list_param:
            per_band = get_param_in_tree(self, ['input', sensor, k], 'band_indexing_method')
            if per_band == 'full':
                self['input'][sensor]['_names'] += [list(i) for i in itertools.product([k], band_keys)]
            elif per_band == 'sparse':
                band_keys = get_param_in_tree(self, ['input', sensor, k], 'band_indexing_list')
                self['input'][sensor]['_names'] += [list(i) for i in itertools.product([k], band_keys)]
            elif per_band == 'constant':
                self['input'][sensor]['_names'] += [[k]]
        self['input'][sensor]['_names'] = tuple(self['input'][sensor]['_names'])

        # initialize empty cache.
        cache = {}

        for sensorpath in self['input'][sensor]['_names']:

            # for each input data path
            path = [*sensorpath]
            # get the code and list of parameters names that are needed to read the data files
            logging.debug(f'Reading metadata {path}')
            #if sensorpath == 'brdf_clim':import ipdb; ipdb.set_trace()
            readername = get_param_in_tree(inputconfig, path, 'dataloc_reader_name')
            logging.info(f'Reading metadata for path {path}, using {readername}')
            dataloc_reader_function, required_params, datetime_params  = get_dataloc_reader(readername)
            logging.debug(f'The dataloc_reader_function ({dataloc_reader_function.__doc__}) needs the parameters {datetime_params}, {required_params}')

            if not inputconfig['use_cache']:
                required_params_dict = {p:get_param_in_tree(inputconfig, path, p) for p in required_params}
                dataloc, metadata = dataloc_reader_function(**{'output_dates':self[sensor]['output_dates'], **required_params_dict})
                # get cache key in case the files have aleady been opened before
                cache_dict = {p:get_param_in_tree(inputconfig, path, p) for p in datetime_params}
                cachekey, filecachekey = get_frozen_keys({'dataloc_reader_name':readername, **cache_dict})
                cachefile = f'cache/dataloc/{filecachekey}'
                logging.debug(f'Using cache {path}: f{filecachekey} : {cachekey}')

                if cachekey in cache:
                    # the (unique) cachekey has been found, reuse the value in cache
                    dataloc = cache[cachekey]
                elif os.path.exists(cachefile):
                    dataloc, metadata = from_yaml_function(cachefile)
                    cache[cachekey] = dataloc
                    # the cache key has not been found, run the actual function to read
                    # data location from the files get the actual values for the required_parameters

                    required_params_dict = {p:get_param_in_tree(inputconfig, path, p) for p in required_params}

                    dataloc, metadata = dataloc_reader_function(**{'output_dates':self[sensor]['output_dates'], **required_params_dict})

                    # save into the cache
                    cache[cachekey] = dataloc
                    save_yaml([dataloc, metadata], filename = cachefile)

            # dataloc has been found, set the value in the appropriate place in the nested dictionary
            set_param_in_tree(self['input'][sensor], path, '_dataloc', value=dataloc)
            if not 'xoutputsize' in self:
                # do this only once
                # set also the sizes of the output
                self['xoutputsize'] = metadata['xoutputsize']
                self['youtputsize'] = metadata['youtputsize']
                self['xfullsize'], self['yfullsize'] = self['xoutputsize'], self['youtputsize']
                self['xfullslice'] = slice(0,self['xfullsize'])
                self['yfullslice'] = slice(0,self['yfullsize'])
                self['.xoutputsize'] = 'Comment: xoutputsize should be the size \
                of the output file, xfullslice should be the size of the input file. Currently, they are identical.'
                self['.xoutputsize.'] = 'When processing only 1/10 pixels or 1/100 pixels, \