Ejemplo n.º 1
0
def get_scheduler_standard_assimilation_dict():
    dico_assim = load_yaml("""
statvars:
  soil_moisture: true
  water_baseflow_reservoir: true
  water_subsurface_reservoir: true
  water_surface_volume: true
  temperature: true
  water_canopy_interception_volume: true
  runoff_generation: true
  streamflow_catchment: true
  water_depth_catchment: true
  previous_water_storage_catchment: true
  flooded_area_catchment: true
  updated_flow_interconnections: true
parameters:
  activate: true
  param_list:
    manning_coefficient: true
    river_depth: true
    river_width: true
    main_river_slope: false
localization: 
  activate: false
  method: velocity
  velocity_file: ${hyfaa_run_dir}/data/mean_velocity.csv
  lengthscale: 10
""", env_vars=False, string_input=True)
    return dico_assim
Ejemplo n.º 2
0
    def __init__(self, assim_data, n_ensemble, nC, nU, lon, lat,
                 model_timestep):
        '''
        This function initializes EnKF variables as defined by Evensen 2003
        Inputs :
        assim_data : DA parameters file
        n_ensemble : size of control ensemble
        nC : number of unit catchments (cells)
        nU : number of hydrological response units
        
        Outputs :
        self.Nens_i : size control ensemble
        self.list_contr_vars_s : list of control variables
        self.Nstatvars_i : number of control variables
        self.Ak_old : control ensemble
        self.Ak_new : analysis ensemble
        self.HA : matrix HA (product of observation operator H and ensemble control matrix A)
        self.Dprim : innovation matrix (difference of observation ensemble and HA)
        self.Sk : Sk=HA'=HA-median(HA)
        self.Rk : observation error covariance matrix
        '''

        if hasattr(assim_data, 'keys'):
            dico = assim_data
        elif os.path.exists(assim_data):
            dico = yaml_parser.load_yaml(assim_data, env_vars=True)
        else:
            raise Exception('There is no file for DA parameters')
        self.Nens_i = n_ensemble
        self.mesh_lon_r = lon
        self.mesh_lat_r = lat
        self.list_contr_vars_s = []
        self.dict_sizes = dict()
        self.mesh_size = nC

        temp = 0
        #statvars assimilation (hydrological state variables)
        for field, activated in dico['statvars'].items():
            if not activated:
                continue
            self.list_contr_vars_s.append(field)
            if field in ['soil_moisture', 'water_canopy_interception_volume']:
                self.dict_sizes[field] = {'size': nC * nU}
                self.dict_sizes[field].update({'hru': nU})
                self.dict_sizes[field].update({'hru_unit_size': nC})
            else:
                self.dict_sizes[field] = {'size': nC}
                self.dict_sizes[field].update({'hru': 1})
            self.dict_sizes[field].update({
                'start_index':
                temp,
                'end_index':
                temp + self.dict_sizes[field]['size']
            })

            temp += self.dict_sizes[field]['size']

        #static parameter assimilation
        if dico['parameters']['activate']:
            for field, activated in dico['parameters']['param_list'].items():
                if not activated:
                    continue
                self.list_contr_vars_s.append(field)
                self.dict_sizes[field] = {'size': nC}
                self.dict_sizes[field].update({'hru': 1})
                self.dict_sizes[field].update({
                    'start_index':
                    temp,
                    'end_index':
                    temp + self.dict_sizes[field]['size']
                })
                temp += self.dict_sizes[field]['size']

        self.Nstatvars_i = temp

        self.Ak_old = None
        self.Ak_new = None
        self.HA = None
        self.Dprim = None
        self.Sk = None
        self.Rk = None

        self.loc_dico = dico['localization']
        if self.loc_dico['activate']:
            self.loc = np.loadtxt(self.loc_dico['loc_file'])
        self.model_timestep = model_timestep
Ejemplo n.º 3
0
def hyfaa_processing(yaml_file_or_dict, verbose=None):
    """main scheduler processing function
    
    :param yaml_file_or_dict: python dict or yaml file
    """

    tim = Timer()

    #check if input is a python dict or a yaml file
    valid_obj = False

    if isinstance(yaml_file_or_dict, dict):
        valid_obj = True
        dico = yaml_file_or_dict
    else:
        if hasattr(
                yaml_file_or_dict, 'replace'
        ):  #pythonic way of checking if this is a python3 string, python2 string or python2 unicode
            if os.path.exists(yaml_file_or_dict):
                valid_obj = True
                dico = load_yaml(yaml_file_or_dict, env_vars=True)
    if not valid_obj:
        raise Exception(
            'input must be a python dict or a path to an existing yaml file')

    #check parameters
    dico = check_parameters(dico)
    exec_time = dico['exec_time']
    if verbose is None:
        if 'verbose' in dico:
            verbose = dico['verbose']
    if verbose is None:
        verbose = 1

    #load assimilation parameters
    if isinstance(dico['assim_params_file'], dict):
        dico_assim = dico['assim_params_file']
    elif os.path.exists(dico['assim_params_file']):
        dico_assim = load_yaml(dico['assim_params_file'], env_vars=True)
    else:
        raise Exception('There is no file for DA parameters')
    #be sure to remove parameters from control list if their perturbation is not activated
    if not dico['perturb_static_data']['activate']:
        dico_assim['parameters']['activate'] = False

    #make temporary folder and hydro state database folders if they do not exist
    for fol in [
            dico['hydrological_states_database_directory'],
            dico['temporary_files_directory']
    ]:
        os.makedirs(fol, exist_ok=True)
    main_temp_dir = tempfile.mkdtemp(dir=dico['temporary_files_directory'],
                                     prefix='temphyfaamain')

    #rain_uncertainty
    prec_error = dico['rain_uncertainty']

    #get mgb iph input model:
    mgb_iph_input_model = load_yaml(dico['mgb']['input_model'], env_vars=True)
    alpha_inertial = float(mgb_iph_input_model['alpha'])

    #get some dimensions from static_data.nc file
    with netCDF4.Dataset(dico['mgb']['static_data_file']) as ds:
        n_cells = ds.dimensions['n_cells'].size
        n_soil_types = ds.dimensions['n_soil_types'].size
        river_length = ds.variables['longest_river_length'][:]
        mesh_lon = ds.variables['longitude_center'][:]
        mesh_lat = ds.variables['latitude_center'][:]

    if dico['n_ensemble'] < 2:
        print(
            'Ensemble size of 1 => setting to false all options to perturb parameters or input forcing, as well as assimilation.'
        )
        dico['activate_assimilation'] = False
        dico['perturb_static_data']['activate'] = False
        dico_assim['parameters']['activate'] = False
    if dico['perturb_static_data']['activate']:
        static_data_ensemble_files = [
            '%s/static_data_%d.nc' %
            (dico['perturb_static_data']['folder_store'], ii)
            for ii in range(dico['n_ensemble'])
        ]
        files_present = [
            el for el in static_data_ensemble_files if os.path.exists(el)
        ]
        if len(files_present) == 0:
            print('Generating perturbed static_data files ...')
            assim_tools.generate_perturbed_static_data_files(dico['mgb']['static_data_file'], dico['perturb_static_data']['folder_store'], \
                dico['perturb_static_data']['varying_parameters'], dico['n_ensemble'],dico['perturb_static_data']['type'],dico['perturb_static_data']['mode'])
        elif len(files_present) != dico['n_ensemble']:
            raise Exception('some ensemble files missing')
    else:
        static_data_ensemble_files = [dico['mgb']['static_data_file']]

    #get list of control parameters
    list_contr_params = []
    if dico_assim['parameters']['activate']:
        print(
            'control parameters exist and will be added to hydrological states files'
        )
        for elem, activated in dico_assim['parameters']['param_list'].items():
            if activated:
                list_contr_params.append(elem)
    if len(list_contr_params) == 0:
        print(
            'No parameters are listed, control parameters option is deactivated'
        )

    #open forcing om-mesh database
    forcing_db = ForcingOnMesh_DBManager(
        dico['forcing_onmesh_database_directory'], mode='r', verbose=verbose)

    ##create assim filter object
    Assimilation_filter = EnKF_filter.EnKFobj(dico_assim, dico['n_ensemble'],
                                              n_cells, n_soil_types, mesh_lon,
                                              mesh_lat,
                                              dico['model_min_time_step'])

    #open assimilation database
    Assim_db = Assimilation_Database(dico['assimilation_database_directory'],
                                     mode='r',
                                     verbose=verbose)
    #open hydrological state database and simulation task manager
    with HydroStates_DBManager(dico['hydrological_states_database_directory'], mode='w', verbose=verbose) as hydrostates_db, \
        SimulationTasks(dico['nprocs'], dico['mgb']['executable'], verbose=0) as sim_tasks:

        if dico['operational_mode']:
            #remove all dates after exec_time - retreatment_time_span => so it takes into account new assimilation data
            hydrostates_db.remove_after_date(
                exec_time - timedelta(dico['retreatment_time_span']))
        #get last date (if it exists)
        _, last_date_hydrostate = hydrostates_db.get_minmax_dates()

        ###############
        #check start modes : if it restarts from an existing calculation, if it is initialized from an initial solution or nothing
        if last_date_hydrostate is None:
            start_mode = 'null'
            last_date_hydrostate = dico['init_conditions']['time_start']
            hydrological_state_start_path = dico['init_conditions'][
                'hydrological_state_start']
            if hydrological_state_start_path is not None:
                start_mode = 'initial_solution'
        else:
            start_mode = 'restart'
            _, files_match = hydrostates_db.get_paths_matching_dates(
                [last_date_hydrostate], dt_max=0., type_request='control')
            files_match = files_match[0]
            #pick last date
            hydrological_state_start_path = files_match[0]
            #add parameters to hydrological states

        #split ensemble files
        last_hydrological_state_ensemble_paths, temp_input_files_created = generate_ensemble_files(
            hydrological_state_start_path, dico['n_ensemble'], main_temp_dir)

        #if last_hydrological_state_ensemble_paths contains real file paths and
        if dico['n_ensemble'] > 1 and len(list_contr_params) > 0:
            if start_mode == 'initial_solution':
                add_assimilation_static_vars_to_hydrological_states(
                    last_hydrological_state_ensemble_paths,
                    static_data_ensemble_files,
                    list_contr_params,
                    verbose=1)
            elif start_mode == 'restart':
                condinit_param = check_assimilation_static_vars_are_in_hydrological_states(
                    last_hydrological_state_ensemble_paths, list_contr_params)
                if not condinit_param:
                    add_assimilation_static_vars_to_hydrological_states(
                        last_hydrological_state_ensemble_paths,
                        static_data_ensemble_files,
                        list_contr_params,
                        verbose=1)

        #dates to compute
        #from beginning
        dates_compute = np.arange(datetime_to_julianday(dico['init_conditions']['time_start']), \
            datetime_to_julianday(exec_time) + dico['forecast_time_span']+dico['scheduler_time_step'], dico['scheduler_time_step'])
        #select only those > last_date_hydrostate + model_min_time_step/3
        dates_compute = dates_compute[
            dates_compute > datetime_to_julianday(last_date_hydrostate) +
            dico['model_min_time_step'] / 3.]
        #select only those <= last_date_forcing + model_min_time_step
        dates_forcing_available, _ = forcing_db.get_dates()
        dates_forcing_available = set(dates_forcing_available)
        last_date_forcing = max(dates_forcing_available)
        dates_compute = dates_compute[
            dates_compute <= datetime_to_julianday(last_date_forcing) +
            dico['model_min_time_step']]
        #convert to datetimes
        dates_compute = [julianday_to_datetime(el) for el in dates_compute]
        last_date = last_date_hydrostate

        ############################
        #LOOP ON SCHEDULER TIME STEPS
        if verbose >= 1:
            print('HYFAA initialized: %s' % tim)
        for i_compute, date_compute in enumerate(dates_compute):
            tim.level_up()
            common_temp_dir = tempfile.mkdtemp(dir=main_temp_dir,
                                               prefix='temphyfaa_%s' %
                                               date2strtag(date_compute))

            print('Computing hydrological state %s -> %s (step %d/%d):' %
                  (date2str(last_date), date2str(date_compute), i_compute + 1,
                   len(dates_compute)))

            if verbose >= 1:
                print(
                    '  0) Writing MGB parameter files and dynamic forcing files'
                )
            #configure common (date) parameters
            param_dict = copy.deepcopy(mgb_iph_input_model)
            param_dict['forcing_dates_dt_max'] = 0.
            param_dict['day'] = last_date.day
            param_dict['month'] = last_date.month
            param_dict['year'] = last_date.year
            param_dict['hour'] = last_date.hour
            dt_full = (date_compute - last_date).total_seconds() / (24. *
                                                                    3600.)
            param_dict['nt'] = int(
                np.ceil(dt_full / dico['model_min_time_step']))
            dt_days = dt_full / (1. * param_dict['nt'])
            param_dict['dt'] = dt_days * 24. * 3600.

            #forcing dates
            dates_forcing_in = [
                last_date + timedelta(ii * dt_days)
                for ii in range(param_dict['nt'])
            ]
            overide_dates_forcing = [
                datetime(param_dict['year'], param_dict['month'],
                         param_dict['day'], param_dict['hour']) +
                timedelta(ii * dt_days) for ii in range(param_dict['nt'])
            ]

            #forcing perturbations
            _, rain_data = forcing_db.build_forcing_data(
                dates_forcing_in,
                data_type='rain',
                search_delta_before=dico['forcing_dates_dt_max'])

            #############################################
            #get fields of multiplication factors
            perturbed_rain_vectors = np.empty(
                (dico['n_ensemble'], np.shape(rain_data)[0],
                 np.shape(rain_data)[1]),
                dtype=rain_data.dtype)

            if dico['n_ensemble'] == 1:
                print('    => ensemble size = 1, no rain perturbation')
                for ii in range(dico['n_ensemble']):
                    perturbed_rain_vectors[ii, :, :] = rain_data
            else:
                perturbed_multfact_vector = assim_tools.build_gaussian_error_fields(
                    n_cells, dico['n_ensemble'],
                    dico['forcing_grid_geo_selection'], mesh_lon, mesh_lat)
                #build fields of perturbed precipitation

                for ii in range(np.shape(rain_data)[1]):
                    perturbed_rain_vectors[:, :,
                                           ii] = assim_tools.rain_perturbation(
                                               perturbed_multfact_vector,
                                               rain_data[:, ii], prec_error)

                    #perturbed_rain_vectors[:,:,ii] =
            perturbed_rain_vectors[perturbed_rain_vectors < 0.] = 0.
            #############################################

            #############################################
            #perform analysis
            if verbose >= 1:
                print('    => %s' % tim)
                print('  1) Analysis step')
            if dico['activate_assimilation']:
                Assim_obs = Assim_db.get_values_between_dates(
                    date_start=last_date,
                    date_end=date_compute,
                    dt_max=0.0,
                    start_strict=False,
                    end_strict=True)
                if len(Assim_obs) > 0 and (start_mode == 'start_mode' or
                                           (start_mode != 'start_mode'
                                            and i_compute > 0)):
                    if verbose >= 2:
                        print(
                            '   => Observations data avalaible ===> Performing analysis...'
                        )
                    #make sure that all ensemble files in the process of being joined where indeed joined before the assimilation process modifies them
                    hydrostates_db.update_joined_ensemble_files_from_worker()
                    #perform analysis
                    Assimilation_filter.perform_analysis(
                        Assim_obs,
                        last_hydrological_state_ensemble_paths,
                        inplace=True,
                        timestep_analysis=None)
                    #add corrected hydro states to DB
                    hydrostates_db.add({'date_data': last_date, 'forcing_confidence_coefficient': 1.0, 'number_obs_used': len(Assim_obs),'type': 'analysis'}, \
                        last_hydrological_state_ensemble_paths)
                    if verbose >= 1:
                        print('    => Assimilated data: %s' % tim)
                else:
                    if verbose >= 1:
                        print('    => No data assimilated: %s' % tim)
            else:
                if verbose >= 1:
                    print(
                        '    => Assimilation not activated, no data assimilation: %s'
                        % tim)

            #############################################
            #BUILD MGB JOBS
            if verbose >= 1:
                print('  2) Preparing %d MGB simulations...' %
                      (dico['n_ensemble']))
            mgb_output_files_ordered = []
            job_params = []

            for i_ensemble in range(dico['n_ensemble']):
                #configure
                local_temp_dir = '%s/ENS%d' % (common_temp_dir, i_ensemble)
                os.system('mkdir -p %s/output' % local_temp_dir)
                param_dict_loc = copy.deepcopy(param_dict)
                if dico['perturb_static_data']['activate']:
                    param_dict_loc[
                        'static_data_file'] = static_data_ensemble_files[
                            i_ensemble]
                else:
                    param_dict_loc['static_data_file'] = dico['mgb'][
                        'static_data_file']
                param_dict_loc[
                    'output_directory'] = '%s/output/' % local_temp_dir
                param_dict_loc[
                    'hydrological_state_read_file'] = last_hydrological_state_ensemble_paths[
                        i_ensemble]
                param_dict_loc['hydrological_state_write_file'] = os.path.join(
                    main_temp_dir, 'hydrostate_out_%d.nc' % i_ensemble)
                mgb_output_files_ordered.append(
                    param_dict_loc['hydrological_state_write_file'])
                param_dict_loc[
                    'forcing_file'] = '%s/dynamic_forcing.nc' % local_temp_dir
                #write param file
                param_file = '%s/mgb_iph_input.yaml' % local_temp_dir

                write_simple_1level_dict_to_yaml_file(param_dict_loc,
                                                      param_file)

                #make precipitation file
                forcing_db.write_forcing_file(dates_forcing_in, param_dict_loc['forcing_file'], data_type='rain', \
                    forcing_data=perturbed_rain_vectors[i_ensemble,:,:], overide_dates=overide_dates_forcing)

                job_params.append([param_file])
            if verbose >= 1:
                print('    => MGB simulations prepared successfuly: %s' % tim)

            ############################
            #LAUNCH MGB SIMULATION
            if verbose >= 1:
                print('  3) Launching %d MGB simulations...' %
                      (len(job_params)))

            job_output_dict = sim_tasks.run(job_params)  #MGB simulation !!!
            # ~ print(job_output_dict[0]['output'])

            missing_output_files = [
                el for el in mgb_output_files_ordered if not os.path.exists(el)
            ]
            if len(missing_output_files) > 0:
                print('    => %d MGB simulations failed: %s' %
                      (len(missing_output_files), tim))
                print(
                    'MGB calculations failed, simulation output files missing:\n%s'
                    % ('\n'.join([' - %s' % el
                                  for el in missing_output_files])))
                error_log_file = os.path.join(
                    dico['temporary_files_directory'], 'error_jobs_yaml.log')
                print('Dumping job dict to %s' % error_log_file)
                write_dict_to_yaml_file(job_output_dict, error_log_file)
                sys.exit(1)
            if verbose >= 1:
                print('    => MGB simulations ended successfuly: %s' % tim)
            ########################

            #in the case where the scheduler was started with no initial hydrological state file and static parameters are marked up for assimilation, after the first MGB job,
            #hydrological state files do not contain static parameters, so we add them here
            if dico['n_ensemble'] > 1 and len(
                    list_contr_params
            ) > 0 and i_compute == 0 and start_mode == 'null':
                add_assimilation_static_vars_to_hydrological_states(
                    mgb_output_files_ordered,
                    static_data_ensemble_files,
                    list_contr_params,
                    verbose=1)

            #make sure that all ensemble files in the process of being joined where indeed joined before removing them
            hydrostates_db.update_joined_ensemble_files_from_worker()
            if temp_input_files_created:
                for filepath in last_hydrological_state_ensemble_paths:
                    os.unlink(filepath)
            #replace output file paths by input file paths for next iteration
            last_hydrological_state_ensemble_paths = []
            for i_ensemble, filepath in enumerate(mgb_output_files_ordered):
                filepath_new = os.path.join(main_temp_dir,
                                            'hydrostate_in_%d.nc' % i_ensemble)
                shutil.move(filepath, filepath_new)
                last_hydrological_state_ensemble_paths.append(filepath_new)
            temp_input_files_created = True
            ############################

            if verbose >= 1:
                print(
                    '  4) Adding last_rain_data_loaded vector to MGB simulation outputs and adding resulting hydrological state to database...'
                )

            #add last_rain_data_loaded vector to MGB simulation outputs (and compute inertial time step condition to raise weird cases)
            for i_ensemble, filepath in enumerate(
                    last_hydrological_state_ensemble_paths):
                with netCDF4.Dataset(filepath, mode='a') as ds:
                    var_data = ds.createVariable('last_rain_data_loaded',
                                                 'f4', ('n_meshes', ),
                                                 zlib=True,
                                                 complevel=4,
                                                 shuffle=True)
                    var_data[:] = perturbed_rain_vectors[i_ensemble, :,
                                                         -1].astype(np.float32)

            #add to database
            hydrostates_db.add(
                {
                    'date_data': date_compute,
                    'forcing_confidence_coefficient': 1.0
                }, last_hydrological_state_ensemble_paths)

            last_date = date_compute
            if verbose >= 1:
                print('    => %s' % tim)

            shutil.rmtree(common_temp_dir)
            tim_dico = tim.get_full_info()
            tim.level_down()
            print(
                '  => step complete : %.2f seconds (%.2f from beginning at %s)'
                %
                (tim_dico['level'], tim_dico['start'], tim_dico['start_date']))

    forcing_db._close_()
    Assim_db._close_()
    sim_tasks.close()

    if temp_input_files_created:
        for filepath in last_hydrological_state_ensemble_paths:
            os.unlink(filepath)
Ejemplo n.º 4
0
def hyfaa_preprocessing_assimilation(yaml_file_or_dict, verbose=None):
    """main scheduler processing function
    
    :param yaml_file_or_dict: python dict or yaml file
    """
    
    #check if input is a python dict or a yaml file
    valid_obj = False
    if isinstance(yaml_file_or_dict, dict):
        valid_obj = True
        dico = yaml_file_or_dict
    else:
        if hasattr(yaml_file_or_dict, 'replace'): #pythonic way of checking if this is a python3 string, python2 string or python2 unicode
            if os.path.exists(yaml_file_or_dict):
                valid_obj = True
                dico = load_yaml(yaml_file_or_dict, env_vars=True)
    if not valid_obj:
        raise Exception('input must be a python dict or a path to an existing yaml file')
        
    #check parameters
    dico = check_parameters(dico)
    exec_time = dico['exec_time']
    if verbose is None:
        if 'verbose' in dico:
            verbose = dico['verbose']
    if verbose is None:
        verbose = 0
    
    #if no assimilation sources, exit
    if len(dico['assimilation_sources']) == 0:
        return
        
    if verbose >= 1:
        print('Assimilation pre-prococessing launched...')
    
    if verbose > 1:
        print('  Assimilation pre-proc: Parameters and mesh coordinates loaded...')
    os.makedirs(dico['assimilation_database_directory'], exist_ok=True)
    
    min_time = dico['init_conditions']['time_start']
    max_time = dico['exec_time']
    
    if verbose > 1:
        print('  Assimilation pre-proc: Opening assimilation database...')


    #update forcing mesh DB
    with Assimilation_Database(dico['assimilation_database_directory'], mode='w', verbose=0) as db:

        for source_file in dico['assimilation_sources']:
            
            if verbose > 1:
                print('  Assimilation pre-proc: loading DB data')
            
            db_data = db.get_values_between_dates(date_start=min_time, date_end=max_time)
            db_id_tuples = set([(db_data['mesh_id'][ii], db_data['sv_name'][ii], db.str2date(db_data['date_data'][ii]), round(db_data['value'][ii],3), round(db_data['uncertainty'][ii],3)) \
                for ii in range(len(db_data['mesh_id']))])
            db_simpleid_tuples = set([(el[0], el[1], el[2]) for el in db_id_tuples])
            del db_data
            
            if verbose > 1:
                print('  Assimilation pre-proc: loading source data')
            
            if isinstance(source_file, dict):
                dico_source = source_file
            else:
                dico_source = load_yaml(source_file, env_vars=False)
            
            if verbose > 1:
                print('  Assimilation pre-proc: executing get_assimilation_data function')
            
            info_data = get_assimilation_data(dico_source, min_date=min_time, max_date=max_time, verbose=verbose)
            
            if verbose > 1:
                print('  Assimilation pre-proc: adding new data to DB')
            
            info_add = []
            info_remove = []
            for elem in info_data:
                try:
                    id_tuple = (elem['mesh_id'], elem['sv_name'], elem['date_data'], round(elem['value'],3), round(elem['uncertainty'],3))
                except:
                    print(elem)
                    raise
                simpleid_tuple = (id_tuple[0], id_tuple[1], id_tuple[2])
                if simpleid_tuple in db_simpleid_tuples:
                    if id_tuple in db_id_tuples:
                        #already present
                        continue
                    else:
                        #already present but data changed => replacing
                        info_remove.append(simpleid_tuple)
                info_add.append(elem)
            db.remove(info_remove)
            db.add(info_add)

            if verbose >= 1:
                print('Updated database with data requested from conf file %s: %d points added, %d removed'%(source_file, len(info_add), len(info_remove)))
Ejemplo n.º 5
0
def check_parameters(dico):
    #1st level
    check_dict(dico, ['init_conditions', 'scheduler_time_step', 'forecast_time_span', 'n_ensemble', 'operational_mode', 'retreatment_time_span', \
        'hydrological_states_database_directory', 'activate_assimilation', 'perturb_static_data', 'forcing_source', 'forcing_grid_database_directory', 'forcing_onmesh_database_directory', \
        'forcing_dates_dt_max', 'rain_uncertainty', 'assimilation_database_directory', 'assimilation_sources', 'assim_params_file', \
        'mgb', 'model_min_time_step', 'post_processing'], check_none=True)
    check_dict(dico, ['exec_time', 'forcing_grid_geo_selection', 'nprocs', 'temporary_files_directory'], check_none=False, prefix=None)
    if dico['temporary_files_directory'] is None:
        if 'TMPDIR' in os.environ:
            dico['temporary_files_directory'] = os.path.abspath(os.environ['TMPDIR'])
        else:
            dico['temporary_files_directory'] = os.path.abspath('.')
    for el in ['scheduler_time_step', 'retreatment_time_span', 'forecast_time_span', 'forcing_dates_dt_max', 'model_min_time_step']:
        dico[el] = float(dico[el])
        if dico[el] <= 0.:
            raise Exception('parameter %s must be > 0'%el)
    dico['n_ensemble'] = int(dico['n_ensemble'])
    if dico['n_ensemble'] < 1:
        raise Exception('n_ensemble must be >= 1')
    if dico['nprocs'] == None:
        dico['nprocs'] = max([1, cpu_count()-1])
    if 'verbose' not in dico:
        dico['verbose'] = 1
    if dico['verbose'] is None:
        dico['verbose'] = 1
        
    if dico['assimilation_sources'] is None:
        dico['assimilation_sources'] = []
    dico['assimilation_sources'] = list_form(dico['assimilation_sources'])
    
    #mgb executable and paths
    check_dict(dico['mgb'], ['executable', 'static_data_file', 'input_model'], check_none=True, prefix='in mgb:')
    
    subdict = dico['forcing_grid_geo_selection']
    if subdict == 'auto':
        #read mini.gtp file and guess minmax
        dico['forcing_grid_geo_selection'] = get_lonlat_minmax_from_mgb_static_file(dico['mgb']['static_data_file'])
    if subdict is not None:
        check_dict(subdict, ['latmin', 'latmax', 'lonmin', 'lonmax'], check_none=False, prefix='in forcing_grid_geo_selection:')
        for el in ['latmin', 'latmax', 'lonmin', 'lonmax']:
            if subdict[el] is not None:
                subdict[el] = float(subdict[el])
        if subdict['latmin'] is None:
            subdict['latmin'] = -90.
        if subdict['latmax'] is None:
            subdict['latmax'] = 90.
        if (subdict['lonmin'] is None) and (subdict['lonmax'] is None):
            subdict['lonmin'], subdict['lonmax'] = 0.,360.
        elif (subdict['lonmin'] is None) or (subdict['lonmax'] is None):
            raise Exception('in forcing_grid_geo_selection: lonmin and lonmax must either both be None or both filled')
    
    #2nd level
    #init conditions
    check_dict(dico['init_conditions'], ['time_start'], check_none=True, prefix='in init_conditions: ')
    check_dict(dico['init_conditions'], ['hydrological_state_start'], check_none=False, prefix='in init_conditions: ')
    if not hasattr(dico['init_conditions']['time_start'], 'strftime'):
        dico['init_conditions']['time_start'] = datetime.strptime(dico['init_conditions']['time_start'], '%Y-%m-%dT%H:%M:%S.%f')
    
    if dico['exec_time'] is None:
        dico['exec_time'] = datetime.utcnow()
    elif not hasattr(dico['exec_time'], 'strftime'):
        dico['exec_time'] = datetime.strptime(dico['exec_time'], '%Y-%m-%dT%H:%M:%S.%f')
        
    #activate assimilation
    assert isinstance(dico['activate_assimilation'], bool)
    
    #perturb static_data
    check_dict(dico['perturb_static_data'], ['activate', 'folder_store', 'varying_parameters','type','mode'], check_none=True, prefix='in perturb_static_data: ')
    if not isinstance(dico['perturb_static_data']['activate'], bool):
        raise Exception('perturb_static_data:activate must be a boolean')
    if (dico['n_ensemble'] < 2) and (dico['perturb_static_data']['activate']):
        print('Cannot perturb static data with an sensemble size of 1 => setting perturb_static_data:activate to false')
        dico['perturb_static_data']['activate'] = False
    if dico['perturb_static_data']['activate']:
        dico['perturb_static_data']['varying_parameters'] = check_perturb_static_data_varying_parameters(dico['perturb_static_data']['varying_parameters'])
        var_params_save_path = '%s/varying_parameters.yaml'%dico['perturb_static_data']['folder_store']
        if os.path.exists(var_params_save_path):
            var_params_saved = check_perturb_static_data_varying_parameters(load_yaml(var_params_save_path), comparison_list=dico['perturb_static_data']['varying_parameters'])
            if len(os.listdir(dico['perturb_static_data']['folder_store'])) != dico['n_ensemble']+1:
                print(dico['n_ensemble'])
                print(os.listdir(dico['perturb_static_data']['folder_store']))
                raise Exception('saved ensemble size mismatch with current ensemble')
        if dico['perturb_static_data']['type'] not in ['saltelli', 'normal']:
            raise Exception('type must be in [saltelli, normal], type %s unknown'%dico['perturb_static_data']['type'])
        if dico['perturb_static_data']['mode'] not in ['per_cell', 'per_variable']:
            raise Exception('mode must be in [per_cell, per_variable], mode %s unknown'%dico['perturb_static_data']['mode'])   
            
    #forcing source
    assert dico['forcing_source'] in ['gsmap', 'era5']
            
    #post_processing
    check_dict(dico['post_processing'], ['science_file', 'portal_file', 'variables'], check_none=True, prefix='in post_processing: ')
    if not isinstance(dico['post_processing']['variables'], list):
        assert isinstance(dico['post_processing']['variables'], str)
        dico['post_processing']['variables'] = [dico['post_processing']['variables']]
    
    return dico
Ejemplo n.º 6
0
def get_scheduler_basic_main_parameters_dict():
    dico_input = load_yaml("""
#scheduler main processing input parameters
#main processing = operational analysis and forecasting using MGB-IPH code + assimilation processes
nprocs: 1
verbose: 1

#init conditions (for first init i.e. if hydrological_states_database_directory is empty)
#these conditions are stored into the hydrological states database => if the database exists and the conditions don't match, the program returns an error
init_conditions:
  time_start: '2011-01-01T00:00:00.000000'
  #to start model with an real initial hydrological state => if None than no hydrological state is loaded
  #default configuration is to use the result of a 1200 days simulation starting 01/01/2011 and ending 15/04/2014, this helps initialize the model faster
  #hydrological_state_start: ${hyfaa_run_dir}/data_niger/default_start_hydrostate_niger_20141504.nc
  hydrological_state_start: 
scheduler_time_step: 1.
forecast_time_span: 3.
#choose an ensemble size of 1 if you do not wish to perturb rain inputs
n_ensemble: 1
exec_time: 

#operational_mode: do not activate, it is only to be used when hyfaa scheduler_processing_main is launched by an operational routine and deletes 
#the last retreatment_time_span days to incorporate assimilationn data
operational_mode: false
retreatment_time_span: 1.


#hydrological state database : used as an input and an output to see which date was treated first
hydrological_states_database_directory: ${hyfaa_run_dir}/data_hydro/hydrological_states_db
  
#perturb parameters within static_data file (only those f(i_cells)) : n_ensemble will be generated
perturb_static_data:
  activate: false
  type: normal
  mode: per_variable
  folder_store: ${hyfaa_run_dir}/data_hydro/mini_gtp_ensemble
  varying_parameters:
  -
    name: river_width
    error: 0.3
    min: 2.4
    max: 
  -
    name: river_depth
    error: 0.3
    min: 0.2
    max: 
  -
    name: manning_coefficient
    error: 0.3
    min: 0.03
    max: 0.25
  -
    name: main_river_slope
    error: 0.3  
    min: 0.001
    max: 

#input databases
#forcing source: gsmap, era5, (imerg, arpege : not implemented yet)
forcing_source: gsmap
forcing_grid_geo_selection:
  lonmin: -12.5
  lonmax: 16.5
  latmin: 4.
  latmax: 25.
forcing_grid_database_directory: ${hyfaa_run_dir}/data_niger/forcing_grid_db
forcing_onmesh_database_directory: ${hyfaa_run_dir}/data_niger/forcing_onmesh_db
forcing_dates_dt_max: 1.e-2
rain_uncertainty: 0.5

assimilation_database_directory: ${hyfaa_run_dir}/data_niger/assimilation_db
assimilation_sources:
- ${hyfaa_run_dir}/cmd/hysope_svs.yaml
assim_params_file: ${hyfaa_run_dir}/cmd/assimilation_parameters.yaml



#executable for MGB-IPH simulation
mgb:
  executable: mgb_iph
  static_data_file: ${hyfaa_run_dir}/data_niger/static_data_laetitia.nc
  input_model: ${hyfaa_run_dir}/data_niger/mgb_iph_model_input.yaml
model_min_time_step: 1.
  

#temporary files directory
temporary_files_directory: ${hyfaa_run_dir}/temp



#post-processing
post_processing:
  science_file: ${hyfaa_run_dir}/data_hydro/post_processing_science.nc
  portal_file: ${hyfaa_run_dir}/data_hydro/post_processing_portal.nc
  variables:
    - water_elevation_catchment
    - streamflow_catchment

""", env_vars=True, string_input=True)
    dico_input['assim_params_file'] = get_scheduler_standard_assimilation_dict()
    return dico_input
Ejemplo n.º 7
0
def hyfaa_postprocessing(yaml_file_or_dict, verbose=None):
    """main scheduler processing function
    
    :param yaml_file_or_dict: python dict or yaml file
    """

    time_now = datetime.utcnow()

    #check if input is a python dict or a yaml file
    valid_obj = False
    if isinstance(yaml_file_or_dict, dict):
        valid_obj = True
        dico = yaml_file_or_dict
    else:
        if hasattr(
                yaml_file_or_dict, 'replace'
        ):  #pythonic way of checking if this is a python3 string, python2 string or python2 unicode
            if os.path.exists(yaml_file_or_dict):
                valid_obj = True
                dico = load_yaml(yaml_file_or_dict, env_vars=True)
    if not valid_obj:
        raise Exception(
            'input must be a python dict or a path to an existing yaml file')

    #check parameters
    dico = check_parameters(dico)
    exec_time = dico['exec_time']
    if verbose is None:
        if 'verbose' in dico:
            verbose = dico['verbose']
    if verbose is None:
        verbose = 0

    if verbose >= 1:
        print('Launching HYFAA post-processing...')

    #get some dimensions from static_data.nc file
    with netCDF4.Dataset(dico['mgb']['static_data_file']) as ds:
        n_cells = ds.dimensions['n_cells'].size
        mesh_lon = ds.variables['longitude_center'][:]
        mesh_lat = ds.variables['latitude_center'][:]

    #post-processing science file

    #create output directory if it doesn't exist
    if not os.path.exists(
            os.path.dirname(dico['post_processing']['science_file'])):
        os.makedirs(os.path.dirname(dico['post_processing']['science_file']),
                    exist_ok=True)

    #get previous data from science file to avoid reprocessing everything if possible. WARNING: changes in variables requested will trigger full reprocessing
    dico_science_previous, old_science_file = None, None
    vars_expected = {'longitude', 'latitude', 'time'}
    for el in ['control', 'analysis']:
        vars_expected |= {
            '%s_%s' % (el1, el)
            for el1 in dico['post_processing']['variables'] +
            ['time_added_to_hydb']
        }
    if os.path.exists(dico['post_processing']['science_file']):
        #read old file and then move it to a temporary location (to be erased after new file is created)
        try:
            old_science_file = dico['post_processing'][
                'science_file'] + '_old%s' % time_now.strftime('%Y%m%d%H%M%S')
            shutil.move(dico['post_processing']['science_file'],
                        old_science_file)
            with netCDF4.Dataset(old_science_file, mode='r') as ds_science_old:
                assert ds_science_old.dimensions[
                    'n_cells'].size == n_cells, 'n_cells mismatch with old science file: %d vs %d' % (
                        ds.dimensions['n_cells'].size, n_cells)
                assert ds_science_old.dimensions['n_ensemble'].size == dico[
                    'n_ensemble'], 'n_ensemble mismatch with old science file: %d vs %d' % (
                        ds.dimensions['n_ensemble'].size, dico['n_ensemble'])
                assert np.all(
                    ds_science_old.variables['longitude'][:] == mesh_lon
                ) and np.all(ds_science_old.variables['latitude'][:] ==
                             mesh_lat), 'lonlat mismatch with old science file'
            ds_science_old = netCDF4.Dataset(old_science_file, mode='r')
        except:
            raise
            #if input file is unreadable or variables don't match, reprocess everything
            os.unlink(old_science_file)
            old_science_file = None

    #get main information from hydrological state database
    with HydroStates_DBManager(dico['hydrological_states_database_directory'],
                               mode='r',
                               verbose=verbose) as hydrostates_db:

        file_info = dict()
        for index, row in hydrostates_db.read_as_pandas_dataframe(
                "SELECT * FROM FILEINFO WHERE file_status=?",
                params=['added']).iterrows():
            date_loc = hydrostates_db.str2date(row['date_data'])
            if date_loc not in file_info:
                file_info[date_loc] = dict()
            file_info[date_loc][row['type']] = {
                'file_path':
                hydrostates_db.get_full_path(row),
                'date_added_to_db':
                hydrostates_db.str2date(row['date_added_to_db'])
            }
        time_new = sorted(list(file_info.keys()))
        n_time = len(time_new)

    #pre-allocate array that will load each variable for each time step
    data_loc = np.ma.masked_invalid(
        np.zeros((n_cells, dico['n_ensemble']), dtype=np.float64))

    #open science and portal file and add information to them either from the old science file (if not modifications since last time), or from hydrological state database
    with netCDF4.Dataset(dico['post_processing']['science_file'],
                         mode='w') as ds_science, netCDF4.Dataset(
                             dico['post_processing']['portal_file'],
                             mode='w') as ds_portal:

        #get existing dates and date_added_to_db in old science file
        if old_science_file is not None:
            dict_times_old = {
                'time_added_to_hydb_%s' % type_loc:
                ds_science_old.variables['time_added_to_hydb_%s' % type_loc][:]
                for type_loc in ['control', 'analysis']
            }
            dict_times_old['time'] = ds_science_old.variables['time'][:]

        #initialize science file variables
        ds_science.createDimension('n_time', n_time)
        ds_science.createDimension('n_cells', n_cells)
        ds_science.createDimension('n_ensemble', dico['n_ensemble'])
        ds_science.createVariable('time', np.float64, ['n_time'])
        ds_science.variables['time'][:] = np.ma.masked_invalid(
            np.array([datetime_to_julianday(el) for el in time_new],
                     dtype=np.float64))
        ds_science.createVariable('longitude', np.float64, ['n_cells'])
        ds_science.variables['longitude'][:] = np.ma.masked_invalid(mesh_lon)
        ds_science.createVariable('latitude', np.float64, ['n_cells'])
        ds_science.variables['latitude'][:] = np.ma.masked_invalid(mesh_lat)
        for type_loc in ['control', 'analysis']:
            var_name = 'time_added_to_hydb_%s' % type_loc
            ds_science.createVariable(var_name, np.float64, ['n_time'])
            ds_science.variables[var_name].setncattr('longname', var_name)
            for var_name in [
                    '%s_%s' % (elem, type_loc)
                    for elem in dico['post_processing']['variables']
            ]:
                ds_science.createVariable(var_name, np.float64,
                                          ['n_time', 'n_cells', 'n_ensemble'])
                ds_science.variables[var_name].setncattr('longname', var_name)
        ds_science.setncattr('date_created',
                             time_now.strftime('%Y-%m-%dT%H:%M:%S'))

        #initialize portal file variables
        ds_portal.createDimension('n_time', n_time)
        ds_portal.createDimension('n_cells', n_cells)
        ds_portal.createDimension('n_ensemble', dico['n_ensemble'])
        ds_portal.createVariable('time', np.float64, ['n_time'])
        ds_portal.variables['time'][:] = np.ma.masked_invalid(
            np.array([datetime_to_julianday(el) for el in time_new],
                     dtype=np.float64))
        ds_portal.variables['time'].setncattr(
            'units', 'days since 1950-01-01 00:00:00.0')
        ds_portal.variables['time'].setncattr('long_name',
                                              'time (days since 1950-01-01)')
        ds_portal.variables['time'].setncattr('standard_name', 'time')
        ds_portal.variables['time'].setncattr('calendar', 'gregorian')
        ds_portal.createVariable('longitude', np.float64, ['n_cells'])
        ds_portal.variables['longitude'][:] = np.ma.masked_invalid(mesh_lon)
        ds_portal.variables['longitude'].setncattr('units', "degrees_east")
        ds_portal.variables['longitude'].setncattr('long_name', "longitude")
        ds_portal.variables['longitude'].setncattr('standard_name',
                                                   "longitude")
        ds_portal.variables['longitude'].setncattr(
            'comments', "East longitude relative to Greenwich meridian")
        ds_portal.createVariable('latitude', np.float64, ['n_cells'])
        ds_portal.variables['latitude'][:] = np.ma.masked_invalid(mesh_lat)
        ds_portal.variables['latitude'].setncattr('units', "degrees_north")
        ds_portal.variables['latitude'].setncattr('long_name', "latitude")
        ds_portal.variables['latitude'].setncattr('standard_name', "latitude")
        ds_portal.variables['latitude'].setncattr(
            'comments',
            "Positive latitude is North latitude, negative latitude is South latitude."
        )
        var_name = 'time_added_to_hydb'
        ds_portal.createVariable(var_name, np.float64, ['n_time'])
        ds_portal.variables[var_name].setncattr(
            'units', 'days since 1950-01-01 00:00:00.0')
        ds_portal.variables[var_name].setncattr(
            'long_name',
            'time added to hydrological database (days since 1950-01-01)')
        ds_portal.variables[var_name].setncattr('standard_name', var_name)
        ds_portal.variables[var_name].setncattr('calendar', 'gregorian')
        var_name = 'is_analysis'
        ds_portal.createVariable(var_name, np.uint8, ['n_time'])
        ds_portal.variables[var_name].setncattr('units', '0,1')
        ds_portal.variables[var_name].setncattr('long_name',
                                                'control (0) or analysis (1)')
        ds_portal.variables[var_name].setncattr('standard_name', var_name)

        for var_name in dico['post_processing']['variables']:
            if dico['n_ensemble'] > 1:
                stat_types_portal = ['mean', 'median', 'std', 'mad']
            else:
                stat_types_portal = ['mean']
            for stat_type in stat_types_portal:
                ds_portal.createVariable(var_name + '_' + stat_type,
                                         np.float64, ['n_time', 'n_cells'])
                if var_name in vars_info_portal:
                    ds_portal.variables[var_name + '_' + stat_type].setncattr(
                        'units',
                        vars_info_portal[var_name]['info_dict']['units'])
                    ds_portal.variables[var_name + '_' + stat_type].setncattr(
                        'long_name',
                        vars_info_portal[var_name]['info_dict']['long_name'] +
                        ' (%s)' % stat_type)
                    ds_portal.variables[var_name + '_' + stat_type].setncattr(
                        'standard_name', vars_info_portal[var_name]
                        ['info_dict']['standard_name'] + '_%s' % stat_type)
                    if stat_type == 'mad':
                        ds_portal.variables[var_name + '_' +
                                            stat_type].setncattr(
                                                'comments',
                                                vars_info_portal[var_name]
                                                ['info_dict']['comments'] +
                                                ' (contains sqrt(2)*%s)' %
                                                stat_type)
                    else:
                        ds_portal.variables[var_name + '_' +
                                            stat_type].setncattr(
                                                'comments',
                                                vars_info_portal[var_name]
                                                ['info_dict']['comments'] +
                                                ' (%s)' % stat_type)
                else:
                    ds_portal.variables[var_name + '_' + stat_type].setncattr(
                        'long_name', var_name + ' (%s)' % stat_type)
                    ds_portal.variables[var_name + '_' + stat_type].setncattr(
                        'standard_name', var_name + '_%s' % stat_type)
        ds_portal.setncattr('date_created',
                            time_now.strftime('%Y-%m-%dT%H:%M:%S'))
        ds_portal.setncattr('n_ensemble', dico['n_ensemble'])

        #iterate over each time step and read information either from the old science file (if unchanged since last time), or from hydrological state database
        it_decile = 1
        tstart_decile = datetime.utcnow()
        for it, time_loc in enumerate(time_new):

            ratio_done = (it + 1) * 1. / (len(time_new) * 1.)
            if ratio_done * 10. >= it_decile:
                print('%d%% done after %s, %s time remaining (estimation)' %
                      (ratio_done * 100., datetime.utcnow() - tstart_decile,
                       (datetime.utcnow() - tstart_decile) *
                       (1. - ratio_done) / ratio_done))
                it_decile = max(it_decile + 1, int(np.floor(ratio_done * 10.)))

            time_loc_jday = datetime_to_julianday(time_loc)

            best_type_loc = 'control'
            if 'analysis' in file_info[time_loc]:
                best_type_loc = 'analysis'

            for type_loc in ['control', 'analysis']:

                #time_loc must be at least in control results, if it is not in analysis step then skip (analysis data for this time step will be masked in science and portal files)
                if type_loc == 'control':
                    assert type_loc in file_info[time_loc]
                elif type_loc not in file_info[time_loc]:
                    continue

                date_added_loc = datetime_to_julianday(
                    file_info[time_loc][type_loc]['date_added_to_db'])

                #get matching id in old science file
                if old_science_file is not None:
                    ids_match = np.where(
                        np.logical_and(
                            dict_times_old['time'] == time_loc_jday,
                            dict_times_old['time_added_to_hydb_%s' %
                                           type_loc] == date_added_loc))[0]
                    if len(ids_match) == 0:
                        it0 = None
                    elif len(ids_match) == 1:
                        it0 = ids_match[0]
                    else:
                        raise Exception(
                            '%d match found in old science file, should not happen'
                            % len(ids_match))
                else:
                    it0 = None

                if it0 is None:
                    ds_in = netCDF4.Dataset(file_info[time_loc][type_loc]
                                            ['file_path'])  #open database file
                for elem in dico['post_processing']['variables']:
                    var_name = '%s_%s' % (elem, type_loc)
                    if it0 is None:
                        for i_ensemble in range(dico['n_ensemble']):
                            data_loc[:, i_ensemble] = np.ma.masked_invalid(
                                ds_in.variables['%s_%d' %
                                                (elem, i_ensemble)][:])
                    else:
                        data_loc = ds_science_old.variables[var_name][
                            it0, :, :]
                    ds_science.variables[var_name][it, :, :] = data_loc
                    if type_loc == best_type_loc:
                        if elem in vars_info_portal:
                            round_value_loc = vars_info_portal[elem]['round']
                        else:
                            round_value_loc = portal_default_round
                        ds_portal.variables[elem + '_mean'][it, :] = np.round(
                            np.ma.mean(data_loc, axis=1), round_value_loc)
                        if dico['n_ensemble'] > 1:
                            ds_portal.variables[elem +
                                                '_median'][it, :] = np.round(
                                                    np.ma.median(data_loc,
                                                                 axis=1),
                                                    round_value_loc)
                            ds_portal.variables[elem +
                                                '_std'][it, :] = np.round(
                                                    np.ma.std(data_loc,
                                                              axis=1),
                                                    round_value_loc)
                            ds_portal.variables[elem + '_mad'][
                                it, :] = np.round(
                                    median_abs_deviation_compatible_old_scipy(
                                        data_loc), round_value_loc)

                if it0 is None:
                    ds_in.close()

                ds_science.variables['time_added_to_hydb_%s' %
                                     type_loc][it] = date_added_loc
                if type_loc == best_type_loc:
                    ds_portal.variables['time_added_to_hydb'][
                        it] = date_added_loc
                    if type_loc == 'analysis':
                        ds_portal.variables['is_analysis'][it] = np.uint8(1)
                    else:
                        ds_portal.variables['is_analysis'][it] = np.uint8(0)

    #compress portal file.
    #NB: no need compressing the science file, it will not yield any significant gains in size
    nc_compress(dico['post_processing']['portal_file'])

    if old_science_file is not None:
        try:
            ds_science_old.close()
        except:
            pass
        os.unlink(old_science_file)

    if verbose >= 1:
        print('  Post processing complete in %s' %
              (datetime.utcnow() - time_now))
Ejemplo n.º 8
0
def hyfaa_preprocessing_forcing(yaml_file_or_dict,
                                gsmap_folder_local=None,
                                verbose=None):
    """main scheduler processing function
    
    :param yaml_file_or_dict: python dict or yaml file
    """

    #check if input is a python dict or a yaml file
    valid_obj = False
    if isinstance(yaml_file_or_dict, dict):
        valid_obj = True
        dico = yaml_file_or_dict
    else:
        if hasattr(
                yaml_file_or_dict, 'replace'
        ):  #pythonic way of checking if this is a python3 string, python2 string or python2 unicode
            if os.path.exists(yaml_file_or_dict):
                valid_obj = True
                dico = load_yaml(yaml_file_or_dict, env_vars=True)
    if not valid_obj:
        raise Exception(
            'input must be a python dict or a path to an existing yaml file')

    #check parameters
    dico = check_parameters(dico)
    exec_time = dico['exec_time']
    if verbose is None:
        if 'verbose' in dico:
            verbose = dico['verbose']
    if verbose is None:
        verbose = 1

    #load module for forcing data download
    if dico['forcing_source'] == 'gsmap':
        from hyfaa.database.forcing.gsmap.gsmap_download_and_interpolate_module import retrieve_forcing_data, interpolate_forcing_data
    elif dico['forcing_source'] == 'era5':
        from hyfaa.database.forcing.era5.era5_download_and_interpolate_module import retrieve_forcing_data, interpolate_forcing_data
    elif dico['forcing_source'] in ['imerg', 'arpege']:
        raise NotImplementedError(
            'retrieval from forcing data source %s has not been implemented yet'
            % dico['forcing_source'])
    else:
        raise Exception('forcing data source %s unknown' %
                        dico['forcing_source'])

    mesh_lon, mesh_lat = get_mesh_cell_centers_from_static_data_file(
        dico['mgb']['static_data_file'])

    if verbose >= 2:
        print('Forcing pre-proc: Parameters and mesh coordinates loaded...')

    #make temporary folder and hydro state database folders if they do not exist
    for fol in [
            dico['forcing_grid_database_directory'],
            dico['forcing_onmesh_database_directory'],
            dico['temporary_files_directory']
    ]:
        os.makedirs(fol, exist_ok=True)
    main_temp_dir = tempfile.mkdtemp(dir=dico['temporary_files_directory'],
                                     prefix='temphyfaaprefor')

    min_time = dico['init_conditions']['time_start']
    min_day = datetime(min_time.year, min_time.month, min_time.day)
    max_time = dico['exec_time'] + timedelta(dico['forecast_time_span'] + 1.)
    max_day = datetime(max_time.year, max_time.month,
                       max_time.day) + timedelta(1)
    ndays = (max_day - min_day).days
    expected_forcing_dates = min_time + np.array(
        [timedelta(day_loc) for day_loc in range(ndays + 1)])

    if verbose >= 2:
        print('Forcing pre-proc: Opening forcing grid database...')

    #update forcing grid DB
    with ForcingGrid_DBManager(dico['forcing_grid_database_directory'],
                               mode='w',
                               verbose=0) as gr_db:

        gr_db_dates, _ = gr_db.get_dates(date_min=expected_forcing_dates[0],
                                         date_max=expected_forcing_dates[-1])
        missing_dates = set(expected_forcing_dates) - set(gr_db_dates)

        if verbose >= 1:
            print(
                'Forcing pre-proc: Downloading data for forcing grid database')

        for file_info in retrieve_forcing_data(missing_dates, geo_selection=dico['forcing_grid_geo_selection'], gsmap_folder_local=gsmap_folder_local, \
            nprocs=dico['nprocs'], temp_folder_base=main_temp_dir, verbose=verbose):
            file_info['data_type'] = 'rain'
            gr_db.add(file_info)
            if verbose >= 1:
                print('Forcing pre-processing: Added date %s to grid DB' %
                      file_info['date_data'].strftime('%Y-%m-%dT%H:%M:%S'))
        gr_db_dates, gr_files_info = gr_db.get_dates(
            date_min=expected_forcing_dates[0],
            date_max=expected_forcing_dates[-1])
        gr_db_data_path = gr_db.get_data_path()

    if verbose >= 2:
        print('Forcing pre-proc: Opening forcing on-mesh database...')

    #update forcing mesh DB
    with ForcingGrid_DBManager(dico['forcing_onmesh_database_directory'],
                               mode='w',
                               verbose=0) as mesh_db:

        mesh_db_dates, mesh_db_info = mesh_db.get_dates(
            date_min=expected_forcing_dates[0],
            date_max=expected_forcing_dates[-1])
        msh_db_dates_set = set(mesh_db_dates)
        mesh_db_dates = np.array(mesh_db_dates)
        gr_db_dates = np.array(gr_db_dates)

        if verbose >= 1:
            print(
                'Forcing pre-proc: Analysing tasks to complete on-mesh DB from grid DB...'
            )

        files_info_interp_mesh = []
        for ii in range(len(gr_db_dates)):
            file_info_loc = {'file_path': os.path.join(gr_db_data_path, gr_files_info['file_path'][ii]), 'data_type': 'rain', 'date_data': gr_db_dates[ii], \
                    'product_type': gr_files_info['product_type'][ii], 'grid_status': gr_files_info['grid_status'][ii]}
            if gr_db_dates[ii] not in msh_db_dates_set:
                files_info_interp_mesh.append(file_info_loc)
                continue
            ids_match = np.where(mesh_db_dates == gr_db_dates[ii])[0]
            assert len(ids_match
                       ) >= 1, 'date not found when it should have been found'
            exact_match = False
            for i0 in ids_match:
                if all([
                        file_info_loc[el] == mesh_db_info[el][i0]
                        for el in ['data_type', 'product_type', 'grid_status']
                ]):
                    exact_match = True
                    break
            if not exact_match:
                files_info_interp_mesh.append(file_info_loc)

        if verbose >= 1:
            print(
                'Forcing pre-proc: Interpolating %d grid files to mesh files...'
                % (len(files_info_interp_mesh)))

        for file_info in interpolate_forcing_data(files_info_interp_mesh, mesh_lon, mesh_lat, nprocs=dico['nprocs'], \
            temp_folder_base=main_temp_dir, verbose=verbose):

            mesh_db.add(file_info)
            if verbose >= 1:
                print('Forcing pre-proc: Added file %s to on-mesh DB' %
                      file_info['file_path'])