def create_dataset_group(spec): """Create a dataset group from its specification.""" group = [] for dataset_name, selected_variables in spec.items(): # Select the relevant dataset. matching_datasets = [ d for d in original_datasets if d.__name__ == dataset_name ] if not len(matching_datasets) == 1: raise ValueError( f"Expected 1 matching dataset for '{dataset_name}', " f"got {matching_datasets}.") # Instantiate the matching Dataset. matching_dataset = matching_datasets[0]() if selected_variables: # There are variables to select. group.append( Datasets(matching_dataset).select_variables( selected_variables).dataset) else: # There is nothing to select. group.append(matching_dataset) return group
def get_data( shift_months=[1, 3, 6, 9, 12, 18, 24], selection_variables=None, masks=None, n_months=n_months, ): target_variable = "GFED4 BA" # Variables required for the above. required_variables = [target_variable] # Dataset selection. selection_datasets = [ AvitabileThurnerAGB(), # Copernicus_SWI(), ERA5_Temperature(), ESA_CCI_Landcover_PFT(), GFEDv4(), HYDE(), WWLLN(), ] # Datasets subject to temporal interpolation. temporal_interp_datasets = [ Datasets(Copernicus_SWI()).select_variables(("SWI(1)", )).dataset ] # Datasets subject to interpolation and shifting. shift_and_interp_datasets = [ Datasets(MOD15A2H_LAI_fPAR()).select_variables( ("FAPAR", "LAI")).dataset, Datasets(VODCA()).select_variables(("VOD Ku-band", )).dataset, Datasets(GlobFluo_SIF()).select_variables(("SIF", )).dataset, ] # These datasets may be shifted. datasets_to_shift = [ Datasets(ERA5_DryDayPeriod()).select_variables( ("Dry Day Period", )).dataset ] # Determine shared temporal extent of the data. min_time, max_time = dataset_times(selection_datasets + temporal_interp_datasets + shift_and_interp_datasets + datasets_to_shift)[:2] interp_min_time, interp_max_time = dataset_times( temporal_interp_datasets + shift_and_interp_datasets)[:2] target_timespan = ( max(min_time, interp_min_time + relativedelta(months=+n_months)), min(max_time, interp_max_time - relativedelta(months=+n_months)), ) # Carry out the temporal NN interpolation. for i, dataset in enumerate(temporal_interp_datasets): temporal_interp_datasets[ i] = dataset.get_temporally_interpolated_dataset( target_timespan, n_months) for i, dataset in enumerate(shift_and_interp_datasets): shift_and_interp_datasets[ i] = dataset.get_temporally_interpolated_dataset( target_timespan, n_months) datasets_to_shift.extend(shift_and_interp_datasets) selection_datasets += datasets_to_shift selection_datasets += temporal_interp_datasets if shift_months is not None: for shift in shift_months: for shift_dataset in datasets_to_shift: selection_datasets.append( shift_dataset.get_temporally_shifted_dataset(months=-shift, deep=False)) if selection_variables is None: selection_variables = [ "AGB Tree", "Diurnal Temp Range", "Dry Day Period", f"FAPAR {n_months}NN", f"LAI {n_months}NN", "Max Temp", f"SIF {n_months}NN", f"SWI(1) {n_months}NN", "ShrubAll", "TreeAll", f"VOD Ku-band {n_months}NN", "lightning", "pftCrop", "pftHerb", "popd", ] if shift_months is not None: for shift in shift_months: selection_variables.extend([ f"LAI {n_months}NN {-shift} Month", f"FAPAR {n_months}NN {-shift} Month", f"Dry Day Period {-shift} Month", f"VOD Ku-band {n_months}NN {-shift} Month", f"SIF {n_months}NN {-shift} Month", ]) selection_variables = list( set(selection_variables).union(required_variables)) selection = Datasets(selection_datasets).select_variables( selection_variables) ( endog_data, exog_data, master_mask, filled_datasets, masked_datasets, land_mask, ) = data_processing( selection, which="climatology", transformations={}, deletions=[], use_lat_mask=False, use_fire_mask=False, target_variable=target_variable, masks=masks, ) return ( endog_data, exog_data, master_mask, filled_datasets, masked_datasets, land_mask, )
def get_data(shift_months=[1, 3, 6, 9, 12, 18, 24], selection_variables=None, masks=None): target_variable = "GFED4 BA" # Variables required for the above. required_variables = [target_variable] # Dataset selection. selection_datasets = [ AvitabileThurnerAGB(), Copernicus_SWI(), ERA5_Temperature(), ESA_CCI_Landcover_PFT(), GFEDv4(), HYDE(), WWLLN(), ] # These datasets will potentially be shifted. datasets_to_shift = [ ERA5_DryDayPeriod(), MOD15A2H_LAI_fPAR(), VODCA(), GlobFluo_SIF(), ] selection_datasets += datasets_to_shift if shift_months is not None: for shift in shift_months: for shift_dataset in datasets_to_shift: selection_datasets.append( shift_dataset.get_temporally_shifted_dataset(months=-shift, deep=False)) if selection_variables is None: selection_variables = [ "AGB Tree", "Diurnal Temp Range", "Dry Day Period", "FAPAR", "LAI", "Max Temp", "SIF", "SWI(1)", "ShrubAll", "TreeAll", "VOD Ku-band", "lightning", "pftCrop", "pftHerb", "popd", ] if shift_months is not None: for shift in shift_months: selection_variables.extend([ f"LAI {-shift} Month", f"FAPAR {-shift} Month", f"Dry Day Period {-shift} Month", f"VOD Ku-band {-shift} Month", f"SIF {-shift} Month", ]) selection_variables = list( set(selection_variables).union(required_variables)) selection = Datasets(selection_datasets).select_variables( selection_variables) ( endog_data, exog_data, master_mask, filled_datasets, masked_datasets, land_mask, ) = data_processing( selection, which="climatology", transformations={}, deletions=[], use_lat_mask=False, use_fire_mask=False, target_variable=target_variable, masks=masks, ) return ( endog_data, exog_data, master_mask, filled_datasets, masked_datasets, land_mask, )
def _nn_basis_func( *, check_max_time, check_min_time, check_shift_min_time, exp_features, normal_mask_ignore_n, shift_mask_ignore_n, max_time=None, min_time=None, n_months, normal_n_time, shift_n_time, spec_datasets_to_shift, spec_selection_datasets, spec_shift_and_interp_datasets, spec_temporal_interp_datasets, target_var, which, all_shifted_variables=variable.shifted_variables, # Store this initially, since this is changed as new datasets (e.g. filled # datasets) are derived from the original datasets. original_datasets=tuple( sorted(Dataset.datasets, key=attrgetter("__name__"))), ): target_variable = target_var.name required_variables = [target_variable] shifted_variables = {var.parent for var in exp_features if var.shift != 0} assert all(shifted_var in all_shifted_variables for shifted_var in shifted_variables) shift_months = [ shift for shift in sorted({var.shift for var in exp_features}) if shift != 0 ] def year_month_datetime(dt): """Use only year and month information to construct a datetime.""" return datetime(dt.year, dt.month, 1) def create_dataset_group(spec): """Create a dataset group from its specification.""" group = [] for dataset_name, selected_variables in spec.items(): # Select the relevant dataset. matching_datasets = [ d for d in original_datasets if d.__name__ == dataset_name ] if not len(matching_datasets) == 1: raise ValueError( f"Expected 1 matching dataset for '{dataset_name}', " f"got {matching_datasets}.") # Instantiate the matching Dataset. matching_dataset = matching_datasets[0]() if selected_variables: # There are variables to select. group.append( Datasets(matching_dataset).select_variables( selected_variables).dataset) else: # There is nothing to select. group.append(matching_dataset) return group selection_datasets = create_dataset_group(spec_selection_datasets) temporal_interp_datasets = create_dataset_group( spec_temporal_interp_datasets) shift_and_interp_datasets = create_dataset_group( spec_shift_and_interp_datasets) datasets_to_shift = create_dataset_group(spec_datasets_to_shift) all_datasets = (selection_datasets + temporal_interp_datasets + shift_and_interp_datasets + datasets_to_shift) # Determine shared temporal extent of the data. _min_time, _max_time, _times_df = dataset_times(all_datasets) print(_times_df) if min_time is None: min_time = _min_time if max_time is None: max_time = _max_time assert min_time >= _min_time assert max_time <= _max_time if shift_months: _shift_min_time = year_month_datetime(min_time) - relativedelta( months=shift_months[-1]) shift_min_time = PartialDateTime(year=_shift_min_time.year, month=_shift_min_time.month) else: shift_min_time = min_time # Sanity check. assert min_time == check_min_time assert shift_min_time == check_shift_min_time assert max_time == check_max_time for dataset in datasets_to_shift: # Apply longer time limit to the datasets to be shifted. dataset.limit_months(shift_min_time, max_time) for cube in dataset: assert cube.shape[0] == shift_n_time for dataset in selection_datasets: # Apply time limit. dataset.limit_months(min_time, max_time) if dataset.frequency == "monthly": for cube in dataset: assert cube.shape[0] == normal_n_time for dataset in shift_and_interp_datasets: # Apply longer time limit to the datasets to be shifted. dataset.limit_months( year_month_datetime(shift_min_time) - relativedelta(months=+n_months), year_month_datetime(max_time) + relativedelta(months=+n_months), ) for cube in dataset: assert cube.shape[0] == shift_n_time + 2 * n_months for dataset in temporal_interp_datasets: # Apply time limit. dataset.limit_months( year_month_datetime(min_time) - relativedelta(months=+n_months), year_month_datetime(max_time) + relativedelta(months=+n_months), ) if dataset.frequency == "monthly": for cube in dataset: assert cube.shape[0] == normal_n_time + 2 * n_months for dataset in all_datasets: # Regrid each dataset to the common grid. dataset.regrid() # Calculate and apply the shared mask. total_masks = [] for dataset in temporal_interp_datasets: for cube in dataset.cubes: # Ignore areas that are always masked, e.g. water. ignore_mask = np.all(cube.data.mask, axis=0) # Also ignore those areas with low data availability. ignore_mask |= np.sum(cube.data.mask, axis=0) > normal_mask_ignore_n total_masks.append(ignore_mask) for dataset in shift_and_interp_datasets: for cube in dataset.cubes: # Ignore areas that are always masked, e.g. water. ignore_mask = np.all(cube.data.mask, axis=0) # Also ignore those areas with low data availability. ignore_mask |= np.sum(cube.data.mask, axis=0) > shift_mask_ignore_n total_masks.append(ignore_mask) combined_mask = reduce(np.logical_or, total_masks) # Apply mask to all datasets. for dataset in all_datasets: dataset.apply_masks(combined_mask) # Carry out the nearest-neighbour filling. for i, dataset in enumerate(temporal_interp_datasets): temporal_interp_datasets[ i] = dataset.get_temporally_interpolated_dataset( target_timespan=tuple( map(year_month_datetime, (min_time, max_time))), n_months=n_months, verbose=True, ) for i, dataset in enumerate(shift_and_interp_datasets): shift_and_interp_datasets[ i] = dataset.get_temporally_interpolated_dataset( target_timespan=tuple( map(year_month_datetime, (shift_min_time, max_time))), n_months=n_months, verbose=True, ) datasets_to_shift.extend(shift_and_interp_datasets) selection_datasets += datasets_to_shift selection_datasets += temporal_interp_datasets if shift_months is not None: for shift in shift_months: for shift_dataset in datasets_to_shift: # Remove any temporal coordinates other than 'time' here if needed, # since these would otherwise become misaligned when the data is # shifted below. for cube in shift_dataset: for prune_coord in ("month_number", "year"): if cube.coords(prune_coord): cube.remove_coord(prune_coord) selection_datasets.append( shift_dataset.get_temporally_shifted_dataset(months=-shift, deep=False)) selection_variables = list( set(map(lambda v: v.get_standard().raw_nn_filled, exp_features)).union(required_variables)) selection = Datasets(selection_datasets).select_variables( selection_variables) ( endog_data, exog_data, master_mask, _, # We don't need the `filled_datasets`. masked_datasets, land_mask, ) = data_processing( selection, which=which, transformations={}, deletions=[], use_lat_mask=False, use_fire_mask=False, target_variable=target_variable, masks=None, ) def _pandas_string_labels_to_variables( x, target_var, all_features=selected_features[Experiment.ALL], ): """Transform series names or columns labels to variable.Variable instances.""" all_variables = tuple( # Get the instantaneous variables corresponding to all variables. list(map(methodcaller("get_standard"), all_features)) + [target_var]) all_variable_names = tuple( map(attrgetter("raw_nn_filled"), all_variables)) if isinstance(x, pd.Series): x.name = all_variables[all_variable_names.index(x.name)] elif isinstance(x, pd.DataFrame): x.columns = [ all_variables[all_variable_names.index(c)] for c in x.columns ] else: raise TypeError( f"Expected either a pandas.Series or pandas.DataFrame. Got '{x}'." ) _pandas_string_labels_to_variables(endog_data, target_var) _pandas_string_labels_to_variables(exog_data, target_var) assert exog_data.shape[1] == len(exp_features) # Calculate anomalies for large lags. # NOTE: Modifies `exog_data` inplace. to_delete = [] for var in exog_data: if var.shift < 12: continue new_var = var.get_offset() comp_var = variable.get_matching(exog_data.columns, name=new_var.name, shift=new_var.comp_shift) print(f"{var} - {comp_var} -> {new_var}") exog_data[new_var] = exog_data[var] - exog_data[comp_var] to_delete.append(var) for column in to_delete: del exog_data[column] # Check again. assert exog_data.shape[1] == len(exp_features) return ( endog_data, exog_data, master_mask, masked_datasets, land_mask, set(exog_data.columns), )
def get_data( shift_months=[1, 3, 6, 9, 12, 18, 24], selection_variables=None, masks=None, n_months=n_months, ): target_variable = "GFED4 BA" # Variables required for the above. required_variables = [target_variable] # Dataset selection. selection_datasets = [ AvitabileThurnerAGB(), ERA5_Temperature(), ESA_CCI_Landcover_PFT(), GFEDv4(), HYDE(), WWLLN(), ] # Datasets subject to temporal interpolation (filling). temporal_interp_datasets = [ Datasets(Copernicus_SWI()).select_variables(("SWI(1)",)).dataset ] # Datasets subject to temporal interpolation and shifting. shift_and_interp_datasets = [ Datasets(MOD15A2H_LAI_fPAR()).select_variables(("FAPAR", "LAI")).dataset, Datasets(VODCA()).select_variables(("VOD Ku-band",)).dataset, Datasets(GlobFluo_SIF()).select_variables(("SIF",)).dataset, ] # Datasets subject to temporal shifting. datasets_to_shift = [ Datasets(ERA5_DryDayPeriod()).select_variables(("Dry Day Period",)).dataset ] all_datasets = ( selection_datasets + temporal_interp_datasets + shift_and_interp_datasets + datasets_to_shift ) # Determine shared temporal extent of the data. min_time, max_time = dataset_times(all_datasets)[:2] shift_min_time = min_time - relativedelta(years=2) interp_min_time, interp_max_time = dataset_times( temporal_interp_datasets + shift_and_interp_datasets )[:2] target_timespan = ( max(shift_min_time, interp_min_time + relativedelta(months=+n_months)), min(max_time, interp_max_time - relativedelta(months=+n_months)), ) # Sanity check. assert min_time == datetime(2010, 1, 1) assert shift_min_time == datetime(2008, 1, 1) assert max_time == datetime(2015, 4, 1) # Carry out the temporal NN interpolation. for datasets in (temporal_interp_datasets, shift_and_interp_datasets): for i, dataset in enumerate(datasets): datasets[i] = dataset.get_temporally_interpolated_dataset( target_timespan, n_months ) datasets_to_shift.extend(shift_and_interp_datasets) selection_datasets += datasets_to_shift selection_datasets += temporal_interp_datasets if shift_months is not None: for shift in shift_months: for shift_dataset in datasets_to_shift: selection_datasets.append( shift_dataset.get_temporally_shifted_dataset( months=-shift, deep=False ) ) if selection_variables is None: selection_variables = get_filled_names( [ "AGB Tree", "Diurnal Temp Range", "Dry Day Period", "FAPAR", "LAI", "Max Temp", "SIF", "SWI(1)", "ShrubAll", "TreeAll", "VOD Ku-band", "lightning", "pftCrop", "pftHerb", "popd", ] ) if shift_months is not None: for shift in shift_months: selection_variables.extend( [ f"{var} {-shift} Month" for var in get_filled_names( ["LAI", "FAPAR", "Dry Day Period", "VOD Ku-band", "SIF"] ) ] ) selection_variables = list(set(selection_variables).union(required_variables)) selection = Datasets(selection_datasets).select_variables(selection_variables) # Ensure correct number of samples (in time). overall_min_time, overall_max_time = dataset_times(selection)[:2] for dataset in selection: dataset.limit_months(overall_min_time, overall_max_time) if dataset.frequency == "monthly": for cube in dataset: assert cube.shape[0] == 61 ( endog_data, exog_data, master_mask, filled_datasets, masked_datasets, land_mask, ) = data_processing( selection, which="climatology", transformations={}, deletions=[], use_lat_mask=False, use_fire_mask=False, target_variable=target_variable, masks=masks, ) return ( endog_data, exog_data, master_mask, filled_datasets, masked_datasets, land_mask, )