def test_collocate_collapse_expand(self): """Test whether collocating, collapsing and expanding work""" collocator = Collocator() test = xr.Dataset({ "time": ("time", np.arange("2000", "2010", dtype="M8[Y]")), "lat": ("time", np.arange(10)), "lon": ("time", np.arange(10)), }) collocations = collocator.collocate(test, test, max_interval="30 days", max_distance="150 miles") collapsed = collapse(collocations) expanded = expand(collocations)
def test_flat_to_main_coord(self): """Tests Collocator._flat_to_main_coord This method is crucial since it stacks the whole input datasets for the collocating routine and makes them collocateable. """ collocator = Collocator() test = xr.Dataset({ "time": ("time", np.arange(10)), "lat": ("time", np.arange(10)), "lon": ("time", np.arange(10)), }) check = xr.Dataset({ "time": ("collocation", np.arange(10)), "lat": ("collocation", np.arange(10)), "lon": ("collocation", np.arange(10)), }) results = collocator._flat_to_main_coord(test) assert check.equals(results) test = xr.Dataset({ "time": ("main", np.arange(10)), "lat": ("main", np.arange(10)), "lon": ("main", np.arange(10)), }) check = xr.Dataset({ "time": ("collocation", np.arange(10)), "lat": ("collocation", np.arange(10)), "lon": ("collocation", np.arange(10)), }) results = collocator._flat_to_main_coord(test) assert check.equals(results) test = xr.Dataset({ "time": ("scnline", np.arange(5)), "lat": (("scnline", "scnpos"), np.arange(10).reshape(5, 2)), "lon": (("scnline", "scnpos"), np.arange(10).reshape(5, 2)), }) check = test.stack(collocation=("scnline", "scnpos")) results = collocator._flat_to_main_coord(test) assert check.equals(results)
class SPAREICE: """Retrieval of IWP from passive radiometers Examples: .. code-block:: python import pandas as pd from typhon.retrieval import SPAREICE # Create a SPARE-ICE object with the standard weights spareice = SPAREICE() # Print the required input fields print(spareice.inputs) # If you want to know the input fields for the each component, IWP # regressor and ice cloud classifier, you can get them like this: print(spareice.iwp.inputs) # Inputs from IWP regressor print(spareice.ice_cloud.inputs) # Inputs from ice cloud classifier # If you have yur own input data, you can use :meth:`retrieve` to run # SPARE-ICE on it. data = pd.DataFrame(...) retrieved = spareice.retrieve(data) # If your data directly comes from collocations between MHS and AVHRR, # you can use ::meth:`convert_collocated_data` to make it SPARE-ICE # compatible. collocations = Collocator().collocate(mhs_data, avhrr_data, ...) standardized_data = self.standardize_collocations(collocations) retrieved = spareice.retrieve(standardized_data) """ def __init__(self, file=None, collocator=None, processes=10, verbose=0, sea_mask_file=None, elevation_file=None): """Initialize a SPAREICE object Args: file: A JSON file with the coefficients of SPAREICE. If not given, the standard configuration will be loaded. collocator: SPARE-ICE requires a collocator when it should be generated from filesets. You can pass your own :class:`Collocator` object here if you want. processes: Number of processes to parallelize the training or collocation search. 10 is the default. Best value depends on your machine. verbose (int): Control ``GridSearchCV`` verbosity. The higher the value, the more debug messages are printed. """ self.verbose = verbose self.processes = processes self.name = "SPARE-ICE" if sea_mask_file is None: self.sea_mask = None else: self.sea_mask = np.flip( np.array(imageio.imread(sea_mask_file) == 255), axis=0) if elevation_file is None: self.elevation_grid = None else: ds = xr.open_dataset(elevation_file, decode_times=False) self.elevation_grid = ds.data.squeeze().values if collocator is None: self.collocator = Collocator() else: self.collocator = collocator # SPARE-ICE consists of two retrievals: one neural network for the IWP # and one decision tree classifier for the ice cloud flag self._iwp = None self._ice_cloud = None # The users can load SPARE-ICE from their own training or the standard # parameters: if file is None: try: self.load(STANDARD_FILE) except Exception as e: warnings.warn( "Could not load the standard parameters of SPARE-ICE!\n" "You need to train SPARE-ICE by yourself.") warnings.warn(str(e)) self._iwp = RetrievalProduct() self._ice_cloud = RetrievalProduct() else: self.load(file) def _debug(self, msg): logger.debug(f"[{self.name}] {msg}") def _info(self, msg): logger.info(f"[{self.name}] {msg}") def _iwp_model(self, processes, cv_folds): """Return the default model for the IWP regressor """ # Estimators are normally objects that have a fit and predict method # (e.g. MLPRegressor from sklearn). To make their training easier we # scale the input data in advance. With Pipeline objects from sklearn # we can combine such steps easily since they behave like an # estimator object as well. estimator = Pipeline([ # SVM or NN work better if we have scaled the data in the first # place. MinMaxScaler is the simplest one. RobustScaler or # StandardScaler could be an alternative. ("scaler", RobustScaler(quantile_range=(15, 85))), # The "real" estimator: ("estimator", MLPRegressor(max_iter=6000, early_stopping=True)), ]) # To optimize the results, we try different hyper parameters by # using a grid search hidden_layer_sizes = [ (15, 10, 3), #(50, 20), ] hyper_parameter = [ { # Hyper parameter for lbfgs solver 'estimator__solver': ['lbfgs'], 'estimator__activation': ['tanh'], 'estimator__hidden_layer_sizes': hidden_layer_sizes, 'estimator__random_state': [0, 42, 100, 3452], 'estimator__alpha': [0.1, 0.001, 0.0001], }, ] return GridSearchCV( estimator, hyper_parameter, refit=True, n_jobs=processes, cv=cv_folds, verbose=self.verbose, ) @staticmethod def _ice_cloud_model(): """Return the default model for the ice cloud classifier""" # As simple as it is. We do not need a grid search trainer for the DTC # since it has already a good performance. return DecisionTreeClassifier( max_depth=12, random_state=5, # n_estimators=20, max_features=9, ) @property def inputs(self): """Return the input fields of the current configuration""" return list(set(self.iwp.inputs) | set(self.ice_cloud.inputs)) @property def iwp(self): """Return the IWP regressor of SPARE-ICE""" return self._iwp @property def ice_cloud(self): """Return the ice cloud classifier of SPARE-ICE""" return self._ice_cloud def load(self, filename): """Load SPARE-ICE from a json file Args: filename: Path and name of the file. Returns: None """ with open(filename, 'r') as infile: parameters = literal_eval(infile.read()) self._iwp = RetrievalProduct.from_dict(parameters["iwp"]) self._ice_cloud = RetrievalProduct.from_dict( parameters["ice_cloud"]) def save(self, filename): """Save SPARE-ICE to a json file Notes: The output format is not standard json! Args: filename: Path and name of the file. Returns: None """ with open(filename, 'w') as outfile: dictionary = { "iwp": self.iwp.to_dict(), "ice_cloud": self.ice_cloud.to_dict(), } outfile.write(repr(dictionary)) def standardize_collocations(self, data, fields=None, add_sea_mask=True, add_elevation=True): """Convert collocation fields to standard SPARE-ICE fields. Args: data: A xarray.Dataset object with collocations either amongst 2C-ICE, MHS & AVHRR or MHS & AVHRR. fields (optional): Fields that will be selected from the collocations. If None (default), all fields will be selected. add_sea_mask: Add a flag to the data whether the pixel is over sea or land. add_elevation: Add the surface elevation in meters to each pixel. Returns: A pandas.DataFrame with all selected fields. """ # Check whether the data is coming from a twice-collocated dataset: if "MHS_2C-ICE/MHS/scnpos" in data.variables: prefix = "MHS_2C-ICE/" else: prefix = "" # The keys of this dictionary are the new names, while the values are # old the names coming from the original collocations. If the value is # a list, the variable is 2-dimensional. The first element is the old # name, and the rest is the dimnesion that should be selected. mapping = { "mhs_channel1": [f"{prefix}MHS/Data/btemps", f"{prefix}MHS/channel", 0], "mhs_channel2": [f"{prefix}MHS/Data/btemps", f"{prefix}MHS/channel", 1], "mhs_channel3": [f"{prefix}MHS/Data/btemps", f"{prefix}MHS/channel", 2], "mhs_channel4": [f"{prefix}MHS/Data/btemps", f"{prefix}MHS/channel", 3], "mhs_channel5": [f"{prefix}MHS/Data/btemps", f"{prefix}MHS/channel", 4], "lat": "lat", "lon": "lon", "time": "time", "mhs_scnpos": f"{prefix}MHS/scnpos", "solar_azimuth_angle": f"{prefix}MHS/Geolocation/Solar_azimuth_angle", "solar_zenith_angle": f"{prefix}MHS/Geolocation/Solar_zenith_angle", "satellite_azimuth_angle": f"{prefix}MHS/Geolocation/Satellite_azimuth_angle", "satellite_zenith_angle": f"{prefix}MHS/Geolocation/Satellite_zenith_angle", "avhrr_channel1": ["AVHRR/Data/btemps_mean", "AVHRR/channel", 0], "avhrr_channel2": ["AVHRR/Data/btemps_mean", "AVHRR/channel", 1], "avhrr_channel3": ["AVHRR/Data/btemps_mean", "AVHRR/channel", 2], "avhrr_channel4": ["AVHRR/Data/btemps_mean", "AVHRR/channel", 3], "avhrr_channel5": ["AVHRR/Data/btemps_mean", "AVHRR/channel", 4], "avhrr_channel1_std": ["AVHRR/Data/btemps_std", "AVHRR/channel", 0], "avhrr_channel2_std": ["AVHRR/Data/btemps_std", "AVHRR/channel", 1], "avhrr_channel3_std": ["AVHRR/Data/btemps_std", "AVHRR/channel", 2], "avhrr_channel4_std": [ "AVHRR/Data/btemps_std", "AVHRR/channel", 3 ], "avhrr_channel5_std": [ "AVHRR/Data/btemps_std", "AVHRR/channel", 4 ], "iwp_number": "MHS_2C-ICE/2C-ICE/ice_water_path_number", "iwp_std": "MHS_2C-ICE/2C-ICE/ice_water_path_std", } # These fields need a special treatment special_fields = ["avhrr_tir_diff", "mhs_diff", "iwp", "ice_cloud"] # Default - take all fields: if fields is None: fields = list(mapping.keys()) + special_fields return_data = {} for field in fields: if field in special_fields: # We will do this later: continue elif field not in mapping: # Some fields might be added later (such as elevation, etc) continue key = mapping[field] try: if isinstance(key, list): return_data[field] = data[key[0]].isel(**{key[1]: key[2]}) else: return_data[field] = data[key] except KeyError: # Keep things easy. Collocations might contain the target # dataset or not. We do not want to have a problem just because # we have not them. pass return_data = pd.DataFrame(return_data) if "avhrr_tir_diff" in fields: return_data["avhrr_tir_diff"] = \ return_data["avhrr_channel5"] - return_data["avhrr_channel4"] if "mhs_diff" in fields: return_data["mhs_diff"] = \ return_data["mhs_channel5"] - return_data["mhs_channel3"] if "iwp" in fields and "MHS_2C-ICE/2C-ICE/ice_water_path_mean" in data: # We transform the IWP to log space because it is better for the # ANN training. Zero values might trigger warnings and # result in -INF. However, we cannot drop them because the ice # cloud classifier needs zero values for its training. with warnings.catch_warnings(): warnings.simplefilter("ignore") return_data["iwp"] = np.log10( data["MHS_2C-ICE/2C-ICE/ice_water_path_mean"]) return_data["iwp"].replace([-np.inf, np.inf], np.nan, inplace=True) if "ice_cloud" in fields \ and "MHS_2C-ICE/2C-ICE/ice_water_path_mean" in data: return_data["ice_cloud"] = \ data["MHS_2C-ICE/2C-ICE/ice_water_path_mean"] > 0 if add_sea_mask: return_data["sea_mask"] = sea_mask(return_data.lat, return_data.lon, self.sea_mask) if add_elevation: def get_grid_value(grid, lat, lon): lat = to_array(lat) lon = to_array(lon) if lon.min() < -180 or lon.max() > 180: raise ValueError("Longitudes out of bounds!") if lat.min() < -90 or lat.max() > 90: raise ValueError("Latitudes out of bounds!") grid_lat_step = 180 / (grid.shape[0] - 1) grid_lon_step = 360 / (grid.shape[1] - 1) lat_cell = (90 - lat) / grid_lat_step lon_cell = lon / grid_lon_step return grid[lat_cell.astype(int), lon_cell.astype(int)] return_data["elevation"] = get_grid_value(self.elevation_grid, return_data.lat, return_data.lon) # We do not need the depth of the oceans (this would just # confuse the ANN): return_data["elevation"][return_data.elevation < 0] = 0 return return_data def retrieve(self, data, as_log10=False): """Retrieve SPARE-ICE for the input variables Args: data: A pandas.DataFrame object with required input fields (see above) or a xarray.Dataset if `from_collocations` is True. as_log10: If true, the retrieved IWP will be returned as logarithm of base 10. Returns: A pandas DataFrame object with the retrieved IWP and ice cloud flag. """ # Retrieve the ice water path: retrieved = self.iwp.retrieve(data[self.iwp.inputs]) if not as_log10 and retrieved is not None: retrieved["iwp"] = 10**retrieved["iwp"] # Retrieve the ice cloud flag: retrieved = retrieved.join( self.ice_cloud.retrieve(data[self.ice_cloud.inputs]), ) return retrieved @staticmethod def _retrieve_from_collocations(collocations, _, spareice): # We need collapsed collocations: if "Collocations/pairs" in collocations.variables: collocations = collapse(collocations, reference="MHS") # However, we do not need the original field names collocations = spareice.standardize_collocations(collocations) # Remove NaNs from the data: collocations = collocations.dropna() if collocations.empty: return None # Retrieve the IWP and the ice cloud flag: retrieved = spareice.retrieve(collocations).to_xarray() start = collocations.time.min() end = collocations.time.max() spareice._debug(f"Retrieve SPARE-ICE from {start} to {end}") # Add more information: retrieved["iwp"].attrs = { "units": "g/m^2", "name": "Ice Water Path", "description": "Ice Water Path (retrieved by SPARE-ICE)." } retrieved["ice_cloud"].attrs = { "units": "boolean", "name": "Ice Cloud Flag", "description": "True if pixel contains an ice cloud (retrieved" " by SPARE-ICE)." } retrieved["lat"] = collocations["lat"] retrieved["lon"] = collocations["lon"] retrieved["time"] = collocations["time"] retrieved["scnpos"] = collocations["mhs_scnpos"] return retrieved def retrieve_from_collocations(self, inputs, output, start=None, end=None, processes=None): """Retrieve SPARE-ICE from collocations between MHS and AVHRR You can use this either with already collocated MHS and AVHRR data (pass the :class:`Collocations` object via `inputs`) or you let MHS and AVHRR be collocated on-the-fly by passing the filesets with the raw data (pass two filesets as list via `inputs`). Args: inputs: Can be :class:`Collocations` or a list with :class:`~typhon.files.fileset.FileSet` objects. If it is a :class:`Collocations` object, all files from them are processed and use as input for SPARE-ICE. output: Must be a path with placeholders or a :class:`FileSet` object where the output files should be stored. start: Start date either as datetime object or as string ("YYYY-MM-DD hh:mm:ss"). Year, month and day are required. Hours, minutes and seconds are optional. If not given, it is datetime.min per default. end: End date. Same format as "start". If not given, it is datetime.max per default. processes: Number of processes to parallelize the collocation search. If not set, the value from the initialization is taken. Returns: None """ if processes is None: processes = self.processes if "sea_mask" in self.inputs and self.sea_mask is None: raise ValueError("You have to pass a sea_mask file via init!") if "elevation" in self.inputs and self.elevation_grid is None: raise ValueError("You have to pass a elevation file via init!") timer = Timer.start() if isinstance(inputs, Collocations): # Simply apply a map function to all files from these collocations inputs.map(SPAREICE._retrieve_from_collocations, kwargs={ "spareice": self, }, on_content=True, pass_info=True, start=start, end=end, max_workers=processes, output=output, worker_type="process") elif len(inputs) == 2: # Collocate MHS and AVHRR on-the-fly: names = set(fileset.name for fileset in inputs) if "MHS" not in names or "AVHRR" not in names: raise ValueError( "You must name the input filesets MHS and AVHRR! Their " f"current names are: {names}") iterator = self.collocator.collocate_filesets( inputs, start=start, end=end, processes=processes, max_interval="30s", max_distance="7.5 km", output=output, post_processor=SPAREICE._retrieve_from_collocations, post_processor_kwargs={ "spareice": self, }, ) for filename in iterator: if filename is not None: self._info(f"Stored SPARE-ICE to\n{filename}") else: raise ValueError( "You need to pass a Collocations object or a list with a MHS " "and AVHRR fileset!") logger.info(f"Took {timer} hours to retrieve SPARE-ICE") def score(self, data): """Calculate the score of SPARE-ICE on testing data Args: data: A pandas.DataFrame object with the required input fields. Returns: The score for the IWP regressor and the score for the ice cloud classifier. """ ice_cloud_score = self.ice_cloud.score(data[self.ice_cloud.inputs], data[self.ice_cloud.outputs]) # We cannot allow NaN or Inf (resulting from transformation to # log space) data = data.dropna() iwp_score = self.iwp.score(data[self.iwp.inputs], data[self.iwp.outputs]) return iwp_score, ice_cloud_score def train(self, data, iwp_inputs=None, ice_cloud_inputs=None, iwp_model=None, ice_cloud_model=None, processes=None, cv_folds=None): """Train SPARE-ICE with data This trains the IWP regressor and ice cloud classifier. Args: data: A pandas.DataFrame object with the required input fields. iwp_inputs: A list with the input field names for the IWP regressor. If this is None, the IWP regressor won't be trained. ice_cloud_inputs: A list with the input field names for the ice cloud classifier. If this is None, the ice cloud classifier won't be trained. iwp_model: Set this to your own sklearn estimator class. ice_cloud_model: Set this to your own sklearn estimator class. processes: Number of processes to parallelize the regressor training. If not set, the value from the initialization is taken. cv_folds: Number of folds used for cross-validation. Default is 5. The higher the number the more data is used for training but the runtime increases. Good values are between 3 and 10. Returns: None """ if iwp_inputs is None and ice_cloud_inputs is None: raise ValueError("Either fields for the IWP regressor or ice " "cloud classifier must be given!") if ice_cloud_inputs is not None: self._info("Train SPARE-ICE - ice cloud classifier") if ice_cloud_model is None: ice_cloud_model = self._ice_cloud_model() score = self.ice_cloud.train( ice_cloud_model, data[ice_cloud_inputs], data[["ice_cloud"]], ) self._info(f"Ice cloud classifier training score: {score:.2f}") if iwp_inputs is not None: self._info("Train SPARE-ICE - IWP regressor") # We cannot allow NaN or Inf (resulting from transformation to # log space) data = data.dropna() if processes is None: processes = self.processes if cv_folds is None: cv_folds = 5 if iwp_model is None: iwp_model = self._iwp_model(processes, cv_folds) score = self.iwp.train( iwp_model, data[iwp_inputs], data[["iwp"]], ) self._info(f"IWP regressor training score: {score:.2f}") self._training_report(iwp_model) @staticmethod def _training_report(trainer): if not hasattr(trainer, "cv_results_"): return logger.info("Best parameters found on training dataset:\n", trainer.best_params_) means = trainer.cv_results_['mean_test_score'] stds = trainer.cv_results_['std_test_score'] for mean, std, params in zip(means, stds, trainer.cv_results_['params']): # noqa logger.info("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params)) def report(self, output_dir, experiment, data): """Test the performance of SPARE-ICE and plot it Args: output_dir: A path to a directory (does not need to exist). A subdirectory named `experiment` will be created there. All plots are stored to it. experiment: A name for the experiment as a string. Will be included in the title of the plots and used as name for the subdirectory in `output_dir`. data: A pandas.DataFrame object with the required input fields. Returns: None """ # Create the output directory: output_dir = join(output_dir, experiment) os.makedirs(output_dir, exist_ok=True) # Run SPARE-ICE! retrieved = self.retrieve(data, as_log10=True) # We are going to plot the performance of the two retrievals: self._report_iwp(output_dir, experiment, data, retrieved) self._report_ice_cloud(output_dir, experiment, data, retrieved) def _report_iwp(self, output_dir, experiment, test, retrieved): """Create and store the plots for IWP regressor""" # Plot the heatmap with the retrieved IWPs fig, ax = plt.subplots(figsize=(10, 8)) scat = heatmap( test.iwp, retrieved.iwp, bins=50, range=[[-1, 4], [-1, 4]], cmap="density", vmin=5, ) scat.cmap.set_under("w") ax.set_xlabel("log10 IWP (2C-ICE) [g/m^2]") ax.set_ylabel("log10 IWP (SPARE-ICE) [g/m^2]") ax.set_title(experiment) fig.colorbar(scat, label="Number of points") fig.savefig(join(output_dir, "2C-ICE-SPAREICE_heatmap.png")) self._plot_scatter( experiment, join(output_dir, "2C-ICE-SPAREICE_scatter_{area}.png"), test.iwp, retrieved.iwp, test.sea_mask.values) # MFE plot with 2C-ICE on x-axis fe = 100 * (np.exp( np.abs(np.log(10**retrieved.iwp.values / 10**test.iwp.values))) - 1) self._plot_error( experiment, join(output_dir, "2C-ICE-SPAREICE_mfe.png"), test, fe, test.sea_mask.values, ) # Plot the bias: bias = retrieved.iwp.values - test.iwp.values self._plot_error( experiment, join(output_dir, "2C-ICE-SPAREICE_bias.png"), test, bias, test.sea_mask.values, mfe=False, yrange=[-0.35, 0.45], ) # self._plot_weights( # experiment, join(output_dir, "SPAREICE_iwp_weights.png"), # ) with open(join(output_dir, "mfe.txt"), "w") as file: mfe = sci_binned_statistic( test.iwp.values, fe, statistic="median", bins=20, range=[0, 4], ) file.write(repr(mfe[0])) @staticmethod def _plot_scatter(experiment, file, xdata, ydata, sea_mask): for area in ["all", "land", "sea"]: if area == "all": mask = slice(None, None, None) elif area == "land": mask = ~sea_mask else: mask = sea_mask fig, ax = plt.subplots(figsize=(10, 8)) ax.scatter(xdata[mask], ydata[mask], s=1, alpha=0.6) ax.grid() ax.set_xlabel("log10 IWP (2C-ICE) [g/m^2]") ax.set_ylabel("log10 IWP (SPARE-ICE) [g/m^2]") ax.set_title(f"{experiment} - {area}") fig.savefig(file.format(area=area)) @staticmethod def _plot_error(experiment, file, xdata, error, sea_mask, mfe=True, yrange=None): fig, ax = plt.subplots(figsize=(10, 8)) xlabel = "log10 IWP (2C-ICE) [g/m^2]" xrange = [0, 4] if mfe: ax.set_ylabel("Median fractional error [%]") ax.set_ylim([0, 200]) statistic = "median" else: ax.set_ylabel("$\Delta$ IWP (SPARE-ICE - 2C-ICE) [log 10 g/m^2]") statistic = "mean" for hemisphere in ["global"]: for area in ["all", "land", "sea"]: if area == "all": mask = np.repeat(True, xdata.iwp.size) elif area == "land": mask = ~sea_mask else: mask = sea_mask if hemisphere == "north": mask &= xdata.lat.values >= 0 elif hemisphere == "south": mask &= xdata.lat.values < 0 binned_statistic(xdata.iwp.values[mask], error[mask], statistic=statistic, bins=20, range=xrange, pargs={ "marker": "o", "label": f"{area} - {hemisphere}" }) ax.set_xlabel(xlabel) ax.grid() ax.legend(fancybox=True) ax.set_title(f"Experiment: {experiment}") if yrange is not None: ax.set_ylim(yrange) fig.tight_layout() fig.savefig(file) def _plot_weights(self, title, file, layer_index=0, vmin=-5, vmax=5): import seaborn as sns sns.set_context("paper") layers = self.iwp.estimator.steps[-1][1].coefs_ layer = layers[layer_index] f, ax = plt.subplots(figsize=(18, 12)) weights = pd.DataFrame(layer) weights.index = self.iwp.inputs sns.set(font_scale=1.1) # Draw a heatmap with the numeric values in each cell sns.heatmap( weights, annot=True, fmt=".1f", linewidths=.5, ax=ax, cmap="difference", center=0, vmin=vmin, vmax=vmax, # annot_kws={"size":14}, ) ax.tick_params(labelsize=18) f.tight_layout() f.savefig(file) def _report_ice_cloud(self, output_dir, experiment, test, retrieved): # Confusion matrix: fig, ax = plt.subplots(figsize=(12, 10)) cm = confusion_matrix(test.ice_cloud, retrieved.ice_cloud) img = self._plot_matrix(cm, classes=["Yes", "No"], normalize=True) fig.colorbar(img, label="probability") ax.set_title("Ice Cloud Classifier - Performance") ax.set_ylabel('real ice cloud') ax.set_xlabel('predicted ice cloud') fig.tight_layout() fig.savefig(join(output_dir, "ice-cloud-confusion-matrix.png")) fig, ax = plt.subplots(figsize=(12, 10)) ax.barh(np.arange(len(self.ice_cloud.inputs)), self.ice_cloud.estimator.feature_importances_) ax.set_yticks(np.arange(len(self.ice_cloud.inputs))) ax.set_yticklabels(self.ice_cloud.inputs) ax.set_xlabel("Feature Importance") ax.set_ylabel("Feature") ax.set_title("Ice Cloud Classifier - Importance") fig.savefig(join(output_dir, "ice-cloud-feature-importance.png")) @staticmethod def _plot_matrix(matrix, classes, normalize=False, ax=None, **kwargs): """Plots the confusion matrix of Normalization can be applied by setting `normalize=True`. """ if normalize: matrix = matrix.astype('float') / matrix.sum(axis=1)[:, np.newaxis] default_kwargs = {"cmap": "Blues", **kwargs} if ax is None: ax = plt.gca() img = ax.imshow(matrix, interpolation='nearest', **default_kwargs) tick_marks = np.arange(len(classes)) ax.set_xticks(tick_marks) ax.set_xticklabels(classes, rotation=45) ax.set_yticks(tick_marks) ax.set_yticklabels(classes) fmt = '.2f' if normalize else 'd' thresh = matrix.max() / 2. for i, j in itertools.product(range(matrix.shape[0]), range(matrix.shape[1])): ax.text(j, i, format(matrix[i, j], fmt), horizontalalignment="center", color="white" if matrix[i, j] > thresh else "black") return img
def __init__(self, file=None, collocator=None, processes=10, verbose=0, sea_mask_file=None, elevation_file=None): """Initialize a SPAREICE object Args: file: A JSON file with the coefficients of SPAREICE. If not given, the standard configuration will be loaded. collocator: SPARE-ICE requires a collocator when it should be generated from filesets. You can pass your own :class:`Collocator` object here if you want. processes: Number of processes to parallelize the training or collocation search. 10 is the default. Best value depends on your machine. verbose (int): Control ``GridSearchCV`` verbosity. The higher the value, the more debug messages are printed. """ self.verbose = verbose self.processes = processes self.name = "SPARE-ICE" if sea_mask_file is None: self.sea_mask = None else: self.sea_mask = np.flip( np.array(imageio.imread(sea_mask_file) == 255), axis=0) if elevation_file is None: self.elevation_grid = None else: ds = xr.open_dataset(elevation_file, decode_times=False) self.elevation_grid = ds.data.squeeze().values if collocator is None: self.collocator = Collocator() else: self.collocator = collocator # SPARE-ICE consists of two retrievals: one neural network for the IWP # and one decision tree classifier for the ice cloud flag self._iwp = None self._ice_cloud = None # The users can load SPARE-ICE from their own training or the standard # parameters: if file is None: try: self.load(STANDARD_FILE) except Exception as e: warnings.warn( "Could not load the standard parameters of SPARE-ICE!\n" "You need to train SPARE-ICE by yourself.") warnings.warn(str(e)) self._iwp = RetrievalProduct() self._ice_cloud = RetrievalProduct() else: self.load(file)