def test_prepare_data_multiprof(): n_profiles = 3 testFile = paths.file_defaultlidardata() t_values, z_values, rcss = utils.extract_data( testFile, to_extract=["rcs_1", "rcs_2"]) rcs_1 = rcss["rcs_1"] rcs_2 = rcss["rcs_2"] params = utils.get_default_params() params["predictors"] = { "day": ["rcs_1", "rcs_2"], "night": ["rcs_1", "rcs_2"] } loc, dateofday, lat, lon = utils.where_and_when(testFile) t = 55 coords = { "time": dt.datetime.utcfromtimestamp(t_values[t]), "lat": lat, "lon": lon } t_back = max(t - n_profiles + 1, 0) rcss = {"rcs_1": rcs_1[t_back:t + 1, :], "rcs_2": rcs_2[t_back:t + 1, :]} X, Z = prepare_data(coords, z_values, rcss=rcss, params=params) assert X.shape == (438, 2) and Z.shape == (438, )
def quicklook_output(nc_file): """Same as blhs_over_data, but directly from the output netcf file (and with less flexibility). Parameters ---------- nc_file : str Path to the netcdf file containing the data Returns ------- `matplotlib.pyplot figure` Same as kabl.graphics.blhs_over_data """ location, day, lat, lon = utils.where_and_when(nc_file) t, z, dat = utils.extract_data(nc_file, to_extract=["rcs_0", "blh_kabl", "pbl"]) rcs = dat["rcs_0"] blh_new = dat["blh_kabl"] blh_mnf = dat["pbl"] fig = blhs_over_data( t, z, rcs, [blh_new, blh_mnf[:, 0]], blhs_names=["BLH KABL", "BLH manufacturer"], titre="Lidar backscatter | " + location + " " + day.strftime("%Y/%m/%d"), ) return fig
def quicklook_data(nc_file, max_height=4500, with_pbl=False, with_cbh=False): '''Give a quick look of the data, only the data. [IN] - nc_file (str): path to the netcdf file containing the data [OUT] (matplotlib.pyplot figure): same as blhs_over_data''' location, day, lat, lon = utils.where_and_when(nc_file) to_be_extracted = ['rcs_0'] if with_pbl: to_be_extracted.append('pbl') if with_cbh: to_be_extracted.append('cloud_base_height') data = utils.extract_data(nc_file, max_height=max_height, to_extract=to_be_extracted) if with_pbl and with_cbh: t, z, rcs, pbl, cbh = data elif with_pbl: t, z, rcs, pbl = data elif with_cbh: t, z, rcs, cbh = data else: t, z, rcs = data plt.figure(figsize=(14, 7)) plt.pcolormesh(t, z, rcs.T, alpha=0.8, cmap='rainbow', vmin=-0.1, vmax=0.8) if with_pbl: pbl[pbl == -999] = np.nan for layer in range(pbl.shape[1]): plt.plot(t, pbl[:, layer], 'k*') if with_cbh: cbh[cbh == -999] = np.nan for layer in range(cbh.shape[1]): plt.plot(t, cbh[:, layer], 'r.') axes = plt.gca() plt.title("Lidar backscatter | " + location + " " + day.strftime('%Y/%m/%d')) axes.set_xlabel('Hour') axes.set_ylabel('Height (m agl)') plt.tight_layout() plt.grid(color='white', ls='solid') plt.colorbar(label="Range corrected signal", alpha=0.8) locs, labels = plt.xticks() labels = [ dt.datetime.utcfromtimestamp(loc).strftime('%H:%M') for loc in locs ] axes.set_xticks(locs) axes.set_xticklabels(labels) plt.gcf().autofmt_xdate() plt.show(block=False)
def quicklook_benchmark( data_file, blh_file, rs_file=None, ): """Same as blhs_over_data, but directly from the output netcf file (and with less flexibility). Parameters ---------- data_file : str Path to the netcdf file containing the data blh_file : str Path to the netcdf file containing the BLH estimation Returns ------- `matplotlib.pyplot figure` Same as kabl.graphics.blhs_over_data """ location, day, lat, lon = utils.where_and_when(data_file) t, z, rcss = utils.extract_data(data_file, to_extract=["rcs_0"]) rcs = rcss["rcs_0"] BLHS = [] BLH_NAMES = [] ncf = nc.Dataset(blh_file) for key in ncf.variables.keys(): if "BLH" in key: BLHS.append(np.array(ncf.variables[key])) BLH_NAMES.append(key[4:]) if rs_file is not None: blh_rs = utils.extract_rs(rs_file, t[0], t[-1]) else: blh_rs = None fig = blhs_over_data( t, z, rcs, BLHS, blhs_names=BLH_NAMES, blh_rs=blh_rs, titre="Lidar backscatter | " + location + " " + day.strftime("%Y/%m/%d"), ) return fig
def quicklook_benchmark(data_file, blh_file, rs_file=None, showFigure=True, storeImages=False, fmtImages=".png"): '''Same as blhs_over_data, but directly from the output netcf file (and with less flexibility). [IN] - data_file (str): path to the netcdf file containing the data - data_file (str): path to the netcdf file containing the BLH estimation [OUT] - (matplotlib.pyplot figure): same as blhs_over_data''' location, day, lat, lon = utils.where_and_when(data_file) t, z, rcs = utils.extract_data(data_file, to_extract=['rcs_0']) BLHS = [] BLH_NAMES = [] ncf = nc.Dataset(blh_file) for key in ncf.variables.keys(): if "BLH" in key: BLHS.append(np.array(ncf.variables[key])) BLH_NAMES.append(key[4:]) if rs_file is not None: blh_rs = utils.extract_rs(rs_file, t[0], t[-1]) else: blh_rs = None fig = blhs_over_data(t, z, rcs, BLHS, blhs_names=BLH_NAMES, blh_rs=blh_rs, titre="Lidar backscatter | " + location + " " + day.strftime('%Y/%m/%d'), showFigure=showFigure, storeImages=storeImages, fmtImages=fmtImages) return fig
def quicklook_output(nc_file): '''Same as blhs_over_data, but directly from the output netcf file (and with less flexibility). [IN] - nc_file (str): path to the netcdf file containing the data [OUT] - (matplotlib.pyplot figure): same as blhs_over_data''' location, day, lat, lon = utils.where_and_when(nc_file) t, z, rcs, blh_new, blh_mnf = utils.extract_data( nc_file, to_extract=['rcs_0', 'blh_kabl', 'pbl']) fig = blhs_over_data(t, z, rcs, [blh_new, blh_mnf[:, 0]], blhs_names=['BLH KABL', 'BLH manufacturer'], titre="Lidar backscatter | " + location + " " + day.strftime('%Y/%m/%d')) return fig
def test_prepare_data_cl31(): n_profiles = 3 testFile = paths.file_defaultcl31data() t_values, z_values, rcss = utils.extract_data(testFile, to_extract=["rcs_0"]) rcs_0 = rcss["rcs_0"] params = utils.get_default_params() params["predictors"] = {"day": ["rcs_0"], "night": ["rcs_0"]} loc, dateofday, lat, lon = utils.where_and_when(testFile) t = 55 coords = { "time": dt.datetime.utcfromtimestamp(t_values[t]), "lat": lat, "lon": lon } t_back = max(t - n_profiles + 1, 0) rcs_0 = rcs_0[t_back:t + 1, :] X, Z = prepare_data(coords, z_values, rcss={"rcs_0": rcs_0}, params=params) assert X.shape == (1347, 1) and Z.shape == (1347, )
# Test of prepare_data #---------------------- print("\n --------------- Test of prepare_data") testFile = '../data_samples/lidar/DAILY_MPL_5025_20180802.nc' print(' ** Single profile **') z_values, rcs_1, rcs_2, coords = utils.extract_testprofile( testFile, profile_id=2, return_coords=True) print("z_values.shape", z_values.shape, "rcs_1.shape", rcs_1.shape, "rcs_2.shape", rcs_2.shape) X, Z = prepare_data(coords, z_values, rcs_1, rcs_2) print("X.shape=", X.shape) print("Z.shape=", Z.shape) n_profiles = 3 print(' ** Concatenated profiles ** (', n_profiles, ')') t_values, z_values, rcs_1, rcs_2 = utils.extract_data( testFile, to_extract=['rcs_1', 'rcs_2']) loc, dateofday, lat, lon = utils.where_and_when(testFile) t = 55 coords = { 'time': dt.datetime.utcfromtimestamp(t_values[t]), 'lat': lat, 'lon': lon } t_back = max(t - n_profiles + 1, 0) rcs_1 = rcs_1[t_back:t + 1, :] rcs_2 = rcs_2[t_back:t + 1, :] print("z_values.shape", z_values.shape, "rcs_1.shape", rcs_1.shape, "rcs_2.shape", rcs_2.shape) X, Z = prepare_data(coords, z_values, rcs_1, rcs_2) print("X.shape=", X.shape) print("Z.shape=", Z.shape)
labels = core.apply_algo(X, 3) blh = core.blh_from_labels(labels, Z) blhs_over_profile(z_values, rcs_1, blh, labels=labels) plt.figure() plt.hist(rcs_1, 35) plt.title("Histogram of a single profile of RCS") plt.show(block=False) # Test of blhs_over_data #------------------------ print("\n --------------- Test of blhs_over_data") testFile = '../data_samples/lidar/DAILY_MPL_5025_20180802.nc' blh = core.blh_estimation(testFile) t_values, z_values, rcs_1, rcs_2 = utils.extract_data(testFile) blhs_over_data(t_values, z_values, rcs_1, blh) # Test of scatterplot_blhs #------------------------ print("\n --------------- Test of scatterplot_blhs") outputFile = '../data_samples/lidar/DAILY_MPL_5025_20180802.out.nc' t_values, z_values, blh_new, blh_mnf = utils.extract_data( outputFile, to_extract=['blh_kabl', 'pbl']) scatterplot_blhs(t_values, blh_mnf[:, 0], blh_new) # Test of quicklook_output #------------------------ print("\n --------------- Test of quicklook_output")
from kabl import adabl from kabl import paths # Usual Python packages import pickle import numpy as np import datetime as dt import pytz import sys import time import netCDF4 as nc lidarFile = paths.file_defaultcl31data() t_values, z_values, rcss = utils.extract_data(lidarFile, max_height=4620, to_extract=["rcs_0"]) rcs_0 = rcss["rcs_0"] # Estimation with KABL # ---------------------- params = utils.get_default_params() params["n_clusters"] = 3 params["predictors"] = {"day": ["rcs_0"], "night": ["rcs_0"]} params["n_profiles"] = 1 params["init"] = "advanced" blh_kabl = core.blh_estimation(lidarFile, storeInNetcdf=False, params=params) # Plot # ------
def quicklook_data(nc_file, max_height=4500, with_pbl=False, with_cbh=False): """Give a quick look of the data, only the data. Parameters ---------- nc_file : str Path to the netcdf file containing the data max_height : {float, int}, default=4500 Top height on the graphic with_pbl : bool, default=False If True, add onto the data the boundary layer height calculated by the manufacturer with_cbh : bool, default=False If True, add onto the data the first cloud base height calculated by the manufacturer Returns ------- None """ location, day, lat, lon = utils.where_and_when(nc_file) to_be_extracted = ["rcs_0"] if with_pbl: to_be_extracted.append("pbl") if with_cbh: to_be_extracted.append("cloud_base_height") t, z, dat = utils.extract_data(nc_file, max_height=max_height, to_extract=to_be_extracted) rcs = dat["rcs_0"] if "pbl" in to_be_extracted: pbl = dat["pbl"] if "cloud_base_height" in to_be_extracted: cbh = dat["cloud_base_height"] plt.figure(figsize=(14, 7)) plt.pcolormesh(t, z, rcs.T, alpha=0.8, cmap="rainbow", vmin=-0.1, vmax=0.8) if with_pbl: pbl[pbl == -999] = np.nan for layer in range(pbl.shape[1]): plt.plot(t, pbl[:, layer], "k*") if with_cbh: cbh[cbh == -999] = np.nan for layer in range(cbh.shape[1]): plt.plot(t, cbh[:, layer], "r.") axes = plt.gca() plt.title("Lidar backscatter | " + location + " " + day.strftime("%Y/%m/%d")) axes.set_xlabel("Hour") axes.set_ylabel("Height (m agl)") plt.tight_layout() plt.grid(color="white", ls="solid") plt.colorbar(label="Range corrected signal", alpha=0.8) locs, labels = plt.xticks() labels = [ dt.datetime.utcfromtimestamp(loc).strftime("%H:%M") for loc in locs ] axes.set_xticks(locs) axes.set_xticklabels(labels) plt.gcf().autofmt_xdate() plt.show(block=False)
from kabl import utils from kabl import graphics from kabl import adabl # Usual Python packages import pickle import numpy as np import datetime as dt import pytz import sys import time import netCDF4 as nc lidarFile = '../data_samples/lidar/DAILY_MPL_5025_20180802.nc' rsFile = '../data_samples/radiosoundings/BLH_RS_liss3_BRNliss10_BREST.nc' t_values, z_values, rcs_1, rcs_2, blh_mnf = utils.extract_data( lidarFile, max_height=4620, to_extract=['rcs_1', 'rcs_2', 'pbl']) # Estimation with KABL #---------------------- params = dict() params['algo'] = 'kmeans' params['n_clusters'] = 3 params['predictors'] = {'day': ['rcs_1'], 'night': ['rcs_1']} params['classif_score'] = 'db' params['n_inits'] = 1 params['n_profiles'] = 1 params['max_k'] = 6 params['init'] = 'given' params['cov_type'] = 'full' params['max_height'] = 4500 params['sunrise_shift'] = 1
def adabl_qualitymetrics( dataFile: str, modelFile: str, scalerFile: str, refFile: str = "indus", outputFile: str = "None", addResultsToNetcdf: bool = False, ): """Perform BLH estimation with ADABL on all profiles of the day and write it into a copy of the netcdf file Parameters ---------- dataFile : str Path to the input file, as generated by raw2l1 modelFile : str Path to the model file (pickle object) scalerFile : str Path to the scaler file (pickle object) refFile : str Path to reference BLH estimation (handmade of manufacturer's). Default is the manufacturer. outputFile : str Path to the output file. Must be specified if addResultsToNetcdf=True addResultsToNetcdf : bool, default=False If True, adds the quality metrics to the existing result file specified in outputFile Returns ------- errl2_blh : float Root mean squared gap between BLH from KABL and the reference .. math:: \sqrt{1/N \sum_i^N (Z(i)-Zref(i))^2} errl1_blh : float Mean absolute gap between BLH from KABL and the reference .. math:: 1/N \sum_i^N \vert Z(i)-Zref(i) \vert errl0_blh : float Maximum absolute gap between BLH from KABL and the reference .. math:: \max_i \vert Z(i)-Zref(i) \vert ch_score : float Average Calinski-Harabasz score (the higher, the better) over the full day db_scores : float Average Davies-Bouldin score (the lower, the better) over the full day s_scores : float Average silhouette score (the higher, the better) over the full day chrono : float Computation time for the full day (seconds) n_invalid : int Number of BLH estimation at NaN or Inf """ t0 = time.time() #:::::::::::::::::::::: # 1. Extract the data # --------------------- loc, dateofday, lat, lon = utils.where_and_when(dataFile) t_values, z_values, dat = utils.extract_data( dataFile, to_extract=["rcs_1", "rcs_2", "pbl"] ) rcs_1 = dat["rcs_1"] rcs_2 = dat["rcs_2"] blh_mnf = dat["pbl"] sec_intheday = np.mod(t_values, 24 * 3600) Nt, Nz = rcs_1.shape # Load pre-trained model # ------------------------ fc = open(modelFile, "rb") model = pickle.load(fc) fc = open(scalerFile, "rb") scaler = pickle.load(fc) blh = [] # setup toolbar toolbar_width = int(len(t_values) / 10) + 1 sys.stdout.write( "ADABL estimation (" + loc + dateofday.strftime(", %Y/%m/%d") + "): [%s]" % ("." * toolbar_width) ) sys.stdout.flush() sys.stdout.write("\b" * (toolbar_width + 1)) # return to start of line, after '[' # Loop on all profile of the day for t in range(Nt): # toolbar if np.mod(t, 10) == 0: if any(np.isnan(blh[-11:-1])): sys.stdout.write("!") else: sys.stdout.write("*") sys.stdout.flush() # 2. Prepare the data # --------------------- rcs1loc = rcs_1[t, :] rcs2loc = rcs_2[t, :] rcs1loc[rcs1loc <= 0] = 1e-5 rcs2loc[rcs2loc <= 0] = 1e-5 X_new = np.array( [ np.repeat(sec_intheday[t], Nz), z_values, np.log10(rcs1loc), np.log10(rcs2loc), ] ).T X_new = scaler.transform(X_new) # 3. Apply the machine learning algorithm # --------------------- y_new = model.predict(X_new) # 4. Derive and store the BLH # --------------------- blh.append(utils.blh_from_labels(y_new, z_values)) # end toolbar t1 = time.time() #:::::::::::::::::::::: chrono = t1 - t0 sys.stdout.write("] (" + str(np.round(chrono, 4)) + " s)\n") if os.path.isfile(refFile): blh_ref = np.loadtxt(refFile) else: blh_ref = blh_mnf[:, 0] if addResultsToNetcdf: BLHS = [np.array(blh)] BLH_NAMES = ["BLH_ADABL"] msg = add_blhs_to_netcdf(outputFile, BLHS, BLH_NAMES) print(msg) errl2_blh = np.sqrt(np.nanmean((blh - blh_ref) ** 2)) errl1_blh = np.nanmean(np.abs(blh - blh_ref)) errl0_blh = np.nanmax(np.abs(blh - blh_ref)) corr_blh = np.corrcoef(blh, blh_ref)[0, 1] n_invalid = np.sum(np.isnan(blh)) + np.sum(np.isinf(blh)) return errl2_blh, errl1_blh, errl0_blh, corr_blh, chrono, n_invalid
def prepare_supervised_dataset( dataFiles: list, refFiles: list, saveInCSV: bool = False, outputFile: str = None, plot_on: bool = False, ): """Create a dataframe with appropriate fields from original data format. Lidar data is expected to be provided in raw2l1 files and handmade BLH estimation is expected in .csv file with 2 columns: time, BLH values. Paths are given in a list in order to easily had multiple days. Parameters ---------- dataFile : list of str Paths to the data input file, as generated by raw2l1 refFile : list of str Paths to the reference file (handmade BLH estimation) in CSV format saveInCSV : bool, default=False If True, the dataset is saved in a .csv file at the specified location outputFile : str, default=None Path to the file where the dataset is stored, if saveInCSV=True plot_on : bool, default=False If True, display the handmade BLH over the data. Returns ------- df : `pandas.DataFrame` Ready-to-use dataframe for ADABL training. Contains 5 columns of input data and one column of output binary data """ RCS0 = [] RCS1 = [] RCS2 = [] SEC0 = [] ALTI = [] y = [] for i in range(len(dataFiles)): dataFile = dataFiles[i] refFile = refFiles[i] print("Reading file ", dataFile, "with reference", refFile) t_values, z_values, dat = utils.extract_data( dataFile, max_height=4620, to_extract=["rcs_0", "rcs_1", "rcs_2", "pbl"] ) rcs_0 = dat["rcs_0"] rcs_1 = dat["rcs_1"] rcs_2 = dat["rcs_2"] blh_mnf = dat["pbl"] blh_ref = pd.read_csv(refFile, delimiter=",", header=0) blh_ref = blh_ref["blh_ref"].values if plot_on: graphics.blhs_over_data(t_values, z_values, rcs_0, blh_ref) # Input data # ---------- sec_intheday = np.mod(t_values, 24 * 3600) Nt, Nz = rcs_1.shape rcs0loc = rcs_0.ravel() rcs0loc[rcs0loc <= 0] = 1e-5 RCS0.append(np.log10(rcs0loc)) rcs1loc = rcs_1.ravel() rcs1loc[rcs1loc <= 0] = 1e-5 RCS1.append(np.log10(rcs1loc)) rcs2loc = rcs_2.ravel() rcs2loc[rcs2loc <= 0] = 2e-5 RCS2.append(np.log10(rcs2loc)) SEC0.append(np.repeat(sec_intheday, Nz)) ALTI.append(np.tile(z_values, Nt)) # Output data # ----------- yday = [] for t in range(Nt): yloc = np.zeros(Nz) yloc[z_values > blh_ref[t]] = 1 yday.append(yloc) y.append(np.array(yday, dtype=int).ravel()) # Create dataframe # ------------------ df = pd.DataFrame( { "sec0": np.concatenate(SEC0), "alti": np.concatenate(ALTI), "rcs0": np.concatenate(RCS0), "rcs1": np.concatenate(RCS1), "rcs2": np.concatenate(RCS2), "isBL": np.concatenate(y), } ) if saveInCSV: if outputFile is None: outputFile = paths.file_labelleddataset() df.to_csv(outputFile, index=False) print("Dataset for ADABL is saved in", outputFile) return df
def adabl_blh_estimation( dataFile: str, modelFile: str, scalerFile: str, outputFile: bool = None, storeInNetcdf: bool = False, ): """Perform BLH estimation with ADABL on all profiles of the day and write it into a copy of the netcdf file Parameters ---------- dataFile : str Path to the input file, as generated by raw2l1 modelFile : str Path to the model file (pickle object) scalerFile : str Path to the scaler file (pickle object) outputFile : str Path to the output file. Default adds ".out" before ".nc" storeInNetcdf : bool If True, the field 'blh_ababl', containg BLH estimation, is stored in the outputFile Returns ------- blh : ndarray of shape (Nt,) Time series of BLH as estimated by the ADABL algorithm. """ t0 = time.time() #:::::::::::::::::::::: # 1. Extract the data # --------------------- loc, dateofday, lat, lon = utils.where_and_when(dataFile) t_values, z_values, dat = utils.extract_data( dataFile, to_extract=["rcs_1", "rcs_2", "pbl"] ) rcs_1 = dat["rcs_1"] rcs_2 = dat["rcs_2"] blh_mnf = dat["pbl"] sec_intheday = np.mod(t_values, 24 * 3600) Nt, Nz = rcs_1.shape # Load pre-trained model # ------------------------ fc = open(modelFile, "rb") model = pickle.load(fc) fc = open(scalerFile, "rb") scaler = pickle.load(fc) blh = [] # setup toolbar toolbar_width = int(len(t_values) / 10) + 1 sys.stdout.write( "ADABL estimation (" + loc + dateofday.strftime(", %Y/%m/%d") + "): [%s]" % ("." * toolbar_width) ) sys.stdout.flush() sys.stdout.write("\b" * (toolbar_width + 1)) # return to start of line, after '[' # Loop on all profile of the day for t in range(Nt): # toolbar if np.mod(t, 10) == 0: if any(np.isnan(blh[-11:-1])): sys.stdout.write("!") else: sys.stdout.write("*") sys.stdout.flush() # 2. Prepare the data # --------------------- rcs1loc = rcs_1[t, :] rcs2loc = rcs_2[t, :] rcs1loc[rcs1loc <= 0] = 1e-5 rcs2loc[rcs2loc <= 0] = 1e-5 X_new = np.array( [ np.repeat(sec_intheday[t], Nz), z_values, np.log10(rcs1loc), np.log10(rcs2loc), ] ).T X_new = scaler.transform(X_new) # 3. Apply the machine learning algorithm # --------------------- y_new = model.predict(X_new) # 4. Derive and store the BLH # --------------------- blh.append(utils.blh_from_labels(y_new, z_values)) # end toolbar t1 = time.time() #:::::::::::::::::::::: chrono = t1 - t0 sys.stdout.write("] (" + str(np.round(chrono, 4)) + " s)\n") if outputFile is None: outputFile = dataFile[:-3] + ".out.nc" # 5. Store the new BLH estimation into a copy of the original netCDF if storeInNetcdf: utils.add_blh_to_netcdf(dataFile, outputFile, blh, origin="adabl") return np.array(blh)
def blh_estimation(inputFile, outputFile=None, storeInNetcdf=True, params=None): '''Perform BLH estimation on all profiles of the day and write it into a copy of the netcdf file. [IN] - inputFile (str): path to the input file, as generated by raw2l1 - outputFile (str): path to the output file. Default adds ".out" before ".nc" - storeInNetcdf (bool): if True, the field 'blh_ababl', containg BLH estimation, is stored in the outputFile - params (dict): dict of parameters. Depends on 'n_clusters' [OUT] - blh (np.array[Nt]): time series of BLH as estimated by the KABL algorithm. ''' t0 = time.time() #:::::::::::::::::::::: if params is None: params = utils.get_default_params() # 1. Extract the data #--------------------- loc, dateofday, lat, lon = utils.where_and_when(inputFile) t_values, z_values, rcs_1, rcs_2 = utils.extract_data(inputFile, params=params) blh = [] # setup toolbar toolbar_width = int(len(t_values) / 10) + 1 sys.stdout.write("KABL estimation (" + loc + dateofday.strftime(', %Y/%m/%d') + "): [%s]" % ("." * toolbar_width)) sys.stdout.flush() sys.stdout.write("\b" * (toolbar_width + 1)) # return to start of line, after '[' # Loop on all profile of the day for t in range(len(t_values)): # toolbar if np.mod(t, 10) == 0: sys.stdout.write("*") sys.stdout.flush() # 2. Prepare the data #--------------------- coords = { 'time': dt.datetime.utcfromtimestamp(t_values[t]), 'lat': lat, 'lon': lon } t_back = max(t - params['n_profiles'] + 1, 0) X, Z = prepare_data(coords, z_values, rcs_1[t_back:t + 1, :], rcs_2[t_back:t + 1, :], params) # 3. Apply the machine learning algorithm #--------------------- if isinstance(params['n_clusters'], int): labels = apply_algo(X, params['n_clusters'], params=params) # (3.1 OPTIONAL) Compute classification score classif_score = silhouette_score(X, labels) #ch_score=calinski_harabaz_score(X,labels) #db_score=davies_bouldin_score(X,labels) else: labels, n_clusters, classif_score = apply_algo_k_auto( X, params=params) # 4. Derive and store the BLH #--------------------- blh.append(blh_from_labels(labels, Z)) if outputFile is None: outputFile = inputFile[:-3] + ".out.nc" # end toolbar t1 = time.time() #:::::::::::::::::::::: chrono = t1 - t0 sys.stdout.write("] (" + str(np.round(chrono, 4)) + " s)\n") # 5. Store the new BLH estimation into a copy of the original netCDF if storeInNetcdf: utils.add_blh_to_netcdf(inputFile, outputFile, blh) return np.array(blh)
def kabl_qualitymetrics(inputFile, outputFile=None, reference='None', rsFile='None', storeResults=True, params=None): '''Copy of blh_estimation including calculus and storage of scores [IN] - inputFile (str): path to the input file, as generated by raw2l1 - outputFile (str): path to the output file. Default adds ".out" before ".nc" - reference (str): path to the reference file, if any. - rsFile (str): path to the radiosounding estimations, if any (give the possibility to store it in the same netcdf) - storeResults (bool): if True, the field 'blh_ababl', containg BLH estimation, is stored in the outputFile - params (dict): dict of parameters. Depends on 'n_clusters' [OUT] - errl2_blh (float): root mean squared gap between BLH from KABL and the reference - errl1_blh (float): mean absolute gap between BLH from KABL and the reference - errl0_blh (float): maximum absolute gap between BLH from KABL and the reference - ch_score (float): mean over all day Calinski-Harabasz score (the higher, the better) - db_scores (float): mean over all day Davies-Bouldin score (the lower, the better) - s_scores (float): mean over all day silhouette score (the higher, the better) - chrono (float): computation time for the full day (seconds) - n_invalid (int): number of BLH estimation at NaN or Inf ''' t0 = time.time() #:::::::::::::::::::::: if params is None: params = utils.get_default_params() # 1. Extract the data #--------------------- loc, dateofday, lat, lon = utils.where_and_when(inputFile) t_values, z_values, rcs_1, rcs_2, blh_mnf, rr, vv, cbh = utils.extract_data( inputFile, to_extract=['rcs_1', 'rcs_2', 'pbl', 'rr', 'vv', 'b1'], params=params) blh = [] K_values = [] s_scores = [] db_scores = [] ch_scores = [] # setup toolbar toolbar_width = int(len(t_values) / 10) + 1 sys.stdout.write("KABL estimation (" + loc + dateofday.strftime(', %Y/%m/%d') + "): [%s]" % ("." * toolbar_width)) sys.stdout.flush() sys.stdout.write("\b" * (toolbar_width + 1)) # return to start of line, after '[' # Loop on all profile of the day for t in range(len(t_values)): # toolbar if np.mod(t, 10) == 0: if any(np.isnan(blh[-11:-1])): sys.stdout.write("!") else: sys.stdout.write("*") sys.stdout.flush() # 2. Prepare the data #--------------------- coords = { 'time': dt.datetime.utcfromtimestamp(t_values[t]), 'lat': lat, 'lon': lon } t_back = max(t - params['n_profiles'] + 1, 0) X, Z = prepare_data(coords, z_values, rcs_1[t_back:t + 1, :], rcs_2[t_back:t + 1, :], params=params) # 3. Apply the machine learning algorithm #--------------------- if isinstance(params['n_clusters'], int): n_clusters = params['n_clusters'] labels = apply_algo(X, params['n_clusters'], params=params) # Compute classification score if len(np.unique(labels)) > 1: with np.errstate( divide='ignore', invalid='ignore' ): # to avoid itempestive warning ("RuntimeWarning: divide by zero encountered in true_divide...") db_score = davies_bouldin_score(X, labels) s_score = silhouette_score(X, labels) ch_score = calinski_harabaz_score(X, labels) else: db_score = np.nan s_score = np.nan ch_score = np.nan else: labels, n_clusters, s_score, db_score, ch_score = apply_algo_k_3scores( X, params=params) # 4. Derive and store the BLH #--------------------- blh.append(blh_from_labels(labels, Z)) K_values.append(n_clusters) s_scores.append(s_score) db_scores.append(db_score) ch_scores.append(ch_score) # end toolbar t1 = time.time() #:::::::::::::::::::::: chrono = t1 - t0 sys.stdout.write("] (" + str(np.round(chrono, 4)) + " s)\n") if outputFile is None: fname = inputFile.split('/')[-1] outputFile = "DAILY_BENCHMARK_" + fname[10:-3] + ".nc" mask_cloud = cbh[:] <= 3000 if os.path.isfile(reference): blh_ref = np.loadtxt(reference) else: blh_ref = blh_mnf[:, 0] if storeResults: BLHS = [np.array(blh), np.array(blh_mnf[:, 0])] BLH_NAMES = ['BLH_KABL', 'BLH_INDUS'] if os.path.isfile(reference): BLHS.append(blh_ref) BLH_NAMES.append('BLH_REF') # Cloud base height is added as if it were a BLH though it's not BLHS.append(cbh) BLH_NAMES.append("CLOUD_BASE_HEIGHT") msg = utils.save_qualitymetrics(outputFile, t_values, BLHS, BLH_NAMES, [s_scores, db_scores, ch_scores], ['SILH', 'DB', 'CH'], [rr, vv], ['MASK_RAIN', 'MASK_FOG'], K_values, chrono, params) if os.path.isfile(rsFile): blh_rs = utils.extract_rs(rsFile, t_values[0], t_values[-1]) else: blh_rs = None # graphics.blhs_over_data(t_values,z_values,rcs_1,BLHS,[s[4:] for s in BLH_NAMES], # blh_rs=blh_rs,storeImages=True,showFigure=False) print(msg) errl2_blh = np.sqrt(np.nanmean((blh - blh_ref)**2)) errl1_blh = np.nanmean(np.abs(blh - blh_ref)) errl0_blh = np.nanmax(np.abs(blh - blh_ref)) corr_blh = np.corrcoef(blh, blh_ref)[0, 1] n_invalid = np.sum(np.isnan(blh)) + np.sum(np.isinf(blh)) return errl2_blh, errl1_blh, errl0_blh, corr_blh, np.mean( ch_scores), np.mean(db_scores), np.mean(s_scores), chrono, n_invalid
def kabl_qualitymetrics( inputFile, outputFile=None, reference="None", rsFile="None", storeResults=True, params=None, ): """Estimate quality metrics of KABL for one day of measurement. This function perform the BLH estimation as in kabl.core.blh_estimation but its output are the quality metrics, not the BLH estimation. As the estimation of quality metrics is greedier this function is noticeably longer to execute. Parameters ---------- inputFile : str Path to the input file, as generated by raw2l1 outputFile : str, default=None Path to the output file reference : str, default=None Path to handmade BLH estimation, if any, which will serve as reference. rsFile : str Path to the radiosounding estimations, if any. Give the possibility to store it in the same netcdf storeResults : bool, default=True If True, quality metrics are stored in the `outputFile` params : dict, default=None Dict with all settings. This function depends on 'n_clusters' Returns ------- errl2_blh : float Root mean squared gap between BLH from KABL and the reference .. math:: \sqrt{1/N \sum_i^N (Z(i)-Zref(i))^2} errl1_blh : float Mean absolute gap between BLH from KABL and the reference .. math:: 1/N \sum_i^N \vert Z(i)-Zref(i) \vert errl0_blh : float Maximum absolute gap between BLH from KABL and the reference .. math:: \max_i \vert Z(i)-Zref(i) \vert ch_score : float Average Calinski-Harabasz score (the higher, the better) over the full day db_scores : float Average Davies-Bouldin score (the lower, the better) over the full day s_scores : float Average silhouette score (the higher, the better) over the full day chrono : float Computation time for the full day (seconds) n_invalid : int Number of BLH estimation at NaN or Inf """ t0 = time.time() #:::::::::::::::::::::: if params is None: params = utils.get_default_params() # 1. Extract the data # --------------------- loc, dateofday, lat, lon = utils.where_and_when(inputFile) t_values, z_values, dat = utils.extract_data( inputFile, to_extract=["rcs_1", "rcs_2", "pbl", "rr", "vv", "b1"], params=params ) rcs_1 = dat["rcs_1"] rcs_2 = dat["rcs_2"] blh_mnf = dat["pbl"] rr = dat["rr"] vv = dat["vv"] cbh = dat["b1"] blh = [] K_values = [] s_scores = [] db_scores = [] ch_scores = [] # setup toolbar toolbar_width = int(len(t_values) / 10) + 1 sys.stdout.write( "\nKABL estimation (" + loc + dateofday.strftime(", %Y/%m/%d") + "): [%s]" % ("." * toolbar_width) ) sys.stdout.flush() sys.stdout.write("\b" * (toolbar_width + 1)) # return to start of line, after '[' # Loop on all profile of the day for t in range(len(t_values)): # toolbar if np.mod(t, 10) == 0: if any(np.isnan(blh[-11:-1])): sys.stdout.write("!") else: sys.stdout.write("*") sys.stdout.flush() # 2. Prepare the data # --------------------- coords = { "time": dt.datetime.utcfromtimestamp(t_values[t]), "lat": lat, "lon": lon, } t_back = max(t - params["n_profiles"] + 1, 0) X, Z = prepare_data( coords, z_values, rcss={"rcs_1": rcs_1[t_back : t + 1, :], "rcs_2": rcs_2[t_back : t + 1, :]}, params=params, ) # 3. Apply the machine learning algorithm # --------------------- if isinstance(params["n_clusters"], int): n_clusters = params["n_clusters"] labels = apply_algo(X, params["n_clusters"], params=params) # Compute classification score if len(np.unique(labels)) > 1: with np.errstate( divide="ignore", invalid="ignore" ): # to avoid itempestive warning ("RuntimeWarning: divide by zero encountered in true_divide...") db_score = davies_bouldin_score(X, labels) s_score = silhouette_score(X, labels) ch_score = calinski_harabaz_score(X, labels) else: db_score = np.nan s_score = np.nan ch_score = np.nan else: labels, n_clusters, s_score, db_score, ch_score = apply_algo_k_3scores( X, params=params ) # 4. Derive and store the BLH # --------------------- blh.append(utils.blh_from_labels(labels, Z)) K_values.append(n_clusters) s_scores.append(s_score) db_scores.append(db_score) ch_scores.append(ch_score) # end toolbar t1 = time.time() #:::::::::::::::::::::: chrono = t1 - t0 sys.stdout.write("] (" + str(np.round(chrono, 4)) + " s)\n") if outputFile is None: fname = os.path.split(inputFile)[-1] outputFile = os.path.join( paths.resultrootdir, "DAILY_BENCHMARK_" + fname[10:-3] + ".nc" ) mask_cloud = cbh[:] <= 3000 if os.path.isfile(reference): blh_ref = np.loadtxt(reference) else: blh_ref = blh_mnf[:, 0] if storeResults: BLHS = [np.array(blh), np.array(blh_mnf[:, 0])] BLH_NAMES = ["BLH_KABL", "BLH_INDUS"] if os.path.isfile(reference): BLHS.append(blh_ref) BLH_NAMES.append("BLH_REF") # Cloud base height is added as if it were a BLH though it's not BLHS.append(cbh) BLH_NAMES.append("CLOUD_BASE_HEIGHT") msg = utils.save_qualitymetrics( outputFile, t_values, BLHS, BLH_NAMES, [s_scores, db_scores, ch_scores], ["SILH", "DB", "CH"], [rr, vv], ["MASK_RAIN", "MASK_FOG"], K_values, chrono, params, ) if os.path.isfile(rsFile): blh_rs = utils.extract_rs(rsFile, t_values[0], t_values[-1]) else: blh_rs = None print(msg) errl2_blh = np.sqrt(np.nanmean((blh - blh_ref) ** 2)) errl1_blh = np.nanmean(np.abs(blh - blh_ref)) errl0_blh = np.nanmax(np.abs(blh - blh_ref)) corr_blh = np.corrcoef(blh, blh_ref)[0, 1] n_invalid = np.sum(np.isnan(blh)) + np.sum(np.isinf(blh)) return ( errl2_blh, errl1_blh, errl0_blh, corr_blh, np.mean(ch_scores), np.mean(db_scores), np.mean(s_scores), chrono, n_invalid, )
labels = core.apply_algo(X, 3) blh = utils.blh_from_labels(labels, Z) blhs_over_profile(z_values, rcs_1, blh, labels=labels) plt.figure() plt.hist(rcs_1, 35) plt.title("Histogram of a single profile of RCS") plt.show(block=False) # Test of blhs_over_data # ------------------------ print("\n --------------- Test of blhs_over_data") testFile = paths.file_defaultlidardata() blh = core.blh_estimation(testFile) t_values, z_values, rcss = utils.extract_data(testFile) rcs_1 = rcss["rcs_1"] rcs_2 = rcss["rcs_2"] blhs_over_data(t_values, z_values, rcs_1, blh) # Test of scatterplot_blhs # ------------------------ print("\n --------------- Test of scatterplot_blhs") outputFile = paths.file_defaultoutput() t_values, z_values, dat = utils.extract_data( outputFile, to_extract=["blh_kabl", "pbl"] ) blh_new = dat["blh_kabl"] blh_mnf = dat["pbl"] scatterplot_blhs(t_values, blh_mnf[:, 0], blh_new)
def blh_estimation_returnlabels( inputFile, outputFile=None, storeInNetcdf=False, params=None ): """Perform BLH estimation on all profiles of the day and return the labels of the classification. Parameters ---------- inputFile : str Path to the input file, as generated by raw2l1 outputFile : str, default=None Path to the output file. Default adds ".out" before ".nc" storeInNetcdf : bool, default=True If True, the field 'blh_kabl', containg BLH estimation, is stored in the outputFile params : dict, default=None Dict with all settings. This function depends on 'n_clusters' Returns ------- blh : ndarray of shape (Nt,) Time series of BLH as estimated by the KABL algorithm zoneID : ndarray of shape (Nt,Nz) Cluster labels of every profiles """ t0 = time.time() #:::::::::::::::::::::: if params is None: params = utils.get_default_params() # 1. Extract the data # --------------------- loc, dateofday, lat, lon = utils.where_and_when(inputFile) needed_data = np.unique(np.concatenate(list(params["predictors"].values()))) t_values, z_values, rcss = utils.extract_data( inputFile, to_extract=needed_data, params=params ) if "rcs_0" in needed_data: rcs_0 = rcss["rcs_0"] if "rcs_1" in needed_data: rcs_1 = rcss["rcs_1"] if "rcs_2" in needed_data: rcs_2 = rcss["rcs_2"] blh = [] zoneID = [] # setup toolbar toolbar_width = int(len(t_values) / 10) + 1 sys.stdout.write( "\nKABL estimation (" + loc + dateofday.strftime(", %Y/%m/%d") + "): [%s]" % ("." * toolbar_width) ) sys.stdout.flush() sys.stdout.write("\b" * (toolbar_width + 1)) # return to start of line, after '[' # Loop on all profile of the day for t in range(len(t_values)): # toolbar if np.mod(t, 10) == 0: sys.stdout.write("*") sys.stdout.flush() # 2. Prepare the data # --------------------- coords = { "time": dt.datetime.utcfromtimestamp(t_values[t]), "lat": lat, "lon": lon, } t_back = max(t - params["n_profiles"] + 1, 0) rcss = {} if "rcs_0" in needed_data: rcss["rcs_0"] = rcs_0[t_back : t + 1, :] if "rcs_1" in needed_data: rcss["rcs_1"] = rcs_1[t_back : t + 1, :] if "rcs_2" in needed_data: rcss["rcs_2"] = rcs_2[t_back : t + 1, :] X, Z = prepare_data(coords, z_values, rcss=rcss, params=params) # 3. Apply the machine learning algorithm # --------------------- if isinstance(params["n_clusters"], int): labels = apply_algo(X, params["n_clusters"], params=params) else: labels, n_clusters, classif_score = apply_algo_k_auto(X, params=params) # 4. Derive and store the BLH # --------------------- blh.append(utils.blh_from_labels(labels, Z)) zoneID.append(labels) if outputFile is None: outputFile = paths.file_defaultoutput() # end toolbar t1 = time.time() #:::::::::::::::::::::: chrono = t1 - t0 sys.stdout.write("] (" + str(np.round(chrono, 4)) + " s)\n") # 5. Store the new BLH estimation into a copy of the original netCDF # --------------------- if storeInNetcdf: utils.add_blh_to_netcdf(inputFile, outputFile, blh) return np.array(blh), np.array(zoneID)