def blh_estimation(inputFile, outputFile=None, storeInNetcdf=True, params=None): """Perform BLH estimation on all profiles of the day and write it into a copy of the netcdf file. Parameters ---------- inputFile : str Path to the input file, as generated by raw2l1 outputFile : str, default=None Path to the output file. Default adds ".out" before ".nc" storeInNetcdf : bool, default=True If True, the field 'blh_kabl', containg BLH estimation, is stored in the outputFile params : dict, default=None Dict with all settings. This function depends on 'n_clusters' Returns ------- blh : ndarray of shape (Nt,) Time series of BLH as estimated by the KABL algorithm. Example ------- >>> from kabl import paths >>> from kabl import core >>> testFile = paths.file_defaultlidardata() >>> blh = core.blh_estimation(testFile) """ t0 = time.time() #:::::::::::::::::::::: if params is None: params = utils.get_default_params() # 1. Extract the data # --------------------- loc, dateofday, lat, lon = utils.where_and_when(inputFile) needed_data = np.unique(np.concatenate(list(params["predictors"].values()))) t_values, z_values, rcss = utils.extract_data( inputFile, to_extract=needed_data, params=params ) if "rcs_0" in needed_data: rcs_0 = rcss["rcs_0"] if "rcs_1" in needed_data: rcs_1 = rcss["rcs_1"] if "rcs_2" in needed_data: rcs_2 = rcss["rcs_2"] blh = [] # setup toolbar toolbar_width = int(len(t_values) / 10) + 1 sys.stdout.write( "\nKABL estimation (" + loc + dateofday.strftime(", %Y/%m/%d") + "): [%s]" % ("." * toolbar_width) ) sys.stdout.flush() sys.stdout.write("\b" * (toolbar_width + 1)) # return to start of line, after '[' # Loop on all profile of the day for t in range(len(t_values)): # toolbar if np.mod(t, 10) == 0: sys.stdout.write("*") sys.stdout.flush() # 2. Prepare the data # --------------------- coords = { "time": dt.datetime.utcfromtimestamp(t_values[t]), "lat": lat, "lon": lon, } t_back = max(t - params["n_profiles"] + 1, 0) rcss = {} if "rcs_0" in needed_data: rcss["rcs_0"] = rcs_0[t_back : t + 1, :] if "rcs_1" in needed_data: rcss["rcs_1"] = rcs_1[t_back : t + 1, :] if "rcs_2" in needed_data: rcss["rcs_2"] = rcs_2[t_back : t + 1, :] X, Z = prepare_data(coords, z_values, rcss=rcss, params=params) # 3. Apply the machine learning algorithm # --------------------- if isinstance(params["n_clusters"], int): labels = apply_algo(X, params["n_clusters"], params=params) else: labels, n_clusters, classif_score = apply_algo_k_auto(X, params=params) # 4. Derive and store the BLH # --------------------- blh.append(utils.blh_from_labels(labels, Z)) if outputFile is None: outputFile = paths.file_defaultoutput() # end toolbar t1 = time.time() #:::::::::::::::::::::: chrono = t1 - t0 sys.stdout.write("] (" + str(np.round(chrono, 4)) + " s)\n") # 5. Store the new BLH estimation into a copy of the original netCDF # --------------------- if storeInNetcdf: utils.add_blh_to_netcdf(inputFile, outputFile, blh) return np.array(blh)
def adabl_qualitymetrics( dataFile: str, modelFile: str, scalerFile: str, refFile: str = "indus", outputFile: str = "None", addResultsToNetcdf: bool = False, ): """Perform BLH estimation with ADABL on all profiles of the day and write it into a copy of the netcdf file Parameters ---------- dataFile : str Path to the input file, as generated by raw2l1 modelFile : str Path to the model file (pickle object) scalerFile : str Path to the scaler file (pickle object) refFile : str Path to reference BLH estimation (handmade of manufacturer's). Default is the manufacturer. outputFile : str Path to the output file. Must be specified if addResultsToNetcdf=True addResultsToNetcdf : bool, default=False If True, adds the quality metrics to the existing result file specified in outputFile Returns ------- errl2_blh : float Root mean squared gap between BLH from KABL and the reference .. math:: \sqrt{1/N \sum_i^N (Z(i)-Zref(i))^2} errl1_blh : float Mean absolute gap between BLH from KABL and the reference .. math:: 1/N \sum_i^N \vert Z(i)-Zref(i) \vert errl0_blh : float Maximum absolute gap between BLH from KABL and the reference .. math:: \max_i \vert Z(i)-Zref(i) \vert ch_score : float Average Calinski-Harabasz score (the higher, the better) over the full day db_scores : float Average Davies-Bouldin score (the lower, the better) over the full day s_scores : float Average silhouette score (the higher, the better) over the full day chrono : float Computation time for the full day (seconds) n_invalid : int Number of BLH estimation at NaN or Inf """ t0 = time.time() #:::::::::::::::::::::: # 1. Extract the data # --------------------- loc, dateofday, lat, lon = utils.where_and_when(dataFile) t_values, z_values, dat = utils.extract_data( dataFile, to_extract=["rcs_1", "rcs_2", "pbl"] ) rcs_1 = dat["rcs_1"] rcs_2 = dat["rcs_2"] blh_mnf = dat["pbl"] sec_intheday = np.mod(t_values, 24 * 3600) Nt, Nz = rcs_1.shape # Load pre-trained model # ------------------------ fc = open(modelFile, "rb") model = pickle.load(fc) fc = open(scalerFile, "rb") scaler = pickle.load(fc) blh = [] # setup toolbar toolbar_width = int(len(t_values) / 10) + 1 sys.stdout.write( "ADABL estimation (" + loc + dateofday.strftime(", %Y/%m/%d") + "): [%s]" % ("." * toolbar_width) ) sys.stdout.flush() sys.stdout.write("\b" * (toolbar_width + 1)) # return to start of line, after '[' # Loop on all profile of the day for t in range(Nt): # toolbar if np.mod(t, 10) == 0: if any(np.isnan(blh[-11:-1])): sys.stdout.write("!") else: sys.stdout.write("*") sys.stdout.flush() # 2. Prepare the data # --------------------- rcs1loc = rcs_1[t, :] rcs2loc = rcs_2[t, :] rcs1loc[rcs1loc <= 0] = 1e-5 rcs2loc[rcs2loc <= 0] = 1e-5 X_new = np.array( [ np.repeat(sec_intheday[t], Nz), z_values, np.log10(rcs1loc), np.log10(rcs2loc), ] ).T X_new = scaler.transform(X_new) # 3. Apply the machine learning algorithm # --------------------- y_new = model.predict(X_new) # 4. Derive and store the BLH # --------------------- blh.append(utils.blh_from_labels(y_new, z_values)) # end toolbar t1 = time.time() #:::::::::::::::::::::: chrono = t1 - t0 sys.stdout.write("] (" + str(np.round(chrono, 4)) + " s)\n") if os.path.isfile(refFile): blh_ref = np.loadtxt(refFile) else: blh_ref = blh_mnf[:, 0] if addResultsToNetcdf: BLHS = [np.array(blh)] BLH_NAMES = ["BLH_ADABL"] msg = add_blhs_to_netcdf(outputFile, BLHS, BLH_NAMES) print(msg) errl2_blh = np.sqrt(np.nanmean((blh - blh_ref) ** 2)) errl1_blh = np.nanmean(np.abs(blh - blh_ref)) errl0_blh = np.nanmax(np.abs(blh - blh_ref)) corr_blh = np.corrcoef(blh, blh_ref)[0, 1] n_invalid = np.sum(np.isnan(blh)) + np.sum(np.isinf(blh)) return errl2_blh, errl1_blh, errl0_blh, corr_blh, chrono, n_invalid
def quicklook_data(nc_file, max_height=4500, with_pbl=False, with_cbh=False): """Give a quick look of the data, only the data. Parameters ---------- nc_file : str Path to the netcdf file containing the data max_height : {float, int}, default=4500 Top height on the graphic with_pbl : bool, default=False If True, add onto the data the boundary layer height calculated by the manufacturer with_cbh : bool, default=False If True, add onto the data the first cloud base height calculated by the manufacturer Returns ------- None """ location, day, lat, lon = utils.where_and_when(nc_file) to_be_extracted = ["rcs_0"] if with_pbl: to_be_extracted.append("pbl") if with_cbh: to_be_extracted.append("cloud_base_height") t, z, dat = utils.extract_data(nc_file, max_height=max_height, to_extract=to_be_extracted) rcs = dat["rcs_0"] if "pbl" in to_be_extracted: pbl = dat["pbl"] if "cloud_base_height" in to_be_extracted: cbh = dat["cloud_base_height"] plt.figure(figsize=(14, 7)) plt.pcolormesh(t, z, rcs.T, alpha=0.8, cmap="rainbow", vmin=-0.1, vmax=0.8) if with_pbl: pbl[pbl == -999] = np.nan for layer in range(pbl.shape[1]): plt.plot(t, pbl[:, layer], "k*") if with_cbh: cbh[cbh == -999] = np.nan for layer in range(cbh.shape[1]): plt.plot(t, cbh[:, layer], "r.") axes = plt.gca() plt.title("Lidar backscatter | " + location + " " + day.strftime("%Y/%m/%d")) axes.set_xlabel("Hour") axes.set_ylabel("Height (m agl)") plt.tight_layout() plt.grid(color="white", ls="solid") plt.colorbar(label="Range corrected signal", alpha=0.8) locs, labels = plt.xticks() labels = [ dt.datetime.utcfromtimestamp(loc).strftime("%H:%M") for loc in locs ] axes.set_xticks(locs) axes.set_xticklabels(labels) plt.gcf().autofmt_xdate() plt.show(block=False)
def adabl_blh_estimation( dataFile: str, modelFile: str, scalerFile: str, outputFile: bool = None, storeInNetcdf: bool = False, ): """Perform BLH estimation with ADABL on all profiles of the day and write it into a copy of the netcdf file Parameters ---------- dataFile : str Path to the input file, as generated by raw2l1 modelFile : str Path to the model file (pickle object) scalerFile : str Path to the scaler file (pickle object) outputFile : str Path to the output file. Default adds ".out" before ".nc" storeInNetcdf : bool If True, the field 'blh_ababl', containg BLH estimation, is stored in the outputFile Returns ------- blh : ndarray of shape (Nt,) Time series of BLH as estimated by the ADABL algorithm. """ t0 = time.time() #:::::::::::::::::::::: # 1. Extract the data # --------------------- loc, dateofday, lat, lon = utils.where_and_when(dataFile) t_values, z_values, dat = utils.extract_data( dataFile, to_extract=["rcs_1", "rcs_2", "pbl"] ) rcs_1 = dat["rcs_1"] rcs_2 = dat["rcs_2"] blh_mnf = dat["pbl"] sec_intheday = np.mod(t_values, 24 * 3600) Nt, Nz = rcs_1.shape # Load pre-trained model # ------------------------ fc = open(modelFile, "rb") model = pickle.load(fc) fc = open(scalerFile, "rb") scaler = pickle.load(fc) blh = [] # setup toolbar toolbar_width = int(len(t_values) / 10) + 1 sys.stdout.write( "ADABL estimation (" + loc + dateofday.strftime(", %Y/%m/%d") + "): [%s]" % ("." * toolbar_width) ) sys.stdout.flush() sys.stdout.write("\b" * (toolbar_width + 1)) # return to start of line, after '[' # Loop on all profile of the day for t in range(Nt): # toolbar if np.mod(t, 10) == 0: if any(np.isnan(blh[-11:-1])): sys.stdout.write("!") else: sys.stdout.write("*") sys.stdout.flush() # 2. Prepare the data # --------------------- rcs1loc = rcs_1[t, :] rcs2loc = rcs_2[t, :] rcs1loc[rcs1loc <= 0] = 1e-5 rcs2loc[rcs2loc <= 0] = 1e-5 X_new = np.array( [ np.repeat(sec_intheday[t], Nz), z_values, np.log10(rcs1loc), np.log10(rcs2loc), ] ).T X_new = scaler.transform(X_new) # 3. Apply the machine learning algorithm # --------------------- y_new = model.predict(X_new) # 4. Derive and store the BLH # --------------------- blh.append(utils.blh_from_labels(y_new, z_values)) # end toolbar t1 = time.time() #:::::::::::::::::::::: chrono = t1 - t0 sys.stdout.write("] (" + str(np.round(chrono, 4)) + " s)\n") if outputFile is None: outputFile = dataFile[:-3] + ".out.nc" # 5. Store the new BLH estimation into a copy of the original netCDF if storeInNetcdf: utils.add_blh_to_netcdf(dataFile, outputFile, blh, origin="adabl") return np.array(blh)