def quicklook_dataset(datasetpath, altmax=4000): """Quick look at the original data. Parameters ---------- datasetpath: str Path to the data file. Must follow the convention adopted in the BLUSC program. Example: "DATASET_2015_0210.PASSY2015_BT-T_linear_dz40_dt30_zmax2000.nc" altmax {int, float} Top altitude of the graph (meter above ground level) Returns ------- Display the default variable of the given file against time and altitude. In X-axis is the time In Y-axis is the height (m agl) Variable values are in shades of colors. """ X_raw, t, z = utils.load_dataset( datasetpath, variables_to_load=["X_raw", "time", "altitude"]) TZ = utils.grid_to_scatter(utils.dtlist2slist(t), z) for p in range(X_raw.shape[1]): t1, z1, V = utils.scatter_to_grid(TZ, X_raw[:, p]) plt.figure() # plt.title("Variable "+str(p)+" of dataset") plt.pcolormesh(t, z, V.T, shading="auto") plt.colorbar() plt.gcf().autofmt_xdate() plt.xlabel("Time (UTC)") plt.ylabel("Alt (m agl)") if storeImages: fileName = "QL_Xraw" + str(p) plt.savefig(figureDir + fileName + fmtImages) plt.close() else: plt.show(block=False)
def comparisonSupervisedAlgo(X_raw, classifiers, resolution=50): """Compare several supervised algorithms by display the border of the class attribution behind the actual data points. Parameters ---------- X_raw: ndarray of shape (N,p) Matrix of data points (one column for each predictor, only the two first are used) classifiers: list of `sklearn` object with `predict` method Trained classifiers to test resolution: int, default=50 Number of points in each coordinates for the evaluation grid. The higher, the more precise is the border. Returns ------- Tile of plot similar to cluster2Dview with the classification border in color shades in background. """ print("Prepare comparison graphics...") classifiers_keys = [str(clf).split("(")[0] for clf in classifiers] BTminbound = X_raw[:, 0].min() - 1 BTmaxbound = X_raw[:, 0].max() + 1 Tminbound = X_raw[:, 1].min() - 2 Tmaxbound = X_raw[:, 1].max() + 2 T_values = np.linspace(Tminbound, Tmaxbound, resolution) BT_values = np.linspace(BTminbound, BTmaxbound, resolution) X_pred = utils.grid_to_scatter(BT_values, T_values) fig, axs = plt.subplots(1, len(classifiers), figsize=(18, 6)) plt.tight_layout() for icl in range(len(classifiers)): clf = classifiers[icl] print("Classifier", icl, "/", len(classifiers), classifiers_keys[icl]) y_pred = clf.predict(X_pred) b, t, y = utils.scatter_to_grid(X_pred, y_pred) axs[icl].set_title(classifiers_keys[icl]) axs[icl].pcolormesh(BT_values, T_values, y.T, vmin=-0.5, cmap="nipy_spectral", shading="auto") axs[icl].plot(X_raw[:, 0], X_raw[:, 1], "k.") axs[icl].set_xlabel(DicLeg["BT"]) axs[icl].set_ylabel(DicLeg["T"]) if storeImages: fileName = "compSupervisedAlgo" plt.savefig(figureDir + fileName + fmtImages) plt.close() print("Figure saved:", figureDir + fileName + fmtImages) else: plt.show(block=False)
def clusterZTview_manyclusters(t_values, z_values, zoneIDs, delete_mask=None, titl=None, fileName=None): """Plots cluster labels in the same time and altitude grid where measurements have been done (boundary layer classification). Repeat it of 6 differents number of clusters. Parameters ---------- t_values: array-like of shape (nt,) Vector of time within the day z_values: array-like of shape (nalt,) Vector of altitude zoneIDs: list of array-like of shape (N,) Cluster labels for each point and for each number of clusters delete_mask: array-like of shape (nt*nalt,) Mask at True when observation has been removed by the `utils.deletelines` function (to avoid NaNs) titl: str, optional Customised title for the figure fileName: str, optional Customised file name for saving the figure Returns ------- 3x2 tile of clusters labels on a time-altitude grid In X-axis is the time In Y-axis is the height (m agl) Clusters are shown with differents colors. """ if titl is None: titl = "" count2letter = ['a)', 'b)', 'c)', 'd)', 'e)', 'f)'] z_values = z_values / 1000 # convert meters to kilometers # 1. Conversion datetime -> seconds t0 = t_values[0] st_values = utils.dtlist2slist(t_values) # 2. Format from grid(z,t) to scatter TZ = utils.grid_to_scatter(st_values, z_values) n_kvalues = len(zoneIDs) nl = int(np.sqrt(n_kvalues)) nc = int(np.ceil(n_kvalues / nl)) # -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- fig, axes = plt.subplots(nrows=nl, ncols=nc, figsize=(12, 8), sharex=True, sharey=True) plt.suptitle(titl) for ink in range(n_kvalues): zoneID = zoneIDs[ink] K = np.max(zoneID) + 1 clustersIDs = np.arange(K) clist = [] cticks = [] cticklabels = [] for k in np.unique(zoneID): cticks.append(k + 0.5) cticklabels.append(clustersIDs[k]) clist.append(clusterMarks[clustersIDs[k]][:-1]) colormap = ListedColormap(clist) # 3. Set labels at grid(z,t) format t_trash, z_trash, labels = utils.scatter_to_grid(TZ, zoneID) if (np.max(np.abs(z_values - z_trash)) + np.max(np.abs(st_values - t_trash)) > 1e-13): raise Exception( "Error in z,t retrieval : max(|z_values-z_trash|)=", np.max(np.abs(z_values - z_trash)), "max(|t_values-t_trash|)=", np.max(np.abs(st_values - t_trash)), ) labels = np.ma.array(labels, mask=np.isnan(labels)) # 4. Graphic plt.subplot(nl, nc, ink + 1) im = plt.pcolormesh(t_values, z_values, labels.T, vmin=0, vmax=K, cmap=colormap, shading="auto") plt.text(t_values[-7], z_values[-4], count2letter[ink], fontweight='bold', fontsize=16) plt.gcf().autofmt_xdate() # Colorbar cbar = plt.colorbar() cbar.set_ticks(cticks) cbar.set_ticklabels(cticklabels) if np.mod(ink, nc) == nl: cbar.set_label("Cluster labels") if np.mod(ink, nc) == 0: plt.ylabel("Alt (km agl)") if ink >= (nl - 1) * nc: plt.xlabel("Time (UTC)") fig.subplots_adjust(wspace=0, hspace=0.1) plt.tight_layout() if storeImages: if fileName is None: fileName = "multi_clusterZTview" plt.savefig(figureDir + fileName + fmtImages) plt.close() print("Figure saved:", figureDir + fileName + fmtImages) else: plt.show(block=False)
def clusterZTview( t_values, z_values, zoneID, delete_mask=None, fileName=None, clustersIDs=None, displayClustersIDs=False, titl=None, ): """Plots cluster labels in the same time and altitude grid where measurements have been done (boundary layer classification). Parameters ---------- t_values: array-like of shape (nt,) Vector of time within the day z_values: array-like of shape (nalt,) Vector of altitude zoneID: array-like of shape (N,) Cluster labels of each point delete_mask: array-like of shape (nt*nalt,) Mask at True when observation has been removed by the `utils.deletelines` function (to avoid NaNs) fileName: str, optional Customised file name for saving the figure clustersIDs: dict, optional Connection between cluster numbers and boundary layer types Example: {0:"CL",1:"SBL",2:"FA",3:"ML"}. Default is {0:0,1:1,...}. displayClustersIDs: bool If True, displays the clusterIDs over the graph, at the center of the cluster. titl: str, optional Customised title for the figure Returns ------- Clusters labels on a time-altitude grid In X-axis is the time In Y-axis is the height (m agl) Clusters are shown with differents colors. """ if clustersIDs is None: K = np.max(zoneID) + 1 clustersIDs = np.arange(K) else: K = len(clustersIDs.items()) for it in clustersIDs.items(): key, val = it clusterMarks[val] = clusterMarks[key] if titl is None: titl = "Cluster in time-altitude grid | " + str(K) + " clusters" clist = [] cticks = [] cticklabels = [] for k in range(K): cticks.append(k + 0.5) cticklabels.append(clustersIDs[k]) clist.append(clusterMarks[clustersIDs[k]][:-1]) colormap = ListedColormap(clist) # 1. Deleted labels completion (when missing data) if delete_mask is not None: fullzoneID = np.full(np.size(delete_mask), np.nan) fullzoneID[~delete_mask] = zoneID else: fullzoneID = zoneID # 2. Conversion datetime -> seconds t0 = t_values[0] st_values = utils.dtlist2slist(t_values) # 3. Format from grid(z,t) to scatter TZ = utils.grid_to_scatter(st_values, z_values) # 4. Set labels at grid(z,t) format t_trash, z_trash, labels = utils.scatter_to_grid(TZ, fullzoneID) if np.max(np.abs(z_values - z_trash)) + np.max( np.abs(st_values - t_trash)) > 1e-13: raise Exception( "Error in z,t retrieval : max(|z_values-z_trash|)=", np.max(np.abs(z_values - z_trash)), "max(|t_values-t_trash|)=", np.max(np.abs(st_values - t_trash)), ) labels = np.ma.array(labels, mask=np.isnan(labels)) # 5. Graphic # -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- fig = plt.figure() # plt.title(titl) plt.pcolormesh(t_values, z_values, labels.T, vmin=0, vmax=K, cmap=colormap, shading="auto") if displayClustersIDs: for k in np.unique(zoneID): idxk = np.where(zoneID == k)[0] x0text = t0 + dt.timedelta(seconds=np.mean(TZ[idxk, 0], axis=0)) x1text = np.mean(TZ[idxk, 1], axis=0) plt.text(x0text, x1text, clustersIDs[k], fontweight="bold", fontsize=18) cbar = plt.colorbar(label="Cluster label") cbar.set_ticks(cticks) cbar.set_ticklabels(cticklabels) plt.gcf().autofmt_xdate() plt.xlabel("Time (UTC)") plt.ylabel("Alt (m agl)") if storeImages: if fileName is None: fileName = "clusterZTview_K" + str(K) plt.savefig(figureDir + fileName + fmtImages) plt.close() print("Figure saved:", figureDir + fileName + fmtImages) else: plt.show(block=False)
def estimateInterpolationError(z_target, t_target, z_known, t_known, V_known, n_randoms=10, plot_on=True): """Estimate the error and the computing time for several interpolation method. Errors are estimated by cross-validation. The function repeats the interpolation with all methods for severals train/test splits. The list of tested methods as well as their parameters must be changed inside the function. Default list: '4NearestNeighbors','8NearestNeighbors','linear','cubic' Parameters ---------- z_target: ndarray of shape (n1_z,) Altitude vector of the target grid (m agl) t_target: array-like of shape (n1_t,) with dtype=datetime Time vector of the target grid z_known: ndarray of shape (n0_z,) Altitude vector of the known grid (m agl) t_known: array-like of shape (n0_t,) with dtype=datetime Time vector of the known grid V_known: ndarray of shape (n0_t,n0_z) Data values on the known grid n_randoms: int, default=10 Number of repeated random split between training and testing sets plot_on: bool, default=True If True, the graphics showing computing time versus accuracy is drawn Returns ------- accuracies: ndarray of shape (n_randoms,n_regressors) R2 score of each regressor (one per line) for each random split (one per column). chronos: ndarray of shape (n_randoms,n_regressors) Computing time of each regressor (one per line) for each random split (one per column). reg_names: list of shape (n_regressors,) Names of regressions methods performed """ from sklearn.neighbors import KNeighborsRegressor from scipy.interpolate import griddata from sklearn.metrics import r2_score from sklearn.model_selection import train_test_split # Switch from format "data=f(coordinates)" to format "obs=f(predictors)" st_known = utils.dtlist2slist(t_known) st_target = utils.dtlist2slist(t_target) X_known, Y_known = utils.grid_to_scatter(st_known, z_known, V_known) X_target = utils.grid_to_scatter(st_target, z_target) # NaN are removed X_known = X_known[~np.isnan(Y_known), :] Y_known = Y_known[~np.isnan(Y_known)] regressors = [] reg_names = [] #### ========= Estimation with 4-nearest neighbors KNN4 = KNeighborsRegressor(n_neighbors=4) regressors.append(KNN4) reg_names.append("4NearestNeighbors") #### ========= Estimation with 8-nearest neighbors KNN8 = KNeighborsRegressor(n_neighbors=8) regressors.append(KNN8) reg_names.append("8NearestNeighbors") chronos = np.zeros((len(regressors) + 2, n_randoms)) accuracies = np.zeros((len(regressors) + 2, n_randoms)) for icl in range(len(regressors)): reg = regressors[icl] print("Testing ", str(reg).split("(")[0]) for ird in range(n_randoms): X_train, X_test, y_train, y_test = train_test_split( X_known, Y_known, test_size=0.2, random_state=ird) t0 = time.time() #:::::: reg.fit(X_train, y_train) accuracies[icl, ird] = reg.score(X_test, y_test) t1 = time.time() #:::::: chronos[icl, ird] = t1 - t0 #### ========= Estimation with 2D linear interpolation reg_names.append("Linear2DInterp") print("Testing Linear2DInterp") for ird in range(n_randoms): X_train, X_test, y_train, y_test = train_test_split(X_known, Y_known, test_size=0.2, random_state=ird) y_pred = griddata(X_train, y_train, X_test, method="linear") # Some data can still be missing even after the interpolation # * Radiometer : resolution coarsens with altitude => last gates missing # * Ceilometer : high lowest range => first gates missing y_test = y_test[~np.isnan(y_pred)] y_pred = y_pred[~np.isnan(y_pred)] accuracies[-2, ird] = r2_score(y_test, y_pred) t1 = time.time() #:::::: chronos[-2, ird] = t1 - t0 #### ========= Estimation with 2D cubic interpolation reg_names.append("Cubic2DInterp") print("Testing Cubic2DInterp") for ird in range(n_randoms): X_train, X_test, y_train, y_test = train_test_split(X_known, Y_known, test_size=0.2, random_state=ird) y_pred = griddata(X_train, y_train, X_test, method="linear") # Some data can still be missing even after the interpolation # * Radiometer : resolution coarsens with altitude => last gates missing # * Ceilometer : high lowest range => first gates missing y_test = y_test[~np.isnan(y_pred)] y_pred = y_pred[~np.isnan(y_pred)] accuracies[-1, ird] = r2_score(y_test, y_pred) t1 = time.time() #:::::: chronos[-1, ird] = t1 - t0 if plot_on: graphics.estimator_quality(accuracies, chronos, reg_names) return accuracies, chronos, reg_names
def estimateongrid(z_target, t_target, z_known, t_known, V_known, method="linear"): """Interpolate the data on a target grid knowning it on another grid. Grids are time-altitude. Supported interpolation methods: 'linear','cubic','nearestneighbors' For nearest neighbors, the number of neighbors must be passed as the first character. For example: method='4nearestneighbors' For more insights about how to choose the good methods (error, computing time...) please refer to the notebook `tuto-0to1-prepdataset.ipynb` Parameters ---------- z_target: ndarray of shape (n1_z,) Altitude vector of the target grid (m agl) t_target: array-like of shape (n1_t,) with dtype=datetime Time vector of the target grid z_known: ndarray of shape (n0_z,) Altitude vector of the known grid (m agl) t_known: array-like of shape (n0_t,) with dtype=datetime Time vector of the known grid V_known: ndarray of shape (n0_t,n0_z) Data values on the known grid method: {'linear','cubic','nearestneighbors'}, default='linear' Interpolation method. Returns ------- V_target: ndarray of shape (n1_t,n1_z) Values on the target grid """ # Switch from format "data=f(coordinates)" to format "obs=f(predictors)" st_known = utils.dtlist2slist(t_known) st_target = utils.dtlist2slist(t_target) X_known, Y_known = utils.grid_to_scatter(st_known, z_known, V_known) X_target = utils.grid_to_scatter(st_target, z_target) # NaN are removed X_known = X_known[~np.isnan(Y_known), :] Y_known = Y_known[~np.isnan(Y_known)] #### ========= Estimation with K-nearest neighbors if method[1:].lower() == "nearestneighbors": from sklearn.neighbors import KNeighborsRegressor KNN = KNeighborsRegressor(n_neighbors=int(method[0])) KNN.fit(X_known, Y_known) Y_target = KNN.predict(X_target) else: #### ========= Estimation with 2D interpolation from scipy.interpolate import griddata Y_target = griddata(X_known, Y_known, X_target, method=method.lower()) # Shape the output t1, z1, V_target = utils.scatter_to_grid(X_target, Y_target) # Sanity checks if np.shape(V_target) != (np.size(st_target), np.size(z_target)): raise Exception( "Output has not expected shape : shape(st_target)", np.shape(st_target), "shape(z_target)", np.shape(z_target), "shape(V_target)", np.shape(V_target), ) if (np.abs(t1 - st_target) > 10**(-10)).any(): raise Exception( "Time vector has been altered : max(|t1-t_target|)=", np.max(np.abs(t1 - st_target)), ) if (np.abs(z1 - z_target) > 10**(-10)).any(): raise Exception( "Altitude vector has been altered : max(|z1-z_target|)=", np.max(np.abs(z1 - z_target)), ) return V_target