def quicklook_dataset(datasetpath, altmax=4000):
    """Quick look at the original data.
    
    
    Parameters
    ----------
    datasetpath: str
        Path to the data file. Must follow the convention adopted in
        the BLUSC program.
        Example: "DATASET_2015_0210.PASSY2015_BT-T_linear_dz40_dt30_zmax2000.nc"
    
    altmax {int, float}
        Top altitude of the graph (meter above ground level)
    
    
    Returns
    -------
    Display the default variable of the given file against time
    and altitude.
        In X-axis is the time
        In Y-axis is the height (m agl)
        Variable values are in shades of colors.
    """
    X_raw, t, z = utils.load_dataset(
        datasetpath, variables_to_load=["X_raw", "time", "altitude"])
    TZ = utils.grid_to_scatter(utils.dtlist2slist(t), z)

    for p in range(X_raw.shape[1]):

        t1, z1, V = utils.scatter_to_grid(TZ, X_raw[:, p])

        plt.figure()
        # plt.title("Variable "+str(p)+" of dataset")
        plt.pcolormesh(t, z, V.T, shading="auto")
        plt.colorbar()
        plt.gcf().autofmt_xdate()
        plt.xlabel("Time (UTC)")
        plt.ylabel("Alt (m agl)")
        if storeImages:
            fileName = "QL_Xraw" + str(p)
            plt.savefig(figureDir + fileName + fmtImages)
            plt.close()
        else:
            plt.show(block=False)
def comparisonSupervisedAlgo(X_raw, classifiers, resolution=50):
    """Compare several supervised algorithms by display the border of
    the class attribution behind the actual data points.
    
    
    Parameters
    ----------
    X_raw: ndarray of shape (N,p)
        Matrix of data points (one column for each predictor, only the
        two first are used)
    
    classifiers: list of `sklearn` object with `predict` method
        Trained classifiers to test
        
    resolution: int, default=50
        Number of points in each coordinates for the evaluation grid.
        The higher, the more precise is the border.
        
    
    Returns
    -------
    Tile of plot similar to cluster2Dview with the classification
        border in color shades in background.
    """

    print("Prepare comparison graphics...")
    classifiers_keys = [str(clf).split("(")[0] for clf in classifiers]

    BTminbound = X_raw[:, 0].min() - 1
    BTmaxbound = X_raw[:, 0].max() + 1
    Tminbound = X_raw[:, 1].min() - 2
    Tmaxbound = X_raw[:, 1].max() + 2

    T_values = np.linspace(Tminbound, Tmaxbound, resolution)
    BT_values = np.linspace(BTminbound, BTmaxbound, resolution)

    X_pred = utils.grid_to_scatter(BT_values, T_values)

    fig, axs = plt.subplots(1, len(classifiers), figsize=(18, 6))
    plt.tight_layout()
    for icl in range(len(classifiers)):
        clf = classifiers[icl]
        print("Classifier", icl, "/", len(classifiers), classifiers_keys[icl])
        y_pred = clf.predict(X_pred)
        b, t, y = utils.scatter_to_grid(X_pred, y_pred)

        axs[icl].set_title(classifiers_keys[icl])
        axs[icl].pcolormesh(BT_values,
                            T_values,
                            y.T,
                            vmin=-0.5,
                            cmap="nipy_spectral",
                            shading="auto")

        axs[icl].plot(X_raw[:, 0], X_raw[:, 1], "k.")
        axs[icl].set_xlabel(DicLeg["BT"])
        axs[icl].set_ylabel(DicLeg["T"])

    if storeImages:
        fileName = "compSupervisedAlgo"
        plt.savefig(figureDir + fileName + fmtImages)
        plt.close()
        print("Figure saved:", figureDir + fileName + fmtImages)
    else:
        plt.show(block=False)
def clusterZTview_manyclusters(t_values,
                               z_values,
                               zoneIDs,
                               delete_mask=None,
                               titl=None,
                               fileName=None):
    """Plots cluster labels in the same time and altitude grid where
    measurements have been done (boundary layer classification).
    Repeat it of 6 differents number of clusters.
    
    
    Parameters
    ----------
    t_values: array-like of shape (nt,)
        Vector of time within the day
    
    z_values: array-like of shape (nalt,)
        Vector of altitude
    
    zoneIDs: list of array-like of shape (N,)
        Cluster labels for each point and for each number of clusters
    
    delete_mask: array-like of shape (nt*nalt,)
        Mask at True when observation has been removed by the
        `utils.deletelines` function (to avoid NaNs)
    
    titl: str, optional
        Customised title for the figure
    
    fileName: str, optional
        Customised file name for saving the figure
    
    
    Returns
    -------
    3x2 tile of clusters labels on a time-altitude grid
        In X-axis is the time
        In Y-axis is the height (m agl)
        Clusters are shown with differents colors.
    """

    if titl is None:
        titl = ""

    count2letter = ['a)', 'b)', 'c)', 'd)', 'e)', 'f)']

    z_values = z_values / 1000  # convert meters to kilometers

    # 1. Conversion datetime -> seconds
    t0 = t_values[0]
    st_values = utils.dtlist2slist(t_values)

    # 2. Format from grid(z,t) to scatter
    TZ = utils.grid_to_scatter(st_values, z_values)

    n_kvalues = len(zoneIDs)
    nl = int(np.sqrt(n_kvalues))
    nc = int(np.ceil(n_kvalues / nl))

    # -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
    fig, axes = plt.subplots(nrows=nl,
                             ncols=nc,
                             figsize=(12, 8),
                             sharex=True,
                             sharey=True)
    plt.suptitle(titl)
    for ink in range(n_kvalues):
        zoneID = zoneIDs[ink]

        K = np.max(zoneID) + 1
        clustersIDs = np.arange(K)

        clist = []
        cticks = []
        cticklabels = []
        for k in np.unique(zoneID):
            cticks.append(k + 0.5)
            cticklabels.append(clustersIDs[k])
            clist.append(clusterMarks[clustersIDs[k]][:-1])
        colormap = ListedColormap(clist)

        # 3. Set labels at grid(z,t) format
        t_trash, z_trash, labels = utils.scatter_to_grid(TZ, zoneID)
        if (np.max(np.abs(z_values - z_trash)) +
                np.max(np.abs(st_values - t_trash)) > 1e-13):
            raise Exception(
                "Error in z,t retrieval : max(|z_values-z_trash|)=",
                np.max(np.abs(z_values - z_trash)),
                "max(|t_values-t_trash|)=",
                np.max(np.abs(st_values - t_trash)),
            )

        labels = np.ma.array(labels, mask=np.isnan(labels))

        # 4. Graphic
        plt.subplot(nl, nc, ink + 1)
        im = plt.pcolormesh(t_values,
                            z_values,
                            labels.T,
                            vmin=0,
                            vmax=K,
                            cmap=colormap,
                            shading="auto")
        plt.text(t_values[-7],
                 z_values[-4],
                 count2letter[ink],
                 fontweight='bold',
                 fontsize=16)
        plt.gcf().autofmt_xdate()

        # Colorbar
        cbar = plt.colorbar()
        cbar.set_ticks(cticks)
        cbar.set_ticklabels(cticklabels)

        if np.mod(ink, nc) == nl:
            cbar.set_label("Cluster labels")
        if np.mod(ink, nc) == 0:
            plt.ylabel("Alt (km agl)")
        if ink >= (nl - 1) * nc:
            plt.xlabel("Time (UTC)")

    fig.subplots_adjust(wspace=0, hspace=0.1)
    plt.tight_layout()
    if storeImages:
        if fileName is None:
            fileName = "multi_clusterZTview"
        plt.savefig(figureDir + fileName + fmtImages)
        plt.close()
        print("Figure saved:", figureDir + fileName + fmtImages)
    else:
        plt.show(block=False)
def clusterZTview(
    t_values,
    z_values,
    zoneID,
    delete_mask=None,
    fileName=None,
    clustersIDs=None,
    displayClustersIDs=False,
    titl=None,
):
    """Plots cluster labels in the same time and altitude grid where
    measurements have been done (boundary layer classification).
    
    
    Parameters
    ----------
    t_values: array-like of shape (nt,)
        Vector of time within the day
    
    z_values: array-like of shape (nalt,)
        Vector of altitude
    
    zoneID: array-like of shape (N,)
        Cluster labels of each point
    
    delete_mask: array-like of shape (nt*nalt,)
        Mask at True when observation has been removed by the
        `utils.deletelines` function (to avoid NaNs)
    
    fileName: str, optional
        Customised file name for saving the figure
    
    clustersIDs:  dict, optional
        Connection between cluster numbers and boundary layer types
        Example: {0:"CL",1:"SBL",2:"FA",3:"ML"}. Default is {0:0,1:1,...}.
    
    displayClustersIDs: bool
        If True, displays the clusterIDs over the graph, at the center
        of the cluster.
    
    titl: str, optional
        Customised title for the figure
        
    
    Returns
    -------
    Clusters labels on a time-altitude grid
        In X-axis is the time
        In Y-axis is the height (m agl)
        Clusters are shown with differents colors.
    """

    if clustersIDs is None:
        K = np.max(zoneID) + 1
        clustersIDs = np.arange(K)
    else:
        K = len(clustersIDs.items())
        for it in clustersIDs.items():
            key, val = it
            clusterMarks[val] = clusterMarks[key]

    if titl is None:
        titl = "Cluster in time-altitude grid | " + str(K) + " clusters"

    clist = []
    cticks = []
    cticklabels = []
    for k in range(K):
        cticks.append(k + 0.5)
        cticklabels.append(clustersIDs[k])
        clist.append(clusterMarks[clustersIDs[k]][:-1])
    colormap = ListedColormap(clist)

    # 1. Deleted labels completion (when missing data)
    if delete_mask is not None:
        fullzoneID = np.full(np.size(delete_mask), np.nan)
        fullzoneID[~delete_mask] = zoneID
    else:
        fullzoneID = zoneID

    # 2. Conversion datetime -> seconds
    t0 = t_values[0]
    st_values = utils.dtlist2slist(t_values)

    # 3. Format from grid(z,t) to scatter
    TZ = utils.grid_to_scatter(st_values, z_values)

    # 4. Set labels at grid(z,t) format
    t_trash, z_trash, labels = utils.scatter_to_grid(TZ, fullzoneID)
    if np.max(np.abs(z_values - z_trash)) + np.max(
            np.abs(st_values - t_trash)) > 1e-13:
        raise Exception(
            "Error in z,t retrieval : max(|z_values-z_trash|)=",
            np.max(np.abs(z_values - z_trash)),
            "max(|t_values-t_trash|)=",
            np.max(np.abs(st_values - t_trash)),
        )

    labels = np.ma.array(labels, mask=np.isnan(labels))

    # 5. Graphic
    # -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
    fig = plt.figure()
    # plt.title(titl)
    plt.pcolormesh(t_values,
                   z_values,
                   labels.T,
                   vmin=0,
                   vmax=K,
                   cmap=colormap,
                   shading="auto")
    if displayClustersIDs:
        for k in np.unique(zoneID):
            idxk = np.where(zoneID == k)[0]
            x0text = t0 + dt.timedelta(seconds=np.mean(TZ[idxk, 0], axis=0))
            x1text = np.mean(TZ[idxk, 1], axis=0)
            plt.text(x0text,
                     x1text,
                     clustersIDs[k],
                     fontweight="bold",
                     fontsize=18)
    cbar = plt.colorbar(label="Cluster label")
    cbar.set_ticks(cticks)
    cbar.set_ticklabels(cticklabels)
    plt.gcf().autofmt_xdate()
    plt.xlabel("Time (UTC)")
    plt.ylabel("Alt (m agl)")
    if storeImages:
        if fileName is None:
            fileName = "clusterZTview_K" + str(K)
        plt.savefig(figureDir + fileName + fmtImages)
        plt.close()
        print("Figure saved:", figureDir + fileName + fmtImages)
    else:
        plt.show(block=False)
Exemple #5
0
def estimateInterpolationError(z_target,
                               t_target,
                               z_known,
                               t_known,
                               V_known,
                               n_randoms=10,
                               plot_on=True):
    """Estimate the error and the computing time for several interpolation
    method.
    
    Errors are estimated by cross-validation. The function repeats the
    interpolation with all methods for severals train/test splits.
    The list of tested methods as well as their parameters must be
    changed inside the function.
    
    Default list: '4NearestNeighbors','8NearestNeighbors','linear','cubic'
    
    
    Parameters
    ----------
    z_target: ndarray of shape (n1_z,)
        Altitude vector of the target grid (m agl)
    
    t_target: array-like of shape (n1_t,) with dtype=datetime
        Time vector of the target grid
    
    z_known: ndarray of shape (n0_z,)
        Altitude vector of the known grid (m agl)
    
    t_known: array-like of shape (n0_t,) with dtype=datetime
        Time vector of the known grid
    
    V_known: ndarray of shape (n0_t,n0_z)
        Data values on the known grid
    
    n_randoms: int, default=10
        Number of repeated random split between training and testing sets
        
    plot_on: bool, default=True
        If True, the graphics showing computing time versus accuracy is drawn
        
        
    Returns
    -------
    accuracies: ndarray of shape (n_randoms,n_regressors)
        R2 score of each regressor (one per line) for each random split (one per
        column).
        
    chronos: ndarray of shape (n_randoms,n_regressors)
        Computing time of each regressor (one per line) for each random split
        (one per column).
    
    reg_names: list of shape (n_regressors,)
        Names of regressions methods performed
    """

    from sklearn.neighbors import KNeighborsRegressor
    from scipy.interpolate import griddata
    from sklearn.metrics import r2_score
    from sklearn.model_selection import train_test_split

    # Switch from format "data=f(coordinates)" to format "obs=f(predictors)"
    st_known = utils.dtlist2slist(t_known)
    st_target = utils.dtlist2slist(t_target)
    X_known, Y_known = utils.grid_to_scatter(st_known, z_known, V_known)
    X_target = utils.grid_to_scatter(st_target, z_target)

    # NaN are removed
    X_known = X_known[~np.isnan(Y_known), :]
    Y_known = Y_known[~np.isnan(Y_known)]

    regressors = []
    reg_names = []

    #### ========= Estimation with 4-nearest neighbors
    KNN4 = KNeighborsRegressor(n_neighbors=4)
    regressors.append(KNN4)
    reg_names.append("4NearestNeighbors")

    #### ========= Estimation with 8-nearest neighbors
    KNN8 = KNeighborsRegressor(n_neighbors=8)
    regressors.append(KNN8)
    reg_names.append("8NearestNeighbors")

    chronos = np.zeros((len(regressors) + 2, n_randoms))
    accuracies = np.zeros((len(regressors) + 2, n_randoms))
    for icl in range(len(regressors)):
        reg = regressors[icl]
        print("Testing ", str(reg).split("(")[0])
        for ird in range(n_randoms):
            X_train, X_test, y_train, y_test = train_test_split(
                X_known, Y_known, test_size=0.2, random_state=ird)
            t0 = time.time()  #::::::
            reg.fit(X_train, y_train)
            accuracies[icl, ird] = reg.score(X_test, y_test)
            t1 = time.time()  #::::::
            chronos[icl, ird] = t1 - t0

    #### ========= Estimation with 2D linear interpolation
    reg_names.append("Linear2DInterp")
    print("Testing Linear2DInterp")
    for ird in range(n_randoms):
        X_train, X_test, y_train, y_test = train_test_split(X_known,
                                                            Y_known,
                                                            test_size=0.2,
                                                            random_state=ird)
        y_pred = griddata(X_train, y_train, X_test, method="linear")
        # Some data can still be missing even after the interpolation
        #   * Radiometer : resolution coarsens with altitude => last gates missing
        #   * Ceilometer : high lowest range => first gates missing
        y_test = y_test[~np.isnan(y_pred)]
        y_pred = y_pred[~np.isnan(y_pred)]
        accuracies[-2, ird] = r2_score(y_test, y_pred)
        t1 = time.time()  #::::::
        chronos[-2, ird] = t1 - t0

    #### ========= Estimation with 2D cubic interpolation
    reg_names.append("Cubic2DInterp")
    print("Testing Cubic2DInterp")
    for ird in range(n_randoms):
        X_train, X_test, y_train, y_test = train_test_split(X_known,
                                                            Y_known,
                                                            test_size=0.2,
                                                            random_state=ird)
        y_pred = griddata(X_train, y_train, X_test, method="linear")
        # Some data can still be missing even after the interpolation
        #   * Radiometer : resolution coarsens with altitude => last gates missing
        #   * Ceilometer : high lowest range => first gates missing
        y_test = y_test[~np.isnan(y_pred)]
        y_pred = y_pred[~np.isnan(y_pred)]
        accuracies[-1, ird] = r2_score(y_test, y_pred)
        t1 = time.time()  #::::::
        chronos[-1, ird] = t1 - t0

    if plot_on:
        graphics.estimator_quality(accuracies, chronos, reg_names)

    return accuracies, chronos, reg_names
Exemple #6
0
def estimateongrid(z_target,
                   t_target,
                   z_known,
                   t_known,
                   V_known,
                   method="linear"):
    """Interpolate the data on a target grid knowning it on another grid.
    Grids are time-altitude.
    
    Supported interpolation methods: 'linear','cubic','nearestneighbors'
    
    For nearest neighbors, the number of neighbors must be passed as the
    first character. For example: method='4nearestneighbors'
    For more insights about how to choose the good methods (error, computing time...)
    please refer to the notebook `tuto-0to1-prepdataset.ipynb`
    
    
    Parameters
    ----------
    z_target: ndarray of shape (n1_z,)
        Altitude vector of the target grid (m agl)
    
    t_target: array-like of shape (n1_t,) with dtype=datetime
        Time vector of the target grid
    
    z_known: ndarray of shape (n0_z,)
        Altitude vector of the known grid (m agl)
    
    t_known: array-like of shape (n0_t,) with dtype=datetime
        Time vector of the known grid
    
    V_known: ndarray of shape (n0_t,n0_z)
        Data values on the known grid
    
    method: {'linear','cubic','nearestneighbors'}, default='linear'
        Interpolation method.
        
        
    Returns
    -------
    V_target: ndarray of shape (n1_t,n1_z)
        Values on the target grid
    """

    # Switch from format "data=f(coordinates)" to format "obs=f(predictors)"
    st_known = utils.dtlist2slist(t_known)
    st_target = utils.dtlist2slist(t_target)
    X_known, Y_known = utils.grid_to_scatter(st_known, z_known, V_known)
    X_target = utils.grid_to_scatter(st_target, z_target)

    # NaN are removed
    X_known = X_known[~np.isnan(Y_known), :]
    Y_known = Y_known[~np.isnan(Y_known)]

    #### ========= Estimation with K-nearest neighbors
    if method[1:].lower() == "nearestneighbors":
        from sklearn.neighbors import KNeighborsRegressor

        KNN = KNeighborsRegressor(n_neighbors=int(method[0]))

        KNN.fit(X_known, Y_known)
        Y_target = KNN.predict(X_target)

    else:
        #### ========= Estimation with 2D interpolation
        from scipy.interpolate import griddata

        Y_target = griddata(X_known, Y_known, X_target, method=method.lower())

    # Shape the output
    t1, z1, V_target = utils.scatter_to_grid(X_target, Y_target)

    # Sanity checks
    if np.shape(V_target) != (np.size(st_target), np.size(z_target)):
        raise Exception(
            "Output has not expected shape : shape(st_target)",
            np.shape(st_target),
            "shape(z_target)",
            np.shape(z_target),
            "shape(V_target)",
            np.shape(V_target),
        )
    if (np.abs(t1 - st_target) > 10**(-10)).any():
        raise Exception(
            "Time vector has been altered : max(|t1-t_target|)=",
            np.max(np.abs(t1 - st_target)),
        )
    if (np.abs(z1 - z_target) > 10**(-10)).any():
        raise Exception(
            "Altitude vector has been altered : max(|z1-z_target|)=",
            np.max(np.abs(z1 - z_target)),
        )

    return V_target