Ejemplo n.º 1
0
def get_full_data():
    """
    This endpoint returns all the data queried by the cron job and stored in the database, and returns 
    it according to the format specified by the user.
    INPUT: data from API.
    OUTPUT: All data from database to date.

    NOTE: 
        - Parameter format: {"output":"OUTPUT_TYPE", "page":"PAGE"}    
    """

    # Getting the parameters form the GET request
    params = dict(request.args)

    # Getting the parameters form the GET request
    output, page = ut.get_default_params(dict(request.args))     

    # DB query for all data available
    try:
        data = db.get_data_db(page)
        return ut.assemble_response(data_returned=data, output=output)
    except:
        print("[!!!] Error with retrieval the data from database.")
        return {"data": "Internal error", "error": True}

###################################################################################################
Ejemplo n.º 2
0
    def __init__(self, is_small=False, is_mini=False):

        lookup_tables_data_dir_path = os.path.join(dir_path, "data")

        if not os.path.isdir(lookup_tables_data_dir_path):
            self._download_lookup_tables(lookup_tables_data_dir_path)

        data_dir = os.path.join(
            lookup_tables_data_dir_path, "samples/sample1/")

        test_files = ["heldout_inputs", "heldout_compositions", "heldout_tables",
                      "new_compositions", "longer_compositions_seen",
                      "longer_compositions_incremental", "longer_compositions_new"]

        valid_file = "validation"
        train_file = "train"

        # Get default params from yml file
        # - these are not required but offer recommendation on default params
        default_params = get_default_params(dir_path)

        # Update the defauls params if task is small /mini
        if is_small:
            default_params["task_defaults"]["k"] = 1
        if is_mini:
            default_params["task_defaults"]["k"] = 1
            default_params["task_defaults"]["batch_size"] = 128
            default_params["task_defaults"]["patience"] = 2
            default_params["task_defaults"]["epochs"] = 3
            default_params["task_defaults"]["n_attn_plots"] = 1

        super().__init__("lookup", data_dir,
                         train_file, valid_file, test_files, default_params)
Ejemplo n.º 3
0
    def __init__(self, is_small=False, is_mini=False):

        data_dir = os.path.join(dir_path, "data")

        if not os.path.isdir(data_dir):
            self._download_symbol_rewriting_data(data_dir)

        train_file = "grammar_std.train.full"
        test_files = ["grammar_long.tst.full", "grammar_repeat.tst.full",
                      "grammar_short.tst.full", "grammar_std.tst.full"]
        valid_file = "grammar.val"

        # Get default params from yml
        # - these are not required but offer recommendation on default params
        default_params = get_default_params(dir_path)

        if is_small:
            train_file = "grammar_std.train.small"
            default_params["task_defaults"]["k"] = 1
        if is_mini:
            train_file = "grammar_std.train.small"
            default_params["task_defaults"]["k"] = 1
            default_params["task_defaults"]["batch_size"] = 128
            default_params["task_defaults"]["patience"] = 2
            default_params["task_defaults"]["epochs"] = 3
            default_params["task_defaults"]["n_attn_plots"] = 1

        super().__init__("Symbol Rewriting", data_dir, train_file,
                         valid_file, test_files, default_params)
Ejemplo n.º 4
0
    def __init__(self,
                 name,
                 is_small=False,
                 is_mini=False,
                 longer_repeat=5,
                 logger=None):
        logger = logger or logging.getLogger(__name__)

        name = name.lower()
        lookup_tables_dir_path = os.path.join(dir_path, name2dir[name])
        if not os.path.isdir(lookup_tables_dir_path):
            raise NotImplementedError(
                "Folder at {} does not exist".format(lookup_tables_dir_path))

        generation_arguments_path = os.path.join(lookup_tables_dir_path,
                                                 'generation_arguments.txt')
        if not os.path.isfile(generation_arguments_path):
            raise NotImplementedError(
                "Generation Arguments .txt Missing in Table Lookup Folder \
                - Cannot Generate Table")

        lookup_tables_data_dir_path = os.path.join(lookup_tables_dir_path,
                                                   "data")

        if not os.path.isdir(lookup_tables_data_dir_path):
            logger.info(
                "Data not present for {} \n Generating Dataset".format(name))
            make_long_lookup_tables(lookup_tables_data_dir_path,
                                    generation_arguments_path)

        # Get default params from json
        # - these are not required but offer recommendation on default params
        default_params = get_default_params(lookup_tables_dir_path)

        # Update the defauls params if task is small /mini
        if default_params is not None:
            if is_small:
                default_params["task_defaults"]["k"] = 1
            if is_mini:
                default_params["task_defaults"]["k"] = 1
                default_params["task_defaults"]["batch_size"] = 128
                default_params["task_defaults"]["patience"] = 2
                default_params["task_defaults"]["epochs"] = 3
                default_params["task_defaults"]["n_attn_plots"] = 1

        train_file = "train"
        valid_file = "validation"
        test_files = flatten([
            "heldout_inputs", "heldout_compositions", "heldout_tables",
            "new_compositions",
            repeat("longer_seen", longer_repeat),
            repeat("longer_incremental", longer_repeat),
            repeat("longer_new", longer_repeat)
        ])

        super().__init__(name, lookup_tables_data_dir_path, train_file,
                         valid_file, test_files, default_params)
Ejemplo n.º 5
0
def get_likes():
    """
    This endpoint just returns the current number of likes of the selected page if queried and sent a 
    dictionnary of parameters.
    INPUT: data from API.
    OUTPUT: Current count of users.

    Parameter format: {"output":"OUTPUT_TYPE", "page":"PAGE"}
    """

    # Instanciating main class for Facebook call
    fbs = FacebookScrapper()

    # Getting the parameters form the GET request
    output, page = ut.get_default_params(dict(request.args))    

    # Getting the data from the FB scraper
    data_fb = fbs.get_page_fan_count(page=page)
        
    return json.dumps(data_fb)
        'width': 50,
        'search_w': 50
    }

    # ============ DATA AND SAVING DIRS SETUP ========== #
    data_dir = os.getenv('DATA_PATH')
    exp_dir = os.getenv('EXP_PATH')
    checkpoint_dir = os.path.join(exp_dir, exp_name)
    if not os.path.exists(checkpoint_dir):
        os.makedirs(checkpoint_dir)
    # ============= LOGGER SETUP ================= #
    # create logger
    logger = get_logger(checkpoint_dir)

    # Set the default parameters
    params_dict = get_default_params(params_dict)

    # ========= PRINT CONFIG TO LOG ======== #
    logger.info('Running %s experiment ...' % exp_name)
    logger.info('\n Settings for this expriment are: \n')
    for key in params_dict.keys():
        logger.info('  {}: {}'.format(key.upper(), params_dict[key]))
    logger.info('Saving checkpoint to {}'.format(checkpoint_dir))

    # KFold iterator
    kf = MyKFold(data_dir, n_splits=5)
    fold_iterator = kf.getFolderIterator()
    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.666)

    sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
    tf.keras.backend.set_session(sess)
Ejemplo n.º 7
0
import utils as ut
import healpy as hp
import numpy as np
import matplotlib.pyplot as plt

nside = 256

# First, let's generate a set of parameters
mean_p, moment_p = ut.get_default_params()
# Note that you can remove individual components by doing e.g.:
mean_p['include_CMB'] = False
mean_p['include_sync'] = False
mean_p['include_dust'] = False
# But let's not do that now
mean_p['include_CMB'] = True
mean_p['include_sync'] = True
mean_p['include_dust'] = True
# By default, these won't contain any spectral index variations:
print(moment_p['amp_beta_dust'], moment_p['amp_beta_sync'])

# OK, let's look at data with no spectral index variations.
#
# First let's generate a simulation with these parameters
sim_db0p0 = ut.get_sky_realization(nside, seed=1000, mean_pars=mean_p,
                                   moment_pars=moment_p, compute_cls=True)
# And now, let's compute their associated theory prediction
thr_db0p0 = ut.get_theory_spectra(nside, mean_pars=mean_p,
                                  moment_pars=moment_p)

# Now, let's introduce spectral index variantions for dust
# with a standard deviation of 0.2.
Ejemplo n.º 8
0
def blh_estimation_returnlabels(inputFile,
                                outputFile=None,
                                storeInNetcdf=False,
                                params=None):
    """Perform BLH estimation on all profiles of the day and return the labels
    of the classification.
    
    
    Parameters
    ----------
    inputFile : str
        Path to the input file, as generated by raw2l1
    
    outputFile : str, default=None
        Path to the output file. Default adds ".out" before ".nc"
    
    storeInNetcdf : bool, default=True
        If True, the field 'blh_kabl', containg BLH estimation, is
        stored in the outputFile
    
    params : dict, default=None
        Dict with all settings. This function depends  on 'n_clusters'
    
    
    Returns
    -------
    blh : ndarray of shape (Nt,)
        Time series of BLH as estimated by the KABL algorithm
    
    zoneID : ndarray of shape (Nt,Nz)
        Cluster labels of every profiles
    """

    t0 = time.time()  #::::::::::::::::::::::

    if params is None:
        params = utils.get_default_params()

    # 1. Extract the data
    # ---------------------
    loc, dateofday, lat, lon = utils.where_and_when(inputFile)
    needed_data = np.unique(np.concatenate(list(
        params["predictors"].values())))
    t_values, z_values, rcss = utils.extract_data(inputFile,
                                                  to_extract=needed_data,
                                                  params=params)

    if "rcs_0" in needed_data:
        rcs_0 = rcss["rcs_0"]
    if "rcs_1" in needed_data:
        rcs_1 = rcss["rcs_1"]
    if "rcs_2" in needed_data:
        rcs_2 = rcss["rcs_2"]

    blh = []
    zoneID = []

    # setup toolbar
    toolbar_width = int(len(t_values) / 10) + 1
    sys.stdout.write("\nKABL estimation (" + loc +
                     dateofday.strftime(", %Y/%m/%d") + "): [%s]" %
                     ("." * toolbar_width))
    sys.stdout.flush()
    sys.stdout.write("\b" *
                     (toolbar_width + 1))  # return to start of line, after '['

    # Loop on all profile of the day
    for t in range(len(t_values)):
        # toolbar
        if np.mod(t, 10) == 0:
            sys.stdout.write("*")
            sys.stdout.flush()

        # 2. Prepare the data
        # ---------------------
        coords = {
            "time": dt.datetime.utcfromtimestamp(t_values[t]),
            "lat": lat,
            "lon": lon,
        }
        t_back = max(t - params["n_profiles"] + 1, 0)

        rcss = {}
        if "rcs_0" in needed_data:
            rcss["rcs_0"] = rcs_0[t_back:t + 1, :]
        if "rcs_1" in needed_data:
            rcss["rcs_1"] = rcs_1[t_back:t + 1, :]
        if "rcs_2" in needed_data:
            rcss["rcs_2"] = rcs_2[t_back:t + 1, :]

        X, Z = prepare_data(coords, z_values, rcss=rcss, params=params)

        # 3. Apply the machine learning algorithm
        # ---------------------
        if isinstance(params["n_clusters"], int):
            labels = apply_algo(X, params["n_clusters"], params=params)
        else:
            labels, n_clusters, classif_score = apply_algo_k_auto(
                X, params=params)

        # 4. Derive and store the BLH
        # ---------------------
        blh.append(utils.blh_from_labels(labels, Z))
        zoneID.append(labels)

    if outputFile is None:
        outputFile = paths.file_defaultoutput()

    # end toolbar
    t1 = time.time()  #::::::::::::::::::::::
    chrono = t1 - t0
    sys.stdout.write("] (" + str(np.round(chrono, 4)) + " s)\n")

    # 5. Store the new BLH estimation into a copy of the original netCDF
    # ---------------------
    if storeInNetcdf:
        utils.add_blh_to_netcdf(inputFile, outputFile, blh)

    return np.array(blh), np.array(zoneID)
Ejemplo n.º 9
0
def kabl_qualitymetrics(
    inputFile,
    outputFile=None,
    reference="None",
    rsFile="None",
    storeResults=True,
    params=None,
):
    """Estimate quality metrics of KABL for one day of measurement.
    
    This function perform the BLH estimation as in
    kabl.core.blh_estimation but its output are the quality metrics, not
    the BLH estimation. As the estimation of quality metrics is greedier
    this function is noticeably longer to execute.
    
    Parameters
    ----------
    inputFile : str
        Path to the input file, as generated by raw2l1
    
    outputFile : str, default=None
        Path to the output file
    
    reference : str, default=None
        Path to handmade BLH estimation, if any, which will serve
        as reference.
    
    rsFile : str
        Path to the radiosounding estimations, if any. Give the
        possibility to store it in the same netcdf
    
    storeResults : bool, default=True
        If True, quality metrics are stored in the `outputFile`
    
    params : dict, default=None
        Dict with all settings. This function depends  on 'n_clusters'
    
    
    Returns
    -------
    errl2_blh : float
        Root mean squared gap between BLH from KABL and the reference
        .. math:: \sqrt{1/N \sum_i^N (Z(i)-Zref(i))^2}
    
    errl1_blh : float
        Mean absolute gap between BLH from KABL and the reference
        .. math:: 1/N \sum_i^N \vert Z(i)-Zref(i) \vert
      
    errl0_blh : float
        Maximum absolute gap between BLH from KABL and the reference
        .. math:: \max_i \vert Z(i)-Zref(i) \vert
    
    ch_score : float
        Average Calinski-Harabasz score (the higher, the better) over
        the full day
        
    db_scores : float
        Average Davies-Bouldin score (the lower, the better) over
        the full day
    
    s_scores : float
        Average silhouette score (the higher, the better) over
        the full day
    
    chrono : float
        Computation time for the full day (seconds)
    
    n_invalid : int
        Number of BLH estimation at NaN or Inf
    """

    t0 = time.time()  #::::::::::::::::::::::

    if params is None:
        params = utils.get_default_params()

    # 1. Extract the data
    # ---------------------
    loc, dateofday, lat, lon = utils.where_and_when(inputFile)
    t_values, z_values, dat = utils.extract_data(
        inputFile,
        to_extract=["rcs_1", "rcs_2", "pbl", "rr", "vv", "b1"],
        params=params)
    rcs_1 = dat["rcs_1"]
    rcs_2 = dat["rcs_2"]
    blh_mnf = dat["pbl"]
    rr = dat["rr"]
    vv = dat["vv"]
    cbh = dat["b1"]

    blh = []
    K_values = []
    s_scores = []
    db_scores = []
    ch_scores = []

    # setup toolbar
    toolbar_width = int(len(t_values) / 10) + 1
    sys.stdout.write("\nKABL estimation (" + loc +
                     dateofday.strftime(", %Y/%m/%d") + "): [%s]" %
                     ("." * toolbar_width))
    sys.stdout.flush()
    sys.stdout.write("\b" *
                     (toolbar_width + 1))  # return to start of line, after '['

    # Loop on all profile of the day
    for t in range(len(t_values)):
        # toolbar
        if np.mod(t, 10) == 0:
            if any(np.isnan(blh[-11:-1])):
                sys.stdout.write("!")
            else:
                sys.stdout.write("*")
            sys.stdout.flush()

        # 2. Prepare the data
        # ---------------------
        coords = {
            "time": dt.datetime.utcfromtimestamp(t_values[t]),
            "lat": lat,
            "lon": lon,
        }
        t_back = max(t - params["n_profiles"] + 1, 0)
        X, Z = prepare_data(
            coords,
            z_values,
            rcss={
                "rcs_1": rcs_1[t_back:t + 1, :],
                "rcs_2": rcs_2[t_back:t + 1, :]
            },
            params=params,
        )

        # 3. Apply the machine learning algorithm
        # ---------------------

        if isinstance(params["n_clusters"], int):
            n_clusters = params["n_clusters"]
            labels = apply_algo(X, params["n_clusters"], params=params)

            # Compute classification score
            if len(np.unique(labels)) > 1:
                with np.errstate(
                        divide="ignore", invalid="ignore"
                ):  # to avoid itempestive warning ("RuntimeWarning: divide by zero encountered in true_divide...")
                    db_score = davies_bouldin_score(X, labels)
                s_score = silhouette_score(X, labels)
                ch_score = calinski_harabaz_score(X, labels)
            else:
                db_score = np.nan
                s_score = np.nan
                ch_score = np.nan
        else:
            labels, n_clusters, s_score, db_score, ch_score = apply_algo_k_3scores(
                X, params=params)

        # 4. Derive and store the BLH
        # ---------------------
        blh.append(utils.blh_from_labels(labels, Z))
        K_values.append(n_clusters)
        s_scores.append(s_score)
        db_scores.append(db_score)
        ch_scores.append(ch_score)

    # end toolbar
    t1 = time.time()  #::::::::::::::::::::::
    chrono = t1 - t0
    sys.stdout.write("] (" + str(np.round(chrono, 4)) + " s)\n")

    if outputFile is None:
        fname = os.path.split(inputFile)[-1]
        outputFile = os.path.join(paths.resultrootdir,
                                  "DAILY_BENCHMARK_" + fname[10:-3] + ".nc")

    mask_cloud = cbh[:] <= 3000

    if os.path.isfile(reference):
        blh_ref = np.loadtxt(reference)
    else:
        blh_ref = blh_mnf[:, 0]

    if storeResults:
        BLHS = [np.array(blh), np.array(blh_mnf[:, 0])]
        BLH_NAMES = ["BLH_KABL", "BLH_INDUS"]
        if os.path.isfile(reference):
            BLHS.append(blh_ref)
            BLH_NAMES.append("BLH_REF")

        # Cloud base height is added as if it were a BLH though it's not
        BLHS.append(cbh)
        BLH_NAMES.append("CLOUD_BASE_HEIGHT")

        msg = utils.save_qualitymetrics(
            outputFile,
            t_values,
            BLHS,
            BLH_NAMES,
            [s_scores, db_scores, ch_scores],
            ["SILH", "DB", "CH"],
            [rr, vv],
            ["MASK_RAIN", "MASK_FOG"],
            K_values,
            chrono,
            params,
        )

        if os.path.isfile(rsFile):
            blh_rs = utils.extract_rs(rsFile, t_values[0], t_values[-1])
        else:
            blh_rs = None

        print(msg)

    errl2_blh = np.sqrt(np.nanmean((blh - blh_ref)**2))
    errl1_blh = np.nanmean(np.abs(blh - blh_ref))
    errl0_blh = np.nanmax(np.abs(blh - blh_ref))
    corr_blh = np.corrcoef(blh, blh_ref)[0, 1]
    n_invalid = np.sum(np.isnan(blh)) + np.sum(np.isinf(blh))

    return (
        errl2_blh,
        errl1_blh,
        errl0_blh,
        corr_blh,
        np.mean(ch_scores),
        np.mean(db_scores),
        np.mean(s_scores),
        chrono,
        n_invalid,
    )
Ejemplo n.º 10
0
def apply_algo_k_3scores(X, quiet=True, params=None):
    """Adapation of kabl.core.apply_algo_k_auto in benchmark context.
    
    Parameters
    ----------
    X : ndarray of shape (N,p)
        Design matrix to put in input of the algorithm. Each line is an
        observation, each column is a predictor.
    
    quiet : bool, default=True
        If True, cut down all prints
        
    params : dict, default=None
        Dict with all settings. This function depends on 'max_k', 'n_clusters'
        
    Returns
    -------
    labels : ndarray of shape (N,)
        Vector of cluster number attribution
        BEWARE: the cluster identification number are random. Only
        borders matter.
    
    n_clusters_opt : int
        Optimal number of clusters to be found in the data
    
    classif_scores : float
        Value of classification score (chosen in
        params['n_clusters']) for the returned classification.
    """

    if params is None:
        params = utils.get_default_params()

    # Apply algorithm and compute scores for several number of clusters
    all_labels = []
    s_scores = []
    db_scores = []
    ch_scores = []
    for n_clusters in range(2, params["max_k"] + 1):

        labels = apply_algo(X, n_clusters, params=params)
        all_labels.append(labels)

        if len(np.unique(labels)) > 1:
            with np.errstate(
                    divide="ignore", invalid="ignore"
            ):  # to avoid itempestive warning ("RuntimeWarning: divide by zero encountered in true_divide...")
                db_scores.append(davies_bouldin_score(X, labels))
            s_scores.append(silhouette_score(X, labels))
            ch_scores.append(calinski_harabaz_score(X, labels))
        else:
            db_scores.append(np.nan)
            s_scores.append(np.nan)
            ch_scores.append(np.nan)

    # Choose the best number of clusters
    valid = True
    if params["classif_score"] in ["silhouette", "silh"]:
        k_best = np.nanargmax(s_scores)
        if s_scores[k_best] < 0.6:
            if not quiet:
                print(
                    "Bad classification according to silhouette score (",
                    s_scores[k_best],
                    "). BLH is thus NaN",
                )
            valid = False
    elif params["classif_score"] in ["davies_bouldin", "db"]:
        k_best = np.nanargmin(db_scores)
        if db_scores[k_best] > 0.4:
            if not quiet:
                print(
                    "Bad classification according to Davies-Bouldin score (",
                    db_scores[k_best],
                    "). BLH is thus NaN",
                )
            valid = False
    else:
        k_best = np.nanargmax(ch_scores)
        if ch_scores[k_best] < 200:
            if not quiet:
                print(
                    "Bad classification according to Calinski-Harabasz score (",
                    ch_scores[k_best],
                    "). BLH is thus NaN",
                )
            valid = False

    if all(np.isnan(db_scores)):
        valid = False

    # Return the results
    if valid:
        result = (
            all_labels[k_best],
            k_best + 2,
            s_scores[k_best],
            db_scores[k_best],
            ch_scores[k_best],
        )
    else:
        result = None, np.nan, s_scores[k_best], db_scores[k_best], ch_scores[
            k_best]

    return result
Ejemplo n.º 11
0
def prepare_data(coords, z_values, rcss, params=None):
    """Put the data in form to fulfil algorithm requirements.
    
    Five operations are carried out in this function:
      0. Check and reshape inputs
      1. Distinguish night and day for predictors
      2. Concatenate the profiles
      3. Take the logarithm of range-corrected signal
      4. Apply also a standard normalisation (remove mean and divide by
    standard deviation).
    
    
    Parameters
    ----------
    coords : dict
        Time and space coordinate. The dict must have 3 keys:
        'time' (datetime): time of the profile
        'lat' (float): latitude of the measurement site
        'lon' (float): longitude of the measurement site
    
    z_values : array-like of shape (nZ,)
        Vector of altitude values
    
    rcss : dict
        Input data, in the form of a dict of named matrices.
        Example: rcss={"rcs_0":rcs_0, "rcs_1":rcs_1} where rcs_0 and
        rcs_1 are ndarray of shape (nT,nZ)
    
    params : dict
        Dict with all settings. This function depends on 'n_profiles',
        'predictors', 'sunrise_shift', 'sunset_shift'.
      
    Returns
    -------
    X : ndarray of shape (N,p)
        Design matrix to put in input of the algorithm.
        Each line is an observation, each column is a predictor.
    
    Z : ndarray of shape (N,)
        Vector of altitudes for each observation.
    """

    if params is None:
        params = utils.get_default_params()

    # 0. Check and reshape inputs
    # ---------------------------
    needed_data = np.unique(np.concatenate(list(
        params["predictors"].values())))

    if set(rcss.keys()) != set(needed_data):
        raise Exception("Wrong input data provided.")

    if "rcs_0" in needed_data:
        rcs_0 = rcss["rcs_0"]
        try:
            Nt, Nz = rcs_0.shape
        except ValueError:
            Nz = rcs_0.size
            Nt = 1
    if "rcs_1" in needed_data:
        rcs_1 = rcss["rcs_1"]
        try:
            Nt, Nz = rcs_1.shape
        except ValueError:
            Nz = rcs_1.size
            Nt = 1
    if "rcs_2" in needed_data:
        rcs_2 = rcss["rcs_2"]
        try:
            Nt, Nz = rcs_2.shape
        except ValueError:
            Nz = rcs_2.size
            Nt = 1

    # 1. Distinguish night and day for predictors
    # --------------------------------------------
    t = coords["time"]
    timeofday = t.strftime("%H:%M")
    dateofday = t.strftime("%Y%m%d")

    s = Sun(lat=coords["lat"], long=coords["lon"])
    sunrise = s.sunrise(t)
    sunset = s.sunset(t)

    sunrise = dt.datetime(
        t.year, t.month, t.day, sunrise.hour, sunrise.minute,
        sunrise.second) + dt.timedelta(hours=params["sunrise_shift"])
    sunset = dt.datetime(
        t.year, t.month, t.day, sunset.hour, sunset.minute,
        sunset.second) + dt.timedelta(hours=params["sunset_shift"])

    if t >= sunrise and t <= sunset:
        nightorday = "day"
    else:
        nightorday = "night"

    predictors = params["predictors"][nightorday]

    # 2. Concatenate the profiles
    # ----------------------------
    if Nt > 1:
        Z = np.tile(z_values, Nt)
    else:
        Z = z_values

    X = []
    if "rcs_0" in predictors:
        if rcs_0 is None:
            raise ValueError(
                "Missing argument rcs_0 in kabl.core.prepare_data")
        X.append(rcs_0.ravel())
    if "rcs_1" in predictors:
        if rcs_1 is None:
            raise ValueError(
                "Missing argument rcs_1 in kabl.core.prepare_data")
        X.append(rcs_1.ravel())
    if "rcs_2" in predictors:
        if rcs_2 is None:
            raise ValueError(
                "Missing argument rcs_2 in kabl.core.prepare_data")
        X.append(rcs_2.ravel())

    # 3. Take the logarithm of range-corrected signal
    # ------------------------------------------------
    X = np.array(X).T
    X[X <= 0] = 1e-5
    X = np.log10(X)
    if X.ndim == 1:
        X.reshape(-1, 1)

    # 4. Normalisation: remove mean and divide by standard deviation
    # ---------------------------------------------------------------
    scaler = StandardScaler().fit(X)
    X = scaler.transform(X)

    return X, Z
Ejemplo n.º 12
0
def apply_algo(X, n_clusters, init_codification=None, params=None):
    """Apply the machine learning algorithm on the prepared data.
    
    Parameters
    ----------
    X : ndarray of shape (N,p)
        Design matrix to put in input of the algorithm. Each line is an
        observation, each column is a predictor.
    
    n_clusters : int
        Number of clusters to be found in the data
    
    init_codification : dict, default=None
        Link initialisation strategy with actual algorithm inputs.
        Keys are the three strategy are available:
            'random': pick randomly an individual as starting  point (both Kmeans and GMM)
            'advanced': more sophisticated way to initialize
            'given': start at explicitly passed point coordinates.
            + special key 'token', where are given the explicit point coordinates to use when the strategy is 'given'
        Values are dictionnaries with, as key, the algorithm name and, as value, the corresponding input in Scikit-learn.
        For 'token', the value is a list of np.arrays (explicit point coordinates)
    
    params : dict, default=None
        Dict with all settings. This function depends  on 'algo',
        'n_inits', 'init', 'cov_type'
        
    Returns
    -------
    labels : ndarray of shape (N,)
        Vector of cluster number attribution
        BEWARE: the cluster identification number are random.
        Only borders matter.
    """

    if params is None:
        params = utils.get_default_params()

    if init_codification is None:
        init_codification = {
            "random": {"kmeans": "random", "gmm": "random"},
            "advanced": {"kmeans": "k-means++", "gmm": "kmeans"},
            "given": {  # When initialization is 'given', the values are given in the 'token' field
                "kmeans": "token",
                "gmm": "kmeans",
            },
            "token": [  # trick to specify centroids in one object
                np.array([-2.7, -0.7]),  # 2 clusters
                np.array([-2.7, -0.7, 1]),  # 3 clusters
                np.array([-3.9, -2.7, -0.7, 1]),  # 4 clusters
                np.array([-3.9, -2.7, -1.9, -0.7, 1]),  # 5 clusters
                np.array([-3.9, -2.7, -1.9, -0.7, 0, 1]),  # 6 clusters
            ],
        }
    initialization = init_codification[params["init"]][params["algo"]]

    # When initialization is 'given', the values are given in the 'token' field
    # The values are accessed afterward to keep the dict init_codification not too hard to read...
    if initialization == "token":
        # Given values are repeated in all predictors
        n_predictors = X.shape[1]
        initialization = np.repeat(init_codification["token"][n_clusters - 2],
                                   n_predictors).reshape(
                                       (n_clusters, n_predictors))

    if params["algo"] == "kmeans":
        kmeans = KMeans(n_clusters=n_clusters,
                        n_init=params["n_inits"],
                        init=initialization)
        kmeans.fit(X)
        labels = kmeans.predict(X)
    elif params["algo"] == "gmm":
        gmm = GaussianMixture(
            n_components=n_clusters,
            covariance_type=params["cov_type"],
            n_init=params["n_inits"],
            init_params=initialization,
        )
        gmm.fit(X)
        labels = gmm.predict(X)

    return labels
Ejemplo n.º 13
0
def apply_algo_k_auto(X, init_codification=None, quiet=True, params=None):
    """Apply the machine learning algorithm for various number of
    clusters and choose the best according the specified score.
    
    Parameters
    ----------
    X : ndarray of shape (N,p)
        Design matrix to put in input of the algorithm. Each line is an
        observation, each column is a predictor.
    
    init_codification : dict, default=None
        Link initialisation strategy with actual algorithm inputs. 
        See kabl.core.apply_algo
    
    quiet : bool, default=True
        If True, cut down all prints
        
    params : dict, default=None
        Dict with all settings. This function depends on 'max_k', 'n_clusters'
    
    
    Returns
    -------
    labels : ndarray of shape (N,)
        Vector of cluster number attribution
        BEWARE: the cluster identification number are random. Only
        borders matter.
    
    n_clusters_opt : int
        Optimal number of clusters to be found in the data
    
    classif_scores : float
        Value of classification score (chosen in
        params['n_clusters']) for the returned classification.
    """

    if params is None:
        params = utils.get_default_params()

    # 1. Apply algorithm and compute scores for several number of clusters
    # --------------------------------------------------------------------
    all_labels = []
    classif_scores = []
    for n_clusters in range(2, params["max_k"]):

        labels = apply_algo(X,
                            n_clusters,
                            init_codification=init_codification,
                            params=params)
        all_labels.append(labels)

        if params["classif_score"] in ["silhouette", "silh"]:
            classif_scores.append(silhouette_score(X, labels))
        elif params["classif_score"] in ["davies_bouldin", "db"]:
            with np.errstate(
                    divide="ignore", invalid="ignore"
            ):  # to avoid itempestive warning ("RuntimeWarning: divide by zero encountered in true_divide...")
                classif_scores.append(davies_bouldin_score(X, labels))
        else:  # Default because fastest
            classif_scores.append(calinski_harabaz_score(X, labels))

    # 2. Choose the best number of clusters
    # -------------------------------------
    if params["classif_score"] in ["silhouette", "silh"]:
        k_best = np.argmax(classif_scores)
        if classif_scores[k_best] < 0.5:
            if not quiet:
                print(
                    "Bad classification according to silhouette score (",
                    classif_scores[k_best],
                    "). BLH is thus NaN",
                )
            k_best = None
    elif params["classif_score"] in ["davies_bouldin", "db"]:
        k_best = np.argmin(classif_scores)
        if classif_scores[k_best] > 0.36:
            if not quiet:
                print(
                    "Bad classification according to Davies-Bouldin score (",
                    classif_scores[k_best],
                    "). BLH is thus NaN",
                )
            k_best = None
    else:
        k_best = np.argmax(classif_scores)
        if classif_scores[k_best] < 200:
            if not quiet:
                print(
                    "Bad classification according to Calinski-Harabasz score (",
                    classif_scores[k_best],
                    "). BLH is thus NaN",
                )
            k_best = None

    # 3. Return the results
    # ---------------------
    if k_best is not None:
        result = all_labels[k_best], k_best + 2, classif_scores[k_best]
    else:
        result = None, None, None

    return result