def test_missing_hdbscan(in_data, col_name, model="-"):
    vp_idx = mh.search_list(col_name, 'vp')
    vs_idx = mh.search_list(col_name, 'vs')
    dn_idx = mh.search_list(col_name, 'dn')
    vpvs_idx = mh.search_list(col_name, 'vp/vs')
    qp_idx = mh.search_list(col_name, 'qp')
    # qs_idx = mh.search_list(col_name, 'qs')

    # when the only available data are: Vp, Vs, Vp/Vs
    test_data_1 = np.squeeze(in_data[:, [vp_idx, vs_idx, vpvs_idx]])

    test1_hdb = test_perform_HDBSCAN(test_data_1)
    sdir = model + '_test1_rehdb.npy'
    np.save(sdir, test1_hdb)
    logging.info('Data saved at: %s' % sdir)

    # when the only available data are: Vp, Qp, Density
    test_data_2 = np.squeeze(in_data[:, [vp_idx, qp_idx, dn_idx]])

    test2_hdb = test_perform_HDBSCAN(test_data_2)
    sdir = model + '_test2_rehdb.npy'
    np.save(sdir, test2_hdb)
    logging.info('Data saved at: %s' % sdir)

    # when the only available data are: Vp, Vs
    test_data_3 = np.squeeze(in_data[:, [vp_idx, vs_idx]])

    test3_hdb = test_perform_HDBSCAN(test_data_3)
    sdir = model + '_test3_rehdb.npy'
    np.save(sdir, test3_hdb)
    logging.info('Data saved at: %s' % sdir)
 def test_ent_pur(self):
     a = [1, 1, 1, 1]
     b = [1, 2, 1, 2]
     c = [1, 1, 2, 2]
     assert mh.ext_eval_entropy(a, a) == (0.0, 1.0)
     assert mh.ext_eval_entropy(a, b) == (0.0, 1.0)
     assert_almost_equal(mh.ext_eval_entropy(b, c), (0.693147180, 0.5))
Example #3
0
def eval_ws(in_data, ws_labels, n_map, label=None, re_all=False):
    """Evaluate and return the best watershed prediction result

    Parameters
    ----------
    in_data : np.array or list
        data matrix
    ws_labels : np.array
        predicted cluster labels from watershed segmentation
    n_map : np.array
        array of the winner neuron
    label : np.array or list, optional
        the true label of each data point

    Returns
    -------
    np.array
        list of best watershed labels, may contain more than one set
    """
    len_watershed = ws_labels.shape[0]
    cluster_labels = np.zeros((len_watershed, len(in_data)))
    avg_sils = np.full(len_watershed, np.nan)
    ch_scs = np.full(len_watershed, np.nan)

    if (label is not None):
        avg_ents = np.full(len_watershed, np.nan)
        avg_purs = np.full(len_watershed, np.nan)

    for i in range(len_watershed):
        param = {'watershed idx': i}
        if (len(np.unique(ws_labels[i])) > 1):
            cluster_labels[i] = gen_e_model(n_map, ws_labels[i])
            avg_sils[i] = mh.int_eval_silhouette(in_data,
                                                 cluster_labels[i],
                                                 method='som_watershed',
                                                 param=param)
            try:
                ch_scs[i] = mh.cal_har_sc(in_data, cluster_labels[i])
            except:
                ch_scs[i] = -1
            if (label is not None):
                avg_ents[i], avg_purs[i] = mh.ext_eval_entropy(
                    label, cluster_labels[i])
    best_idx = []
    best_idx.append(np.nanargmax(np.array(avg_sils)))  # closest to 1
    best_idx.append(np.nanargmax(ch_scs))  # higher = better
    if (label is not None):
        best_idx.append(np.nanargmin(np.array(avg_ents)))  # closest to 0
        best_idx.append(np.nanargmax(np.array(avg_purs)))  # closest to 1
    best_idx = np.unique(best_idx)
    if (re_all):
        return (cluster_labels, avg_sils, ch_scs, best_idx)
    else:
        return (cluster_labels[best_idx], avg_sils[best_idx], ch_scs[best_idx])
def plot_best_fuzz(predicted_mem, x, z, save_name=""):
    """Plot the best resulf from fuzzy c mean"""
    y_pred = get_best_fuzz(predicted_mem)

    if (save_name != ""):
        save_path = save_name + '_bestFuzz_plot.png'
        mh.plot_e_model(y_pred, x, z, save_path=save_path, sep_label=True)
    else:
        mh.plot_e_model(y_pred, x, z, sep_label=True)

    return y_pred
def plot_fcm(predicted_mem, x, z, save_name=""):
    """Plot all the fuzzy result from fuzzy c mean"""
    for i in range(predicted_mem.shape[1]):
        if (save_name != ""):
            save_path = save_name + '_pclass_' + \
                        str(i) + '.png'
            mh.plot_e_model(predicted_mem[:, i],
                            x,
                            z,
                            cmap='Blues',
                            save_path=save_path)
        else:
            mh.plot_e_model(predicted_mem[:, i], x, z, cmap='Blues')
def fcm_compute(in_data, n_clusters, save_name="", save=False):
    """Computes the fuzzy c mean prediction

    Parameters
    ----------
    in_data : np.array or list
        data matrix
    n_clusters : int
        number of clusters
    save_name : str, optional
        the name which will be used to save the plot as png file,
    save : bool, optional
        flag whether to save the model, by default False

    Returns
    -------
    fuzzy_clustering.FCM
        an object of FCM class, see fuzzy_clustering.py for further details
    """
    start = timer()
    fcm = FCM(n_clusters=n_clusters)
    fcm.fit(in_data)
    predicted_mem = fcm.predict(in_data)
    stop = timer()

    logging.info("FCM elapsed time: %.6f", stop - start)

    if (save):
        # pickle the model
        if (save_name == ""):
            timestr = tm.strftime("%Y%m%d-%H%M%S")
            save_name = timestr
        fdir = save_name + '_model.p'
        mh.save_model(fcm, fdir)

    return fcm, predicted_mem
Example #7
0
def random_search_som(in_data,
                      init_guess,
                      max_eval=20,
                      label=None,
                      seed=10,
                      re_all=False):
    """perform random search for SOMs best parameters.
    
    Parameters
    ----------
    in_data : np.array or list
        data matrix
    init_guess : tuple
        list of initial guess of the parameters, in order of dimension,
        number of iterations, learning rate, and sigma
    max_eval : int, optional
        number of max iterartion to perform the search, by default 20
    label : np.array or list, optional
        the true label of each data point, by default None
    seed : integer, optional
        random seed for reproducibility, by default 10
    
    Returns
    -------
    All cluster label and its counterpart parameters.
    """
    random.seed(seed)

    param_grid = gen_param_grid(init_guess)

    dims = np.zeros(max_eval)
    iters = np.zeros(max_eval)
    lrs = np.zeros(max_eval)
    sigmas = np.zeros(max_eval)

    avg_sils = np.full(max_eval, np.nan)
    ch_scs = np.full(max_eval, np.nan)
    cluster_labels = np.zeros((max_eval, len(in_data)))

    if (label is not None):
        avg_ents = np.full(max_eval, np.nan)
        avg_purs = np.full(max_eval, np.nan)

    i = 0
    while i < max_eval:
        random_params = {
            k: random.sample(v, 1)[0]
            for k, v in param_grid.items()
        }

        dims[i], iters[i], lrs[i], sigmas[i] = list(random_params.values())

        som = som_assemble(in_data,
                           seed,
                           int(dims[i]),
                           lr=lrs[i],
                           sigma=sigmas[i])
        som.train_random(in_data, int(iters[i]), verbose=False)
        u_matrix = som.distance_map().T
        watershed_bins = histedges_equalN(u_matrix.flatten())
        ws_labels = watershed_level(u_matrix, watershed_bins)
        n_map = som.neuron_map(in_data)

        _c, _as, _ch = eval_ws(in_data, ws_labels, n_map)
        cluster_labels[i], avg_sils[i], ch_scs[i] = _c[0], _as[0], _ch[0]

        n_clusters = len(np.unique(cluster_labels[i]))
        if (n_clusters < 5 or n_clusters > 30):
            logging.info(
                "Random search using dim=%d, iter=%d, lr=%.6f, sigma=%.6f\
                 result to very small / large number of clusters (n_clusters = %d)\
                 " % (dims[i], iters[i], lrs[i], sigmas[i], n_clusters))
            continue

        logging.info(
            "dim=%d, iter=%d, lr=%.6f, sigma=%.6f, sil=%.6f, ch=%.6f" %
            (dims[i], iters[i], lrs[i], sigmas[i], avg_sils[i], ch_scs[i]))

        if (label is not None):
            avg_ents[i], avg_purs[i] = mh.ext_eval_entropy(label,
                                                           cluster_labels[i],
                                                           init_clus=-1)
            logging.info("ent=%.6f, pur=%.6f" % (avg_ents[i], avg_purs[i]))

        i += 1

    best_idx = []
    best_idx.append(np.nanargmax(np.array(avg_sils)))  # closest to 1
    best_idx.append(np.nanargmax(ch_scs))  # higher = better
    if (label is not None):
        best_idx.append(np.nanargmin(np.array(avg_ents)))  # closest to 0
        best_idx.append(np.nanargmax(np.array(avg_purs)))  # closest to 1
    best_idx = np.unique(best_idx)
    if (re_all):
        return (cluster_labels, avg_sils, ch_scs, dims, iters, lrs, sigmas,
                best_idx)
    else:
        return (cluster_labels[best_idx], avg_sils[best_idx], ch_scs[best_idx],
                dims[best_idx], iters[best_idx], lrs[best_idx],
                sigmas[best_idx])
def test_missing_cols(in_data, col_name, model="-"):
    vp_idx = mh.search_list(col_name, 'vp')
    vs_idx = mh.search_list(col_name, 'vs')
    dn_idx = mh.search_list(col_name, 'dn')
    vpvs_idx = mh.search_list(col_name, 'vp/vs')
    qp_idx = mh.search_list(col_name, 'qp')
    # qs_idx = mh.search_list(col_name, 'qs')

    # when the only available data are: Vp, Vs, Vp/Vs
    test_data_1 = np.squeeze(in_data[:, [vp_idx, vs_idx, vpvs_idx]])

    test1_hdb = test_perform_HDBSCAN(test_data_1)
    sdir = model + '_test1_hdb.npy'
    np.save(sdir, test1_hdb)
    logging.info('Data saved at: %s' % sdir)

    _, test1_fcm = fh.fcm_compute(test_data_1, 10)
    sdir = model + '_test1_fcm.npy'
    np.save(sdir, test1_fcm)
    logging.info('Data saved at: %s' % sdir)

    test1_som, test1_som_fcm, test1_som_hdb = test_perform_SOM(test_data_1)
    sdir = model + '_test1_som.npy'
    np.save(sdir, test1_som)
    logging.info('Data saved at: %s' % sdir)
    sdir = model + '_test1_somfcm.npy'
    np.save(sdir, test1_som_fcm)
    logging.info('Data saved at: %s' % sdir)
    sdir = model + '_test1_somhdb.npy'
    np.save(sdir, test1_som_hdb)
    logging.info('Data saved at: %s' % sdir)

    # when the only available data are: Vp, Qp, Density
    test_data_2 = np.squeeze(in_data[:, [vp_idx, qp_idx, dn_idx]])

    test2_hdb = test_perform_HDBSCAN(test_data_2)
    sdir = model + '_test2_hdb.npy'
    np.save(sdir, test2_hdb)
    logging.info('Data saved at: %s' % sdir)

    _, test2_fcm = fh.fcm_compute(test_data_2, 10)
    sdir = model + '_test2_fcm.npy'
    np.save(sdir, test2_fcm)
    logging.info('Data saved at: %s' % sdir)

    test2_som, test2_som_fcm, test2_som_hdb = test_perform_SOM(test_data_2)
    sdir = model + '_test2_som.npy'
    np.save(sdir, test2_som)
    logging.info('Data saved at: %s' % sdir)
    sdir = model + '_test2_somfcm.npy'
    np.save(sdir, test2_som_fcm)
    logging.info('Data saved at: %s' % sdir)
    sdir = model + '_test2_somhdb.npy'
    np.save(sdir, test2_som_hdb)
    logging.info('Data saved at: %s' % sdir)

    # when the only available data are: Vp, Vs
    test_data_3 = np.squeeze(in_data[:, [vp_idx, vs_idx]])

    test3_hdb = test_perform_HDBSCAN(test_data_3)
    sdir = model + '_test3_hdb.npy'
    np.save(sdir, test3_hdb)
    logging.info('Data saved at: %s' % sdir)

    _, test3_fcm = fh.fcm_compute(test_data_3, 10)
    sdir = model + '_test3_fcm.npy'
    np.save(sdir, test3_fcm)
    logging.info('Data saved at: %s' % sdir)

    test3_som, test3_som_fcm, test3_som_hdb = test_perform_SOM(test_data_3)
    sdir = model + '_test3_som.npy'
    np.save(sdir, test3_som)
    logging.info('Data saved at: %s' % sdir)
    sdir = model + '_test3_somfcm.npy'
    np.save(sdir, test3_som_fcm)
    logging.info('Data saved at: %s' % sdir)
    sdir = model + '_test3_somhdb.npy'
    np.save(sdir, test3_som_hdb)
    logging.info('Data saved at: %s' % sdir)
Example #9
0
def random_search_hdb(in_data,
                      init_guess,
                      max_eval=20,
                      label=None,
                      seed=10,
                      rand_range=(10, 10)):
    random.seed(seed)

    g_min_size, g_min_sam = init_guess

    g_min_size = g_min_size - rand_range[
        0] if g_min_size - rand_range[0] > 5 else 5
    g_min_sam = g_min_sam - rand_range[
        1] if g_min_sam - rand_range[1] > 5 else 5

    param_grid = {
        'g_min_size': list(range(g_min_size, g_min_size + rand_range[0])),
        'g_min_sam': list(range(g_min_sam, g_min_sam + rand_range[1]))
    }

    min_size = np.zeros(max_eval)
    min_sam = np.zeros(max_eval)
    avg_sils = np.full(max_eval, np.nan)
    ch_scs = np.full(max_eval, np.nan)
    cluster_labels = np.zeros((max_eval, len(in_data)))

    if (label is not None):
        avg_ents = np.full(max_eval, np.nan)
        avg_purs = np.full(max_eval, np.nan)

    i = 0
    while i < max_eval:
        random_params = {
            k: random.sample(v, 1)[0]
            for k, v in param_grid.items()
        }
        min_size[i], min_sam[i] = list(random_params.values())
        clusterer = hdbscan.HDBSCAN(min_cluster_size=int(min_size[i]),
                                    min_samples=int(min_sam[i]),
                                    memory='cache')
        cluster_labels[i] = clusterer.fit_predict(in_data)
        n_clusters = len(np.unique(cluster_labels[i]))
        if (n_clusters < 5 or n_clusters > 30):
            logging.info(
                "Random search using min_size = %d, min_sam = %d result to very small / large number of clusters (n_clusters = %d)"
                % (int(min_size[i]), int(min_sam[i]), n_clusters))
            continue

        avg_sils[i] = mh.int_eval_silhouette(in_data, cluster_labels[i])
        ch_scs[i] = mh.cal_har_sc(in_data, cluster_labels[i])
        logging.info(
            "min_size=%d, min_sam=%d, sil=%.6f, ch=%.6f" %
            (int(min_size[i]), int(min_sam[i]), avg_sils[i], ch_scs[i]))

        if (label is not None):
            avg_ents[i], avg_purs[i] = mh.ext_eval_entropy(label,
                                                           cluster_labels[i],
                                                           init_clus=-1)
            logging.info("ent=%.6f, pur=%.6f" % (avg_ents[i], avg_purs[i]))

        i += 1

    best_idx = []
    best_idx.append(np.nanargmax(np.array(avg_sils)))  # closest to 1
    best_idx.append(np.nanargmax(ch_scs))  # higher = better
    if (label is not None):
        best_idx.append(np.nanargmin(np.array(avg_ents)))  # closest to 0
        best_idx.append(np.nanargmax(np.array(avg_purs)))  # closest to 1
    best_idx = np.unique(best_idx)
    return (cluster_labels[best_idx], avg_sils[best_idx], ch_scs[best_idx],
            min_size[best_idx], min_sam[best_idx])
def iter_n_class(in_data, in_range, save_name="", save=False, label=None):
    """Iterates number of cluster in FCM to plot the elbow

    Parameters
    ----------
    in_data : np.array or list
        data matrix
    in_range : class 'range'
        the range of number of clusters to iterate through
    save_name : str, optional
        the name which will be used to save, by default ""
    save : bool, optional
        flag whether to save the model, by default False
    label : np.array or list
        the true label of each data point

    Returns
    -------
    list
        list of all fcm objects
    list
        list of prediction members
    list
        sum square error result
    """
    fcms = []
    pred_mems = []

    it = len(in_range)

    SSE = np.zeros(it)
    avg_sils = np.full(it, np.nan)
    ch_scs = np.full(it, np.nan)
    if (label is not None):
        avg_ents = np.full(it, np.nan)
        avg_purs = np.full(it, np.nan)

    for i, c in enumerate(in_range):
        save_name = save_name + '_nclass_' + str(c)
        fcm, pred_mem = fcm_compute(in_data, c, save_name, save=save)
        fcms.append(fcm)
        pred_mems.append(pred_mem)
        SSE[i] = fcm.SSE(in_data)
        pred = get_best_fuzz(pred_mem)
        avg_sils[i] = mh.int_eval_silhouette(in_data, pred)
        ch_scs[i] = mh.cal_har_sc(in_data, pred)
        logging.info("sil=%.6f, chs=%.6f" % (avg_sils[i], ch_scs[i]))

        if (label is not None):
            avg_ents[i], avg_purs[i] = mh.ext_eval_entropy(label,
                                                           pred,
                                                           init_clus=-1)
            logging.info("ent=%.6f, pur=%.6f" % (avg_ents[i], avg_purs[i]))

    mh.elbowplot(in_range, SSE)

    best_idx = []
    best_idx.append(np.nanargmax(np.array(avg_sils)))  # closest to 1
    best_idx.append(np.nanargmax(ch_scs))  # higher = better
    if (label is not None):
        best_idx.append(np.nanargmin(np.array(avg_ents)))  # closest to 0
        best_idx.append(np.nanargmax(np.array(avg_purs)))  # closest to 1
    best_idx = np.unique(best_idx)

    return fcms, pred_mems, SSE, avg_sils, ch_scs, best_idx