def test_missing_hdbscan(in_data, col_name, model="-"): vp_idx = mh.search_list(col_name, 'vp') vs_idx = mh.search_list(col_name, 'vs') dn_idx = mh.search_list(col_name, 'dn') vpvs_idx = mh.search_list(col_name, 'vp/vs') qp_idx = mh.search_list(col_name, 'qp') # qs_idx = mh.search_list(col_name, 'qs') # when the only available data are: Vp, Vs, Vp/Vs test_data_1 = np.squeeze(in_data[:, [vp_idx, vs_idx, vpvs_idx]]) test1_hdb = test_perform_HDBSCAN(test_data_1) sdir = model + '_test1_rehdb.npy' np.save(sdir, test1_hdb) logging.info('Data saved at: %s' % sdir) # when the only available data are: Vp, Qp, Density test_data_2 = np.squeeze(in_data[:, [vp_idx, qp_idx, dn_idx]]) test2_hdb = test_perform_HDBSCAN(test_data_2) sdir = model + '_test2_rehdb.npy' np.save(sdir, test2_hdb) logging.info('Data saved at: %s' % sdir) # when the only available data are: Vp, Vs test_data_3 = np.squeeze(in_data[:, [vp_idx, vs_idx]]) test3_hdb = test_perform_HDBSCAN(test_data_3) sdir = model + '_test3_rehdb.npy' np.save(sdir, test3_hdb) logging.info('Data saved at: %s' % sdir)
def test_ent_pur(self): a = [1, 1, 1, 1] b = [1, 2, 1, 2] c = [1, 1, 2, 2] assert mh.ext_eval_entropy(a, a) == (0.0, 1.0) assert mh.ext_eval_entropy(a, b) == (0.0, 1.0) assert_almost_equal(mh.ext_eval_entropy(b, c), (0.693147180, 0.5))
def eval_ws(in_data, ws_labels, n_map, label=None, re_all=False): """Evaluate and return the best watershed prediction result Parameters ---------- in_data : np.array or list data matrix ws_labels : np.array predicted cluster labels from watershed segmentation n_map : np.array array of the winner neuron label : np.array or list, optional the true label of each data point Returns ------- np.array list of best watershed labels, may contain more than one set """ len_watershed = ws_labels.shape[0] cluster_labels = np.zeros((len_watershed, len(in_data))) avg_sils = np.full(len_watershed, np.nan) ch_scs = np.full(len_watershed, np.nan) if (label is not None): avg_ents = np.full(len_watershed, np.nan) avg_purs = np.full(len_watershed, np.nan) for i in range(len_watershed): param = {'watershed idx': i} if (len(np.unique(ws_labels[i])) > 1): cluster_labels[i] = gen_e_model(n_map, ws_labels[i]) avg_sils[i] = mh.int_eval_silhouette(in_data, cluster_labels[i], method='som_watershed', param=param) try: ch_scs[i] = mh.cal_har_sc(in_data, cluster_labels[i]) except: ch_scs[i] = -1 if (label is not None): avg_ents[i], avg_purs[i] = mh.ext_eval_entropy( label, cluster_labels[i]) best_idx = [] best_idx.append(np.nanargmax(np.array(avg_sils))) # closest to 1 best_idx.append(np.nanargmax(ch_scs)) # higher = better if (label is not None): best_idx.append(np.nanargmin(np.array(avg_ents))) # closest to 0 best_idx.append(np.nanargmax(np.array(avg_purs))) # closest to 1 best_idx = np.unique(best_idx) if (re_all): return (cluster_labels, avg_sils, ch_scs, best_idx) else: return (cluster_labels[best_idx], avg_sils[best_idx], ch_scs[best_idx])
def plot_best_fuzz(predicted_mem, x, z, save_name=""): """Plot the best resulf from fuzzy c mean""" y_pred = get_best_fuzz(predicted_mem) if (save_name != ""): save_path = save_name + '_bestFuzz_plot.png' mh.plot_e_model(y_pred, x, z, save_path=save_path, sep_label=True) else: mh.plot_e_model(y_pred, x, z, sep_label=True) return y_pred
def plot_fcm(predicted_mem, x, z, save_name=""): """Plot all the fuzzy result from fuzzy c mean""" for i in range(predicted_mem.shape[1]): if (save_name != ""): save_path = save_name + '_pclass_' + \ str(i) + '.png' mh.plot_e_model(predicted_mem[:, i], x, z, cmap='Blues', save_path=save_path) else: mh.plot_e_model(predicted_mem[:, i], x, z, cmap='Blues')
def fcm_compute(in_data, n_clusters, save_name="", save=False): """Computes the fuzzy c mean prediction Parameters ---------- in_data : np.array or list data matrix n_clusters : int number of clusters save_name : str, optional the name which will be used to save the plot as png file, save : bool, optional flag whether to save the model, by default False Returns ------- fuzzy_clustering.FCM an object of FCM class, see fuzzy_clustering.py for further details """ start = timer() fcm = FCM(n_clusters=n_clusters) fcm.fit(in_data) predicted_mem = fcm.predict(in_data) stop = timer() logging.info("FCM elapsed time: %.6f", stop - start) if (save): # pickle the model if (save_name == ""): timestr = tm.strftime("%Y%m%d-%H%M%S") save_name = timestr fdir = save_name + '_model.p' mh.save_model(fcm, fdir) return fcm, predicted_mem
def random_search_som(in_data, init_guess, max_eval=20, label=None, seed=10, re_all=False): """perform random search for SOMs best parameters. Parameters ---------- in_data : np.array or list data matrix init_guess : tuple list of initial guess of the parameters, in order of dimension, number of iterations, learning rate, and sigma max_eval : int, optional number of max iterartion to perform the search, by default 20 label : np.array or list, optional the true label of each data point, by default None seed : integer, optional random seed for reproducibility, by default 10 Returns ------- All cluster label and its counterpart parameters. """ random.seed(seed) param_grid = gen_param_grid(init_guess) dims = np.zeros(max_eval) iters = np.zeros(max_eval) lrs = np.zeros(max_eval) sigmas = np.zeros(max_eval) avg_sils = np.full(max_eval, np.nan) ch_scs = np.full(max_eval, np.nan) cluster_labels = np.zeros((max_eval, len(in_data))) if (label is not None): avg_ents = np.full(max_eval, np.nan) avg_purs = np.full(max_eval, np.nan) i = 0 while i < max_eval: random_params = { k: random.sample(v, 1)[0] for k, v in param_grid.items() } dims[i], iters[i], lrs[i], sigmas[i] = list(random_params.values()) som = som_assemble(in_data, seed, int(dims[i]), lr=lrs[i], sigma=sigmas[i]) som.train_random(in_data, int(iters[i]), verbose=False) u_matrix = som.distance_map().T watershed_bins = histedges_equalN(u_matrix.flatten()) ws_labels = watershed_level(u_matrix, watershed_bins) n_map = som.neuron_map(in_data) _c, _as, _ch = eval_ws(in_data, ws_labels, n_map) cluster_labels[i], avg_sils[i], ch_scs[i] = _c[0], _as[0], _ch[0] n_clusters = len(np.unique(cluster_labels[i])) if (n_clusters < 5 or n_clusters > 30): logging.info( "Random search using dim=%d, iter=%d, lr=%.6f, sigma=%.6f\ result to very small / large number of clusters (n_clusters = %d)\ " % (dims[i], iters[i], lrs[i], sigmas[i], n_clusters)) continue logging.info( "dim=%d, iter=%d, lr=%.6f, sigma=%.6f, sil=%.6f, ch=%.6f" % (dims[i], iters[i], lrs[i], sigmas[i], avg_sils[i], ch_scs[i])) if (label is not None): avg_ents[i], avg_purs[i] = mh.ext_eval_entropy(label, cluster_labels[i], init_clus=-1) logging.info("ent=%.6f, pur=%.6f" % (avg_ents[i], avg_purs[i])) i += 1 best_idx = [] best_idx.append(np.nanargmax(np.array(avg_sils))) # closest to 1 best_idx.append(np.nanargmax(ch_scs)) # higher = better if (label is not None): best_idx.append(np.nanargmin(np.array(avg_ents))) # closest to 0 best_idx.append(np.nanargmax(np.array(avg_purs))) # closest to 1 best_idx = np.unique(best_idx) if (re_all): return (cluster_labels, avg_sils, ch_scs, dims, iters, lrs, sigmas, best_idx) else: return (cluster_labels[best_idx], avg_sils[best_idx], ch_scs[best_idx], dims[best_idx], iters[best_idx], lrs[best_idx], sigmas[best_idx])
def test_missing_cols(in_data, col_name, model="-"): vp_idx = mh.search_list(col_name, 'vp') vs_idx = mh.search_list(col_name, 'vs') dn_idx = mh.search_list(col_name, 'dn') vpvs_idx = mh.search_list(col_name, 'vp/vs') qp_idx = mh.search_list(col_name, 'qp') # qs_idx = mh.search_list(col_name, 'qs') # when the only available data are: Vp, Vs, Vp/Vs test_data_1 = np.squeeze(in_data[:, [vp_idx, vs_idx, vpvs_idx]]) test1_hdb = test_perform_HDBSCAN(test_data_1) sdir = model + '_test1_hdb.npy' np.save(sdir, test1_hdb) logging.info('Data saved at: %s' % sdir) _, test1_fcm = fh.fcm_compute(test_data_1, 10) sdir = model + '_test1_fcm.npy' np.save(sdir, test1_fcm) logging.info('Data saved at: %s' % sdir) test1_som, test1_som_fcm, test1_som_hdb = test_perform_SOM(test_data_1) sdir = model + '_test1_som.npy' np.save(sdir, test1_som) logging.info('Data saved at: %s' % sdir) sdir = model + '_test1_somfcm.npy' np.save(sdir, test1_som_fcm) logging.info('Data saved at: %s' % sdir) sdir = model + '_test1_somhdb.npy' np.save(sdir, test1_som_hdb) logging.info('Data saved at: %s' % sdir) # when the only available data are: Vp, Qp, Density test_data_2 = np.squeeze(in_data[:, [vp_idx, qp_idx, dn_idx]]) test2_hdb = test_perform_HDBSCAN(test_data_2) sdir = model + '_test2_hdb.npy' np.save(sdir, test2_hdb) logging.info('Data saved at: %s' % sdir) _, test2_fcm = fh.fcm_compute(test_data_2, 10) sdir = model + '_test2_fcm.npy' np.save(sdir, test2_fcm) logging.info('Data saved at: %s' % sdir) test2_som, test2_som_fcm, test2_som_hdb = test_perform_SOM(test_data_2) sdir = model + '_test2_som.npy' np.save(sdir, test2_som) logging.info('Data saved at: %s' % sdir) sdir = model + '_test2_somfcm.npy' np.save(sdir, test2_som_fcm) logging.info('Data saved at: %s' % sdir) sdir = model + '_test2_somhdb.npy' np.save(sdir, test2_som_hdb) logging.info('Data saved at: %s' % sdir) # when the only available data are: Vp, Vs test_data_3 = np.squeeze(in_data[:, [vp_idx, vs_idx]]) test3_hdb = test_perform_HDBSCAN(test_data_3) sdir = model + '_test3_hdb.npy' np.save(sdir, test3_hdb) logging.info('Data saved at: %s' % sdir) _, test3_fcm = fh.fcm_compute(test_data_3, 10) sdir = model + '_test3_fcm.npy' np.save(sdir, test3_fcm) logging.info('Data saved at: %s' % sdir) test3_som, test3_som_fcm, test3_som_hdb = test_perform_SOM(test_data_3) sdir = model + '_test3_som.npy' np.save(sdir, test3_som) logging.info('Data saved at: %s' % sdir) sdir = model + '_test3_somfcm.npy' np.save(sdir, test3_som_fcm) logging.info('Data saved at: %s' % sdir) sdir = model + '_test3_somhdb.npy' np.save(sdir, test3_som_hdb) logging.info('Data saved at: %s' % sdir)
def random_search_hdb(in_data, init_guess, max_eval=20, label=None, seed=10, rand_range=(10, 10)): random.seed(seed) g_min_size, g_min_sam = init_guess g_min_size = g_min_size - rand_range[ 0] if g_min_size - rand_range[0] > 5 else 5 g_min_sam = g_min_sam - rand_range[ 1] if g_min_sam - rand_range[1] > 5 else 5 param_grid = { 'g_min_size': list(range(g_min_size, g_min_size + rand_range[0])), 'g_min_sam': list(range(g_min_sam, g_min_sam + rand_range[1])) } min_size = np.zeros(max_eval) min_sam = np.zeros(max_eval) avg_sils = np.full(max_eval, np.nan) ch_scs = np.full(max_eval, np.nan) cluster_labels = np.zeros((max_eval, len(in_data))) if (label is not None): avg_ents = np.full(max_eval, np.nan) avg_purs = np.full(max_eval, np.nan) i = 0 while i < max_eval: random_params = { k: random.sample(v, 1)[0] for k, v in param_grid.items() } min_size[i], min_sam[i] = list(random_params.values()) clusterer = hdbscan.HDBSCAN(min_cluster_size=int(min_size[i]), min_samples=int(min_sam[i]), memory='cache') cluster_labels[i] = clusterer.fit_predict(in_data) n_clusters = len(np.unique(cluster_labels[i])) if (n_clusters < 5 or n_clusters > 30): logging.info( "Random search using min_size = %d, min_sam = %d result to very small / large number of clusters (n_clusters = %d)" % (int(min_size[i]), int(min_sam[i]), n_clusters)) continue avg_sils[i] = mh.int_eval_silhouette(in_data, cluster_labels[i]) ch_scs[i] = mh.cal_har_sc(in_data, cluster_labels[i]) logging.info( "min_size=%d, min_sam=%d, sil=%.6f, ch=%.6f" % (int(min_size[i]), int(min_sam[i]), avg_sils[i], ch_scs[i])) if (label is not None): avg_ents[i], avg_purs[i] = mh.ext_eval_entropy(label, cluster_labels[i], init_clus=-1) logging.info("ent=%.6f, pur=%.6f" % (avg_ents[i], avg_purs[i])) i += 1 best_idx = [] best_idx.append(np.nanargmax(np.array(avg_sils))) # closest to 1 best_idx.append(np.nanargmax(ch_scs)) # higher = better if (label is not None): best_idx.append(np.nanargmin(np.array(avg_ents))) # closest to 0 best_idx.append(np.nanargmax(np.array(avg_purs))) # closest to 1 best_idx = np.unique(best_idx) return (cluster_labels[best_idx], avg_sils[best_idx], ch_scs[best_idx], min_size[best_idx], min_sam[best_idx])
def iter_n_class(in_data, in_range, save_name="", save=False, label=None): """Iterates number of cluster in FCM to plot the elbow Parameters ---------- in_data : np.array or list data matrix in_range : class 'range' the range of number of clusters to iterate through save_name : str, optional the name which will be used to save, by default "" save : bool, optional flag whether to save the model, by default False label : np.array or list the true label of each data point Returns ------- list list of all fcm objects list list of prediction members list sum square error result """ fcms = [] pred_mems = [] it = len(in_range) SSE = np.zeros(it) avg_sils = np.full(it, np.nan) ch_scs = np.full(it, np.nan) if (label is not None): avg_ents = np.full(it, np.nan) avg_purs = np.full(it, np.nan) for i, c in enumerate(in_range): save_name = save_name + '_nclass_' + str(c) fcm, pred_mem = fcm_compute(in_data, c, save_name, save=save) fcms.append(fcm) pred_mems.append(pred_mem) SSE[i] = fcm.SSE(in_data) pred = get_best_fuzz(pred_mem) avg_sils[i] = mh.int_eval_silhouette(in_data, pred) ch_scs[i] = mh.cal_har_sc(in_data, pred) logging.info("sil=%.6f, chs=%.6f" % (avg_sils[i], ch_scs[i])) if (label is not None): avg_ents[i], avg_purs[i] = mh.ext_eval_entropy(label, pred, init_clus=-1) logging.info("ent=%.6f, pur=%.6f" % (avg_ents[i], avg_purs[i])) mh.elbowplot(in_range, SSE) best_idx = [] best_idx.append(np.nanargmax(np.array(avg_sils))) # closest to 1 best_idx.append(np.nanargmax(ch_scs)) # higher = better if (label is not None): best_idx.append(np.nanargmin(np.array(avg_ents))) # closest to 0 best_idx.append(np.nanargmax(np.array(avg_purs))) # closest to 1 best_idx = np.unique(best_idx) return fcms, pred_mems, SSE, avg_sils, ch_scs, best_idx