def test_simpleeq(self): from original_sdbw import S_Dbw a = S_Dbw(simple_data, simple_data_cluster, simple_centers_id) value_old = a.S_Dbw_result() # 0.2886751345948128 from s_dbw import S_Dbw value = S_Dbw(simple_data, simple_data_cluster, method='Halkidi', centr='mean', nearest_centr=True) # 0.2886751345948128 self.assertEqual( value_old, value, msg='test 2 = {:.16f}, must be 0.2886751345948128'.format(value))
def test_anisodbnewversioncombnoise(self): from s_dbw import S_Dbw value = S_Dbw(X, labels, method='Halkidi', centr='mean', nearest_centr=True) # 1.4045566925764599 self.assertTrue( 1.4045566925764599 - epsilon < value < 1.4045566925764599 + epsilon, msg='test 3 = {:.16f}, must be 1.4045566925764599'.format(value))
def s_dbw(self): score = S_Dbw(np.asarray(self.data_raw), self.class_label, centers_id=None, method='Halkidi', alg_noise='bind', centr='mean', nearest_centr=True, metric='euclidean') self.validation = score return self.validation
def test_simplenewversion(self): from s_dbw import S_Dbw value = S_Dbw(simple_data, simple_data_cluster, method='Halkidi', centr='mean', nearest_centr=True) # 0.2886751345948128 self.assertTrue( 0.2886751345948128 - epsilon < value < 0.2886751345948128 + epsilon, msg='test 1 = {:.16f}, must be 0.2886751345948128'.format(value))
def evaluate(label, pred, extracted_features, dataset): nmi = metrics.normalized_mutual_info_score(label, pred) ari = metrics.adjusted_rand_score(label, pred) f = metrics.fowlkes_mallows_score(label, pred) pred_adjusted = get_y_preds(label, pred, len(set(label))) acc = metrics.accuracy_score(pred_adjusted, label) ds = metrics.davies_bouldin_score(extracted_features, pred) s = metrics.silhouette_score(extracted_features, pred, metric='euclidean') from s_dbw import S_Dbw s_dbw = S_Dbw(extracted_features, pred) compute_tsne(features=extracted_features, label=pred, dataset=dataset) return nmi, ari, f, acc, ds, s, s_dbw
def test_anisodbnewversionsepnoise(self): from s_dbw import S_Dbw value = S_Dbw(X, labels, alg_noise='sep', method='Halkidi', centr='mean', nearest_centr=True) # 0.3844372683801507 self.assertTrue( 0.3844372683801507 - epsilon < value < 0.3844372683801507 + epsilon, msg='test 5 = {:.16f}, must be 0.3844372683801507'.format(value))
def test_anisodbnewversionbindnoise(self): from s_dbw import S_Dbw value = S_Dbw(X, labels, alg_noise='bind', method='Halkidi', centr='mean', nearest_centr=True) # 1.2233006502166595 self.assertTrue( 1.2233006502166595 - epsilon < value < 1.2233006502166595 + epsilon, msg='test 4 = {:.16f}, must be 1.2233006502166595'.format(value))
def multi_validity(X, labels_dict, validity_metrics, print_rslt=False): metric_scores_dict = {} for metric in validity_metrics: scores = [] for parameter, labels in labels_dict.items(): if metric == 's_dbw': score = S_Dbw(X, labels) elif metric == 'silhouette': score = metrics.silhouette_score(X, labels, metric='euclidean', sample_size=METRIC_SAMPLE_SIZE) else: print("error: no valid metric supplied") scores.append(score) if print_rslt: print("parameter = %f.2 yields a %s score of %f" % (parameter, metric, score)) metric_scores_dict[metric] = scores return metric_scores_dict
def best_clustering_by_score(features_matrix, optics_min_samples_list): """ Find the best clustering per S_Dbw with respect to the given parameter list :param features_matrix : feature matrix based upon which the clustering is performed :param optics_min_samples_list : list of parameters to be tested (for the OPTICS clustering) :return: """ best_score = math.inf best_parameter = 0 best_labels = [] for min_samples_parameter in optics_min_samples_list: clustering_labels = clustering( features_matrix, optics_min_samples=min_samples_parameter) score = S_Dbw(features_matrix, clustering_labels, metric='correlation') if score < best_score: best_score = score best_parameter = min_samples_parameter best_labels = clustering_labels return best_labels, best_parameter
t0 = time() silhouette_list = [] #logging.info("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_)) #logging.info("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_)) #logging.info("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_)) #logging.info("Adjusted Rand-Index: %.3f" # % metrics.adjusted_rand_score(labels, km.labels_)) logging.info(" Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X, km.labels_, sample_size=1000)) X_to_CH = X if opts.n_components else X.toarray() logging.info(" Calinski-Harabasz Index: %0.3f" % metrics.calinski_harabasz_score(X_to_CH, km.labels_)) # note S_Dbw increases metrics calculation time by 150 % logging.info(" S_Dbw validity index: %0.3f" % S_Dbw(X, km.labels_, alg_noise='bind', centr='mean', metric='euclidean')) if opts.baseline: truth_file = '../models/groundtruth_labels_final.csv' labels = pd.read_csv(truth_file, index_col=0).values[:,0].tolist() logging.info(" Adjusted Rand-Index: %0.3f" % metrics.adjusted_rand_score(labels, km.labels_)) logging.info(" Metrics calculated in %fs" % (time() - t0)) logging.info("Top terms per cluster:") if opts.n_components: original_space_centroids = svd.inverse_transform(km.cluster_centers_) order_centroids = original_space_centroids.argsort()[:, ::-1] # logging.info("original_space_centroids: \n{0}".format(original_space_centroids[:, ::600])) # logging.info("order_centroids: \n{0}".format(order_centroids[:, ::600])) else: order_centroids = km.cluster_centers_.argsort()[:, ::-1]
def get_sdbw_score(xy_list, labels): # return S Dbw validity index, the smaller the better a cluster is. return S_Dbw(xy_list, labels) # add cosine metric and see Halkidi vs Tong
def s_dbw(self): if self.num_clusters() > 0: return S_Dbw(self.data, self.labels()) else: return np.nan