コード例 #1
0
 def test_simpleeq(self):
     from original_sdbw import S_Dbw
     a = S_Dbw(simple_data, simple_data_cluster, simple_centers_id)
     value_old = a.S_Dbw_result()  # 0.2886751345948128
     from s_dbw import S_Dbw
     value = S_Dbw(simple_data,
                   simple_data_cluster,
                   method='Halkidi',
                   centr='mean',
                   nearest_centr=True)
     # 0.2886751345948128
     self.assertEqual(
         value_old,
         value,
         msg='test 2 = {:.16f}, must be 0.2886751345948128'.format(value))
コード例 #2
0
 def test_anisodbnewversioncombnoise(self):
     from s_dbw import S_Dbw
     value = S_Dbw(X,
                   labels,
                   method='Halkidi',
                   centr='mean',
                   nearest_centr=True)  # 1.4045566925764599
     self.assertTrue(
         1.4045566925764599 - epsilon < value <
         1.4045566925764599 + epsilon,
         msg='test 3 = {:.16f}, must be 1.4045566925764599'.format(value))
コード例 #3
0
ファイル: cvi.py プロジェクト: Lekunze/autoclus
 def s_dbw(self):
     score = S_Dbw(np.asarray(self.data_raw),
                   self.class_label,
                   centers_id=None,
                   method='Halkidi',
                   alg_noise='bind',
                   centr='mean',
                   nearest_centr=True,
                   metric='euclidean')
     self.validation = score
     return self.validation
コード例 #4
0
 def test_simplenewversion(self):
     from s_dbw import S_Dbw
     value = S_Dbw(simple_data,
                   simple_data_cluster,
                   method='Halkidi',
                   centr='mean',
                   nearest_centr=True)
     # 0.2886751345948128
     self.assertTrue(
         0.2886751345948128 - epsilon < value <
         0.2886751345948128 + epsilon,
         msg='test 1 = {:.16f}, must be 0.2886751345948128'.format(value))
コード例 #5
0
def evaluate(label, pred, extracted_features, dataset):
    nmi = metrics.normalized_mutual_info_score(label, pred)
    ari = metrics.adjusted_rand_score(label, pred)
    f = metrics.fowlkes_mallows_score(label, pred)
    pred_adjusted = get_y_preds(label, pred, len(set(label)))
    acc = metrics.accuracy_score(pred_adjusted, label)
    ds = metrics.davies_bouldin_score(extracted_features, pred)
    s = metrics.silhouette_score(extracted_features, pred, metric='euclidean')
    from s_dbw import S_Dbw
    s_dbw = S_Dbw(extracted_features, pred)
    compute_tsne(features=extracted_features, label=pred, dataset=dataset)
    return nmi, ari, f, acc, ds, s, s_dbw
コード例 #6
0
 def test_anisodbnewversionsepnoise(self):
     from s_dbw import S_Dbw
     value = S_Dbw(X,
                   labels,
                   alg_noise='sep',
                   method='Halkidi',
                   centr='mean',
                   nearest_centr=True)
     # 0.3844372683801507
     self.assertTrue(
         0.3844372683801507 - epsilon < value <
         0.3844372683801507 + epsilon,
         msg='test 5 = {:.16f}, must be 0.3844372683801507'.format(value))
コード例 #7
0
 def test_anisodbnewversionbindnoise(self):
     from s_dbw import S_Dbw
     value = S_Dbw(X,
                   labels,
                   alg_noise='bind',
                   method='Halkidi',
                   centr='mean',
                   nearest_centr=True)
     # 1.2233006502166595
     self.assertTrue(
         1.2233006502166595 - epsilon < value <
         1.2233006502166595 + epsilon,
         msg='test 4 = {:.16f}, must be 1.2233006502166595'.format(value))
コード例 #8
0
def multi_validity(X, labels_dict, validity_metrics, print_rslt=False):
    metric_scores_dict = {}
    for metric in validity_metrics:

        scores = []
        for parameter, labels in labels_dict.items():
            if metric == 's_dbw':
                score = S_Dbw(X, labels)

            elif metric == 'silhouette':
                score = metrics.silhouette_score(X, labels,
                                                 metric='euclidean',
                                                 sample_size=METRIC_SAMPLE_SIZE)
            else:
                print("error: no valid metric supplied")
            scores.append(score)
            if print_rslt:
                print("parameter = %f.2 yields a %s score of %f" % (parameter, metric, score))
        metric_scores_dict[metric] = scores

    return metric_scores_dict
コード例 #9
0
def best_clustering_by_score(features_matrix, optics_min_samples_list):
    """
    Find the best clustering per S_Dbw with respect to the given parameter list

    :param features_matrix          : feature matrix based upon which the clustering is performed
    :param optics_min_samples_list  : list of parameters to be tested (for the OPTICS clustering)

    :return:
    """
    best_score = math.inf
    best_parameter = 0
    best_labels = []

    for min_samples_parameter in optics_min_samples_list:
        clustering_labels = clustering(
            features_matrix, optics_min_samples=min_samples_parameter)
        score = S_Dbw(features_matrix, clustering_labels, metric='correlation')

        if score < best_score:
            best_score = score
            best_parameter = min_samples_parameter
            best_labels = clustering_labels

    return best_labels, best_parameter
コード例 #10
0
t0 = time()
silhouette_list = []

#logging.info("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_))
#logging.info("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_))
#logging.info("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_))
#logging.info("Adjusted Rand-Index: %.3f"
#      % metrics.adjusted_rand_score(labels, km.labels_))
logging.info("  Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(X, km.labels_, sample_size=1000))
X_to_CH = X if opts.n_components else X.toarray()
logging.info("  Calinski-Harabasz Index: %0.3f"
      % metrics.calinski_harabasz_score(X_to_CH, km.labels_))
# note S_Dbw increases metrics calculation time by 150 %
logging.info("  S_Dbw validity index: %0.3f"
             % S_Dbw(X, km.labels_, alg_noise='bind', centr='mean',
                     metric='euclidean'))
if opts.baseline:
    truth_file = '../models/groundtruth_labels_final.csv'
    labels = pd.read_csv(truth_file, index_col=0).values[:,0].tolist()
    logging.info("  Adjusted Rand-Index: %0.3f"
                 % metrics.adjusted_rand_score(labels, km.labels_))
logging.info("  Metrics calculated in %fs" % (time() - t0))

logging.info("Top terms per cluster:")
if opts.n_components:
    original_space_centroids = svd.inverse_transform(km.cluster_centers_)
    order_centroids = original_space_centroids.argsort()[:, ::-1]
#        logging.info("original_space_centroids: \n{0}".format(original_space_centroids[:, ::600]))
#        logging.info("order_centroids: \n{0}".format(order_centroids[:, ::600]))
else:
    order_centroids = km.cluster_centers_.argsort()[:, ::-1]
コード例 #11
0
def get_sdbw_score(xy_list, labels):
    # return S Dbw validity index, the smaller the better a cluster is.
    return S_Dbw(xy_list, labels) # add cosine metric and see Halkidi vs Tong
コード例 #12
0
 def s_dbw(self):
     if self.num_clusters() > 0:
         return S_Dbw(self.data, self.labels())
     else:
         return np.nan