Esempio n. 1
0
class Clusterer():
    #  TODO HDBSCAN should be added
    def __init__(self,
                 cluster_model,
                 n_clusters,
                 spectral_affinity='nearest_neighbors',
                 max_try_cnt=5):
        self.cluster_model = cluster_model
        self.n_clusters = n_clusters
        self.max_try_cnt = max_try_cnt
        self.trained_model = None
        if self.cluster_model in ['KMeans', 'KMEANS', 'km', 'kmeans', 'KM']:
            self.cluster_model = "KMeans"
        if self.cluster_model in ['GMM', 'GMM_Full', 'GMM_full']:
            self.cluster_model = "GMM_full"
        if self.cluster_model in ['GMMdiag', 'GMM_Diag', 'GMM_diag']:
            self.cluster_model = "GMM_diag"
        if self.cluster_model in ['Spectral', 'SC', 'SpectralClustering']:
            self.cluster_model = "Spectral"
            self.spectral_affinity = spectral_affinity  # 'nearest_neighbors', 'rbf'
        if self.cluster_model not in [
                "KMeans", "GMM_full", "GMM_diag", "Spectral"
        ]:
            os_error(cluster_model + " is not applicable in this Class")

    def fit(self,
            X,
            post_analyze_distribution=False,
            verbose=1,
            random_state=0):
        df = pd_df(np_array(X))

        curTol = 0.0001 if self.cluster_model == 'KMeans' else 0.01
        max_iter = 300 if self.cluster_model == 'KMeans' else 200

        numOf_1_sample_bins = 1
        unique_clust_cnt = 1
        expCnt = 0
        while (unique_clust_cnt == 1 or
               numOf_1_sample_bins - expCnt > 0) and expCnt < self.max_try_cnt:
            t = time()
            if expCnt > 0:
                if numOf_1_sample_bins > 0:
                    print("running ", self.cluster_model, " for the ",
                          str(expCnt), " time due to numOf_1_sample_bins(",
                          str(numOf_1_sample_bins), ")")
                if unique_clust_cnt == 1:
                    print("running ", self.cluster_model, " for the ",
                          str(expCnt), " time due to unique_clust_cnt==1")
            if verbose > 0:
                print('Clustering the featVec(', X.shape, ') with n_clusters(',
                      str(self.n_clusters),
                      ') and model = ', self.cluster_model, ", curTol(",
                      str(curTol), "), max_iter(", str(max_iter), "), at ",
                      datetime.now().strftime("%H:%M:%S"))
            self.kluster_centers = None
            self.predictedKlusters = None

            if self.cluster_model == 'KMeans':
                # default vals for kmeans --> max_iter=300, 1e-4
                self.trained_model = KMeans(init='k-means++',
                                            n_clusters=self.n_clusters,
                                            n_init=20,
                                            tol=curTol,
                                            max_iter=max_iter,
                                            random_state=random_state).fit(df)
                self.predictedKlusters = self.trained_model.labels_.astype(
                    float)
                self.kluster_centers = self.trained_model.cluster_centers_.astype(
                    float)
            elif self.cluster_model == 'GMM_full':
                # default vals for gmm --> max_iter=100, 1e-3
                self.trained_model = GaussianMixture(
                    n_components=self.n_clusters,
                    covariance_type='full',
                    tol=curTol,
                    random_state=random_state,
                    max_iter=max_iter,
                    reg_covar=1e-4).fit(df)
                _, log_resp = self.trained_model._e_step(X)
                self.predictedKlusters = log_resp.argmax(axis=1)
            elif self.cluster_model == 'GMM_diag':
                self.trained_model = GaussianMixture(
                    n_components=self.n_clusters,
                    covariance_type='diag',
                    tol=curTol,
                    random_state=random_state,
                    max_iter=max_iter,
                    reg_covar=1e-4).fit(df)
                _, log_resp = self.trained_model._e_step(X)
                self.predictedKlusters = log_resp.argmax(axis=1)
            elif self.cluster_model == 'Spectral':
                sc = SpectralClustering(n_clusters=self.n_clusters,
                                        affinity=self.spectral_affinity,
                                        random_state=random_state)
                self.trained_model = sc.fit(X)
                self.predictedKlusters = self.trained_model.labels_

            self.kluster_centroids = get_cluster_centroids(
                X,
                self.predictedKlusters,
                kluster_centers=self.kluster_centers,
                verbose=0)

            if post_analyze_distribution:
                numOf_1_sample_bins, histSortedInv = analyzeClusterDistribution(
                    self.predictedKlusters, self.n_clusters, verbose=verbose)
                unique_clust_cnt = len(np_unique(self.predictedKlusters))
                curTol = curTol * 10
                max_iter = max_iter + 50
                expCnt = expCnt + 1
            else:
                expCnt = self.max_try_cnt

            elapsed = time() - t
            if verbose > 0:
                print('Clustering done in (', getElapsedTimeFormatted(elapsed),
                      '), ended at ',
                      datetime.now().strftime("%H:%M:%S"))
        removeLastLine()
        if verbose > 0:
            print('Clustering completed with (',
                  np_unique(self.predictedKlusters).shape,
                  ') clusters,  expCnt(', str(expCnt), ')')
        # elif 'OPTICS' in clusterModel:
        #     N = featVec.shape[0]
        #     min_cluster_size = int(np.ceil(N / (n_clusters * 4)))
        #     pars = clusterModel.split('_')  # 'OPTICS_hamming_dbscan', 'OPTICS_russellrao_xi'
        #     #  metricsAvail = np.sort(['braycurtis', 'canberra', 'chebyshev', 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski',
        #     #                'mahalanobis', 'minkowski', 'rogerstanimoto', 'russellrao', 'seuclidean', 'sokalmichener',
        #     #                'sokalsneath', 'sqeuclidean', 'yule',
        #     #                'cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan'])
        #     #  cluster_methods_avail = ['xi', 'dbscan']
        #     clust = ClusterOPT(min_samples=50, xi=.05, min_cluster_size=min_cluster_size, metric=pars[1], cluster_method=pars[2])
        #     clust.fit(featVec)
        #     predictedKlusters = cluster_optics_dbscan(reachability=clust.reachability_,
        #                                                core_distances=clust.core_distances_,
        #                                                ordering=clust.ordering_, eps=0.5)
        #     n1 = np.unique(predictedKlusters)
        #     print(clusterModel, ' found ', str(n1), ' uniq clusters')
        #     predictedKlusters = predictedKlusters + 1

        return self

    def fit_predict(self, X, post_analyze_distribution=False, verbose=1):
        self.fit(X,
                 post_analyze_distribution=post_analyze_distribution,
                 verbose=verbose)
        return np_asarray(self.predictedKlusters,
                          dtype=int), self.kluster_centroids

    def predict(self, X, post_analyze_distribution=False, verbose=1):
        df = pd_df(X)
        print("started prediction for ", self.cluster_model, " X(", X.shape,
              ")")

        if self.cluster_model == 'KMeans':
            # default vals for kmeans --> max_iter=300, 1e-4
            self.predictedKlusters = self.trained_model.predict(df).astype(
                float)
            self.kluster_centers = self.trained_model.cluster_centers_.astype(
                float)
        elif self.cluster_model == 'GMM_full':
            # default vals for gmm --> max_iter=100, 1e-3
            _, log_resp = self.trained_model._e_step(df)
            self.predictedKlusters = log_resp.argmax(axis=1)
        elif self.cluster_model == 'GMM_diag':
            _, log_resp = self.trained_model._e_step(df)
            self.predictedKlusters = log_resp.argmax(axis=1)
        elif self.cluster_model == 'Spectral':
            self.predictedKlusters = self.trained_model.predict(X).labels_

        self.kluster_centroids = get_cluster_centroids(
            X,
            self.predictedKlusters,
            kluster_centers=self.kluster_centers,
            verbose=0)

        if post_analyze_distribution:
            numOf_1_sample_bins, histSortedInv = analyzeClusterDistribution(
                self.predictedKlusters, self.n_clusters, verbose=1)
            unique_clust_cnt = len(np_unique(self.predictedKlusters))
            print("prediction completed for ",
                  self.cluster_model, " - unique_clust_cnt(",
                  str(unique_clust_cnt), "), numOf_1_sample_bins(",
                  str(numOf_1_sample_bins), ")")
        return np_asarray(self.predictedKlusters,
                          dtype=int), self.kluster_centroids
Esempio n. 2
0
    np.savetxt('../result/arc_roc.txt', np.column_stack((tpr, fpr)), fmt='%f')

    plt.figure()
    plt.plot(fpr,
             tpr,
             color='b',
             label='ROC (AUC = %0.2f)' % auc,
             lw=2,
             alpha=.8)
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.show()

    # online learning
    for n_iter in range(100):
        log_prob_norm, log_resp = clf._e_step(test_data[label == 0, :-1])
        clf._m_step(test_data[label == 0, :-1], log_resp)

    scores = clf.score_samples(test_data[:, :-1])
    scores = (scores - min(scores)) / (max(scores) - min(scores))
    np.savetxt('../result/scores.txt', scores, fmt='%.6f')
    exit(0)

    thresholds = np.arange(scores.min(), scores.max(), 0.001)
    tpr, fpr, F1, accuracy, best_threshold_index = calc_roc(
        thresholds, scores, np.logical_not(label))

    auc = metrics.auc(fpr, tpr)
    print('auc: %s' % auc)
    print('accuracy: %s' % np.mean(accuracy))
    print('best_threshold: %s' % thresholds[best_threshold_index])