Exemple #1
0
def f1():
    model = AgglomerativeClustering(n_clusters=claster_number,
                                    affinity='euclidean',
                                    linkage='complete')

    # model.fit(matrix.toarray())
    # print (model.distance)
    # print(linkage(matrix.toarray(), method='single', metric='euclidean'))

    preds = model.fit_predict(matrix.toarray())

    res = dict()
    for i, p in enumerate(preds):
        # print(p)
        res[basename(dataset['filenames'][i])] = dataset['target_names'][p]
    prev = None
    for k, v in sorted(res.items(), key=operator.itemgetter(1)):
        if prev != v:
            print(v, ':')
            prev = v
        print('\t\t', k)

    dist = 1 - cosine_similarity(matrix.toarray())
    row_sums = dist.sum(axis=1)
    new_matrix = dist / row_sums[:, np.newaxis]
    plt.figure(figsize=(20, 20), dpi=300)
    sb.heatmap(new_matrix)
    lbls = list()
    for fn in dataset['filenames']:
        lbls.append(basename(fn)[:-4])
    plt.xticks(np.arange(0, article_number), lbls, rotation='vertical')
    plt.yticks(np.arange(0, article_number), lbls, rotation='horizontal')
    plt.show()
    print(())
Exemple #2
0
    def _global_clustering(self, X=None):

        #对fitting之后获得的subclusters进行global_clustering
        clusterer = self.n_clusters
        centroids = self.subcluster_centers_
        compute_labels = (X is not None) and self.compute_labels

        # 预处理
        not_enough_centroids = False
        if isinstance(clusterer, int):
            clusterer = AgglomerativeClustering(n_clusters=self.n_clusters)
            if len(centroids) < self.n_clusters:
                not_enough_centroids = True
        elif (clusterer is not None):
            raise ValueError("n_clusters should be an instance of " "ClusterMixin or an int")

        # 避免predict环节,重复运算
        self._subcluster_norms = row_norms(
            self.subcluster_centers_, squared=True)

        if clusterer is None or not_enough_centroids:
            self.subcluster_labels_ = np.arange(len(centroids))
            if not_enough_centroids:
                warnings.warn(
                    "Number of subclusters found (%d) by Birch is less than (%d). Decrease the threshold."% (len(centroids), self.n_clusters))
        else:
            # 对所有叶子节点的subcluster进行聚类,它将subcluster的centroids作为样本,并且找到最终的centroids.
            self.subcluster_labels_ = clusterer.fit_predict(
                self.subcluster_centers_)

        if compute_labels:
            self.labels_ = self.predict(X)
Exemple #3
0
def agglomerative_clustering (matrix):
    print "====== Agglomerative Clustering ==============="

    model = AgglomerativeClustering(n_clusters=3, affinity='cosine', linkage='complete')
    preds = model.fit_predict(matrix)
    clusters =  model.labels_.tolist()

    return (preds, clusters)
Exemple #4
0
def test_n_clusters():
    # Test that n_clusters param works properly
    X, y = make_blobs(n_samples=100, centers=10)
    brc1 = Birch(n_clusters=10)
    brc1.fit(X)
    assert len(brc1.subcluster_centers_) > 10
    assert len(np.unique(brc1.labels_)) == 10

    # Test that n_clusters = Agglomerative Clustering gives
    # the same results.
    gc = AgglomerativeClustering(n_clusters=10)
    brc2 = Birch(n_clusters=gc)
    brc2.fit(X)
    assert_array_equal(brc1.subcluster_labels_, brc2.subcluster_labels_)
    assert_array_equal(brc1.labels_, brc2.labels_)

    # Test that the wrong global clustering step raises an Error.
    clf = ElasticNet()
    brc3 = Birch(n_clusters=clf)
    with pytest.raises(ValueError):
        brc3.fit(X)

    # Test that a small number of clusters raises a warning.
    brc4 = Birch(threshold=10000.)
    assert_warns(ConvergenceWarning, brc4.fit, X)
def classification_clustering(X, n_clusters=1):
    X = tfidf(X)
    clustering = AgglomerativeClustering(n_clusters=n_clusters,
                                         compute_full_tree=True).fit(X)
    print(X.shape)
    print(clustering.labels_)
    print(clustering.children_)
    print(len(clustering.children_))
Exemple #6
0
    def _global_clustering(self, X=None):
        """
        Global clustering for the subclusters obtained after fitting
        """
        clusterer = self.n_clusters
        centroids = self.subcluster_centers_
        compute_labels = (X is not None) and self.compute_labels

        # Preprocessing for the global clustering.
        not_enough_centroids = False
        if isinstance(clusterer, int):
            clusterer = AgglomerativeClustering(
                n_clusters=self.n_clusters)
            # There is no need to perform the global clustering step.
            if len(centroids) < self.n_clusters:
                not_enough_centroids = True
        elif (clusterer is not None and not
              hasattr(clusterer, 'fit_predict')):
            raise ValueError("n_clusters should be an instance of "
                             "ClusterMixin or an int")

        # To use in predict to avoid recalculation.
        self._subcluster_norms = row_norms(
            self.subcluster_centers_, squared=True)

        if clusterer is None or not_enough_centroids:
            self.subcluster_labels_ = np.arange(len(centroids))
            if not_enough_centroids:
                warnings.warn(
                    "Number of subclusters found (%d) by Birch is less "
                    "than (%d). Decrease the threshold."
                    % (len(centroids), self.n_clusters))
        else:
            # The global clustering step that clusters the subclusters of
            # the leaves. It assumes the centroids of the subclusters as
            # samples and finds the final centroids.
            self.subcluster_labels_ = clusterer.fit_predict(
                self.subcluster_centers_)

        if compute_labels:
            self.labels_ = self.predict(X)
Exemple #7
0
    def cluster_faces(self, eucl_dist_vecs, n_clusters=30):
        """
        
        :param eucl_dist_vecs: A list of euclidian distances as returned from FaceRecognitionModel.get_vector() 
        :param cluster_range_min: Minimal number of clusters to produce
        :param cluster_range_max: Maximal number of clusters to produce
        :return: A dict(key:n_clusters, val=list[label_idx][data_idx])
        """

        X = np.array(eucl_dist_vecs)
        clustering = AgglomerativeClustering(n_clusters=n_clusters).fit(X)

        return clustering.labels_
Exemple #8
0
    X=np.array([[0.697,0.460],[0.774,0.376],[0.634,0.264],[0.608,0.318],[0.556,0.215],
                [0.403,0.237],[0.481,0.149],[0.437,0.211],[0.666,0.091],[0.243,0.267],
                [0.245,0.057],[0.343,0.099],[0.639,0.161],[0.657,0.198],[0.360,0.370],
                [0.593,0.042],[0.719,0.103],[0.359,0.188],[0.339,0.241],[0.282,0.257],
                [0.748,0.232],[0.714,0.346],[0.483,0.312],[0.478,0.437],[0.525,0.369],
                [0.751,0.489],[0.532,0.472],[0.473,0.376],[0.725,0.445],[0.446,0.459]])

    X_test=X
    agnes=AGNES()
    agnes.fit(X)
    print('C:', agnes.C)
    print(agnes.labels_)
    plt.figure(12)
    plt.subplot(121)
    plt.scatter(X[:, 0], X[:, 1], c=agnes.labels_)
    plt.title('tinyml')

    from sklearn.cluster.hierarchical import AgglomerativeClustering
    sklearn_agnes=AgglomerativeClustering(n_clusters=7,affinity='l2',linkage='average')
    sklearn_agnes.fit(X)
    print(sklearn_agnes.labels_)
    plt.subplot(122)
    plt.scatter(X[:,0],X[:,1],c=sklearn_agnes.labels_)
    plt.title('sklearn')
    plt.show()





    def run_concurrent(self, args, sign_progress):
        idx = 0
        movie_path = args[0]
        resolution = args[1]
        in_hists = args[3]
        fps = args[2]
        indices = args[4]
        frame_resolution = args[5]
        n_cluster_range = args[6]
        alt_resolution = args[7]
        cluster_sizes = range(n_cluster_range[0], n_cluster_range[1], 1)
        histograms = []
        frames = []

        cap = cv2.VideoCapture(movie_path)
        length = cap.get(cv2.CAP_PROP_FRAME_COUNT)

        resize_f = 192.0 / cap.get(cv2.CAP_PROP_FRAME_WIDTH)

        data_idx = 0
        read_img = -1  # We only want to read every second image
        if indices is None:
            indices = []
            resolution = alt_resolution
        for i in range(int(length)):
            if self.aborted:
                return None
            if i % resolution == 0:
                read_img += 1
                if in_hists is not None and data_idx >= len(in_hists):
                    break
                if read_img % frame_resolution == 0:
                    cap.set(cv2.CAP_PROP_POS_FRAMES, i)
                    ret, frame = cap.read()
                    frames.append(
                        cv2.resize(frame, None, None, resize_f, resize_f,
                                   cv2.INTER_CUBIC))
                    read_img = 0

                sign_progress(i / length)
                if in_hists is not None:
                    histograms.append(
                        np.resize(in_hists[data_idx], new_shape=16**3))
                else:
                    cap.set(cv2.CAP_PROP_POS_FRAMES, i)
                    ret, frame = cap.read()
                    if frame is None:
                        break

                    frame = cv2.cvtColor(floatify_img(frame),
                                         cv2.COLOR_BGR2LAB)
                    data = np.resize(frame,
                                     (frame.shape[0] * frame.shape[1], 3))
                    hist = cv2.calcHist([data[:, 0], data[:, 1], data[:, 2]],
                                        [0, 1, 2], None, [16, 16, 16],
                                        [0, 100, -128, 128, -128, 128])
                    indices.append(i)
                    histograms.append(np.resize(hist, new_shape=16**3))
                data_idx += 1

        connectivity = np.zeros(shape=(len(histograms), len(histograms)),
                                dtype=np.uint8)
        for i in range(1, len(histograms) - 1, 1):
            connectivity[i][i - 1] = 1
            connectivity[i][i] = 1
            connectivity[i][i + 1] = 1

        clusterings = []
        for i, n_cluster in enumerate(cluster_sizes):
            sign_progress(i / len(cluster_sizes))

            if len(histograms) > n_cluster:
                model = AgglomerativeClustering(linkage="ward",
                                                connectivity=connectivity,
                                                n_clusters=n_cluster,
                                                compute_full_tree=True)
                model.fit(histograms)
                clusterings.append(model.labels_)

        return [
            clusterings, frames, indices, fps, frame_resolution,
            n_cluster_range
        ]
Exemple #10
0
def cluster(diss_matrix, n_clusters):

    agglo_clusterer = AgglomerativeClustering(affinity='precomputed',
                                              linkage='average',
                                              n_clusters=n_clusters)
    return agglo_clusterer.fit(diss_matrix).labels_
Exemple #11
0
from sklearn.preprocessing.data import StandardScaler
from sklearn.manifold.t_sne import TSNE
from sklearn.linear_model.theil_sen import TheilSenRegressor
from sklearn.mixture.dpgmm import VBGMM
from sklearn.feature_selection.variance_threshold import VarianceThreshold

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)


clf_dict = {'ARDRegression':ARDRegression(),
			'AdaBoostClassifier':AdaBoostClassifier(),
			'AdaBoostRegressor':AdaBoostRegressor(),
			'AdditiveChi2Sampler':AdditiveChi2Sampler(),
			'AffinityPropagation':AffinityPropagation(),
			'AgglomerativeClustering':AgglomerativeClustering(),
			'BaggingClassifier':BaggingClassifier(),
			'BaggingRegressor':BaggingRegressor(),
			'BayesianGaussianMixture':BayesianGaussianMixture(),
			'BayesianRidge':BayesianRidge(),
			'BernoulliNB':BernoulliNB(),
			'BernoulliRBM':BernoulliRBM(),
			'Binarizer':Binarizer(),
			'Birch':Birch(),
			'CCA':CCA(),
			'CalibratedClassifierCV':CalibratedClassifierCV(),
			'DBSCAN':DBSCAN(),
			'DPGMM':DPGMM(),
			'DecisionTreeClassifier':DecisionTreeClassifier(),
			'DecisionTreeRegressor':DecisionTreeRegressor(),
			'DictionaryLearning':DictionaryLearning(),
Exemple #12
0
def run_experiment(ae_model_path):
    logger.info(
        f"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"
    )
    logger.info(
        f"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"
    )
    logger.info(f"Working now on {ae_model_path.name}")
    logger.info(
        f"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"
    )
    logger.info(
        f"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"
    )
    new_seed = random.randint(0, 1000)
    logger.info(f"Seed value for this is: {new_seed}")
    set_random_seed(new_seed)

    ae_module = stacked_ae(pt_data.shape[1], [500, 500, 2000, 10],
                           weight_initalizer=torch.nn.init.xavier_normal_,
                           activation_fn=lambda x: F.relu(x),
                           loss_fn=None,
                           optimizer_fn=None)

    model_data = torch.load(ae_model_path, map_location='cpu')
    ae_module.load_state_dict(model_data)
    ae_module = ae_module.cuda()

    # Get embedded data
    embedded_data = None
    for batch_data in torch.utils.data.DataLoader(pt_data,
                                                  batch_size=256,
                                                  shuffle=False):
        embedded_batch_np = ae_module.forward(
            batch_data.cuda())[0].detach().cpu().numpy()
        if embedded_data is None:
            embedded_data = embedded_batch_np
        else:
            embedded_data = np.concatenate([embedded_data, embedded_batch_np],
                                           0)
    del ae_module

    sl_cl = AgglomerativeClustering(compute_full_tree=True,
                                    n_clusters=n_clusters,
                                    linkage="single").fit(embedded_data)
    sl_labels = sl_cl.labels_
    sl_purity_tree = prune_dendrogram_purity_tree(
        to_dendrogram_purity_tree(sl_cl.children_), n_leaf_nodes_final)
    sl_nmi = nmi(gold_labels, sl_labels, average_method='arithmetic')
    sl_acc = cluster_acc(sl_labels, gold_labels)[0]
    sl_purity = dendrogram_purity(sl_purity_tree, gold_labels)
    sl_lp = leaf_purity(sl_purity_tree, gold_labels)
    sl_leaf_purity_value = f"{sl_lp[0]:1.3}\t({sl_lp[1]:1.3})"

    result_file_sl = Path(
        f"{result_dir}/results_ae_agglo_single_{dataset_name}.txt")
    result_file_sl_exists = result_file_sl.exists()
    f = open(result_file_sl, "a+")
    if not result_file_sl_exists:
        f.write(
            "#\"ae_model_name\"\t\"NMI\"\t\"ACC\"\t\"Dendrogram_Purity\"\t\"Leaf_Purity\t(Std)\"\n"
        )
    f.write(
        f"{ae_model_path.name}\t{sl_nmi}\t{sl_acc}\t{sl_purity}\t{sl_leaf_purity_value}\n"
    )
    f.close()
    del sl_cl, sl_labels, sl_purity_tree

    cl_cl = AgglomerativeClustering(compute_full_tree=True,
                                    n_clusters=n_clusters,
                                    linkage="complete").fit(embedded_data)
    cl_labels = cl_cl.labels_
    cl_purity_tree = prune_dendrogram_purity_tree(
        to_dendrogram_purity_tree(cl_cl.children_), n_leaf_nodes_final)
    cl_nmi = nmi(gold_labels, cl_labels, average_method='arithmetic')
    cl_acc = cluster_acc(cl_labels, gold_labels)[0]
    cl_purity = dendrogram_purity(cl_purity_tree, gold_labels)
    cl_lp = leaf_purity(cl_purity_tree, gold_labels)
    cl_leaf_purity_value = f"{cl_lp[0]:1.3}\t({cl_lp[1]:1.3})"

    result_file_cl = Path(
        f"{result_dir}/results_ae_agglo_complete_{dataset_name}.txt", )
    result_file_cl_exists = result_file_cl.exists()
    f = open(result_file_cl, "a+")
    if not result_file_cl_exists:
        f.write(
            "#\"ae_model_name\"\t\"NMI\"\t\"ACC\"\t\"Dendrogram_Purity\"\t\"Leaf_Purity\t(Std)\"\n"
        )
    f.write(
        f"{ae_model_path.name}\t{cl_nmi}\t{cl_acc}\t{cl_purity}\t{cl_leaf_purity_value}\n"
    )
    f.close()
    del cl_cl, cl_labels, cl_purity_tree
Exemple #13
0
z = linkage(df_norm1, method='complete', metric='euclidean')

from sklearn.cluster.hierarchical import AgglomerativeClustering
import sklearn.cluster.hierarchical as shch

plt.figure(figsize=(15, 5))
plt.title('Hierarchical Clustering Dendogram')
plt.xlabel('Index')
plt.ylabel('Distance')
sch.dendrogram(z, leaf_rotation=0, leaf_font_size=8)

plt.show()

h_clustering = AgglomerativeClustering(n_clusters=6,
                                       affinity="euclidean",
                                       linkage="complete").fit(df_norm1)
h_clustering

h = pd.Series(h_clustering.labels_)
h

crime2['clust'] = h
crime2 = crime2.iloc[:, [5, 0, 1, 2, 3, 4]]

crime2.iloc[:, 2:].groupby(crime2.clust).median()

crime2.to_csv("crime2.csv", encoding="utf-8")

import os
os.getcwd()
Exemple #14
0
plt.scatter(X[:, 0], X[:, 1])
plt.show()

#kMeans clustering
from sklearn.cluster import KMeans
km = KMeans(init='random', max_iter=150, n_clusters=2, random_state=0)
y_km = km.fit_predict(X)

plt.scatter(X[y_km == 0, 0], X[y_km == 0, 1], c='green')
plt.scatter(X[y_km == 1, 0], X[y_km == 1, 1], c='red')
plt.title("KMeans")
plt.show()

#Agglomerative Clustering with complete linkage
from sklearn.cluster.hierarchical import AgglomerativeClustering
aggcl = AgglomerativeClustering(n_clusters=2, linkage='complete')
y_agcl = aggcl.fit_predict(X)

plt.scatter(X[y_agcl == 0, 0], X[y_agcl == 0, 1], c='green')
plt.scatter(X[y_agcl == 1, 0], X[y_agcl == 1, 1], c='red')
plt.title("Aggolomerative Clustering")
plt.show()

#Demonstaring clustering using density-based approach
from sklearn.cluster import DBSCAN
dbs = DBSCAN(eps=0.2, min_samples=5)
y_dbs = dbs.fit_predict(X)

plt.scatter(X[y_dbs == 0, 0], X[y_dbs == 0, 1], c='green')
plt.scatter(X[y_dbs == 1, 0], X[y_dbs == 1, 1], c='red')
plt.title("Density based(DBSCAN) Clustering")
    print(len(i))


# In[24]:


for cluster in dbscan_clusters:
    print(data.loc[cluster].mean())


# In[11]:


from sklearn.cluster.hierarchical import AgglomerativeClustering

model = AgglomerativeClustering(n_clusters=4, linkage='average', affinity='manhattan')
aggl_preds = model.fit_predict(scaled_features)


# In[12]:


clusters_aggl = []
for lbl in np.unique(aggl_preds):
    indices = [i for i, x in enumerate(aggl_preds) if x == lbl]
    clusters_aggl.append(indices)


# In[25]:

    vectors = np.loadtxt(config("path.article-embeds"))
    # checking vectors length: mean ~0.46, std ~ 0.05, max = 1, min ~ 0.29
    # stats = np.array([np.linalg.norm(vectors[i, :]) for i in range(vectors.shape[0])])

    pick_recent = 3 * 25 * 365
    clusters = model.fit_predict(vectors[-pick_recent:])

    clust_series = pd.Series(clusters, name='Cluster')

    tagged = pd.concat(
        [clust_series, df.iloc[-pick_recent:].reset_index(drop=True)],
        axis=1,
        sort=False)
    tagged[["News",
            "Cluster"]].iloc[-pick_recent:].to_csv("for_visualization.tsv",
                                                   sep="\t",
                                                   index=False,
                                                   header=False)
    tagged.sort_values(by='Cluster').to_csv(
        "explore_topics({}).tsv".format(label), sep="\t", index=False)
    print(tagged.Cluster.value_counts())


if __name__ == '__main__':
    cluster_and_dump(
        AgglomerativeClustering(n_clusters=100,
                                affinity="cosine",
                                linkage="complete"), "100,cos,complete")
    # cluster_and_dump(AgglomerativeClustering(n_clusters=100), "ward")
    # cluster_and_dump(DBSCAN(eps=0.19), "dbscan")
    def process(self, args, sign_progress):
        """
        This is the actual analysis, which takes place in a WorkerThread.
        Do NOT and NEVER modify the project within this function.

        We want to read though the movie and get the Average Colors from each Segment.

        Once done, we create an Analysis Object from it.
        """
        args, sign_progress = super(ShotSegmentationAnalysis,
                                    self).process(args, sign_progress)
        # Signal the Progress
        sign_progress(0.0)

        video_capture = cv2.VideoCapture(args['movie_path'])

        duration = video_capture.get(cv2.CAP_PROP_FRAME_COUNT)
        resize_f = 192.0 / video_capture.get(cv2.CAP_PROP_FRAME_WIDTH)
        resize_clamp = self.frame_width_clamp / video_capture.get(
            cv2.CAP_PROP_FRAME_WIDTH)
        width = video_capture.get(cv2.CAP_PROP_FRAME_WIDTH)
        height = video_capture.get(cv2.CAP_PROP_FRAME_HEIGHT)

        n = int(np.floor(duration / self.resolution))
        X = np.zeros(shape=(n, 16**3), dtype=np.float16)
        frame_pos = np.zeros(shape=n, dtype=np.int32)

        frames = []
        for i in range(int(n)):
            if self.aborted:
                return None

            idx = int(i * self.resolution)
            video_capture.set(cv2.CAP_PROP_POS_FRAMES, idx)

            ret, frame = video_capture.read()
            if frame is None:
                continue

            if self.return_frames and i % self.frame_resolution == 0:
                frames.append(
                    cv2.resize(frame, None, None, resize_f, resize_f,
                               cv2.INTER_CUBIC))

            if resize_clamp < 1.0:
                frame = cv2.resize(frame, None, None, resize_clamp,
                                   resize_clamp, cv2.INTER_CUBIC)
            frame = cv2.cvtColor(floatify_img(frame), cv2.COLOR_BGR2LAB)
            X[i] = np.resize(calculate_histogram(frame, lab_mode=True),
                             new_shape=16**3) / (width * height)
            frame_pos[i] = idx
            sign_progress(round(i / n, 4))

        connectivity = np.zeros(shape=(n, n), dtype=np.uint8)
        for i in range(1, n - 1, 1):
            connectivity[i][i - 1] = 1
            connectivity[i][i] = 1
            connectivity[i][i + 1] = 1
        clusterings = []

        cluster_sizes = range(self.cluster_range[0], self.cluster_range[1], 1)
        for i, n_cluster in enumerate(cluster_sizes):
            sign_progress(i / len(cluster_sizes))
            if X.shape[0] > n_cluster:
                model = AgglomerativeClustering(linkage="ward",
                                                connectivity=connectivity,
                                                n_clusters=n_cluster,
                                                compute_full_tree=True)
                model.fit(X)
                timestamps = self._generate_segments(
                    model.labels_, frame_pos,
                    video_capture.get(cv2.CAP_PROP_FPS))
                clusterings.append(timestamps)

        if self.return_hdf5_compatible:
            result = np.zeros(shape=self.dataset_shape,
                              dtype=self.dataset_dtype)
            for i in range(len(clusterings)):
                result[i][0:len(clusterings[i])] = [[
                    c['label'], c['f_start'], c['f_stop']
                ] for c in clusterings[i]]
            # Creating an IAnalysisJobAnalysis Object that will be handed back to the Main-Thread
            analysis = IAnalysisJobAnalysis(
                name="My Analysis",
                results=result,
                analysis_job_class=self.__class__,
                parameters=dict(resolution=self.resolution),
                container=args['movie_descriptor'])
        else:
            analysis = AnalysisContainer(name=self.name, data=clusterings)
        sign_progress(1.0)
        return analysis