Ejemplo n.º 1
0
def cluster_category_data(df,
                          scale_data='minmax',
                          dim_red_method='som',
                          use_elbow_method='True',
                          cluster_method='hierarchical',
                          n_clusters=None,
                          verbose=1,
                          perplexity=None):
    """
    :param df: dataframe containing all the columns belonging to a category to be used in clustering
    :param scale_data: method to be used to scale the dataset
    :param dim_red_method: options are 'som', 'umap', 'tsne', None. If  None, do clustering directly.
    :param use_elbow_method: if True, elbow method is used to find the optimum number of clusters. If False, n_clusters needs to be specified
    :param cluster_method: options are 'kmeans' and 'hierarchical'. In either case kmeans is used for the elbow method(because of the time required).
    :param n_clusters: If use_elbow_method is False, n_clusters needs to be given.
    :param verbose: If True, output the progress in clustering process
    :param perplexity: If method used is TSNE, perplexity nedds to be specified
    """
    t = time.time()

    if scale_data == 'minmax':
        X = MinMaxScaler().fit_transform(df)
    elif scale_data == 'standard':
        X = StandardScaler().fit_transform(df)
    else:
        X = df.values

    if verbose:
        print(f'number of features = {df.shape[1]}')

    if dim_red_method == 'som':
        if verbose:
            print(
                'Self Organising Maps is being used for dimensionality reduction...'
            )
        opt_k = 2
        max_s = -1
        f = 0
        for mapsize in [(30, 30)]:
            if verbose:
                print(f'map size = {mapsize}')
            sm = SOMFactory().build(X,
                                    normalization='var',
                                    initialization='pca',
                                    mapsize=mapsize)
            sm.train(n_job=1,
                     verbose=False,
                     train_rough_len=100,
                     train_finetune_len=500)
            if use_elbow_method:
                model = KElbowVisualizer(KMeans(), k=20, timings=False)
                elbow = model.fit(sm.codebook.matrix).elbow_value_
                if elbow and verbose:
                    print(f'elbow value = {elbow}')
                if not elbow:
                    if verbose:
                        print('elbow not found')
                    ms = -1
                    for k in range(2, 20):
                        km_labels = KMeans(k).fit_predict(sm.codebook.matrix)
                        s = silhouette_score(sm.codebook.matrix, km_labels)
                        if s > ms:
                            elbow = k
            else:
                elbow = n_clusters
            x = sm.project_data(X)
            labels, _, _ = sm.cluster(opt=elbow, cl_type=cluster_method)
            clabels = []
            for i in range(X.shape[0]):
                clabels.append(labels[x[i]])
            s_score = silhouette_score(X, clabels)
            if verbose:
                print(f'silhouette score = {round(s_score, 3)}')
            max_s = max(s_score, max_s)
            if (max_s == s_score):
                opt_k = elbow
                opt_labels = clabels
                opt_size = mapsize
            if (max_s > s_score):
                break
        if verbose:
            print(f'optimum mapsize = {opt_size}')
            print(
                f'optimum number of clusters = {opt_k} & silhouette score = {round(max_s,3)}'
            )
            print(f'time taken = {round(time.time()-t,1)}')
        return opt_labels, opt_k

    elif dim_red_method:
        if dim_red_method == 'umap':
            print('UMAP is being used for dimensionality reduction...')
            embedding = umap.UMAP(n_components=2,
                                  n_neighbors=5,
                                  min_dist=0.0001,
                                  metric='euclidean',
                                  random_state=1,
                                  spread=0.5,
                                  n_epochs=1000).fit_transform(X)
            print('UMAP embedding done...')
        elif dim_red_method == 'tsne':
            print('t-SNE is being used for dimensionality reduction...')
            embedding = TSNE(perplexity=perplexity).fit_transform(X)
            print('t-SNE embedding is done...')
        if use_elbow_method:
            model = KElbowVisualizer(KMeans(), k=20, timings=False)
            elbow = model.fit(embedding).elbow_value_
        else:
            elbow = n_clusters
        if cluster_method == 'kmeans':
            opt_labels = KMeans(elbow).fit_predict(embedding)
        elif cluster_method == 'hierarchical':
            opt_labels = AgglomerativeClustering(elbow).fit_predict(embedding)
        if verbose:
            s_score = silhouette_score(X, opt_labels)
            print(
                f'number of clusters = {elbow} and silhouette_score = {s_score}'
            )
        return opt_labels, elbow

    else:
        if use_elbow_method:
            model = KElbowVisualizer(KMeans(), k=20, timings=False)
            elbow = model.fit(X).elbow_value_
        else:
            elbow = n_clusters
        if cluster_method == 'kmeans':
            opt_labels = KMeans(elbow).fit_predict(X)
        elif cluster_method == 'hierarchical':
            opt_labels = AgglomerativeClustering(elbow).fit_predict(X)
        print(f'silhouette score = {round(silhouette_score(X,opt_labels),3)}')
        return opt_labels, elbow
Ejemplo n.º 2
0
vhts = BmuHitsView(5, 5, "Hits Map", text_size=11)
vhts.show(sm,
          anotate=True,
          onlyzeros=False,
          labelsize=9,
          cmap="plasma",
          logaritmic=False)

# Get the labels for each BMU
# in the SOM (15 * 10 neurons)
clabs = sm.cluster_labels

# Project the data on to the SOM
# so that we get the BMU for each
# of the original data points
bmus = sm.project_data(data)

# Turn the BMUs into cluster labels
# and append to the data frame
data_std[s_var] = pd.Series(clabs[bmus], index=data_std.index)

print(data_std.SOM.value_counts())

sdf = gdf.join(data_std, how='inner')

sdf.sample(5)[[k_var, d_var, s_var]]

from pysal.contrib.viz import mapping as maps

# Where will our shapefile be stored
shp_link = os.path.join('outputs', 'lsoas_som.shp')