def test_kmodes_init_soybean(self): init_vals = np.array( [[0, 1, 2, 1, 0, 3, 1, 1, 0, 2, 1, 1, 0, 2, 2, 0, 0, 0, 1, 0, 1, 2, 0, 0, 0, 0, 0, 3, 4, 0, 0, 0, 0, 0, 1], [4, 0, 0, 1, 1, 1, 3, 1, 1, 1, 1, 1, 0, 2, 2, 0, 0, 0, 1, 1, 0, 3, 0, 0, 0, 2, 1, 0, 4, 0, 0, 0, 0, 0, 0], [3, 0, 2, 1, 0, 2, 0, 2, 1, 1, 1, 1, 0, 2, 2, 0, 0, 0, 1, 0, 3, 0, 1, 1, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0], [3, 0, 2, 0, 1, 3, 1, 2, 0, 1, 1, 0, 0, 2, 2, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 3, 4, 0, 0, 0, 0, 0, 0]]) kmodes_init = kmodes.KModes(n_clusters=4, init=init_vals, verbose=2) result = kmodes_init.fit_predict(SOYBEAN) expected = np.array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) assert_cluster_splits_equal(result, expected) # 5 initial centroids, 4 n_clusters init_vals = np.array( [[0, 1], [4, 0], [4, 0], [3, 0], [3, 0]]) kmodes_init = kmodes.KModes(n_clusters=4, init=init_vals, verbose=2) with self.assertRaises(AssertionError): kmodes_init.fit(SOYBEAN) # wrong number of attributes init_vals = np.array( [0, 1, 2, 3]) kmodes_init = kmodes.KModes(n_clusters=4, init=init_vals, verbose=2) with self.assertRaises(AssertionError): kmodes_init.fit(SOYBEAN)
def get_cluster_num_for_levels(data_df, target_col_name): for column in data_df.columns: if column != target_col_name: cluster_num_cost = {} feature_frame_train = data_df[[column, target_col_name]] logging.info( str('feature_frame_train is :: ' + str(feature_frame_train.describe()))) # for num_of_clusters in range(1, 5): # logging.info('Number of clusters :: ' + str(num_of_clusters)) # km = kmodes.KModes(n_clusters=num_of_clusters, init='Huang', n_init=1, verbose=1) # clusters = km.fit_predict(feature_frame_train) # cluster_num_cost.update({num_of_clusters: km.cost_}) # logging.info('Cluster cost is :: ' + str(km.cost_)) km = kmodes.KModes(n_clusters=4, init='Huang', n_init=5, verbose=1) clusters = km.fit_predict(feature_frame_train) cluster_num_cost.update({4: km.cost_}) cluster_col = column + '_cluster_num' data_df[cluster_col] = clusters data_df[cluster_col] = data_df[cluster_col].astype('category') logging.info(cluster_num_cost) plt.bar(range(len(cluster_num_cost)), cluster_num_cost.values(), align='center') plt.xticks(range(len(cluster_num_cost)), cluster_num_cost.keys()) plt.ylabel('Cost') plt.xlabel('Number of clusters') plt.savefig('../plots/preprocess2/' + cluster_col + "_" + '20' + ".png") plt.gcf().clear() # plt.show() return data_df
def test_kmodes_predict_soybean(self): kmodes_cao = kmodes.KModes(n_clusters=4, init='Cao', verbose=2) kmodes_cao = kmodes_cao.fit(SOYBEAN) result = kmodes_cao.predict(SOYBEAN2) expected = np.array([2, 1, 3, 0]) assert_cluster_splits_equal(result, expected) self.assertTrue(result.dtype == np.dtype(np.uint8))
def kmode(filename,col_name): # Read sample for clustering from some file df=pd.read_csv(filename, usecols=[col_name]) data = df[col_name] total_rows=len(data) #converting pandas series into ndarray input_data=np.asarray(data) data=np.reshape(data,(total_rows,1)) # random categorical data km = kmodes.KModes(n_clusters=30, n_init=5, verbose=2) clusters = km.fit_predict(data) # Print the cluster centroids print(km.cluster_centroids_) print(clusters) print(timeit.timeit('"-".join(str(n) for n in range(100))',number=10000)) # datalist=[] # datalist.append() #dataframe = pd.DataFrame(np.random.randn(10, 2), # columns=['clusters', 'data1']) #fig = ff.create_scatterplotmatrix(dataframe, height=800, width=800) #py.iplot(fig, filename='Basic Scatterplot Matrix') #fig.show() fig = plt.figure(figsize=(7,4)) plt.scatter(data, clusters, alpha=1, edgecolor='black') plt.savefig("C:/Users/Nupura Hajare/Desktop/flask_app/web/static/img/kmode.png")
def get_silhouette_score(df, X, n_clusters, model='KM'): ''' Calculate silhouette score for clustered dataframe. :param df: dataframe to cluster :param X: dense binary array for silhouette scoring :param n_clusters: number of clusters for model to cluster data into :param model: the clustering algorithm to be applied to the data, default = 'KM' (k-modes) :returns: silhouette score ''' # Initialize clusterer and set random state, if possible if model == 'AG': clusterer = AgglomerativeClustering(n_clusters=n_clusters, affinity='cosine', linkage='average').fit(X) labels = clusterer.labels_ sil_avg = silhouette_score(X, labels, metric='hamming') elif model == 'KM': clusterer = kmodes.KModes(n_clusters=n_clusters, n_init=5, init='Huang', verbose=1) labels = clusterer.fit_predict(df) sil_avg = silhouette_score(X, labels, metric='hamming') elif model == 'GM': clusterer = GaussianMixture(n_components=n_clusters, covariance_type='tied', max_iter=20, n_init=50, random_state=42, verbose=1).fit(X) labels = clusterer.predict(X) sil_avg = silhouette_score(X, labels, metric='hamming') return sil_avg
def test_kmodes_cao_soybean(self): kmodes_cao = kmodes.KModes(n_clusters=4, init='Cao', verbose=2) result = kmodes_cao.fit_predict(SOYBEAN) expected = np.array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]) np.testing.assert_array_equal(result, expected) self.assertTrue(result.dtype == np.dtype(np.uint8))
def test_kmodes_nunique_nclusters(self): data = np.array([[0, 1], [0, 1], [0, 1], [0, 2], [0, 2], [0, 2]]) np.random.seed(42) kmodes_cao = kmodes.KModes(n_clusters=6, init='Cao', verbose=2) result = kmodes_cao.fit_predict(data, categorical=[1]) expected = np.array([0, 0, 0, 1, 1, 1]) np.testing.assert_array_equal(result, expected) np.testing.assert_array_equal(kmodes_cao.cluster_centroids_, np.array([[0, 1], [0, 2]]))
def kmode(y, x): kmodes_huang = kmodes.KModes(n_clusters=10, init='Huang', verbose=1) kmodes_huang.fit(x) # Print cluster centroids of the trained model. print('k-modes (Huang) centroids:') print(kmodes_huang.cluster_centroids_) # Print training statistics print('Final training cost: {}'.format(kmodes_huang.cost_)) print('Training iterations: {}'.format(kmodes_huang.n_iter_))
def test_kmodes_huang_soybean(self): np.random.seed(42) kmodes_huang = kmodes.KModes(n_clusters=4, n_init=2, init='Huang', verbose=2) result = kmodes_huang.fit_predict(SOYBEAN) expected = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 1, 2, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 2, 1, 2, 1]) assert_cluster_splits_equal(result, expected) self.assertTrue(result.dtype == np.dtype(np.uint8))
def return_classifier(self, k): if self.algorithm == "kmodes": return kmodes.KModes(n_clusters=k, init='Huang', n_init=1, verbose=1) elif self.algorithm == "kmeans": if len(self.df) > self.max_lines: return MiniBatchKMeans(n_clusters=k, random_state=1) else: return KMeans(n_clusters=k, random_state=1)
def get_subcluster_list(cluster, data, norefresh=True): """ @cluster Cluster model @data pandas.DataFrame object adapted from https://github.com/nicodv/kmodes/blob/master/examples/soybean.py """ num_of_clusters = cluster.num_of_clusters clus_train, serials = _get_clus_train(data) if Subcluster.objects.filter(group=cluster).exists() and norefresh: #make dataframe from db query_results = Subcluster.objects.filter(group=cluster) serials = [] subclusters = [] for row in query_results: serials.append(float(row.serial)) subclusters.append(row.subcluster) df = pd.DataFrame({ 'SERIAL': serials, 'cluster': subclusters, }) data = data[data['SERIAL'].isin(df.SERIAL.tolist())] merged_db = pd.merge(df, data, on='SERIAL') return merged_db Subcluster.objects.filter(group=cluster).delete() x = clus_train.as_matrix() kmodes_huang = kmodes.KModes(n_clusters=num_of_clusters, init='Huang', verbose=1) kmodes_huang.fit(x) labels = kmodes_huang.labels_ clus_train['cluster'] = labels clus_train['SERIAL'] = serials for i in range(clus_train.shape[0]): row = clus_train.iloc[i] s = Subcluster(serial=row.SERIAL, group=cluster, subcluster=row.cluster) s.save() return clus_train
def test_kmodes_cao_soybean_ng(self): kmodes_cao = kmodes.KModes(n_clusters=4, init='Cao', verbose=2, cat_dissim=ng_dissim) result = kmodes_cao.fit_predict(SOYBEAN) expected = np.array([ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ]) assert_cluster_splits_equal(result, expected) self.assertTrue(result.dtype == np.dtype(np.uint8))
def Cluster(self, feature_matrix, CR): for pred in feature_matrix: features = [] for atom_value in feature_matrix[pred]: features += [feature_matrix[pred][atom_value]] print(pred, len(features[0])) n_of_clusters = math.floor(len(features) * CR) km = kmodes.KModes(n_clusters=n_of_clusters, init='Huang', n_init=1, verbose=1) clusters = km.fit_predict(features) print(clusters)
def test_kmodes_empty_init_cluster_soybean(self): # Check if the clustering does not crash in case of an empty cluster. init_vals = np.array( [[0, 1, 2, 1, 0, 3, 1, 1, 0, 2, 1, 1, 0, 2, 2, 0, 0, 0, 1, 0, 1, 2, 0, 0, 0, 0, 0, 3, 4, 0, 0, 0, 0, 0, 1], [4, 0, 0, 1, 1, 1, 3, 1, 1, 1, 1, 1, 0, 2, 2, 0, 0, 0, 1, 1, 0, 3, 0, 0, 0, 2, 1, 0, 4, 0, 0, 0, 0, 0, 0], [3, 0, 2, 1, 0, 2, 0, 2, 1, 1, 1, 1, 0, 2, 2, 0, 0, 0, 1, 0, 3, 0, 1, 1, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0], [3, 0, 2, 0, 1, 3, 1, 2, 0, 1, 1, 0, 0, 2, 2, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 3, 4, 0, 0, 0, 0, 0, 0]]) kmodes_init = kmodes.KModes(n_clusters=4, init=init_vals, verbose=2) result = kmodes_init.fit(SOYBEAN) self.assertIsInstance(result, kmodes.KModes)
def run_kmodes(syms, X, n, alpha): if os.path.isfile("%d_CLUSTERS.pkl" % (n, )): X_ENC, clusters, centroids = pickle.load( open("%d_CLUSTERS.pkl" % (n, ), "r")) sil_avg = silhouette_score(X_ENC, clusters, metric=simple_compare) return (np.amax(clusters) + 1, sil_avg) kproto = kmodes.KModes(n_clusters=n, init='Cao', kmodes_cat_dissim=dissim_meas, verbose=2) clusters = kproto.fit_predict(X) centroids = kproto.cluster_centroids_ X_ENC = kmodes.encode_features(X, enc_map=kproto.enc_map_)[0] # densities = calc_densities(X_ENC) # bigClusters = getBigClusters(clusters, alpha) # while len(bigClusters) <= 0: # print "Error no big clusters, decreasing alpha to", (alpha-1) # alpha -= 1 # if alpha <= 0: # sys.exit(1) # bigClusters = getBigClusters(clusters, alpha) # cdists = cdist(bigClusters, clusters, syms, X_ENC, centroids) # ranked_outliers = sorted([(c, -densities[idc]/float(len(X_ENC)), syms[idc], clusters[idc]) for idc,c in enumerate(cdists)], reverse=True) # for ido,o in enumerate(ranked_outliers): # index = [ idr for idr,rv in enumerate(syms) if rv == o[2] ][0] # vals = [ v for v in X[index] if not v.startswith("N_") ] # print o, vals with open("%d_CLUSTERS.pkl" % (n, ), "wb") as foutput: pickle.dump((X_ENC, clusters, centroids), foutput) #Distances may be calculated using Euclidean distances. The Silhouette #coefficient and its average range between -1, indicating a very poor model, and #1, indicating an excellent model. As found by Kaufman and Rousseeuw (1990), an #average silhouette greater than 0.5 indicates reasonable partitioning of data; #less than 0.2 means that the data do not exhibit cluster structure. #print "Precomputing Matrix" #X_PCMP =precomputMatrix(X_ENC, clusters) #sil_avg = silhouette_score(X_PCMP, clusters, metric='precomputed') sil_avg = silhouette_score(X_ENC, clusters, metric=simple_compare) return (np.amax(clusters) + 1, sil_avg)
def k_modes(self, clust_num): kmodes_cao = kmodes.KModes(n_clusters=clust_num, init='Cao', verbose=1) num_cols = [4, 21] # age, renta cat_data_indices = self.get_cat_cols(self.data, num_cols) self.data = self.convert_col_type(self.data, cat_data_indices) categorical_data = self.data[cat_data_indices] # get category cols print(categorical_data.dtypes) kmodes_cao.fit(categorical_data) # Print cluster centroids of the trained model. print('k-modes (Cao) centroids:') print(kmodes_cao.cluster_centroids_) # Print training statistics print('Final training cost: {}'.format(kmodes_cao.cost_)) print('Training iterations: {}'.format(kmodes_cao.n_iter_)) return kmodes_cao.labels_
def run_kmodes(X, init_method='Huang', n_clusters=4): ''' Perform k-modes clustering. :param X: prepared array for clustering :param init_method: initiation method for k-prototypes clustering, default = 'Huang' :param n_clusters: number of clusters for model to segment data, default = 4 :returns: k-modes models, array of labels ''' km = kmodes.KModes(n_clusters=n_clusters, n_init=10, init=init_method, verbose=1) labels = km.fit_predict(X) return km, labels
def main(): movie_file = open("merged_data.csv", "r") movies = [row for row in csv.reader(movie_file.read().splitlines())] genre_map = {} i = 0 for movie in movies: if i > 0: genres = movie[4].split() for genre in genres: if genre not in genre_map: genre_map[genre] = 1 else: genre_map[genre] += 1 i += 1 genre_list = [] for genre in genre_map.keys(): if genre_map[genre] > 5: genre_list.append(genre) print(genre_list) mat = np.zeros((5043, len(genre_list))) i = 0 for movie in movies: if i > 0: genres = movie[4].split() for genre in genres: if genre in genre_list: mat[i - 1][genre_list.index(genre)] = 1 i += 1 km = kmodes.KModes(n_clusters=10, init='Huang', n_init=10, verbose=1) clusters = km.fit_predict(mat) cluster_labels = open("clusters.txt", "w") cluster_labels.write("\n") for label in km.labels_: cluster_labels.write(str(label)) cluster_labels.write("\n") print(km.labels_)
def cluster_asmt(A, nclus, ntries): '''cluster_asmt clusters an assessment using kmodes algorithm Parameters: A, assessment (# students x # questions) nclus, number of clusters ntries, number of times to try kmodes algorithm Returns: c_indx, the index of the cluster to which the student is assigned c_cent, the centroid of each cluster c_distn, the total distortion (scalar) of the clustering ''' km = kmodes.KModes(n_clusters=nclus, init='Huang', n_init=ntries, verbose=0) c_indx = km.fit_predict(A) c_cent = km.cluster_centroids_ c_distn = distn_kmode(A, c_indx, c_cent) return c_indx, c_cent, c_distn
def huang(): kmodes.KModes(n_clusters=K, init='Huang', n_init=1).fit_predict(data)
def test_pickle(self): obj = kmodes.KModes() s = pickle.dumps(obj) assert_equal(type(pickle.loads(s)), obj.__class__)
def cao(): kmodes.KModes(n_clusters=K, init='Cao').fit_predict(data)
def test_kmodes_unknowninit_soybean(self): with self.assertRaises(NotImplementedError): kmodes.KModes(n_clusters=4, init='nonsense', verbose=2).fit(SOYBEAN)
import numpy as np from kmodes import kmodes '''生成互相无交集的离散属性样本集''' data1 = np.random.randint(1, 6, (10000, 10)) data2 = np.random.randint(7, 12, (10000, 10)) print(data1.shape) print(data2.shape) data = np.concatenate((data2, data1)) print(data.shape) '''进行K-modes聚类''' km = kmodes.KModes(n_clusters=2) clusters = km.fit_predict(data) # 打印聚类中心 print(km.cluster_centroids_) '''计算正确归类率''' score = np.sum(clusters[:int(len(clusters) / 2)]) + ( len(clusters) / 2 - np.sum(clusters[int(len(clusters) / 2):])) print("np.sum(clusters[:int(len(clusters)/2)]):{}".format( np.sum(clusters[:int(len(clusters) / 2)]))) print(clusters[:int(len(clusters) / 2)]) print("np.sum(clusters[int(len(clusters)/2):]):{}".format( np.sum(clusters[int(len(clusters) / 2):]))) print(clusters[int(len(clusters) / 2):]) score = score / len(clusters) score = score / len(clusters) if score >= 0.5: print('正确率:' + str(score)) else:
def test_kmodes_random_soybean(self): kmodes_random = kmodes.KModes(n_clusters=4, init='random', verbose=2) result = kmodes_random.fit(SOYBEAN) self.assertIsInstance(result, kmodes.KModes)
def test_kmodes_predict_unfitted(self): kmodes_cao = kmodes.KModes(n_clusters=4, init='Cao', verbose=2) with self.assertRaises(AssertionError): kmodes_cao.predict(SOYBEAN) with self.assertRaises(AttributeError): kmodes_cao.cluster_centroids_
import pandas as pd from kmodes import kmodes #Carregar e transformar os dados em dummies df = pd.read_csv("C:/Users/milen/Desktop/Case_-_Cred.csv", sep=';', decimal=',') df.drop(columns=["Atualizado em", "StoneCode", "Descredenciado"], inplace=True) df_dummy = pd.get_dummies(df) x = df_dummy.reset_index().values km = kmodes.KModes(n_clusters=2, init='Huang', n_init=5, verbose=0) clusters = km.fit_predict(x) df_dummy['clusters'] = clusters import matplotlib.pyplot as plt from sklearn.decomposition import PCA pca = PCA(2) # Transformar a dummy em duas colunas no PCA plot_columns = pca.fit_transform(df_dummy) # Plotar os grupos plt.scatter(x=plot_columns[:, 1], y=plot_columns[:, 0], c=df_dummy["clusters"], s=30) plt.show()
'C': [0, 0, 1, 0], 'D': [0, 0, 0, 1] } ansdata = [] emldata = [] with open('MultiPInfodb.csv', 'r') as f: reader = csv.reader(f, delimiter=':') for row in reader: tem = [] emldata.append(row[0]) for x in row[1].split(' '): tem += alpha_dic[x] ansdata.append(tem) f.close() km = kmodes.KModes(n_clusters=4, init="Huang", n_init=5, verbose=1) clusters = km.fit_predict(ansdata) grp = {0: [], 1: [], 2: [], 3: []} ctr = 0 for index in clusters: grp[index].append(emldata[ctr]) ctr += 1 file_lines = [] print grp for x in grp.values(): if x is None: file_lines.append("") else: txt = ""
rows = rows[1:] for row in rows: row[-1] = row[-1][:-1] #rows = rows[1:] print(rows[0][-1]) rows = np.array(rows) syms = rows[:, 0] print(len(syms)) X = rows[:, 1:-1] #print(syms[0:5]) #print(X[0:5]) #print(syms) #print(X) kproto = kmodes.KModes(n_clusters=6, init='Cao', verbose=2) clusters = kproto.fit_predict(X, categorical=[0, 1, 2, 3]) newData = ["57139", "835106", "2", "Air Travel#Business Travel"] cluster = kproto.predict(newData) print(cluster[0]) #~ for s, c in zip(syms, clusters): #~ print("Symbol: {}, cluster:{}".format(s, c)) bids = [i for i in range(0, len(syms))] mod = list(list()) for i in range(0, len(clusters)): old = rows[i] old = np.append(old, clusters[i])
#!/usr/bin/env python import numpy as np from kmodes import kmodes # reproduce results on small soybean data set x = np.genfromtxt('soybean.csv', dtype=int, delimiter=',')[:, :-1] y = np.genfromtxt('soybean.csv', dtype=str, delimiter=',', usecols=(35, )) kmodes_huang = kmodes.KModes(n_clusters=4, init='Huang', verbose=1) kmodes_huang.fit(x) kmodes_cao = kmodes.KModes(n_clusters=4, init='Cao', verbose=1) kmodes_cao.fit(x) for result in (kmodes_huang, kmodes_cao): classtable = np.zeros((4, 4), dtype=int) for ii, _ in enumerate(y): classtable[int(y[ii][-1]) - 1, result.labels_[ii]] += 1 print("\n") print(" | Cl. 1 | Cl. 2 | Cl. 3 | Cl. 4 |") print("----|-------|-------|-------|-------|") for ii in range(4): prargs = tuple([ii + 1] + list(classtable[ii, :])) print(" D{0} | {1:>2} | {2:>2} | {3:>2} | {4:>2} |".format( *prargs))