コード例 #1
0
    def test_kmodes_init_soybean(self):
        init_vals = np.array(
            [[0, 1, 2, 1, 0, 3, 1, 1, 0, 2, 1, 1, 0, 2, 2, 0, 0, 0, 1, 0, 1, 2,
              0, 0, 0, 0, 0, 3, 4, 0, 0, 0, 0, 0, 1],
             [4, 0, 0, 1, 1, 1, 3, 1, 1, 1, 1, 1, 0, 2, 2, 0, 0, 0, 1, 1, 0, 3,
              0, 0, 0, 2, 1, 0, 4, 0, 0, 0, 0, 0, 0],
             [3, 0, 2, 1, 0, 2, 0, 2, 1, 1, 1, 1, 0, 2, 2, 0, 0, 0, 1, 0, 3, 0,
              1, 1, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0],
             [3, 0, 2, 0, 1, 3, 1, 2, 0, 1, 1, 0, 0, 2, 2, 0, 0, 0, 1, 1, 1, 1,
              0, 1, 1, 0, 0, 3, 4, 0, 0, 0, 0, 0, 0]])
        kmodes_init = kmodes.KModes(n_clusters=4, init=init_vals, verbose=2)
        result = kmodes_init.fit_predict(SOYBEAN)
        expected = np.array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1,
                             1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0,
                             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
        assert_cluster_splits_equal(result, expected)

        # 5 initial centroids, 4 n_clusters
        init_vals = np.array(
            [[0, 1],
             [4, 0],
             [4, 0],
             [3, 0],
             [3, 0]])
        kmodes_init = kmodes.KModes(n_clusters=4, init=init_vals, verbose=2)
        with self.assertRaises(AssertionError):
            kmodes_init.fit(SOYBEAN)

        # wrong number of attributes
        init_vals = np.array(
            [0, 1, 2, 3])
        kmodes_init = kmodes.KModes(n_clusters=4, init=init_vals, verbose=2)
        with self.assertRaises(AssertionError):
            kmodes_init.fit(SOYBEAN)
コード例 #2
0
def get_cluster_num_for_levels(data_df, target_col_name):
    for column in data_df.columns:
        if column != target_col_name:
            cluster_num_cost = {}
            feature_frame_train = data_df[[column, target_col_name]]
            logging.info(
                str('feature_frame_train is  :: ' +
                    str(feature_frame_train.describe())))
            # for num_of_clusters in range(1, 5):
            #     logging.info('Number of clusters :: ' + str(num_of_clusters))
            #     km = kmodes.KModes(n_clusters=num_of_clusters, init='Huang', n_init=1, verbose=1)
            #     clusters = km.fit_predict(feature_frame_train)
            #     cluster_num_cost.update({num_of_clusters: km.cost_})
            #     logging.info('Cluster cost is :: ' + str(km.cost_))
            km = kmodes.KModes(n_clusters=4, init='Huang', n_init=5, verbose=1)
            clusters = km.fit_predict(feature_frame_train)
            cluster_num_cost.update({4: km.cost_})
            cluster_col = column + '_cluster_num'
            data_df[cluster_col] = clusters
            data_df[cluster_col] = data_df[cluster_col].astype('category')
            logging.info(cluster_num_cost)
            plt.bar(range(len(cluster_num_cost)),
                    cluster_num_cost.values(),
                    align='center')
            plt.xticks(range(len(cluster_num_cost)), cluster_num_cost.keys())
            plt.ylabel('Cost')
            plt.xlabel('Number of clusters')
            plt.savefig('../plots/preprocess2/' + cluster_col + "_" + '20' +
                        ".png")
            plt.gcf().clear()
    # plt.show()
    return data_df
コード例 #3
0
 def test_kmodes_predict_soybean(self):
     kmodes_cao = kmodes.KModes(n_clusters=4, init='Cao', verbose=2)
     kmodes_cao = kmodes_cao.fit(SOYBEAN)
     result = kmodes_cao.predict(SOYBEAN2)
     expected = np.array([2, 1, 3, 0])
     assert_cluster_splits_equal(result, expected)
     self.assertTrue(result.dtype == np.dtype(np.uint8))
コード例 #4
0
def kmode(filename,col_name):
    # Read sample for clustering from some file
    df=pd.read_csv(filename, usecols=[col_name])

    data = df[col_name]
    total_rows=len(data)


    #converting pandas series into ndarray
    input_data=np.asarray(data)
    data=np.reshape(data,(total_rows,1))

    # random categorical data

    km = kmodes.KModes(n_clusters=30,  n_init=5, verbose=2)
    clusters = km.fit_predict(data)

    # Print the cluster centroids
    print(km.cluster_centroids_)
    print(clusters)
    print(timeit.timeit('"-".join(str(n) for n in range(100))',number=10000))

#    datalist=[]
#    datalist.append()

    #dataframe = pd.DataFrame(np.random.randn(10, 2),
     #                        columns=['clusters', 'data1'])

    #fig = ff.create_scatterplotmatrix(dataframe, height=800, width=800)
    #py.iplot(fig, filename='Basic Scatterplot Matrix')
    #fig.show()

    fig = plt.figure(figsize=(7,4))
    plt.scatter(data, clusters, alpha=1, edgecolor='black')
    plt.savefig("C:/Users/Nupura Hajare/Desktop/flask_app/web/static/img/kmode.png")
コード例 #5
0
def get_silhouette_score(df, X, n_clusters, model='KM'):
    '''
    Calculate silhouette score for clustered dataframe.

    :param df: dataframe to cluster
    :param X: dense binary array for silhouette scoring
    :param n_clusters: number of clusters for model to cluster data into
    :param model: the clustering algorithm to be applied to the data, default = 'KM' (k-modes)
    :returns: silhouette score
    '''
    # Initialize clusterer and set random state, if possible
    if model == 'AG':
        clusterer = AgglomerativeClustering(n_clusters=n_clusters, affinity='cosine', linkage='average').fit(X)
        labels = clusterer.labels_
        sil_avg = silhouette_score(X, labels, metric='hamming')

    elif model == 'KM':
        clusterer = kmodes.KModes(n_clusters=n_clusters, n_init=5, init='Huang', verbose=1)
        labels = clusterer.fit_predict(df)
        sil_avg = silhouette_score(X, labels, metric='hamming')

    elif model == 'GM':
        clusterer = GaussianMixture(n_components=n_clusters, covariance_type='tied', max_iter=20, n_init=50, random_state=42, verbose=1).fit(X)
        labels = clusterer.predict(X)
        sil_avg = silhouette_score(X, labels, metric='hamming')

    return sil_avg
コード例 #6
0
 def test_kmodes_cao_soybean(self):
     kmodes_cao = kmodes.KModes(n_clusters=4, init='Cao', verbose=2)
     result = kmodes_cao.fit_predict(SOYBEAN)
     expected = np.array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1,
                          1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0,
                          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
     np.testing.assert_array_equal(result, expected)
     self.assertTrue(result.dtype == np.dtype(np.uint8))
コード例 #7
0
 def test_kmodes_nunique_nclusters(self):
     data = np.array([[0, 1], [0, 1], [0, 1], [0, 2], [0, 2], [0, 2]])
     np.random.seed(42)
     kmodes_cao = kmodes.KModes(n_clusters=6, init='Cao', verbose=2)
     result = kmodes_cao.fit_predict(data, categorical=[1])
     expected = np.array([0, 0, 0, 1, 1, 1])
     np.testing.assert_array_equal(result, expected)
     np.testing.assert_array_equal(kmodes_cao.cluster_centroids_,
                                   np.array([[0, 1], [0, 2]]))
コード例 #8
0
def kmode(y, x):
    kmodes_huang = kmodes.KModes(n_clusters=10, init='Huang', verbose=1)
    kmodes_huang.fit(x)
    # Print cluster centroids of the trained model.
    print('k-modes (Huang) centroids:')
    print(kmodes_huang.cluster_centroids_)
    # Print training statistics
    print('Final training cost: {}'.format(kmodes_huang.cost_))
    print('Training iterations: {}'.format(kmodes_huang.n_iter_))
コード例 #9
0
 def test_kmodes_huang_soybean(self):
     np.random.seed(42)
     kmodes_huang = kmodes.KModes(n_clusters=4, n_init=2, init='Huang', verbose=2)
     result = kmodes_huang.fit_predict(SOYBEAN)
     expected = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                          0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 1,
                          2, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 2, 1, 2, 1])
     assert_cluster_splits_equal(result, expected)
     self.assertTrue(result.dtype == np.dtype(np.uint8))
コード例 #10
0
 def return_classifier(self, k):
     if self.algorithm == "kmodes":
         return kmodes.KModes(n_clusters=k,
                              init='Huang',
                              n_init=1,
                              verbose=1)
     elif self.algorithm == "kmeans":
         if len(self.df) > self.max_lines:
             return MiniBatchKMeans(n_clusters=k, random_state=1)
         else:
             return KMeans(n_clusters=k, random_state=1)
コード例 #11
0
ファイル: clustering.py プロジェクト: janaldous/fypreport
def get_subcluster_list(cluster, data, norefresh=True):
    """
        @cluster Cluster model
        @data pandas.DataFrame object

        adapted from https://github.com/nicodv/kmodes/blob/master/examples/soybean.py
    """

    num_of_clusters = cluster.num_of_clusters

    clus_train, serials = _get_clus_train(data)

    if Subcluster.objects.filter(group=cluster).exists() and norefresh:
        #make dataframe from db
        query_results = Subcluster.objects.filter(group=cluster)
        serials = []
        subclusters = []
        for row in query_results:
            serials.append(float(row.serial))
            subclusters.append(row.subcluster)
        df = pd.DataFrame({
            'SERIAL': serials,
            'cluster': subclusters,
        })

        data = data[data['SERIAL'].isin(df.SERIAL.tolist())]
        merged_db = pd.merge(df, data, on='SERIAL')

        return merged_db

    Subcluster.objects.filter(group=cluster).delete()

    x = clus_train.as_matrix()

    kmodes_huang = kmodes.KModes(n_clusters=num_of_clusters,
                                 init='Huang',
                                 verbose=1)
    kmodes_huang.fit(x)

    labels = kmodes_huang.labels_

    clus_train['cluster'] = labels

    clus_train['SERIAL'] = serials

    for i in range(clus_train.shape[0]):
        row = clus_train.iloc[i]
        s = Subcluster(serial=row.SERIAL,
                       group=cluster,
                       subcluster=row.cluster)
        s.save()

    return clus_train
コード例 #12
0
 def test_kmodes_cao_soybean_ng(self):
     kmodes_cao = kmodes.KModes(n_clusters=4,
                                init='Cao',
                                verbose=2,
                                cat_dissim=ng_dissim)
     result = kmodes_cao.fit_predict(SOYBEAN)
     expected = np.array([
         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 3,
         3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0
     ])
     assert_cluster_splits_equal(result, expected)
     self.assertTrue(result.dtype == np.dtype(np.uint8))
コード例 #13
0
    def Cluster(self, feature_matrix, CR):
        for pred in feature_matrix:
            features = []
            for atom_value in feature_matrix[pred]:
                features += [feature_matrix[pred][atom_value]]
            print(pred, len(features[0]))

            n_of_clusters = math.floor(len(features) * CR)
            km = kmodes.KModes(n_clusters=n_of_clusters,
                               init='Huang',
                               n_init=1,
                               verbose=1)
            clusters = km.fit_predict(features)
            print(clusters)
コード例 #14
0
 def test_kmodes_empty_init_cluster_soybean(self):
     # Check if the clustering does not crash in case of an empty cluster.
     init_vals = np.array(
         [[0, 1, 2, 1, 0, 3, 1, 1, 0, 2, 1, 1, 0, 2, 2, 0, 0, 0, 1, 0, 1, 2,
           0, 0, 0, 0, 0, 3, 4, 0, 0, 0, 0, 0, 1],
          [4, 0, 0, 1, 1, 1, 3, 1, 1, 1, 1, 1, 0, 2, 2, 0, 0, 0, 1, 1, 0, 3,
           0, 0, 0, 2, 1, 0, 4, 0, 0, 0, 0, 0, 0],
          [3, 0, 2, 1, 0, 2, 0, 2, 1, 1, 1, 1, 0, 2, 2, 0, 0, 0, 1, 0, 3, 0,
           1, 1, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0],
          [3, 0, 2, 0, 1, 3, 1, 2, 0, 1, 1, 0, 0, 2, 2, 0, 0, 0, 1, 1, 1, 1,
           0, 1, 1, 0, 0, 3, 4, 0, 0, 0, 0, 0, 0]])
     kmodes_init = kmodes.KModes(n_clusters=4, init=init_vals, verbose=2)
     result = kmodes_init.fit(SOYBEAN)
     self.assertIsInstance(result, kmodes.KModes)
コード例 #15
0
ファイル: cluster.py プロジェクト: terry2012/UiRef
def run_kmodes(syms, X, n, alpha):
    if os.path.isfile("%d_CLUSTERS.pkl" % (n, )):
        X_ENC, clusters, centroids = pickle.load(
            open("%d_CLUSTERS.pkl" % (n, ), "r"))
        sil_avg = silhouette_score(X_ENC, clusters, metric=simple_compare)
        return (np.amax(clusters) + 1, sil_avg)

    kproto = kmodes.KModes(n_clusters=n,
                           init='Cao',
                           kmodes_cat_dissim=dissim_meas,
                           verbose=2)
    clusters = kproto.fit_predict(X)
    centroids = kproto.cluster_centroids_

    X_ENC = kmodes.encode_features(X, enc_map=kproto.enc_map_)[0]
    #    densities = calc_densities(X_ENC)

    #    bigClusters = getBigClusters(clusters, alpha)
    #    while len(bigClusters) <= 0:
    #        print "Error no big clusters, decreasing alpha to", (alpha-1)
    #        alpha -= 1
    #        if alpha <= 0:
    #            sys.exit(1)
    #        bigClusters = getBigClusters(clusters, alpha)

    #    cdists = cdist(bigClusters, clusters, syms, X_ENC, centroids)

    #    ranked_outliers = sorted([(c, -densities[idc]/float(len(X_ENC)), syms[idc], clusters[idc]) for idc,c in enumerate(cdists)], reverse=True)

    #    for ido,o in enumerate(ranked_outliers):
    #        index = [ idr for idr,rv in enumerate(syms) if rv == o[2] ][0]
    #        vals = [ v for v in X[index] if not v.startswith("N_") ]
    #        print o, vals

    with open("%d_CLUSTERS.pkl" % (n, ), "wb") as foutput:
        pickle.dump((X_ENC, clusters, centroids), foutput)

    #Distances may be calculated using Euclidean distances. The Silhouette
    #coefficient and its average range between -1, indicating a very poor model, and
    #1, indicating an excellent model. As found by Kaufman and Rousseeuw (1990), an
    #average silhouette greater than 0.5 indicates reasonable partitioning of data;
    #less than 0.2 means that the data do not exhibit cluster structure.
    #print "Precomputing Matrix"
    #X_PCMP =precomputMatrix(X_ENC, clusters)
    #sil_avg = silhouette_score(X_PCMP, clusters, metric='precomputed')

    sil_avg = silhouette_score(X_ENC, clusters, metric=simple_compare)

    return (np.amax(clusters) + 1, sil_avg)
コード例 #16
0
ファイル: dataset.py プロジェクト: cwoidyla/kaggle
 def k_modes(self, clust_num):
     kmodes_cao = kmodes.KModes(n_clusters=clust_num, init='Cao', verbose=1)
     num_cols = [4, 21]  # age, renta
     cat_data_indices = self.get_cat_cols(self.data, num_cols)
     self.data = self.convert_col_type(self.data, cat_data_indices)
     categorical_data = self.data[cat_data_indices]  # get category cols
     print(categorical_data.dtypes)
     kmodes_cao.fit(categorical_data)
     # Print cluster centroids of the trained model.
     print('k-modes (Cao) centroids:')
     print(kmodes_cao.cluster_centroids_)
     # Print training statistics
     print('Final training cost: {}'.format(kmodes_cao.cost_))
     print('Training iterations: {}'.format(kmodes_cao.n_iter_))
     return kmodes_cao.labels_
コード例 #17
0
def run_kmodes(X, init_method='Huang', n_clusters=4):
    '''
    Perform k-modes clustering.

    :param X: prepared array for clustering
    :param init_method: initiation method for k-prototypes clustering, default = 'Huang'
    :param n_clusters: number of clusters for model to segment data, default = 4
    :returns: k-modes models, array of labels
    '''
    km = kmodes.KModes(n_clusters=n_clusters,
                       n_init=10,
                       init=init_method,
                       verbose=1)
    labels = km.fit_predict(X)
    return km, labels
コード例 #18
0
ファイル: kmodes_movie.py プロジェクト: yoshnee/BDS-Proj
def main():
    movie_file = open("merged_data.csv", "r")
    movies = [row for row in csv.reader(movie_file.read().splitlines())]
    genre_map = {}
    i = 0
    for movie in movies:
        if i > 0:
            genres = movie[4].split()
            for genre in genres:
                if genre not in genre_map:
                    genre_map[genre] = 1
                else:
                    genre_map[genre] += 1
        i += 1

    genre_list = []
    for genre in genre_map.keys():
        if genre_map[genre] > 5:
            genre_list.append(genre)

    print(genre_list)
    mat = np.zeros((5043, len(genre_list)))

    i = 0
    for movie in movies:
        if i > 0:
            genres = movie[4].split()
            for genre in genres:
                if genre in genre_list:
                    mat[i - 1][genre_list.index(genre)] = 1

        i += 1

    km = kmodes.KModes(n_clusters=10, init='Huang', n_init=10, verbose=1)
    clusters = km.fit_predict(mat)

    cluster_labels = open("clusters.txt", "w")
    cluster_labels.write("\n")
    for label in km.labels_:
        cluster_labels.write(str(label))
        cluster_labels.write("\n")
    print(km.labels_)
コード例 #19
0
def cluster_asmt(A, nclus, ntries):
    '''cluster_asmt clusters an assessment using kmodes algorithm

    Parameters:
    A, assessment (# students x # questions)
    nclus, number of clusters
    ntries, number of times to try kmodes algorithm

    Returns:
    c_indx, the index of the cluster to which the student is assigned
    c_cent, the centroid of each cluster
    c_distn, the total distortion (scalar) of the clustering
    '''

    km = kmodes.KModes(n_clusters=nclus,
                       init='Huang',
                       n_init=ntries,
                       verbose=0)
    c_indx = km.fit_predict(A)
    c_cent = km.cluster_centroids_
    c_distn = distn_kmode(A, c_indx, c_cent)

    return c_indx, c_cent, c_distn
コード例 #20
0
ファイル: benchmark.py プロジェクト: robin-lai/kmodes
def huang():
    kmodes.KModes(n_clusters=K, init='Huang', n_init=1).fit_predict(data)
コード例 #21
0
 def test_pickle(self):
     obj = kmodes.KModes()
     s = pickle.dumps(obj)
     assert_equal(type(pickle.loads(s)), obj.__class__)
コード例 #22
0
ファイル: benchmark.py プロジェクト: robin-lai/kmodes
def cao():
    kmodes.KModes(n_clusters=K, init='Cao').fit_predict(data)
コード例 #23
0
 def test_kmodes_unknowninit_soybean(self):
     with self.assertRaises(NotImplementedError):
         kmodes.KModes(n_clusters=4, init='nonsense', verbose=2).fit(SOYBEAN)
コード例 #24
0
import numpy as np
from kmodes import kmodes
'''生成互相无交集的离散属性样本集'''
data1 = np.random.randint(1, 6, (10000, 10))
data2 = np.random.randint(7, 12, (10000, 10))

print(data1.shape)
print(data2.shape)
data = np.concatenate((data2, data1))

print(data.shape)
'''进行K-modes聚类'''
km = kmodes.KModes(n_clusters=2)
clusters = km.fit_predict(data)

# 打印聚类中心
print(km.cluster_centroids_)
'''计算正确归类率'''
score = np.sum(clusters[:int(len(clusters) / 2)]) + (
    len(clusters) / 2 - np.sum(clusters[int(len(clusters) / 2):]))
print("np.sum(clusters[:int(len(clusters)/2)]):{}".format(
    np.sum(clusters[:int(len(clusters) / 2)])))
print(clusters[:int(len(clusters) / 2)])
print("np.sum(clusters[int(len(clusters)/2):]):{}".format(
    np.sum(clusters[int(len(clusters) / 2):])))
print(clusters[int(len(clusters) / 2):])
score = score / len(clusters)
score = score / len(clusters)
if score >= 0.5:
    print('正确率:' + str(score))
else:
コード例 #25
0
 def test_kmodes_random_soybean(self):
     kmodes_random = kmodes.KModes(n_clusters=4, init='random', verbose=2)
     result = kmodes_random.fit(SOYBEAN)
     self.assertIsInstance(result, kmodes.KModes)
コード例 #26
0
 def test_kmodes_predict_unfitted(self):
     kmodes_cao = kmodes.KModes(n_clusters=4, init='Cao', verbose=2)
     with self.assertRaises(AssertionError):
         kmodes_cao.predict(SOYBEAN)
     with self.assertRaises(AttributeError):
         kmodes_cao.cluster_centroids_
コード例 #27
0
import pandas as pd
from kmodes import kmodes

#Carregar e transformar os dados em dummies
df = pd.read_csv("C:/Users/milen/Desktop/Case_-_Cred.csv",
                 sep=';',
                 decimal=',')
df.drop(columns=["Atualizado em", "StoneCode", "Descredenciado"], inplace=True)
df_dummy = pd.get_dummies(df)

x = df_dummy.reset_index().values

km = kmodes.KModes(n_clusters=2, init='Huang', n_init=5, verbose=0)
clusters = km.fit_predict(x)
df_dummy['clusters'] = clusters

import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

pca = PCA(2)

# Transformar a dummy em duas colunas no PCA
plot_columns = pca.fit_transform(df_dummy)

# Plotar os grupos
plt.scatter(x=plot_columns[:, 1],
            y=plot_columns[:, 0],
            c=df_dummy["clusters"],
            s=30)
plt.show()
コード例 #28
0
    'C': [0, 0, 1, 0],
    'D': [0, 0, 0, 1]
}
ansdata = []
emldata = []
with open('MultiPInfodb.csv', 'r') as f:
    reader = csv.reader(f, delimiter=':')
    for row in reader:
        tem = []
        emldata.append(row[0])
        for x in row[1].split(' '):
            tem += alpha_dic[x]
        ansdata.append(tem)
f.close()

km = kmodes.KModes(n_clusters=4, init="Huang", n_init=5, verbose=1)
clusters = km.fit_predict(ansdata)

grp = {0: [], 1: [], 2: [], 3: []}
ctr = 0
for index in clusters:
    grp[index].append(emldata[ctr])

    ctr += 1
file_lines = []
print grp
for x in grp.values():
    if x is None:
        file_lines.append("")
    else:
        txt = ""
コード例 #29
0
rows = rows[1:]
for row in rows:
    row[-1] = row[-1][:-1]
#rows = rows[1:]
print(rows[0][-1])
rows = np.array(rows)
syms = rows[:, 0]
print(len(syms))
X = rows[:, 1:-1]

#print(syms[0:5])
#print(X[0:5])
#print(syms)
#print(X)

kproto = kmodes.KModes(n_clusters=6, init='Cao', verbose=2)
clusters = kproto.fit_predict(X, categorical=[0, 1, 2, 3])

newData = ["57139", "835106", "2", "Air Travel#Business Travel"]
cluster = kproto.predict(newData)
print(cluster[0])

#~ for s, c in zip(syms, clusters):
#~ print("Symbol: {}, cluster:{}".format(s, c))

bids = [i for i in range(0, len(syms))]

mod = list(list())
for i in range(0, len(clusters)):
    old = rows[i]
    old = np.append(old, clusters[i])
コード例 #30
0
ファイル: soybean.py プロジェクト: robin-lai/kmodes
#!/usr/bin/env python

import numpy as np
from kmodes import kmodes

# reproduce results on small soybean data set
x = np.genfromtxt('soybean.csv', dtype=int, delimiter=',')[:, :-1]
y = np.genfromtxt('soybean.csv', dtype=str, delimiter=',', usecols=(35, ))

kmodes_huang = kmodes.KModes(n_clusters=4, init='Huang', verbose=1)
kmodes_huang.fit(x)
kmodes_cao = kmodes.KModes(n_clusters=4, init='Cao', verbose=1)
kmodes_cao.fit(x)

for result in (kmodes_huang, kmodes_cao):
    classtable = np.zeros((4, 4), dtype=int)
    for ii, _ in enumerate(y):
        classtable[int(y[ii][-1]) - 1, result.labels_[ii]] += 1

    print("\n")
    print("    | Cl. 1 | Cl. 2 | Cl. 3 | Cl. 4 |")
    print("----|-------|-------|-------|-------|")
    for ii in range(4):
        prargs = tuple([ii + 1] + list(classtable[ii, :]))
        print(" D{0} |    {1:>2} |    {2:>2} |    {3:>2} |    {4:>2} |".format(
            *prargs))