Example #1
0
 def test_kprotoypes_huang_stocks(self):
     np.random.seed(42)
     kproto_huang = kprototypes.KPrototypes(n_clusters=4,
                                            n_init=1,
                                            init='Huang',
                                            verbose=2)
     # Untrained model
     with self.assertRaises(AssertionError):
         kproto_huang.predict(STOCKS, categorical=[1, 2])
     result = kproto_huang.fit_predict(STOCKS, categorical=[1, 2])
     expected = np.array([0, 3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1])
     assert_cluster_splits_equal(result, expected)
     self.assertTrue(result.dtype == np.dtype(np.uint8))
Example #2
0
 def test_kprotoypes_impossible_init(self):
     data = np.array([
         [0., 'Regular'],
         [0., 'Regular'],
         [0., 'Regular'],
         [0., 'Slim'],
         [0., 'Slim'],
         [0., 'Slim']
     ])
     np.random.seed(42)
     kproto_cao = kprototypes.KPrototypes(n_clusters=6, init='Cao', verbose=2)
     with self.assertRaises(ValueError):
         kproto_cao.fit_predict(data, categorical=[1])
 def test_kmodes_fit_predict_equality(self):
     """Test whether fit_predict interface works the same as fit and predict."""
     kproto = kprototypes.KPrototypes(n_clusters=3,
                                      init='Cao',
                                      random_state=42)
     sample_weight = [0.5] * STOCKS.shape[0]
     model1 = kproto.fit(STOCKS,
                         categorical=[1, 2],
                         sample_weight=sample_weight)
     data1 = model1.predict(STOCKS, categorical=[1, 2])
     data2 = kproto.fit_predict(STOCKS,
                                categorical=[1, 2],
                                sample_weight=sample_weight)
     assert_cluster_splits_equal(data1, data2)
 def test_k_prototypes_sample_weight_all_but_one_zero(self):
     """Test whether centroid collapses to single datapoint with non-zero weight."""
     kproto = kprototypes.KPrototypes(n_clusters=1,
                                      init='Cao',
                                      random_state=42)
     n_samples = 2
     for indicator in range(n_samples):
         sample_weight = np.zeros(n_samples)
         sample_weight[indicator] = 1
         model = kproto.fit(STOCKS[:n_samples, :],
                            categorical=[1, 2],
                            sample_weight=sample_weight)
         np.testing.assert_array_equal(model.cluster_centroids_[0, :],
                                       STOCKS[indicator, :])
Example #5
0
 def test_kprotoypes_init_stocks_ng(self):
     init_vals = [
         np.array([[356.975], [275.35], [738.5], [197.667]]),
         np.array([[3, 2], [0, 2], [3, 2], [2, 2]])
     ]
     kproto_init = kprototypes.KPrototypes(n_clusters=4,
                                           init=init_vals,
                                           verbose=2,
                                           cat_dissim=ng_dissim,
                                           random_state=42)
     result = kproto_init.fit_predict(STOCKS, categorical=[1, 2])
     expected = np.array([2, 0, 0, 0, 0, 1, 1, 1, 1, 3, 3, 3])
     assert_cluster_splits_equal(result, expected)
     self.assertTrue(result.dtype == np.dtype(np.uint16))
Example #6
0
 def test_kprotoypes_missings(self):
     init_vals = [
         np.array([[356.975],
                   [275.35],
                   [738.5],
                   [np.NaN]]),
         np.array([[3, 2],
                   [0, 2],
                   [3, 2],
                   [2, 2]])
     ]
     kproto_init = kprototypes.KPrototypes(n_clusters=4, init=init_vals, verbose=2)
     with self.assertRaises(ValueError):
         kproto_init.fit_predict(STOCKS, categorical=[1, 2])
Example #7
0
 def test_kprotoypes_nunique_nclusters(self):
     data = np.array([
         [0., 'Regular'],
         [0., 'Regular'],
         [0., 'Regular'],
         [1., 'Slim'],
         [1., 'Slim'],
         [1., 'Slim']
     ])
     kproto_cao = kprototypes.KPrototypes(n_clusters=6, init='Cao',
                                          verbose=2, random_state=42)
     kproto_cao.fit_predict(data, categorical=[1])
     # Check if there are only 2 clusters.
     self.assertEqual(kproto_cao.cluster_centroids_[0].shape, (2, 1))
     self.assertEqual(kproto_cao.cluster_centroids_[1].shape, (2, 1))
Example #8
0
    def test_kprotoypes_init_stocks(self):
        # Wrong order
        init_vals = [
            np.array([[3, 2], [0, 2], [3, 2], [2, 2]]),
            np.array([[356.975], [275.35], [738.5], [197.667]])
        ]
        kproto_init = kprototypes.KPrototypes(n_clusters=4,
                                              init=init_vals,
                                              verbose=2)
        with self.assertRaises(AssertionError):
            kproto_init.fit_predict(STOCKS, categorical=[1, 2])

        init_vals = [
            np.array([[356.975], [275.35], [738.5], [197.667]]),
            np.array([[3, 2], [0, 2], [3, 2], [2, 2]])
        ]
        np.random.seed(42)
        kproto_init = kprototypes.KPrototypes(n_clusters=4,
                                              init=init_vals,
                                              verbose=2)
        result = kproto_init.fit_predict(STOCKS, categorical=[1, 2])
        expected = np.array([2, 0, 0, 0, 0, 1, 1, 1, 1, 3, 3, 3])
        np.testing.assert_array_equal(result, expected)
        self.assertTrue(result.dtype == np.dtype(np.uint8))
def run_kproto(X, cat_cols, init_method='Cao', n_clusters=4):
    '''
    Perform k-prototypes clustering.

    :param X: prepared array for clustering
    :param cat_cols: list of index positions for categorical variables
    :param init_method: initiation method for k-prototypes clustering, default = 'Cao'
    :param n_clusters: number of clusters for model to segment data, default = 4
    :returns: k-prototypes models, array of labels
    '''
    kp = kprototypes.KPrototypes(n_clusters=n_clusters,
                                 init=init_method,
                                 n_init=10,
                                 max_iter=5,
                                 verbose=2)
    labels = kp.fit_predict(X, categorical=cat_cols)
    return kp, labels
Example #10
0
def users_clustering(users_ids, users_bio, users_tweet, my_data):
    users_dataset = []
    data_set = {}
    for i in range(len(users_ids)):
        user = users_ids[i]
        user_bio = users_bio[i].tolist()
        user_tweet = users_tweet[i].tolist()

        profile_background_tile = 1 if my_data[user]['profile_features'][
            'profile_background_tile'] else 0
        profile_use_background_image = 1 if my_data[user]['profile_features'][
            'profile_use_background_image'] else 0
        screen_name = len(my_data[user]['profile_features']['screen_name'])
        verified = 1 if my_data[user]['profile_features']['verified'] else 0
        statuses_count = my_data[user]['profile_features']['statuses_count']
        favourites_count = my_data[user]['profile_features'][
            'favourites_count']
        has_extended_profile = 1 if my_data[user]['profile_features'][
            'has_extended_profile'] else 0
        friends_count = my_data[user]['profile_features']['friends_count']
        followers_count = my_data[user]['profile_features']['followers_count']
        number_cascades = len(my_data[user]['cascades_feature'])
        users_dataset.append([
            profile_background_tile, profile_use_background_image, screen_name,
            verified, statuses_count, favourites_count, has_extended_profile,
            friends_count, followers_count, number_cascades
        ] + user_bio + user_tweet)

        data_set[i] = user

    logging.info("making data matrix finished.")

    users_dataset = np.array(users_dataset)
    logging.info('data set created')
    kproto_init = kprototypes.KPrototypes(n_clusters=3600,
                                          init="Huang",
                                          verbose=2,
                                          n_init=1)
    logging.info('go for learning clusters')
    result = kproto_init.fit_predict(users_dataset, categorical=[0, 1, 3, 6])
    logging.info("model fit-predict result:{0}".format(result))
    pickle.dump(result, open('results1_text.p', 'wb'))
    pickle.dump(data_set, open('results11_text.p', 'wb'))
    with open('results1_text.txt', 'w') as f:
        f.write("\n".join(str(result)))
Example #11
0
 def test_kprotoypes_not_stuck_initialization(self):
     init_problem = np.array([[0, 'Regular'],
                              [0, 'Regular'], [0, 'Regular'], [0, np.NaN],
                              [-0.5, 'Regular'], [-0.5, 'Regular'],
                              [0, np.NaN], [0, 'Regular'], [0, 'Regular'],
                              [0, 'Slim'], [0, 'Regular'], [0, 'Regular'],
                              [0.5, 'Regular'], [-0.5, 'Regular'],
                              [0.5, 'Regular'], [0.5,
                                                 'Slim'], [0, 'Regular'],
                              [0.5, 'Regular'], [0, 'Regular'],
                              [-0.5, 'Regular'], [0, np.NaN], [0, np.NaN],
                              [0, 'Regular'], [0,
                                               'Regular'], [0, 'Regular']])
     kproto_cao = kprototypes.KPrototypes(n_clusters=6,
                                          init='Cao',
                                          verbose=2,
                                          random_state=42)
     kproto_cao = kproto_cao.fit(init_problem, categorical=[1])
     self.assertTrue(hasattr(kproto_cao, 'cluster_centroids_'))
Example #12
0
    def k_prototype(self, clust_num, clustees):
        print("Starting k-prototypes clustering...")
        kproto = kprototypes.KPrototypes(n_clusters=clust_num,
                                         init='Cao',
                                         verbose=2)
        num_cols = [4, 21]  # age, renta
        cat_data_indices = self.get_cat_cols(self.data, num_cols)
        self.data = self.convert_col_type(self.data, cat_data_indices)
        #print(self.data.dtypes)
        clusters = kproto.fit_predict(self.data.values,
                                      categorical=cat_data_indices)
        print("cluster centroids of the trained model.")
        print(kproto.cluster_centroids_)
        print("training statistics")
        print(kproto.cost_)
        print(kproto.n_iter_)

        #for s, c in zip(clustees, clusters):
        #    print("CustID: {}, cluster:{}".format(s, c))
        return clusters
Example #13
0
def cluster_from_pickle(number_of_clusters=3600):
    user_features = pickle.load(
        open(os.path.join(ROOT_DIR, 'users_feature.p'), 'rb'))

    users_features_vectors = list(user_features.values())
    users_dataset = np.array(users_features_vectors)
    print(users_dataset[1])
    kproto_init = kprototypes.KPrototypes(n_clusters=number_of_clusters,
                                          init="Huang",
                                          verbose=2,
                                          n_init=1)
    result = kproto_init.fit_predict(users_dataset, categorical=[0, 1, 3, 6])

    clustering_result = {}
    for i in range(len(result)):
        if result[i] in clustering_result:
            clustering_result[result[i]] += [users_features_vectors[i]]
        else:
            clustering_result[result[i]] = [users_features_vectors[i]]
    file_to_write = open('users_vectprs_clustering.p', 'wb')
    pickle.dump(clustering_result, file_to_write)
Example #14
0
 def test_kprototypes_sample_weights_validation(self):
     kproto = kprototypes.KPrototypes(n_clusters=4, init='Cao', verbose=2)
     sample_weight_too_few = [1] * 11
     with self.assertRaisesRegex(
             ValueError,
             "sample_weight should be of equal size as samples."):
         kproto.fit_predict(STOCKS,
                            categorical=[1, 2],
                            sample_weight=sample_weight_too_few)
     sample_weight_negative = [-1] + [1] * 11
     with self.assertRaisesRegex(
             ValueError, "sample_weight elements should be positive."):
         kproto.fit_predict(STOCKS,
                            categorical=[1, 2],
                            sample_weight=sample_weight_negative)
     sample_weight_non_numerical = [None] + [1] * 11
     with self.assertRaisesRegex(
             ValueError,
             "sample_weight elements should either be int or floats."):
         kproto.fit_predict(STOCKS,
                            categorical=[1, 2],
                            sample_weight=sample_weight_non_numerical)
Example #15
0
def kprototype(filename, num_clusters):
    #输入数据
    #若输入完整数据库30000个entries可能计算时间会过久,故先以3000个data points作为例子。
    num_data = 3000
    X_original = np.genfromtxt(filename, dtype=object,
                               delimiter=',')[1:num_data, :]

    #normalize连续型变量
    X_categorical = X_original[:, 0:10]
    X_numerical = normalize(X_original[:, 11:], norm='l2')
    #对于连续型变量,如果数量较多的话可以考虑使用PCA降维
    #X_numerical = PCA(n_components=1).fit_transform(X_numerical)
    X = np.concatenate((X_categorical, X_numerical), axis=1)

    #开始训练,默认权重u为0.5 * 连续型变量值的标准差
    kproto = kprototypes.KPrototypes(n_clusters=num_clusters,
                                     init='Cao',
                                     verbose=2)
    clusters = kproto.fit_predict(X,
                                  categorical=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
    print '\n'

    #输出每个数据点所属的聚类标签
    print 'Labels of each data point: \n', kproto.labels_, '\n'

    #输出各个类别的样本数
    for i in range(num_clusters):
        num_sample = 0
        for n in kproto.labels_:
            if i == n:
                num_sample += 1
        print 'numbers of samples in the', i, 'cluster: ', num_sample
    print '\n'

    #输出聚类中心
    print 'Clusters: \n', kproto.cluster_centroids_

    #输出目标函数成本
    print 'Cost: ', kproto.cost_
Example #16
0
    def cluster_data_before_classify(self, data):
        model = kprototypes.KPrototypes(n_clusters=8)
        cluster_data = data.drop(['target'], axis=1)
        categoricals = [i for i in range(3, len(cluster_data.columns))]

        # visualizer for find best cluster number
        # visualizer = KElbowVisualizer(model, k=(2, 8), metric='silhouette', timings=False)
        # # Fit the data and visualize
        # visualizer.fit(data)
        # visualizer.poof()
        model.fit(cluster_data.values, categorical=categoricals)

        labels = model.labels_
        unique, counts = np.unique(labels, return_counts=True)
        dict(zip(unique, counts))

        indices = [
            np.where(model.labels_ == i)[0] for i in range(model.n_clusters)
        ]
        for i in range(model.n_clusters):
            random.shuffle(indices[i])
        self.merge_train_test_data_from_each_cluster(indices)
Example #17
0
                                 0],
               worker_pca_result[np.where(worker_df['agglo_label'] == j)[0],
                                 1],
               label=str(j) + ": " +
               str(len(np.where(worker_df['agglo_label'] == j)[0])))
pl.title("Workers Agglomerative Clustering")
pl.legend()
pl.show()

# kprototypes clustering
requester_norm_df = pd.DataFrame(
    np.hstack((requester_df.iloc[:, 1:2].values, requester_norm_features)))
worker_norm_df = pd.DataFrame(
    np.hstack((worker_df.iloc[:, 1:2].values, worker_norm_features)))
for i in range(2, 6):
    kproto = kp.KPrototypes(n_clusters=i)
    # cluster requesters data
    requester_label = kproto.fit_predict(requester_norm_df.iloc[:, 1:].values,
                                         categorical=[0])
    requester_df['kmeans_' + str(i)] = requester_label

    pl.figure()
    for j in range(i):
        pl.scatter(
            requester_pca_result[np.where(requester_df["kmeans_" +
                                                       str(i)] == j)[0], 0],
            requester_pca_result[np.where(requester_df["kmeans_" +
                                                       str(i)] == j)[0], 1],
            label=str(j) + ": " +
            str(len(np.where(requester_df["kmeans_" + str(i)] == j)[0])))
    pl.title("Requesters Clustering result k=" + str(i))
Example #18
0
#!/usr/bin/env python

import numpy as np
from kmodes import kprototypes

# stocks with their market caps, sectors and countries
syms = np.genfromtxt('stocks.csv', dtype=str, delimiter=',')[:, 0]
xnum = np.genfromtxt('stocks.csv', dtype=float, delimiter=',')[:, 1]
xnum = np.atleast_2d(xnum).T
xcat = np.genfromtxt('stocks.csv', dtype=str, delimiter=',')[:, 2:]

kproto = kprototypes.KPrototypes(n_clusters=4, init='Cao', verbose=2)
clusters = kproto.fit_predict([xnum, xcat])

for s, c in zip(syms, clusters):
    print("Symbol: {}, cluster:{}".format(s, c))
Example #19
0
import numpy as np
from kmodes import kprototypes

kp = kprototypes.KPrototypes(n_clusters=7, init='Cao', verbose=2)

X = df_small.values
clusters = kp.fit_predict(X, categorical=[1, 2, 3, 7, 8])

print "Cluster centroids"
print(kp.cluster_centroids_)

print "Training stats"
print(kproto.cost_)
print(kproto.n_iter_)

#getting cost
cost = []
for i in range(1, 11):
    kp = kprototypes.KPrototypes(n_clusters=i, init='Cao', verbose=2)
    clusters = kp.fit_predict(X, categorical=[0, 1, 2, 3, 4, 5, 6, 8, 9])
    cost.append(kp.cost_)
Example #20
0
def get_silhouette_score(nclust):
    kprot = kprototypes.KPrototypes(nclust)
    labels = kprot.fit_predict(scaled, categorical=categoricals_indicies)
    sil_avg = silhouette_score(scaled, labels)
    return sil_avg
Example #21
0
#np.save('X',X)
#np.save('Y',Y)
#%%
# Make training matrix.
#training_matrix = []
#for user in train_data:
#    for artist in train_data[user]:
#        training_matrix.append([user, artist] + train_data[user][artist])

#reg = KMeans(n_clusters = 10, n_init = 3, n_jobs = -1)
#reg.fit(training_matrix[:-1],training_matrix[-1])
#reg.fit(training_matrix)
#reg.fit(training_matrix[:-1],training_matrix[-1])
#%%
X_train,X_val,Y_train,Y_val = train_test_split(X, Y,test_size=0.9)
reg = kprototypes.KPrototypes(n_clusters = 8, init='Cao')
reg.fit(X_train,y=Y_train,categorical =[0,1,2,4,5] )
#%% Test out vs user mean
for i in X_val
# Write out test solutions.
with open(test_file, 'r') as test_fh:
    test_csv = csv.reader(test_fh, delimiter=',', quotechar='"')
    next(test_csv, None)

    with open(soln_file, 'w') as soln_fh:
        soln_csv = csv.writer(soln_fh,
                              delimiter=',',
                              quotechar='"',
                              quoting=csv.QUOTE_MINIMAL)
        soln_csv.writerow(['Id', 'plays'])
Example #22
0
def analyze(data):
    # Convert this to python data for us to be able to run ML algorithms
    json_to_python = json.loads(data)

    per_user = dict()  # IP-Status
    hostlist = dict()

    # Data pre-processing here:
    for y in json_to_python:

        hostlist[y['HOST']] = 1

        if y['HOST'] in per_user:
            per_user[y['HOST']].append(y['STATUS'])

        else:
            per_user[y['HOST']] = [y['STATUS']]

    log.debug("***  Printing input contents to the algorithm: ***")

    ###Analysis 1 : (ML): Run K-prototypes algorithm on IP-Response_status feature-set here:
    X = np.array([[0.00, '0']])

    for x in hostlist:
        word_counter = {}
        for word in per_user[x]:
            if word in word_counter:
                word_counter[word] += 1
            else:
                word_counter[word] = 1

        popular_words = sorted(word_counter,
                               key=word_counter.get,
                               reverse=True)
        max_status = popular_words[0]
        # print x + ": " + max_status
        y = x.split(".")
        ip = ""
        for z in range(4):
            l = len(y[z])
            l = 3 - l
            if (l > 0):
                zero = ""
                for t in range(3 - len(y[z])):
                    zero = zero + "0"
                y[z] = zero + y[z]

            ip = ip + y[z]
        log.debug(str(float(float(ip) / 1000)) + ": " + max_status)
        le = [float(float(ip) / 1000), max_status]
        X = np.vstack([X, le])

    # print X
    log.info(
        "######******* Analysis #1: K-prototype for IP address-Response status: ******#######"
    )

    ##For k-proto analysis:

    ##Adjust number of clusters here
    kproto = kprototypes.KPrototypes(n_clusters=4, init='Cao', verbose=2)

    result = kproto.fit_predict(X, categorical=1)

    # print result
    # cluster by status

    num_clust = dict()
    clust_content = dict()

    X_index = 0
    for x in result:
        if x in num_clust:
            num_clust[x] += 1
            clust_content[x].append(X_index)
        else:
            num_clust[x] = 1
            clust_content[x] = [X_index]
        X_index += 1

    min_index = min(num_clust, key=num_clust.get)

    max_index = max(num_clust, key=num_clust.get)

    log.info("Cluster no. " + str(min_index) + " has the least elements: " +
             str(num_clust[min_index]))
    log.info("Check INFO.log to view its contents!")

    content_arr = clust_content[min_index]

    log.info(
        "****  Contents of the cluster with minimum number of elements!  *****"
    )

    # Prints contents of min cluster
    input_index = 0
    for y in X:
        if input_index in content_arr:
            log.info(y)
        input_index += 1

    log.info("Cluster no. " + str(max_index) + " has the maximum elements: " +
             str(num_clust[max_index]))
    log.info("Check INFO.log to view its contents!")
    log.info(
        "Check DEBUG.log to view contents of all clusters along with the main input X!"
    )

    content_arr = clust_content[max_index]

    log.info(
        "***** Contents of the cluster with maximum number of elements! *****")
    # Prints contents of max cluster
    input_index = 0
    for y in X:
        if input_index in content_arr:
            log.info(y)
        input_index += 1

    log.debug("***** Contents of all clusters! *****")
    # Prints contents of all clusters

    for k in clust_content:
        content_arr = clust_content[k]
        log.debug("***** Contents of cluster #" + str(k) + ":  *****")
        log.debug("***** This cluster has " + str(num_clust[k]) +
                  " elements!  *****")

        input_index = 0
        for y in X:
            if input_index in content_arr:
                log.debug(y)
            input_index += 1
Example #23
0
scaled = scaler.fit_transform(dfSessions)

# clustering k-prototypes (mixed numeric and categorical features)

init = 'Huang'  # can be 'Cao', 'Huang' or 'random'
n_clusters = 12
max_iter = 100

# 15
# 1556.6108261222275

# 16
# 1435.3049147588504

kproto = kprototypes.KPrototypes(n_clusters=n_clusters,
                                 init=init,
                                 max_iter=max_iter)
# k_prototypes(X, categorical, n_clusters, max_iter, num_dissim,
# cat_dissim, gamma, init, n_init, verbose, random_state, n_jobs)

# cluster_centroids_ : array, [n_clusters, n_features]
#     Categories of cluster centroids
# labels_ :
#     Labels of each point
# cost_ : float
#     Clustering cost, defined as the sum distance of all points to
#     their respective cluster centroids.
# n_iter_ : int
#     The number of iterations the algorithm ran for.
# gamma : float
#     The (potentially calculated) weighing factor.
Example #24
0
 def test_kprotoypes_no_categoricals(self):
     kproto_cao = kprototypes.KPrototypes(n_clusters=6, init='Cao',
                                          verbose=2, random_state=42)
     with self.assertRaises(NotImplementedError):
         kproto_cao.fit(STOCKS, categorical=[])
Example #25
0
 def test_pickle(self):
     obj = kprototypes.KPrototypes()
     s = pickle.dumps(obj)
     assert_equal(type(pickle.loads(s)), obj.__class__)
Example #26
0
 def test_pickle(self):
     obj = kprototypes.KPrototypes()
     serialized = pickle.dumps(obj)
     self.assertTrue(isinstance(pickle.loads(serialized), obj.__class__))
Example #27
0
 def test_kprototypes_unknowninit_soybean(self):
     kproto = kprototypes.KPrototypes(n_clusters=4, init='nonsense',
                                      verbose=2)
     with self.assertRaises(NotImplementedError):
         kproto.fit(STOCKS, categorical=[1, 2])
Example #28
0
 def test_kprotoypes_random_stocks(self):
     kproto_random = kprototypes.KPrototypes(n_clusters=4, init='random',
                                             verbose=2)
     result = kproto_random.fit(STOCKS, categorical=[1, 2])
     self.assertIsInstance(result, kprototypes.KPrototypes)
Example #29
0
 def test_kprotoypes_wrong_categorical_type(self):
     kproto = kprototypes.KPrototypes(n_clusters=4, init='Cao', verbose=2)
     with self.assertRaises(AssertionError):
         kproto.fit_predict(STOCKS, categorical={1, 2})
Example #30
0
 def test_pickle_fitted(self):
     kproto = kprototypes.KPrototypes(n_clusters=4, init='Cao', verbose=2)
     model = kproto.fit(STOCKS[:, :2], categorical=1)
     serialized = pickle.dumps(model)
     self.assertTrue(isinstance(pickle.loads(serialized), model.__class__))