Beispiel #1
0
def k_elbow_plot(fpath, max_k=10):
    """
    Funzione per plottare il grafico "a gomito" che mostra il rapporto tra SSE del modello di clustering
    e il numero di cluster scelti. L'utilità sta nel poter selezionare il numero di k più appropriato
    in base all'ultimo valore k che comporta una buona diminuzione dell'errore ("punta del gomito")
    :param fpath: percorso del dataset processato (vedasi descrizione di argv[1] in cima allo script)
    :param max_k: numero massimo di k che si vuole utilizzare per produrre il grafo
    """
    if not path.isfile(fpath):
        print("Error: could not find specified CSV dataset.")
        return
    if max_k <= 0:
        print("Error: k must be a positive integer.")
        return

    data = refactor_data_frame(pd.read_csv(fpath))
    errors = []
    for k in range(1, max_k + 1):
        kmodes = KModes(n_clusters=k, random_state=42, n_init=1, init="random")
        kmodes.fit(data)
        errors.append(kmodes.cost_)
        print("DONE WITH K=" + str(k))
    plt.figure(figsize=(16, 8))
    plt.plot(range(1, max_k + 1), errors, 'bo-')
    plt.xlabel('#Clusters (K)')
    plt.ylabel('Errore (0/1)')
    plt.title("Rapporto parametro K/errore del dataset " +
              path.basename(fpath))
    plt.show()
Beispiel #2
0
    def test_kmodes_init_soybean(self):
        init_vals = np.array(
            [[0, 1, 2, 1, 0, 3, 1, 1, 0, 2, 1, 1, 0, 2, 2, 0, 0, 0, 1, 0, 1, 2,
              0, 0, 0, 0, 0, 3, 4, 0, 0, 0, 0, 0, 1],
             [4, 0, 0, 1, 1, 1, 3, 1, 1, 1, 1, 1, 0, 2, 2, 0, 0, 0, 1, 1, 0, 3,
              0, 0, 0, 2, 1, 0, 4, 0, 0, 0, 0, 0, 0],
             [3, 0, 2, 1, 0, 2, 0, 2, 1, 1, 1, 1, 0, 2, 2, 0, 0, 0, 1, 0, 3, 0,
              1, 1, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0],
             [3, 0, 2, 0, 1, 3, 1, 2, 0, 1, 1, 0, 0, 2, 2, 0, 0, 0, 1, 1, 1, 1,
              0, 1, 1, 0, 0, 3, 4, 0, 0, 0, 0, 0, 0]])
        kmodes_init = KModes(n_clusters=4, init=init_vals, verbose=2)
        result = kmodes_init.fit_predict(SOYBEAN)
        expected = np.array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1,
                             1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0,
                             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
        assert_cluster_splits_equal(result, expected)

        # 5 initial centroids, 4 n_clusters
        init_vals = np.array(
            [[0, 1],
             [4, 0],
             [4, 0],
             [3, 0],
             [3, 0]])
        kmodes_init = KModes(n_clusters=4, init=init_vals, verbose=2)
        with self.assertRaises(AssertionError):
            kmodes_init.fit(SOYBEAN)

        # wrong number of attributes
        init_vals = np.array(
            [0, 1, 2, 3])
        kmodes_init = KModes(n_clusters=4, init=init_vals, verbose=2)
        with self.assertRaises(AssertionError):
            kmodes_init.fit(SOYBEAN)
def clustering(user_session_subset, chicago_clustering,
               chicago_clustering_labels):
    #THIS WILL EXTRACT THE CHICAGO_ZERO_AND_ONE RESTAURANTS THAT MATCH WITH THE ONES FOUND FROM THE INITIAL SUBSET(required for clustering)
    #get count of user session subset
    user_session_subset_count = pd.crosstab(
        index=user_session_subset['Restaurant_ID'], columns="count")

    mask = np.zeros(len(chicago_clustering), dtype=bool)
    mask[user_session_subset_count['count'].index.values.astype(int)] = True

    chicago_clustering_labels = chicago_clustering_labels[mask]
    chicago_clustering = chicago_clustering[mask]

    #method - Huang, number of clusters - 4, verbose=1 mean textual output (0 is no output)
    kmodes_huang = KModes(n_clusters=3, init='Huang', verbose=0, n_init=20)
    kmodes_huang.fit(chicago_clustering)

    #this joins the restaurant name
    cluster_results = np.column_stack(
        (chicago_clustering_labels, kmodes_huang.labels_))

    #convert numpy matrix to pandas dataframe
    cluster_result_df = pd.DataFrame(cluster_results)
    cluster_result_df.columns = ['Restaurant', 'Cluster']

    #JOIN THE CLUSTERING RESULTS WITH user_session_subset_count TO GET OUT FINAL RESULTS
    #remove existing indecies so the new ones line up and df's can be joined
    cluster_result_df.reset_index(drop=True, inplace=True)
    user_session_subset_count.reset_index(drop=True, inplace=True)

    #join the cluster results with the restaurant counts
    clusters_with_counts = pd.concat(
        [cluster_result_df, user_session_subset_count], axis=1)

    return clusters_with_counts
Beispiel #4
0
    def test_kmodes_init_soybean(self):
        init_vals = np.array(
            [[0, 1, 2, 1, 0, 3, 1, 1, 0, 2, 1, 1, 0, 2, 2, 0, 0, 0, 1, 0, 1, 2,
              0, 0, 0, 0, 0, 3, 4, 0, 0, 0, 0, 0, 1],
             [4, 0, 0, 1, 1, 1, 3, 1, 1, 1, 1, 1, 0, 2, 2, 0, 0, 0, 1, 1, 0, 3,
              0, 0, 0, 2, 1, 0, 4, 0, 0, 0, 0, 0, 0],
             [3, 0, 2, 1, 0, 2, 0, 2, 1, 1, 1, 1, 0, 2, 2, 0, 0, 0, 1, 0, 3, 0,
              1, 1, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0],
             [3, 0, 2, 0, 1, 3, 1, 2, 0, 1, 1, 0, 0, 2, 2, 0, 0, 0, 1, 1, 1, 1,
              0, 1, 1, 0, 0, 3, 4, 0, 0, 0, 0, 0, 0]])
        kmodes_init = KModes(n_clusters=4, init=init_vals, verbose=2)
        result = kmodes_init.fit_predict(SOYBEAN)
        expected = np.array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1,
                             1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0,
                             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
        assert_cluster_splits_equal(result, expected)

        # 5 initial centroids, 4 n_clusters
        init_vals = np.array(
            [[0, 1],
             [4, 0],
             [4, 0],
             [3, 0],
             [3, 0]])
        kmodes_init = KModes(n_clusters=4, init=init_vals, verbose=2)
        with self.assertRaises(AssertionError):
            kmodes_init.fit(SOYBEAN)

        # wrong number of attributes
        init_vals = np.array(
            [0, 1, 2, 3])
        kmodes_init = KModes(n_clusters=4, init=init_vals, verbose=2)
        with self.assertRaises(AssertionError):
            kmodes_init.fit(SOYBEAN)
def cluster_and_output(k, matrix, clust_type, inputpath, outdir):
    km = KModes(n_clusters=args.k,
                cat_dissim=conflict_dissim,
                init='huang',
                n_init=args.n,
                verbose=1)
    km.fit(matrix)

    from collections import defaultdict
    cluster_groups = defaultdict(list)

    for j in range(matrix.shape[0]):
        cluster_groups[km.labels_[j]].append(j)

    tot_rows = 0
    for cluster in cluster_groups:
        tot_rows += len(cluster_groups[cluster])

    filename = os.path.splitext(os.path.basename(inputpath))[0]
    outfile = os.path.join(outdir, filename)

    centroids = km.cluster_centroids_
    out_matrix = list()
    for ix_c, c in enumerate(centroids):
        if ix_c in cluster_groups:
            x = list(map(int, list(map(round, c))))
            out_matrix.append(x)

    out_matrix = np.transpose(np.array(out_matrix))

    print(out_matrix.shape)
    print(len(cluster_groups))
    np.savetxt('{}_celluloid.matrix'.format(outfile),
               out_matrix,
               fmt='%d',
               delimiter=' ')

    with open('{}_celluloid_clusters.txt'.format(outfile), 'w+') as file_out:
        for cluster in sorted(cluster_groups):
            file_out.write('{0}\t"{1}"\n'.format(
                cluster,
                ','.join([str(x + 1) for x in cluster_groups[cluster]])))

    with open('{}_celluloid.mutations'.format(outfile), 'w+') as file_out:
        for cluster in sorted(cluster_groups):
            file_out.write('{0}\n'.format(','.join(
                [str(x + 1) for x in cluster_groups[cluster]])))

    print('Done.')
Beispiel #6
0
def exec_kmodes(df, choices_obj):
    # reproduce results on small soybean data set
    cats_not_scaled = [header for header in choices_obj['categorical']]

    X = df[cats_not_scaled].astype(str)
    k = int(input("Number of clusters:\n > "))

    kmodes_cao = KModes(n_clusters=k, init='Cao', verbose=1)
    kmodes_cao.fit(X.values)

    # Print cluster centroids of the trained model.
    print('k-modes (Cao) centroids:')
    print(kmodes_cao.cluster_centroids_)
    # Print training statistics
    print('Final training cost: {}'.format(kmodes_cao.cost_))
    print('Training iterations: {}'.format(kmodes_cao.n_iter_))
 def test_kmodes_predict_soybean(self):
     kmodes_cao = KModes(n_clusters=4, init='Cao', verbose=2)
     kmodes_cao = kmodes_cao.fit(SOYBEAN)
     result = kmodes_cao.predict(SOYBEAN2)
     expected = np.array([2, 1, 3, 0])
     assert_cluster_splits_equal(result, expected)
     self.assertTrue(result.dtype == np.dtype(np.uint16))
 def test_kmodes_random_soybean(self):
     kmodes_random = KModes(n_clusters=4,
                            init='random',
                            verbose=2,
                            random_state=42)
     result = kmodes_random.fit(SOYBEAN)
     self.assertIsInstance(result, KModes)
Beispiel #9
0
 def test_kmodes_predict_soybean_ng(self):
     kmodes_cao = KModes(n_clusters=4, init='Cao', verbose=2, cat_dissim=ng_dissim)
     kmodes_cao = kmodes_cao.fit(SOYBEAN)
     result = kmodes_cao.predict(SOYBEAN2)
     expected = np.array([2, 1, 3, 0])
     assert_cluster_splits_equal(result, expected)
     self.assertTrue(result.dtype == np.dtype(np.uint8))
Beispiel #10
0
 def test_kmodes_predict_soybean_jaccard_dissim_label(self):
     kmodes_huang = KModes(n_clusters=4, n_init=2, init='Huang', verbose=2,
                           cat_dissim=jaccard_dissim_label, random_state=42)
     kmodes_huang = kmodes_huang.fit(TEST_DATA)
     result = kmodes_huang.fit_predict(TEST_DATA_PREDICT)
     expected = np.array([1, 0, 1, 2])
     assert_cluster_splits_equal(result, expected)
     self.assertTrue(result.dtype == np.dtype(np.uint16))
Beispiel #11
0
 def test_kmodes_fit_predict(self):
     """Test whether fit_predict interface works the same as fit and predict."""
     kmodes = KModes(n_clusters=4, init='Cao', random_state=42)
     sample_weight = [0.5] * TEST_DATA.shape[0]
     data1 = kmodes.fit_predict(TEST_DATA, sample_weight=sample_weight)
     data2 = kmodes.fit(TEST_DATA,
                        sample_weight=sample_weight).predict(TEST_DATA)
     assert_cluster_splits_equal(data1, data2)
Beispiel #12
0
 def test_k_modes_sample_weight_unchanged(self):
     """Test whether centroid definition remains unchanged when scaling uniformly."""
     kmodes_baseline = KModes(n_clusters=4, init='Cao', random_state=42)
     model_baseline = kmodes_baseline.fit(SOYBEAN)
     expected = set(tuple(row) for row in model_baseline.cluster_centroids_)
     for weight in [.5, 1, 1., 2]:
         sample_weight = [weight] * SOYBEAN.shape[0]
         kmodes_weighted = KModes(n_clusters=4, init='Cao', random_state=42)
         model_weighted = kmodes_weighted.fit(SOYBEAN,
                                              sample_weight=sample_weight)
         factual = set(
             tuple(row) for row in model_weighted.cluster_centroids_)
         # Centroids might be ordered differently. To compare the centroids, we first
         # sort them.
         tuple_pairs = zip(sorted(expected), sorted(factual))
         for tuple_expected, tuple_factual in tuple_pairs:
             self.assertAlmostEqual(tuple_expected, tuple_factual)
Beispiel #13
0
 def test_pickle_fitted(self):
     kmodes_huang = KModes(n_clusters=4,
                           n_init=2,
                           init='Huang',
                           verbose=2,
                           random_state=42)
     model = kmodes_huang.fit(SOYBEAN)
     serialized = pickle.dumps(model)
     self.assertTrue(isinstance(pickle.loads(serialized), model.__class__))
Beispiel #14
0
 def test_kmodes_predict_soybean_jaccard_dissim_binary(self):
     kmodes_huang = KModes(n_clusters=4, n_init=2, init='Huang', verbose=2,
                           cat_dissim=jaccard_dissim_binary, random_state=42)
     # binary encoded variables are required
     bin_variables = SOYBEAN.astype(bool).astype(int)
     kmodes_huang = kmodes_huang.fit(bin_variables)
     # binary encoded variables required for prediction as well
     bin_variables_pred = SOYBEAN2.astype(bool).astype(int)
     result = kmodes_huang.fit_predict(bin_variables_pred)
     expected = np.array([0, 1, 2, 3])
     assert_cluster_splits_equal(result, expected)
     self.assertTrue(result.dtype == np.dtype(np.uint16))
Beispiel #15
0
 def test_kmodes_sample_weights_all_but_one_zero(self):
     """Test whether centroid collapses to single datapoint with non-zero weight."""
     kmodes = KModes(n_clusters=1, init='Cao', random_state=42)
     n_samples = 10
     for indicator in range(n_samples):
         sample_weight = np.zeros(n_samples)
         sample_weight[indicator] = 1
         model = kmodes.fit(TEST_DATA[:n_samples, :],
                            sample_weight=sample_weight)
         self.assertTrue(
             (model.cluster_centroids_[0, :] == TEST_DATA[indicator, :]
              ).all())
Beispiel #16
0
def f(game, modes, K, N, in_colour, seed):
    print("Running clustering...")
    with NumpySeed(seed):
        dset = StaticAtariDataset(game=game, after_warp=not in_colour)

        X = dset.x

        if N:
            X = X[:N, ...]
        else:
            N = X.shape[0]

        if not in_colour:
            X = X[..., 0]
        image_shape = X.shape[1:]
        X = X.reshape(N, -1)

        if modes:
            km = KModes(n_clusters=K, init='Huang', n_init=1, verbose=1)
            km.fit(X)

            centroids = km.cluster_centroids_
            centroids = centroids.reshape(K, *image_shape)
            discrete_centroids = centroids
            centroids = centroids / 255.

            labels = km.labels_
        else:
            result = k_means(X / 255., K)
            centroids = result[0]
            discrete_centroids = np.uint8(np.floor(centroids * 255))

        centroids = np.maximum(centroids, 1e-6)
        centroids = np.minimum(centroids, 1 - 1e-6)
        centroids = centroids.reshape(K, *image_shape)

        labels = np.array(labels)
        X = X.reshape(N, *image_shape)
        return centroids, discrete_centroids, labels, X
    print("Done.")
Beispiel #17
0
 def test_kmodes_empty_init_cluster_soybean(self):
     # Check if the clustering does not crash in case of an empty cluster.
     init_vals = np.array(
         [[0, 1, 2, 1, 0, 3, 1, 1, 0, 2, 1, 1, 0, 2, 2, 0, 0, 0, 1, 0, 1, 2,
           0, 0, 0, 0, 0, 3, 4, 0, 0, 0, 0, 0, 1],
          [4, 0, 0, 1, 1, 1, 3, 1, 1, 1, 1, 1, 0, 2, 2, 0, 0, 0, 1, 1, 0, 3,
           0, 0, 0, 2, 1, 0, 4, 0, 0, 0, 0, 0, 0],
          [3, 0, 2, 1, 0, 2, 0, 2, 1, 1, 1, 1, 0, 2, 2, 0, 0, 0, 1, 0, 3, 0,
           1, 1, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0],
          [3, 0, 2, 0, 1, 3, 1, 2, 0, 1, 1, 0, 0, 2, 2, 0, 0, 0, 1, 1, 1, 1,
           0, 1, 1, 0, 0, 3, 4, 0, 0, 0, 0, 0, 0]])
     kmodes_init = KModes(n_clusters=4, init=init_vals, verbose=2)
     result = kmodes_init.fit(SOYBEAN)
     self.assertIsInstance(result, KModes)
Beispiel #18
0
 def test_kmodes_empty_init_cluster_soybean(self):
     # Check if the clustering does not crash in case of an empty cluster.
     init_vals = np.array(
         [[0, 1, 2, 1, 0, 3, 1, 1, 0, 2, 1, 1, 0, 2, 2, 0, 0, 0, 1, 0, 1, 2,
           0, 0, 0, 0, 0, 3, 4, 0, 0, 0, 0, 0, 1],
          [4, 0, 0, 1, 1, 1, 3, 1, 1, 1, 1, 1, 0, 2, 2, 0, 0, 0, 1, 1, 0, 3,
           0, 0, 0, 2, 1, 0, 4, 0, 0, 0, 0, 0, 0],
          [3, 0, 2, 1, 0, 2, 0, 2, 1, 1, 1, 1, 0, 2, 2, 0, 0, 0, 1, 0, 3, 0,
           1, 1, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0],
          [3, 0, 2, 0, 1, 3, 1, 2, 0, 1, 1, 0, 0, 2, 2, 0, 0, 0, 1, 1, 1, 1,
           0, 1, 1, 0, 0, 3, 4, 0, 0, 0, 0, 0, 0]])
     kmodes_init = KModes(n_clusters=4, init=init_vals, verbose=2)
     result = kmodes_init.fit(SOYBEAN)
     self.assertIsInstance(result, KModes)
Beispiel #19
0
xls_file = pd.ExcelFile(
    "..\\source\\traffic_violations_selected_features_delete_missing.xlsx")

# get excel sheet - object type: pandas dataframe
pd_traffic_violations = xls_file.parse('Hoja1')

# select features to model
pd_traffic_violations_to_model = pd_traffic_violations[[
    'CODIGO INFRACCION', 'TIPO DE VIA', 'LUGAR DE INTERVENCION',
    'EMPRESA DE TRANSPORTE'
]]

# instance k-modes object - 180 clusters
kmodes = KModes(n_clusters=180, init='Cao', verbose=1)
# modeling
kmodes.fit(pd_traffic_violations_to_model)

# cluster centroids of the model
print(kmodes.cluster_centroids_)
# statistics of modeling
print(kmodes.cost_)
print(kmodes.n_iter_)

# create new cluster column in pandas dataframe
pd_traffic_violations['CLUSTER'] = kmodes.labels_

# save labeled dataframe to .csv
pd_traffic_violations.to_csv(
    '..\\clustering\\kmodes_clustering_traffic_violations.csv',
    index=False,
    header=True)
Beispiel #20
0
#wcss = []
#for i in range(1,30):
#    kmodes = KModes(n_clusters=i, init='Huang', n_init=5, verbose=1)
#    kmodes.fit(data1)
#    wcss.append(kmodes.cluster_centroids_)

#plt.plot(range(1,30), wcss)
#plt.title("The elbow method")
#plt.xlabel("The number of clusters")
#plt.ylabel("WCSS")
#plt.show()

wcss
"""**Kmode Model Creation and prediction**"""

km = KModes(n_clusters=23, init='Huang', n_init=5, verbose=1)

km = km.fit(data1)

clusters = km.predict(data1)
# Print the cluster centroids
print(km.cluster_centroids_)
"""**Storing My Prediction to CSV file**"""

k = pd.DataFrame()

k['output'] = clusters

k.to_csv("outpt.csv")
Beispiel #21
0
#print(housing_binary)
#print(len(housing_binary))
#print(u_housing[i_housing])
#print(len(u_housing[i_housing]))


# LIFT

km = KModes(n_clusters = 5)
#kmeans = KMeans(n_clusters = 5)
X = np.vstack((i_plaintiff,i_judgment_type, i_judgment_method,d_a, g_num))
X = np.transpose(X)
X = np.hstack((X,low))
#print(X)
km.fit(X)
y_km = km.predict(X)
#print(X[0,:])
#print(X[1,:])
plt.scatter(latitude, longitude, c = y_km, s = 50, cmap='winter')
plt.xlabel('Latitude')
plt.ylabel('Longitude')
plt.show()

#print(y_km)


units = np.array(df['units'])
number = np.nonzero(units)
print(number)
number = np.array(number)
Beispiel #22
0
    #
    le = {}
    if (to_encode is None):
        df_enc = df.copy(deep=True)
    else:
        df_enc = df.copy(deep=True)
        for fname in to_encode:
            le[fname]     = preprocessing.LabelEncoder()
            df_enc[fname] = le[fname].fit_transform(df_enc[fname])

    # iterate over KModes a few times
    n_iter = n_max_clusters - n_min_clusters + 1
    cost = np.zeros(n_iter)
    for i in range(n_iter):
        km = KModes(n_clusters= (i+1), n_init = 1, verbose=0)
        km.fit(df_enc)
        cost[i] = km.cost_

    # locate the elbow
    kl = KneeLocator(range(n_iter), cost, curve="convex", direction="decreasing")
    n_clusters = kl.elbow

    # generate the final kmodes fit
    km = KModes(n_clusters=n_clusters, n_init = 1, verbose=0)
    clusters = km.fit_predict(df_enc)

    if not (to_encode is None):
        df_renc = df_enc.copy()
        for fname in to_encode:
            df_renc[fname] = le[fname].inverse_transform(df_renc[fname])
        df_ind_res = df_renc.reset_index()
Beispiel #23
0
#THIS WILL EXTRACT THE CHICAGO_ZERO_AND_ONE RESTAURANTS THAT MATCH WITH THE ONES FOUND FROM THE INITIAL SUBSET(required for clustering)
#get count of user session subset
user_session_subset_count = pd.crosstab(
    index=user_session_subset['Restaurant_ID'], columns="count")

mask = np.zeros(len(chicago_clustering), dtype=bool)
mask[user_session_subset_count['count'].index.values.astype(int)] = True

chicago_clustering_labels = chicago_clustering_labels[mask]
chicago_clustering = chicago_clustering[mask]

#CLUSTERING
#method - Huang, number of clusters - 4, verbose=1 mean textual output (0 is no output)
kmodes_huang = KModes(n_clusters=3, init='Huang', verbose=0)
kmodes_huang.fit(chicago_clustering)

#this joins the restaurant name
cluster_results = np.column_stack(
    (chicago_clustering_labels, kmodes_huang.labels_))

#convert numpy matrix to pandas dataframe
cluster_result_df = pd.DataFrame(cluster_results)
cluster_result_df.columns = ['Restaurant', 'Cluster']

#JOIN THE CLUSTERING RESULTS WITH user_session_subset_count TO GET OUT FINAL RESULTS
#remove existing indecies so the new ones line up and df's can be joined
cluster_result_df.reset_index(drop=True, inplace=True)
user_session_subset_count.reset_index(drop=True, inplace=True)

#join the cluster results with the restaurant counts
Beispiel #24
0
def fit(qual_id, count):
    # More than 500 documents results in slow training
    if count >= 500:
        count = 500
    # Query for passed qual_id with incorrect answers
    # May take a lengthly amount of time. Recommend optimizing query.
    # print("Querying for", qual_id)
    data = collection.find({"qual_id": qual_id, "correct": False})[:count]
    # print("Query complete.")

    # Compile dictionary of all possible features in given list of records
    # print("Compiling dictionary of features.")
    features = {}
    for doc in data:
        doc_features = {}
        if doc['response'] is None:
            continue
        doc_features = retrieveKeys(doc['response'], doc_features)
        features = mergeFeatures(doc_features, features)
    # print("Feature compilation complete.")

    # Count number of features
    length = countFeatures(features)
    if length == 0:
        return

    # Reuse queried documents.
    data = data.rewind()

    # Append missing features to all records and assign common benign value.
    # Current benign value is an empty string.
    # print("Appending features to documents.")
    student_data = np.array([])
    for doc in data:
        if doc['response'] is None:
            continue
        else:
            temp = np.array([])
            temp = addFeatures(features, temp, doc['response'])
            if len(student_data) == 0:
                student_data = np.append(student_data, temp)
                student_data = np.reshape(student_data, (-1, length))
            else:
                student_data = np.append(student_data, [temp], axis=0)
    # print("Finished appending features to documents.")

    # Perform k-modes clustering
    # print("Clustering...")
    clusters = len(student_data)
    # K-modes implementation can't generate more than 255 centroids
    if clusters > 255:
        clusters = 255
    km = KModes(n_clusters=clusters, init='Cao', n_init=4, verbose=False)
    # print("Finished.")
    km.fit(student_data)

    # Print important information from clustering
    # Centroids are common values to each cluster
    centroids = km.cluster_centroids_
    # print("Centroids")
    # print(centroids)

    # Labels is a list indicating which cluster each record belongs to
    labels = km.labels_
    # print("Labels")
    # print(labels)

    # Cost is value indicating possible error in the clusters. Ideal value is
    # 0.0. If value is greater than 0.0, then the max number of clusters were
    # generated and some responses were assigned to an inexact cluster. This.
    # would result in the largest cluster having having documents it shouldn't.
    # Recommend re-clustering with fewer documents or more clusters if possible.
    cost = km.cost_
    # print("Cost")
    # print(cost)

    # Prints 5 largest cluster labels and number of records per cluster.
    most_common = Counter(labels).most_common(5)
    # print("Most populated centroids")
    # print(most_common)

    # Generate cluster dictionary to be inserted in the centroid_db.
    # Qual_id: qual_id of given documents
    # Features: Dictionary of all possible features in passed documents.
    # Centroids: List of generated centroids.
    # Cluster_sizes: Number of documents in each cluster.
    # Behavioral_traits: Behavioral traits associated with at least one
    # document assigned to the given centroid.
    # Screenshot_urls: A screenshot from one document within each cluster.
    # Centroids and behavioral_traits have the same lengths. The behavioral
    # traits in a given index of behavioral_traits is associated with the same
    # index of centroids.
    post = {
        'qual_id': qual_id,
        'features': features,
        'centroids': {},
        'cluster_sizes': {},
        'behavioral_traits': {},
        'screenshot_urls': {}
    }

    for i in Counter(labels).most_common(len(centroids)):
        if str(i[0]) not in post['cluster_sizes']:
            post['cluster_sizes'][str(i[0])] = str(i[1])

    for i in range(len(centroids.tolist())):
        if str(i) not in post['centroids']:
            post['centroids'][str(i)] = centroids.tolist()[i]

    # Reuse queried documents.
    data = data.rewind()
    label = 0
    for doc in data:
        if doc['response'] is None:
            continue
        elif str(labels[label]) not in post['screenshot_urls']:
            post['screenshot_urls'][str(labels[label])] = doc['screenshot_url']
            label += 1
        else:
            label += 1

    # Reuse queried documents.
    data = data.rewind()

    # Add associated behavioral traits to cluster dictionary.
    for doc in data:
        if doc['response'] is None:
            continue
        else:
            temp = np.array([])
            temp = addFeatures(features, temp, doc['response'])
            temp = np.reshape(temp, (-1, length))
            label = km.predict(temp)[0]
            if str(label) not in post['behavioral_traits']:
                post['behavioral_traits'][str(
                    label)] = doc['behavioral_traits']

    # Add generated cluster dictionary to centroid_db.
    # If a record shares the same qual_id as the generated cluster dictionary,
    # then the stored record will be overwritten.
    # print("Posting centroids to database centroids.")
    centroid_db.replace_one({'qual_id': qual_id}, post, upsert=True)
data_to_cluster = data[[
    'Project Resource Category', 'Project Subject Category Tree',
    'Project Subject Subcategory Tree', 'Project Type', 'School Metro Type',
    'Region', 'Project Grade Level Category'
]]

### Find Optimal Clusters ###
n_clusters = np.arange(2, 1003, 100)
costs = []

for n in n_clusters:
    print("Working on {} clusters.".format(n))
    kproto = KModes(n_clusters=n, init='random', verbose=False)
    # here you use the unsclaed data and tell the model which columns are categorical
    # and which ones are numerical
    cluster_obj = kproto.fit(data_to_cluster)
    labels = cluster_obj.labels_
    cost = cluster_obj.cost_
    costs.append(cost)

#Plot Average Silhouette Scores
optimum_k = 100
fig, ax = plt.subplots()
plt.title("Cost vs. Number of Clusters - Random Centroid Initializations")
plt.plot(n_clusters, costs, linestyle='--', marker='o')
plt.axvline(x=optimum_k,
            color='black',
            linestyle='--',
            label='Best Number of Clusters: {}'.format(optimum_k))
plt.xlabel('Number of Clusters')
plt.ylabel('Cost')
cslice_counts.tail()


# In[83]:


cluster_range = range( 1, 11 )


# In[84]:


for n_clusters in cluster_range:
    km = KModes(n_clusters, init='Huang', n_init=10, verbose=1)
    km.fit(cslice)


# In[86]:


# Plot costs by number of clusters
plt.plot([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], [17513.0, 15391.0, 13947.0, 13507.0, 13236.0, 12803.0, 12625.0, 12467.0, 12292.0, 12101])
plt.xlabel('Clusters')
plt.ylabel('Costs')
plt.axis([0, 11, 12000.0, 18000.0])
plt.show()


# ## Evaluate 3 clusters
Beispiel #27
0
 def test_kmodes_epoch_costs(self):
     kmodes = KModes(n_clusters=4, init='Cao', random_state=42)
     kmodes.fit(SOYBEAN)
     self.assertEqual(kmodes.epoch_costs_, [206.0, 204.0, 199.0, 199.0])
Beispiel #28
0

showDistance1()
print()

## e)
dataFrame['Type'] = dataFrame['Type'].astype('category')
dataFrame['Origin'] = dataFrame['Origin'].astype('category')
dataFrame['DriveTrain'] = dataFrame['DriveTrain'].astype('category')
dataFrame['Cylinders'] = dataFrame['Cylinders'].astype('category')

cat_col = dataFrame.select_dtypes(['category']).columns
df = dataFrame[cat_col].apply(lambda x: x.cat.codes)

km = KModes(n_clusters=3, init='Huang', random_state=555)
clusters = km.fit(df)

cents = km.cluster_centroids_
predict_results = km.predict(df)
unique, counts = np.unique(predict_results, return_counts=True)
num_obs_in_each_cluster = dict(zip(unique, counts))


def showResult(i):
    print("The number of observations in cluster 1: %d" %
          num_obs_in_each_cluster[i])
    print("The number of observations in cluster 2: %d" %
          num_obs_in_each_cluster[i + 1])
    print("The number of observations in cluster 3: %d" %
          num_obs_in_each_cluster[i + 2])
Beispiel #29
0
 def test_kmodes_random_soybean(self):
     kmodes_random = KModes(n_clusters=4, init='random', verbose=2)
     result = kmodes_random.fit(SOYBEAN)
     self.assertIsInstance(result, KModes)
Beispiel #30
0
                  delimiter=',')[:, 0:]  # test.csv
y = np.genfromtxt('data_category/dataset_extract.csv',
                  dtype=str,
                  delimiter=',',
                  usecols=(0))  #data_category/dataset_extract.csv

print(x.shape)
print(y.shape)
dataNum = y.shape[0]
n_clusters = [100, 300, 500, 600, 1000]
for nc in n_clusters:
    kmodes_huang = KModes(n_clusters=nc,
                          cat_dissim=multimatch_dissim,
                          init='Huang',
                          verbose=0)
    kmodes_huang.fit(x)

    # with open('summary'+str(nc)+'.txt','w') as f:
    # Print cluster centroids of the trained model.
    # f.write('k-modes (Huang) centroids:')
    # print(kmodes_huang.cluster_centroids_)
    # Print training statistics
    print('For number of clusters ', nc)
    print('Final training cost: {}'.format(kmodes_huang.cost_))
    print('Training iterations: {}'.format(kmodes_huang.n_iter_))

    print('Save tables:')
    np.savetxt('labels' + str(nc) + '.out',
               kmodes_huang.labels_,
               fmt='%i',
               delimiter=',')
Beispiel #31
0
def fit(qual_id, count):
    # More than 500 documents results in slow training
    if count >= 500:
        count = 500
    # Query for passed qual_id with incorrect answers
    # May take a lengthly amount of time. Recommend optimizing query.
    if FLAG_VERBOSE:
        print("Querying for", qual_id)
    data = collection.find({"qual_id": qual_id, "correct": False})[:count]
    if FLAG_VERBOSE:
        print("Query complete.")

    # Compile dictionary of all possible features in given list of records
    if FLAG_VERBOSE:
        print("Compiling dictionary of features.")
    num_examples = 0
    num_empty = 0
    features = {}
    for doc in data:
        doc_features = {}
        if doc['response'] is None:
            num_empty += 1
            continue
        doc_features = retrieveKeys(doc['response'], doc_features)
        features = mergeFeatures(doc_features, features, "")
        num_examples += 1
    if FLAG_VERBOSE:
        print("Feature compilation complete.")

    # Count number of features
    num_features = countFeatures(features)
    if FLAG_VERBOSE:
        print("*** Number of features: {}".format(num_features))
        print(
            "*** Number of non-empty records for [Q_ID:{}]: {}. (dropped {} with empty resp)"
            .format(qual_id, num_examples, num_empty))
    if num_features == 0:
        return

    # Reuse queried documents.
    data = data.rewind()

    # Append missing features to all records and assign common benign value.
    # Current benign value is an empty string.
    # print("Appending features to documents.")
    # faster to create zeroed np array first, rather then appending
    student_data = np.zeros((num_examples, num_features), dtype='<U32')
    i = 0
    for doc in data:
        if doc['response'] is None:
            continue
        else:
            temp = addFeatures(features, [], doc['response'])
            student_data[i, :] = temp
            i += 1
    if FLAG_VERBOSE:
        print("Finished appending features to documents.")
        print(student_data)
    #print("*** Features: ***")
    #pprint(interpretFeatures(features, []))

    # print feature vectors
    #print("*** FEATURE VECTOR: ***")
    #i = 0
    #for row in student_data:
    #    print("[{}]: {}".format(i, row))
    #    i += 1
    #print(repr(student_data))

    # Perform k-modes clustering
    print("Clustering...")
    clusters = NUM_CLUSTERS
    # K-modes implementation can't generate more than 255 centroids
    if clusters > 255:
        clusters = 255
    if clusters > len(student_data):
        clusters = len(student_data)
    km = KModes(n_clusters=clusters, init='Cao', n_init=4, verbose=False)
    km.fit(student_data)
    print("Finished.")

    # Print important information from clustering
    # Centroids are common values to each cluster
    centroids = km.cluster_centroids_
    if FLAG_VERBOSE:
        print("*** CENTROIDS: ***")
        print(centroids)

    # Labels is a list indicating which cluster each record belongs to
    labels = km.labels_
    if FLAG_VERBOSE:
        print("*** LABELS: ***")
        print(labels)

    # Cost is value indicating possible error in the clusters. Ideal value is 0.0
    if FLAG_VERBOSE:
        cost = km.cost_
        print("*** COST: ***")
        print(cost)

    # Prints 5 largest cluster labels and number of records per cluster.
    if FLAG_VERBOSE:
        most_common = Counter(labels).most_common(5)
        print("Most populated centroids")
        print(most_common)

    # Generate cluster dictionary to be inserted in the centroid_db.
    # Qual_id: qual_id of given documents
    # Features: Dictionary of all possible features in passed documents.
    # Centroids: List of generated centroids.
    # Behavioral_traits: Behavioral traits associated with at least one
    # document assigned to the given centroid.
    # Centroids and behavioral_traits have the same lengths. The behavioral
    # traits in a given index of behavioral_traits is associated with the same
    # index of centroids.
    if FLAG_USE_CENTROID_DB:
        post = {
            'qual_id': qual_id,
            'features': features,
            'centroids': centroids.tolist(),
            'behavioral_traits': {}
        }

        # Reuse queried documents.
        data = data.rewind()

        # Add associated behavioral traits to cluster dictionary.
        for doc in data:
            if doc['response'] is None:
                continue
            else:
                temp = np.array([])
                temp = addFeatures(features, temp, doc['response'])
                temp = np.reshape(temp, (-1, num_features))
                label = km.predict(temp)[0]
                if str(label) not in post['behavioral_traits']:
                    post['behavioral_traits'][str(
                        label)] = doc['behavioral_traits']
                X_ids.append(doc['_id'])

        # Add generated cluster dictionary to centroid_db.
        # If a record shares the same qual_id as the generated cluster dictionary,
        # then the stored record will be overwritten.
        print("Posting centroids to database centroids.")
        centroid_db.replace_one({'qual_id': qual_id}, post, upsert=True)
        print(qual_id, "complete.")
        print()

    if FLAG_DO_ANALYSIS:
        # perform some automatic EDA on largest clusters and save
        # collect ids of examples
        data = data.rewind()
        X_ids = []
        for doc in data:
            if doc['response'] is None:
                continue
            else:
                X_ids.append(doc['_id'])
        out_dir = ANALYS_OUT_DIR
        if out_dir is None:
            out_dir = "./out/" + str(qual_id)
        analys = cluster_analyzer(collection, out_dir)
        analys.analyze(student_data, labels, centroids, X_ids, qual_id,
                       interpretFeatures(features, []))
Beispiel #32
0
#!/usr/bin/env python

import numpy as np
from kmodes.kmodes import KModes

# reproduce results on small soybean data set
x = np.genfromtxt('soybean.csv', dtype=int, delimiter=',')[:, :-1]
y = np.genfromtxt('soybean.csv', dtype=str, delimiter=',', usecols=(35, ))

kmodes_huang = KModes(n_clusters=4, init='Huang', verbose=1)
kmodes_huang.fit(x)

# Print cluster centroids of the trained model.
print('k-modes (Huang) centroids:')
print(kmodes_huang.cluster_centroids_)
# Print training statistics
print('Final training cost: {}'.format(kmodes_huang.cost_))
print('Training iterations: {}'.format(kmodes_huang.n_iter_))

kmodes_cao = KModes(n_clusters=4, init='Cao', verbose=1)
kmodes_cao.fit(x)

# Print cluster centroids of the trained model.
print('k-modes (Cao) centroids:')
print(kmodes_cao.cluster_centroids_)
# Print training statistics
print('Final training cost: {}'.format(kmodes_cao.cost_))
print('Training iterations: {}'.format(kmodes_cao.n_iter_))

print('Results tables:')
for result in (kmodes_huang, kmodes_cao):
Beispiel #33
0
def trainModelAndValidate(train, test):
    count = 0
    # select the required columns
    per = pd.DataFrame(np.c_[train.iloc[:, 31:73]])
    # kmodes clustering with initial cluster as 500
    km = KModes(n_clusters=500, max_iter=1000, init='Huang', n_init=2,
                n_jobs=-1)
    print("Cost of K clusters")
    m1 = km.fit(per)
    # print the cost of clustering
    print("500 clusters:", m1.cost_)

    # reduce the clusters gradually till the cost is minimized
    mdl1 = m1.cluster_centroids_
    km1 = KModes(
        n_clusters=250,
        max_iter=1000,
        init='Huang',
        n_init=2,
        n_jobs=-1)
    m2 = km1.fit(mdl1)
    # print(m2.cluster_centroids_)
    print("250 clusters:", m2.cost_)

    mdl2 = m2.cluster_centroids_
    km2 = KModes(
        n_clusters=125,
        max_iter=1000,
        init='Huang',
        n_init=2,
        n_jobs=-1)
    m3 = km2.fit(mdl2)
    # print(m3.cluster_centroids_)
    print("125 clusters:", m3.cost_)

    mdl3 = m3.cluster_centroids_
    km3 = KModes(
        n_clusters=62,
        max_iter=1000,
        init='Huang',
        n_init=2,
        n_jobs=-1)
    m4 = km3.fit(mdl3)
    # print(m4.cluster_centroids_)
    print("62 clusters:", m4.cost_)

    mdl4 = m4.cluster_centroids_
    km4 = KModes(
        n_clusters=31,
        max_iter=1000,
        init='Huang',
        n_init=2,
        n_jobs=-1)
    m5 = km4.fit(mdl4)
    # print(m5.cluster_centroids_)
    print("31 clusters:", m5.cost_)

    mdl5 = m5.cluster_centroids_
    km5 = KModes(
        n_clusters=15,
        max_iter=1000,
        init='Huang',
        n_init=2,
        n_jobs=-1)
    m6 = km5.fit(mdl5)
    # print(m6.cluster_centroids_)
    print("15 clusters:", m6.cost_)

    mdl6 = m6.cluster_centroids_
    km6 = KModes(
        n_clusters=10,
        max_iter=1000,
        init='Cao',
        n_init=2,
        n_jobs=-1)
    m7 = km6.fit(mdl6)
    # print(m7.cluster_centroids_)
    print("10 clusters:", m7.cost_)

    mdl7 = m7.cluster_centroids_
    km7 = KModes(
        n_clusters=8,
        max_iter=1000,
        init='Cao',
        n_init=2,
        n_jobs=-1)
    m8 = km7.fit(mdl7)
    mfin_clust = m8.cluster_centroids_
    print("8 clusters:", m8.cost_)
    print()

    # The min cost is obtained when number of clusters = 8
    mfin = km7.fit_predict(per)
    fin = pd.DataFrame(mfin)
    # print(mfin_clust)

    # select the required columns
    df1 = train.iloc[:, 20:73]
    # add a new column which has the final classification
    df1['clusters'] = mfin
    # In order to find the similarity between the users, we group the users
    # who belong to the same cluster
    df_fin = df1.groupby(['clusters'])

    fin_0 = df_fin.get_group(0)
    # print(np.std(fin_0['Horror']))

    fin_1 = df_fin.get_group(1)
    # print(np.std(fin_1['Horror']))

    fin_2 = df_fin.get_group(2)

    fin_3 = df_fin.get_group(3)

    fin_4 = df_fin.get_group(4)

    fin_5 = df_fin.get_group(5)

    fin_6 = df_fin.get_group(6)

    fin_7 = df_fin.get_group(7)

    # convert the centroids of a cluster into a list
    mfin_clust = list(mfin_clust)

    for i in range((test.shape[0])):
        row_hobby = list(df.iloc[i, 31:73])
        row_genre = list(df.iloc[i, 20:31])

        # Euclidian distance between y and the centroid of each cluster
        # The calculated distances are stored in a dictionary with the key =
        # cluster numbers
        distance = {}

        for i in range(0, 8):
            distance[i] = (math.sqrt(
                sum([(a - b) ** 2 for a, b in zip(mfin_clust[i], row_hobby)])))
        # minimum distance is calculated using the values of the dictionary
        min_clust = min(distance, key=distance.get)
        # the user is classified into the cluster
        df_clust = df_fin.get_group(min_clust)
        # similarity for y and df_clust
        # drop the columns containing movie genre as the similarity between the
        # users is calculated using the hobbies preferences
        df_clust2 = df_clust.drop(['Horror',
                                   'Romantic',
                                   'Comedy',
                                   'Thriller',
                                   'Sci-fi',
                                   'War',
                                   'Fantasy/Fairy tales',
                                   'Western',
                                   'Animated',
                                   'Documentary',
                                   'Action'],
                                  axis=1)
        # insert a new column called index as each user needs to have one
        # unique identity
        ind = list(range(0, len(df_clust2)))
        df_clust2.insert(0, 'Index', ind)

        # add the index column to the dataframe
        df_clust2['Index']
        # dictinary to store the user-user similarity
        xz_dict = {}
        for j in range(0, len(df_clust2)):
            xz = []
            # the list contains the column header and the preferences of the
            # jth row
            xz = list(df_clust2.iloc[j, :].items())
            # print(xz)
            xz1 = []
            # append only the preferences in a new list
            for i in range(1, 43):
                xz1.append(xz[i][1])
            # print(xz1)
            simi = sim(xz1, row_hobby)
            # store the user similarity in a dictionary
            xz_dict[j] = simi
        # find 5 users who are most similar to the new user
        top_5 = sorted(xz_dict, key=xz_dict.get, reverse=True)[:5]

        # dictionary used to store the rating for each genre based on user
        # similarities
        fin_rec = {}
        actual = {}
        # for each genre
        for k in range(1, 12):
            actual[k] = row_genre[k - 1]
            user_rating = []
            sum_sim = 0
            rec = 0
            # apped the ratings of the similar users into a list for a
            # particular genre
            for i in top_5:
                user_rating.append(df_clust.iloc[i, k:k + 1].item())
            # calculate the rating of the new user based on user-user
            # similarity
            for i, j in zip(user_rating, xz_dict):
                rec = rec + (i * xz_dict[j])
                sum_sim = sum_sim + xz_dict[j]
            # store the rating in the dictionary created
            fin_rec[k] = rec / sum_sim
        # select the top 3 genres based on rating
        top_3 = sorted(fin_rec, key=fin_rec.get, reverse=True)[:3]
        top_3_actual = sorted(actual, key=actual.get, reverse=True)[:3]
        # Thus recommend the genres to the user.
        for l in top_3:
            if l in top_3_actual:
                count += 1
    print("Accuracy", count / (3 * test.shape[0]))
    return count / (3 * test.shape[0])