Ejemplo n.º 1
0
 def KModesRatio(self):
     '''
     Type: K-Modes
     Y-axis: No Reaction
     X-axis: Reaction
     '''
     if self.authenticated:
         from kmodes.kmodes import KModes as KMo
         algorithm = KMo(n_clusters=2)
         categories = algorithm.fit_predict(self.allCoord)
         print(algorithm.cluster_centroids_)
         plt.scatter(self.allCoord[categories == 0, 0],
                     self.allCoord[categories == 0, 1],
                     c="green")
         plt.scatter(self.allCoord[categories == 1, 0],
                     self.allCoord[categories == 1, 1],
                     c="red")
         plt.scatter(algorithm.cluster_centroids_[:, 0],
                     algorithm.cluster_centroids_[:, 1],
                     c="black",
                     marker="*")
         for i, txt in enumerate(self.labels):
             plt.annotate(txt, (self.allCoord[i][0], self.allCoord[i][1]))
         plt.ylabel("NO REACTION")
         plt.xlabel("REACTION")
         plt.annotate("NO INFLAMMATION", algorithm.cluster_centroids_[0])
         plt.annotate("CAUSES INFLAMMATION",
                      algorithm.cluster_centroids_[1])
         plt.title("K-Modes: Reaction, No Reaction")
         plt.show()
Ejemplo n.º 2
0
def k_modes(questions):
    temp = []
    for col in df.columns:
        temp.append(col)

    for val in questions:
        foo = 'Q' + str(val) + '-0'
        print(str(val) + ' - ' + mapping[foo][0])
    headers = []
    for q in questions:
        head = 'Q' + str(q) + '-'
        for val in temp:
            if head in val:
                headers.append(val)

    km = KModes(n_clusters=2)

    clusters = km.fit_predict(df[headers])

    columns = []
    for centroid in km.cluster_centroids_:
        temp = []
        for i in range(0, len(centroid)):
            if centroid[i] == 1:
                temp.append(headers[i])
        columns.append(temp)

    for column in columns:
        l = [mapping[i][1] for i in column]
        print(column)
        print(l)
def makeClusters (data,year, numClusters):
    km=KModes(n_clusters=numClusters, init="Cao", n_init=1, verbose=1)
    subsetData= data[data["Year"]==year].drop(["Year","Community Area","Beat"],axis=1).values
    fitClusters=km.fit_predict(subsetData)
    clustersCentroidsData=pd.DataFrame(km.cluster_centroids_)
    clustersCentroidsData.columns=subsetData.columns
    return fitClusters, clustersCentroidsData
Ejemplo n.º 4
0
def cluster():

    # create a DataFrame to hold the categorical data
    df = pd.DataFrame(data)

    # remove all features not appropriate for clustering
    df = df.drop(['CVD_ID', 'Date_Published', 'Date_Modified', 'Vendor', 'Product'], axis=1)

    km = KModes(n_clusters=NUMBER_OF_CLUSTERS, init=CLUSTERING_ALGORITHM, verbose=0)
    km.fit_predict(df)
    centroids = km.cluster_centroids_
    labels = km.labels_
    cost = km.cost_

    # add counts to this dataframe
    l = pd.DataFrame(centroids, columns=df.columns)

    # add assigned cluster to record
    df['Cluster'] = labels
    clusters = pd.DataFrame(df.groupby('Cluster')['Cluster'].count())
    clusters.rename(columns={'Cluster':'Cluster_Count'}, inplace=True)

    cnt = []
    for i in range(0, len(clusters)):
         cnt.append(clusters.iloc[i][0])

    l['Count'] = cnt
    print("\nTotal Cost of Selected Clustering Hyperparameters: ", cost)
    print("Number of Clusters:  ", NUMBER_OF_CLUSTERS)
    print("Algorithm: ", CLUSTERING_ALGORITHM)
    print("Cluster data is printed to Final_Clusters.csv.")
    print(l.sort_values('Count', ascending=False))
    l.sort_values('Count', ascending=False).to_csv("Data/Final_Clusters.csv", index=False)
Ejemplo n.º 5
0
 def test_kmodes_predict_soybean(self):
     kmodes_cao = KModes(n_clusters=4, init='Cao', verbose=2)
     kmodes_cao = kmodes_cao.fit(SOYBEAN)
     result = kmodes_cao.predict(SOYBEAN2)
     expected = np.array([2, 1, 3, 0])
     assert_cluster_splits_equal(result, expected)
     self.assertTrue(result.dtype == np.dtype(np.uint16))
Ejemplo n.º 6
0
def clustering(user_session_subset, chicago_clustering,
               chicago_clustering_labels):
    #THIS WILL EXTRACT THE CHICAGO_ZERO_AND_ONE RESTAURANTS THAT MATCH WITH THE ONES FOUND FROM THE INITIAL SUBSET(required for clustering)
    #get count of user session subset
    user_session_subset_count = pd.crosstab(
        index=user_session_subset['Restaurant_ID'], columns="count")

    mask = np.zeros(len(chicago_clustering), dtype=bool)
    mask[user_session_subset_count['count'].index.values.astype(int)] = True

    chicago_clustering_labels = chicago_clustering_labels[mask]
    chicago_clustering = chicago_clustering[mask]

    #method - Huang, number of clusters - 4, verbose=1 mean textual output (0 is no output)
    kmodes_huang = KModes(n_clusters=3, init='Huang', verbose=0, n_init=20)
    kmodes_huang.fit(chicago_clustering)

    #this joins the restaurant name
    cluster_results = np.column_stack(
        (chicago_clustering_labels, kmodes_huang.labels_))

    #convert numpy matrix to pandas dataframe
    cluster_result_df = pd.DataFrame(cluster_results)
    cluster_result_df.columns = ['Restaurant', 'Cluster']

    #JOIN THE CLUSTERING RESULTS WITH user_session_subset_count TO GET OUT FINAL RESULTS
    #remove existing indecies so the new ones line up and df's can be joined
    cluster_result_df.reset_index(drop=True, inplace=True)
    user_session_subset_count.reset_index(drop=True, inplace=True)

    #join the cluster results with the restaurant counts
    clusters_with_counts = pd.concat(
        [cluster_result_df, user_session_subset_count], axis=1)

    return clusters_with_counts
Ejemplo n.º 7
0
def k_elbow_plot(fpath, max_k=10):
    """
    Funzione per plottare il grafico "a gomito" che mostra il rapporto tra SSE del modello di clustering
    e il numero di cluster scelti. L'utilità sta nel poter selezionare il numero di k più appropriato
    in base all'ultimo valore k che comporta una buona diminuzione dell'errore ("punta del gomito")
    :param fpath: percorso del dataset processato (vedasi descrizione di argv[1] in cima allo script)
    :param max_k: numero massimo di k che si vuole utilizzare per produrre il grafo
    """
    if not path.isfile(fpath):
        print("Error: could not find specified CSV dataset.")
        return
    if max_k <= 0:
        print("Error: k must be a positive integer.")
        return

    data = refactor_data_frame(pd.read_csv(fpath))
    errors = []
    for k in range(1, max_k + 1):
        kmodes = KModes(n_clusters=k, random_state=42, n_init=1, init="random")
        kmodes.fit(data)
        errors.append(kmodes.cost_)
        print("DONE WITH K=" + str(k))
    plt.figure(figsize=(16, 8))
    plt.plot(range(1, max_k + 1), errors, 'bo-')
    plt.xlabel('#Clusters (K)')
    plt.ylabel('Errore (0/1)')
    plt.title("Rapporto parametro K/errore del dataset " +
              path.basename(fpath))
    plt.show()
Ejemplo n.º 8
0
    def run_cluster(self):
        columns = self.board_game_data.columns.tolist()
        columns = [
            c for c in columns if c not in [
                'board_game_id', 'name', 'year', 'minplayer', 'maxplayer',
                'playingtime', 'avgratings', 'designer', 'category',
                'mechanic', 'publisher', 'age', 'rank'
            ]
        ]
        print(columns)
        cluster_df = self.board_game_data[columns]
        km = KModes(n_clusters=15, init='Huang', n_init=10, verbose=1)
        clusters = km.fit_predict(cluster_df)
        print(km.cluster_centroids_)

        centroids = km.cluster_centroids_
        for i in range(centroids.shape[0]):
            if sum(centroids[i, :]) == 0:
                print("\ncluster " + str(i) + ": ")
                print("no cluster")
            else:
                print("\ncluster " + str(i) + ": ")
                cent = centroids[i, :]
                for j in cluster_df.columns[np.nonzero(cent)]:
                    print(j)
Ejemplo n.º 9
0
 def test_kmodes_ninit(self):
     kmodes = KModes(n_init=10, init='Huang')
     self.assertEqual(kmodes.n_init, 10)
     kmodes = KModes(n_init=10)
     self.assertEqual(kmodes.n_init, 1)
     kmodes = KModes(n_init=10, init=np.array([1, 1]))
     self.assertEqual(kmodes.n_init, 1)
Ejemplo n.º 10
0
def run_kmodes(n_clusters=4):
    km_huang = KModes(n_clusters=n_clusters,
                      init="Huang",
                      verbose=1,
                      n_init=2,
                      max_iter=10)
    csv_data = pd.read_csv("kmodes_input.csv")
    input_data = csv_data.iloc[:, 1:]
    roadmap_id = csv_data.iloc[:, 0]
    clusters = km_huang.fit_predict(input_data)

    cluster_df = pd.DataFrame(clusters)
    cluster_df.columns = ["cluster_predicted"]
    cluster_df["roadmap_id"] = roadmap_id

    # # cluster_data의 전체 행 개수를 roadmap id와 맞추어서 서치 없이 바로 접근할 수 있게하기위함
    # # 전체 roadmap 개수 + 0행만큼 행을 만든다
    # continuous_id_df = pd.DataFrame(list(range(roadmap_id[roadmap_id.index[-1]] + 1)))
    # continuous_id_df.columns = ["roadmap_id"]
    #
    # cluster_df = pd.merge(cluster_df, continuous_id_df, how="right", on="roadmap_id")
    print(cluster_df)
    # save as csv
    cluster_df.to_csv("clustering_result.csv",
                      sep=",",
                      na_rep="NaN",
                      index=False)
Ejemplo n.º 11
0
 def test_kmodes_predict_soybean_ng(self):
     kmodes_cao = KModes(n_clusters=4, init='Cao', verbose=2, cat_dissim=ng_dissim)
     kmodes_cao = kmodes_cao.fit(SOYBEAN)
     result = kmodes_cao.predict(SOYBEAN2)
     expected = np.array([2, 1, 3, 0])
     assert_cluster_splits_equal(result, expected)
     self.assertTrue(result.dtype == np.dtype(np.uint8))
Ejemplo n.º 12
0
 def test_kmodes_random_soybean(self):
     kmodes_random = KModes(n_clusters=4,
                            init='random',
                            verbose=2,
                            random_state=42)
     result = kmodes_random.fit(SOYBEAN)
     self.assertIsInstance(result, KModes)
Ejemplo n.º 13
0
    def fit(self, data, verbose=0):
        best_scores = dict(zip(self.metric_names, -np.ones(len(self.metrics))))
        best_clusters = []
        score = dict()
        clustering_options = self.clustering_options
        for n_clusters in range(self.min_clusters, self.max_clusters + 1,
                                self.step):
            clustering_options["n_clusters"] = n_clusters
            km = KModes(**self.clustering_options)
            clusters = km.fit_predict(data)
            for name, metric in zip(self.metric_names, self.metrics):
                if name == "Incluster distances":
                    score[name] = metric(np.array(data),
                                         clusters,
                                         metric=matching_dissim,
                                         centroids=km.cluster_centroids_)
                else:
                    score[name] = metric(np.array(data),
                                         clusters,
                                         metric=matching_dissim)

            if score["Silhouette"] > best_scores["Silhouette"]:
                best_clusters = copy(clusters)
                best_scores = copy(score)
                self.centroids = copy(km.cluster_centroids_)
                self.km = deepcopy(km)
        self.best_scores = best_scores

        return best_clusters, best_scores
Ejemplo n.º 14
0
    def kmode_calculation(self, data):
        """
        This function calculates the centroid using the k-mode algorithm.

        This functiontakes in the cleaned data and returns:

        - Column element mapping dictionary
        - Centroids
        - The output data with classification
        """
        col_dict = {}

        for col in data.columns:
            data[col] = data[col].astype('category')
            col_dict.update({col: dict(enumerate(data[col].cat.categories))})

        # Get all the cols in the DataFrame
        cols = [col for col in data.columns]

        # Transform all values into categorical and numerical values
        for col in cols:
            data[col] = data[col].astype('category')
            data[col] = data[col].cat.codes

        # Run k-modes using the algorithm
        kmodes_method = KModes(n_clusters=self.n_cluster,
                               init=self.init_method,
                               n_init=self.n_iter,
                               verbose=1)
        kmode_result = kmodes_method.fit_predict(data[cols])

        # Attach the output label for each data point
        data['classification'] = pd.Series(kmode_result, index=data.index)

        return col_dict, kmodes_method.cluster_centroids_, data
Ejemplo n.º 15
0
 def KModePercentTotal(self):
     '''
     Type: K-Modes
     Y-axis: % Reactions
     X-axis: # Observations
     '''
     if self.authenticated:
         from kmodes.kmodes import KModes as KMo
         algorithm = KMo(n_clusters=2)
         # partPercent = np.array([np.array([x, percent]) for j in self.stuff for _, x, _, percent in j])
         categories = algorithm.fit_predict(self.percentTotal)
         plt.scatter(self.percentTotal[categories == 0, 0],
                     self.percentTotal[categories == 0, 1],
                     c="green")
         plt.scatter(self.percentTotal[categories == 1, 0],
                     self.percentTotal[categories == 1, 1],
                     c="red")
         plt.scatter(algorithm.cluster_centroids_[:, 0],
                     algorithm.cluster_centroids_[:, 1],
                     c="black",
                     marker="*")
         for i, txt in enumerate(self.labels):
             plt.annotate(
                 txt, (self.percentTotal[i][0], self.percentTotal[i][1]))
         plt.ylabel("PERCENT")
         plt.xlabel("TOTAL")
         plt.annotate("NO INFLAMMATION", algorithm.cluster_centroids_[0])
         plt.annotate("CAUSES INFLAMMATION",
                      algorithm.cluster_centroids_[1])
         plt.title("K-Modes: # Observations, % Reactions")
         plt.show()
def fit_kModes(data, n_cluster=2, N_trials=10):
    kmo = KModes(n_clusters=n_cluster,
                 n_init=N_trials,
                 init='Huang',
                 random_state=616)
    clusters = kmo.fit_predict(data)
    cluster_feature_weights = kmo.cluster_centroids_
    return clusters, cluster_feature_weights
Ejemplo n.º 17
0
def clusterBitVec(data, max_clusters=5):
    best_k = findKBitVec(data, max_clusters)
    if best_k == 0:
        return 0, []
    else:
        kmodes = KModes(best_k)
        labels = kmodes.fit_predict(data)
        return best_k, labels
Ejemplo n.º 18
0
 def test_kmodes_cao_soybean(self):
     kmodes_cao = KModes(n_clusters=4, init='Cao', verbose=2)
     result = kmodes_cao.fit_predict(SOYBEAN)
     expected = np.array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1,
                          1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0,
                          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
     assert_cluster_splits_equal(result, expected)
     self.assertTrue(result.dtype == np.dtype(np.uint8))
Ejemplo n.º 19
0
 def test_kmodes_fit_predict(self):
     """Test whether fit_predict interface works the same as fit and predict."""
     kmodes = KModes(n_clusters=4, init='Cao', random_state=42)
     sample_weight = [0.5] * TEST_DATA.shape[0]
     data1 = kmodes.fit_predict(TEST_DATA, sample_weight=sample_weight)
     data2 = kmodes.fit(TEST_DATA,
                        sample_weight=sample_weight).predict(TEST_DATA)
     assert_cluster_splits_equal(data1, data2)
Ejemplo n.º 20
0
 def test_kmodes_predict_soybean_jaccard_dissim_label(self):
     kmodes_huang = KModes(n_clusters=4, n_init=2, init='Huang', verbose=2,
                           cat_dissim=jaccard_dissim_label, random_state=42)
     kmodes_huang = kmodes_huang.fit(TEST_DATA)
     result = kmodes_huang.fit_predict(TEST_DATA_PREDICT)
     expected = np.array([1, 0, 1, 2])
     assert_cluster_splits_equal(result, expected)
     self.assertTrue(result.dtype == np.dtype(np.uint16))
Ejemplo n.º 21
0
 def test_kmodes_cao_soybean_ng(self):
     kmodes_cao = KModes(n_clusters=4, init='Cao', verbose=2,
                         cat_dissim=ng_dissim)
     result = kmodes_cao.fit_predict(SOYBEAN)
     expected = np.array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1,
                          1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0,
                          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
     assert_cluster_splits_equal(result, expected)
     self.assertTrue(result.dtype == np.dtype(np.uint16))
Ejemplo n.º 22
0
 def test_kmodes_huang_soybean_parallel(self):
     kmodes_huang = KModes(n_clusters=4, n_init=4, init='Huang', verbose=2,
                           random_state=42, n_jobs=4)
     result = kmodes_huang.fit_predict(SOYBEAN)
     expected = np.array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
                          0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2,
                          2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])
     assert_cluster_splits_equal(result, expected)
     self.assertTrue(result.dtype == np.dtype(np.uint16))
Ejemplo n.º 23
0
 def test_kmodes_huang_soybean_ng(self):
     kmodes_huang = KModes(n_clusters=4, n_init=2, init='Huang', verbose=2,
                           cat_dissim=ng_dissim, random_state=42)
     result = kmodes_huang.fit_predict(SOYBEAN)
     expected = np.array([3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0,
                          0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2,
                          2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])
     assert_cluster_splits_equal(result, expected)
     self.assertTrue(result.dtype == np.dtype(np.uint16))
Ejemplo n.º 24
0
def clusterCreationKmode():
    # random categorical data
    data = np.random.choice(20, (100, 10))

    km = KModes(n_clusters=4, init='Huang', n_init=5, verbose=1)

    clusters = km.fit_predict(data)

    return HttpResponse(km.cluster_centroids_)
Ejemplo n.º 25
0
 def test_kmodes_huang_soybean(self):
     np.random.seed(42)
     kmodes_huang = KModes(n_clusters=4, n_init=2, init='Huang', verbose=2)
     result = kmodes_huang.fit_predict(SOYBEAN)
     expected = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                          0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 1,
                          2, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 2, 1, 2, 1])
     assert_cluster_splits_equal(result, expected)
     self.assertTrue(result.dtype == np.dtype(np.uint8))
Ejemplo n.º 26
0
 def test_kmodes_nunique_nclusters(self):
     data = np.array([[0, 1], [0, 1], [0, 1], [0, 2], [0, 2], [0, 2]])
     np.random.seed(42)
     kmodes_cao = KModes(n_clusters=6, init='Cao', verbose=2)
     result = kmodes_cao.fit_predict(data, categorical=[1])
     expected = np.array([0, 0, 0, 1, 1, 1])
     assert_cluster_splits_equal(result, expected)
     np.testing.assert_array_equal(kmodes_cao.cluster_centroids_,
                                   np.array([[0, 1], [0, 2]]))
Ejemplo n.º 27
0
def do_clustering(newDF, number_cluster):
    clusters = []
    randomm = randint(2, 10)
    rand_clusters = randint(number_cluster, 2 * number_cluster)
    km = KModes(n_clusters=4, init='random', n_init=randomm, verbose=0)
    km.fit_predict(newDF)
    clusters = list(km.labels_)
    print(len(clusters))
    return clusters
Ejemplo n.º 28
0
 def test_pickle_fitted(self):
     kmodes_huang = KModes(n_clusters=4,
                           n_init=2,
                           init='Huang',
                           verbose=2,
                           random_state=42)
     model = kmodes_huang.fit(SOYBEAN)
     serialized = pickle.dumps(model)
     self.assertTrue(isinstance(pickle.loads(serialized), model.__class__))
Ejemplo n.º 29
0
def kmodes_samping(df):
    km = KModes(n_clusters=100, init='Huang', n_init=5, verbose=1, n_jobs=-1)
    #model = KPrototypes(n_clusters=100, init='Huang', n_init=5, verbose=1, n_jobs=1)
    data = df[[
        'PANDAID', 'JOBSTATUS', 'COMPUTINGSITE', 'FINAL_STATUS', 'IS_SCOUT',
        'DURATION'
    ]].values
    clusters = km.fit_predict(data)
    centers = [row[0] for row in km.cluster_centroids_]
    return df[df['PANDAID'].isin(centers)]
Ejemplo n.º 30
0
def do_clustering(newDF, number_cluster):
    clusters = []
    randomm = randint(20, 100)

    km = KModes(n_clusters=number_cluster,
                init='Huang',
                n_init=randomm,
                verbose=0)
    km.fit_predict(newDF)
    clusters = list(km.labels_)
    return clusters
Ejemplo n.º 31
0
 def test_kmodes_huang_soybean_jaccard_dissim_binary(self):
     kmodes_huang = KModes(n_clusters=4, n_init=2, init='Huang', verbose=2,
                           cat_dissim=jaccard_dissim_binary, random_state=42)
     # binary encoded variables are required
     bin_variables = SOYBEAN.astype(bool).astype(int)
     result = kmodes_huang.fit_predict(bin_variables)
     expected = np.array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                          0, 3, 1, 1, 3, 3, 1, 1, 1, 1, 3, 1, 1, 3, 1, 3, 3, 1, 3,
                          3, 3, 1, 1, 3, 1, 3, 1, 1])
     assert_cluster_splits_equal(result, expected)
     self.assertTrue(result.dtype == np.dtype(np.uint16))
Ejemplo n.º 32
0
 def test_kmodes_huang_soybean(self):
     np.random.seed(42)
     kmodes_huang = KModes(n_clusters=4, n_init=2, init='Huang', verbose=2)
     result = kmodes_huang.fit_predict(SOYBEAN)
     expected = np.array([
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3,
         3, 3, 3, 3, 3, 3, 3, 3, 2, 1, 2, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 2,
         1, 2, 1
     ])
     assert_cluster_splits_equal(result, expected)
     self.assertTrue(result.dtype == np.dtype(np.uint8))
Ejemplo n.º 33
0
def kmode(data, ncluster, n_init, verbose):
    # kmode for categorical data

    # random categorical data
    data = np.random.choice(20, (100, 10))
    km = KModes(n_clusters=ncluster,
                init='Huang',
                n_init=n_init,
                verbose=verbose)

    clusters = km.fit_predict(data)
    return clusters
Ejemplo n.º 34
0
 def test_kmodes_empty_init_cluster_soybean(self):
     # Check if the clustering does not crash in case of an empty cluster.
     init_vals = np.array(
         [[0, 1, 2, 1, 0, 3, 1, 1, 0, 2, 1, 1, 0, 2, 2, 0, 0, 0, 1, 0, 1, 2,
           0, 0, 0, 0, 0, 3, 4, 0, 0, 0, 0, 0, 1],
          [4, 0, 0, 1, 1, 1, 3, 1, 1, 1, 1, 1, 0, 2, 2, 0, 0, 0, 1, 1, 0, 3,
           0, 0, 0, 2, 1, 0, 4, 0, 0, 0, 0, 0, 0],
          [3, 0, 2, 1, 0, 2, 0, 2, 1, 1, 1, 1, 0, 2, 2, 0, 0, 0, 1, 0, 3, 0,
           1, 1, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0],
          [3, 0, 2, 0, 1, 3, 1, 2, 0, 1, 1, 0, 0, 2, 2, 0, 0, 0, 1, 1, 1, 1,
           0, 1, 1, 0, 0, 3, 4, 0, 0, 0, 0, 0, 0]])
     kmodes_init = KModes(n_clusters=4, init=init_vals, verbose=2)
     result = kmodes_init.fit(SOYBEAN)
     self.assertIsInstance(result, KModes)
Ejemplo n.º 35
0
    def test_kmodes_init_soybean(self):
        init_vals = np.array(
            [[0, 1, 2, 1, 0, 3, 1, 1, 0, 2, 1, 1, 0, 2, 2, 0, 0, 0, 1, 0, 1, 2,
              0, 0, 0, 0, 0, 3, 4, 0, 0, 0, 0, 0, 1],
             [4, 0, 0, 1, 1, 1, 3, 1, 1, 1, 1, 1, 0, 2, 2, 0, 0, 0, 1, 1, 0, 3,
              0, 0, 0, 2, 1, 0, 4, 0, 0, 0, 0, 0, 0],
             [3, 0, 2, 1, 0, 2, 0, 2, 1, 1, 1, 1, 0, 2, 2, 0, 0, 0, 1, 0, 3, 0,
              1, 1, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0],
             [3, 0, 2, 0, 1, 3, 1, 2, 0, 1, 1, 0, 0, 2, 2, 0, 0, 0, 1, 1, 1, 1,
              0, 1, 1, 0, 0, 3, 4, 0, 0, 0, 0, 0, 0]])
        kmodes_init = KModes(n_clusters=4, init=init_vals, verbose=2)
        result = kmodes_init.fit_predict(SOYBEAN)
        expected = np.array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1,
                             1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0,
                             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
        assert_cluster_splits_equal(result, expected)

        # 5 initial centroids, 4 n_clusters
        init_vals = np.array(
            [[0, 1],
             [4, 0],
             [4, 0],
             [3, 0],
             [3, 0]])
        kmodes_init = KModes(n_clusters=4, init=init_vals, verbose=2)
        with self.assertRaises(AssertionError):
            kmodes_init.fit(SOYBEAN)

        # wrong number of attributes
        init_vals = np.array(
            [0, 1, 2, 3])
        kmodes_init = KModes(n_clusters=4, init=init_vals, verbose=2)
        with self.assertRaises(AssertionError):
            kmodes_init.fit(SOYBEAN)
Ejemplo n.º 36
0
 def test_kmodes_nunique_nclusters_ng(self):
     data = np.array([
         [0, 1],
         [0, 1],
         [0, 1],
         [0, 2],
         [0, 2],
         [0, 2]
     ])
     np.random.seed(42)
     kmodes_cao = KModes(n_clusters=6, init='Cao', verbose=2, cat_dissim=ng_dissim)
     result = kmodes_cao.fit_predict(data, categorical=[1])
     expected = np.array([0, 0, 0, 1, 1, 1])
     assert_cluster_splits_equal(result, expected)
     np.testing.assert_array_equal(kmodes_cao.cluster_centroids_,
                                   np.array([[0, 1],
                                             [0, 2]]))
Ejemplo n.º 37
0
 def test_kmodes_predict_unfitted(self):
     kmodes_cao = KModes(n_clusters=4, init='Cao', verbose=2)
     with self.assertRaises(AssertionError):
         kmodes_cao.predict(SOYBEAN)
     with self.assertRaises(AttributeError):
         kmodes_cao.cluster_centroids_
Ejemplo n.º 38
0
 def test_kmodes_random_soybean(self):
     kmodes_random = KModes(n_clusters=4, init='random', verbose=2)
     result = kmodes_random.fit(SOYBEAN)
     self.assertIsInstance(result, KModes)