コード例 #1
0
def cluster():

    # create a DataFrame to hold the categorical data
    df = pd.DataFrame(data)

    # remove all features not appropriate for clustering
    df = df.drop(['CVD_ID', 'Date_Published', 'Date_Modified', 'Vendor', 'Product'], axis=1)

    km = KModes(n_clusters=NUMBER_OF_CLUSTERS, init=CLUSTERING_ALGORITHM, verbose=0)
    km.fit_predict(df)
    centroids = km.cluster_centroids_
    labels = km.labels_
    cost = km.cost_

    # add counts to this dataframe
    l = pd.DataFrame(centroids, columns=df.columns)

    # add assigned cluster to record
    df['Cluster'] = labels
    clusters = pd.DataFrame(df.groupby('Cluster')['Cluster'].count())
    clusters.rename(columns={'Cluster':'Cluster_Count'}, inplace=True)

    cnt = []
    for i in range(0, len(clusters)):
         cnt.append(clusters.iloc[i][0])

    l['Count'] = cnt
    print("\nTotal Cost of Selected Clustering Hyperparameters: ", cost)
    print("Number of Clusters:  ", NUMBER_OF_CLUSTERS)
    print("Algorithm: ", CLUSTERING_ALGORITHM)
    print("Cluster data is printed to Final_Clusters.csv.")
    print(l.sort_values('Count', ascending=False))
    l.sort_values('Count', ascending=False).to_csv("Data/Final_Clusters.csv", index=False)
コード例 #2
0
ファイル: compare_cost.py プロジェクト: swsnu/swpp2020-team6
def compare_cost(build_tag_sample, build_roadmap_data):
    # create tags.csv with currently existing tags
    if build_tag_sample:
        preprocess_tags()

    # create kmodes_input.csv based on currently existing roadmaps
    if build_roadmap_data:
        preprocess_roadmaps()

    # input data
    csv_data = pd.read_csv("kmodes_input.csv")
    input_data = csv_data.iloc[:, 1:]

    # cao
    cost_cao = []
    for num_clusters in list(range(3, 7)):
        kmode_cao = KModes(n_clusters=num_clusters, init="Cao", verbose=1)
        kmode_cao.fit_predict(input_data)
        cost_cao.append(kmode_cao.cost_)

    y = np.array([i for i in range(3, 7, 1)])
    plt.plot(y, cost_cao)

    # Huang
    cost_huang = []
    for num_clusters in list(range(3, 7)):
        km_huang = KModes(n_clusters=num_clusters, init="Huang", verbose=1)
        km_huang.fit_predict(input_data)
        cost_huang.append(km_huang.cost_)

    plt.plot(y, cost_huang)
コード例 #3
0
ファイル: test.py プロジェクト: Brnawyah/Project1
def do_clustering(newDF, number_cluster):
    clusters = []
    randomm = randint(2, 10)
    rand_clusters = randint(number_cluster, 2 * number_cluster)
    km = KModes(n_clusters=4, init='random', n_init=randomm, verbose=0)
    km.fit_predict(newDF)
    clusters = list(km.labels_)
    print(len(clusters))
    return clusters
コード例 #4
0
ファイル: RSTIdaone.py プロジェクト: Brnawyah/Project1
def do_clustering(newDF, number_cluster):
    clusters = []
    randomm = randint(20, 100)

    km = KModes(n_clusters=number_cluster,
                init='Huang',
                n_init=randomm,
                verbose=0)
    km.fit_predict(newDF)
    clusters = list(km.labels_)
    return clusters
コード例 #5
0
ファイル: pipelineSteps.py プロジェクト: Vlets/MScThesisRepo
    def cluster_data(self, data_frame, number_of_segments=10):
        data_frame = data_frame.astype(str)
        kmodes_cao = KModes(n_clusters=number_of_segments,
                            init='Cao',
                            verbose=1)
        kmodes_cao.fit_predict(data_frame)

        column_names = list(data_frame.columns.values)
        clusters = pd.DataFrame(kmodes_cao.cluster_centroids_,
                                columns=column_names)
        print("Step 7/7 - Clustering, done...")
        return clusters
コード例 #6
0
def opti_para_select(cluster_name, data):
    """
    专门用于寻找最优参数的函数
    :param cluster_name:聚类方法名称
    :param data:需要进行聚类的数据
    :return:
    """
    if cluster_name == SpectralClustering:
        max_score = 0
        opti_gamma, opti_n_clusters = 0, 0
        for gamma in (0.01, 0.1, 1):
            for n_clusters in (15, 20, 25, 30):
                clusters = SpectralClustering(n_clusters=n_clusters,
                                              gamma=gamma).fit_predict(data)
                score = metrics.calinski_harabaz_score(data, clusters)
                # print("Calinski-Harabasz Score with gamma=", gamma, "n_clusters=", n_clusters,"score:", score)
                if max_score < score:
                    max_score = score
                    opti_gamma, opti_n_clusters = gamma, n_clusters
        print("max_score:", max_score, "opti_gamma:", opti_gamma,
              "opti_n_clusters:", opti_n_clusters)

    if cluster_name == "k_modes":
        max_score = 0
        opti_n_clusters = 0
        cluster_num_list = [30, 40, 50, 60, 70, 80, 90, 100]
        # for n in range(30, 100):
        for n in cluster_num_list:
            kmodes = KModes(n_clusters=n, init="Huang", n_init=5, verbose=1)
            clusters = kmodes.fit_predict(data)
            score = metrics.calinski_harabaz_score(data, clusters)
            print("Calinski-Harabasz Score——", "n_clusters=", n, "score:",
                  score)
            if max_score < score:
                max_score = score
                opti_n_clusters = n
        print("max_score:", max_score, "opti_n_clusters:", opti_n_clusters)

    if cluster_name == "k_means":
        max_score = 0
        opti_n_clusters = 0
        for n in range(2, 30):
            kmodes = KModes(n_clusters=n, init="Huang", n_init=10, verbose=1)
            clusters = kmodes.fit_predict(data)
            score = metrics.calinski_harabaz_score(data, clusters)
            print("Calinski-Harabasz Score——", "n_clusters=", n, "score:",
                  score)
            if max_score < score:
                max_score = score
                opti_n_clusters = n
        print("max_score:", max_score, "opti_n_clusters:", opti_n_clusters)
コード例 #7
0
def get_cluster(df=choose_feature()):
    """
    this function choose the best number of cluster and return an cluster algo
    Parameters
    ----------
    df : pandas.DataFrame
        data frame of features that used to cluster
    Returns
    ----------
    km:
        the cluster algo with best number of cluster
    """
    # choosing best number of cluster
    hyperparams = {"n_clusters": range(2, 11), "init": ["Huang", "Cao"]}

    para_cost = {}

    for init in hyperparams["init"]:
        cost = []
        for n in hyperparams["n_clusters"]:
            km = KModes(n_clusters=n,
                        init=init,
                        n_init=1,
                        verbose=0,
                        random_state=1)
            km.fit_predict(df)
            cost.append(km.cost_)
        cost_decrease_ratio = [
            (cost[n - 1] - cost[n]) / cost[n - 1] if n > 0 else 1
            for n, k in enumerate(cost)
        ]
        if_decrease_slow = [
            1 if cost_decrease_ratio[n] < 0.02 else 0
            for n, k in enumerate(cost_decrease_ratio)
        ]
        if 1 in if_decrease_slow:
            idx = np.argwhere(np.array(if_decrease_slow) == 1).min() - 1
        else:
            idx = len(if_decrease_slow) - 1
        k = list(hyperparams["n_clusters"])[idx]
        para_cost[(init, k)] = cost[idx]

    best_para = min(para_cost, key=para_cost.get)
    best_para_dict = {"n_clusters": best_para[1], "init": best_para[0]}

    # fit model
    km = KModes(**best_para_dict, n_init=3, verbose=0)

    return km
コード例 #8
0
 def KModePercentTotal(self):
     '''
     Type: K-Modes
     Y-axis: % Reactions
     X-axis: # Observations
     '''
     if self.authenticated:
         from kmodes.kmodes import KModes as KMo
         algorithm = KMo(n_clusters=2)
         # partPercent = np.array([np.array([x, percent]) for j in self.stuff for _, x, _, percent in j])
         categories = algorithm.fit_predict(self.percentTotal)
         plt.scatter(self.percentTotal[categories == 0, 0],
                     self.percentTotal[categories == 0, 1],
                     c="green")
         plt.scatter(self.percentTotal[categories == 1, 0],
                     self.percentTotal[categories == 1, 1],
                     c="red")
         plt.scatter(algorithm.cluster_centroids_[:, 0],
                     algorithm.cluster_centroids_[:, 1],
                     c="black",
                     marker="*")
         for i, txt in enumerate(self.labels):
             plt.annotate(
                 txt, (self.percentTotal[i][0], self.percentTotal[i][1]))
         plt.ylabel("PERCENT")
         plt.xlabel("TOTAL")
         plt.annotate("NO INFLAMMATION", algorithm.cluster_centroids_[0])
         plt.annotate("CAUSES INFLAMMATION",
                      algorithm.cluster_centroids_[1])
         plt.title("K-Modes: # Observations, % Reactions")
         plt.show()
コード例 #9
0
ファイル: train_data.py プロジェクト: swsnu/swpp2020-team6
def run_kmodes(n_clusters=4):
    km_huang = KModes(n_clusters=n_clusters,
                      init="Huang",
                      verbose=1,
                      n_init=2,
                      max_iter=10)
    csv_data = pd.read_csv("kmodes_input.csv")
    input_data = csv_data.iloc[:, 1:]
    roadmap_id = csv_data.iloc[:, 0]
    clusters = km_huang.fit_predict(input_data)

    cluster_df = pd.DataFrame(clusters)
    cluster_df.columns = ["cluster_predicted"]
    cluster_df["roadmap_id"] = roadmap_id

    # # cluster_data의 전체 행 개수를 roadmap id와 맞추어서 서치 없이 바로 접근할 수 있게하기위함
    # # 전체 roadmap 개수 + 0행만큼 행을 만든다
    # continuous_id_df = pd.DataFrame(list(range(roadmap_id[roadmap_id.index[-1]] + 1)))
    # continuous_id_df.columns = ["roadmap_id"]
    #
    # cluster_df = pd.merge(cluster_df, continuous_id_df, how="right", on="roadmap_id")
    print(cluster_df)
    # save as csv
    cluster_df.to_csv("clustering_result.csv",
                      sep=",",
                      na_rep="NaN",
                      index=False)
コード例 #10
0
ファイル: test_kmodes.py プロジェクト: ashishyadavppe/kmodes
    def test_kmodes_init_soybean(self):
        init_vals = np.array(
            [[0, 1, 2, 1, 0, 3, 1, 1, 0, 2, 1, 1, 0, 2, 2, 0, 0, 0, 1, 0, 1, 2,
              0, 0, 0, 0, 0, 3, 4, 0, 0, 0, 0, 0, 1],
             [4, 0, 0, 1, 1, 1, 3, 1, 1, 1, 1, 1, 0, 2, 2, 0, 0, 0, 1, 1, 0, 3,
              0, 0, 0, 2, 1, 0, 4, 0, 0, 0, 0, 0, 0],
             [3, 0, 2, 1, 0, 2, 0, 2, 1, 1, 1, 1, 0, 2, 2, 0, 0, 0, 1, 0, 3, 0,
              1, 1, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0],
             [3, 0, 2, 0, 1, 3, 1, 2, 0, 1, 1, 0, 0, 2, 2, 0, 0, 0, 1, 1, 1, 1,
              0, 1, 1, 0, 0, 3, 4, 0, 0, 0, 0, 0, 0]])
        kmodes_init = KModes(n_clusters=4, init=init_vals, verbose=2)
        result = kmodes_init.fit_predict(SOYBEAN)
        expected = np.array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1,
                             1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0,
                             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
        assert_cluster_splits_equal(result, expected)

        # 5 initial centroids, 4 n_clusters
        init_vals = np.array(
            [[0, 1],
             [4, 0],
             [4, 0],
             [3, 0],
             [3, 0]])
        kmodes_init = KModes(n_clusters=4, init=init_vals, verbose=2)
        with self.assertRaises(AssertionError):
            kmodes_init.fit(SOYBEAN)

        # wrong number of attributes
        init_vals = np.array(
            [0, 1, 2, 3])
        kmodes_init = KModes(n_clusters=4, init=init_vals, verbose=2)
        with self.assertRaises(AssertionError):
            kmodes_init.fit(SOYBEAN)
コード例 #11
0
    def fit(self, data, verbose=0):
        best_scores = dict(zip(self.metric_names, -np.ones(len(self.metrics))))
        best_clusters = []
        score = dict()
        clustering_options = self.clustering_options
        for n_clusters in range(self.min_clusters, self.max_clusters + 1,
                                self.step):
            clustering_options["n_clusters"] = n_clusters
            km = KModes(**self.clustering_options)
            clusters = km.fit_predict(data)
            for name, metric in zip(self.metric_names, self.metrics):
                if name == "Incluster distances":
                    score[name] = metric(np.array(data),
                                         clusters,
                                         metric=matching_dissim,
                                         centroids=km.cluster_centroids_)
                else:
                    score[name] = metric(np.array(data),
                                         clusters,
                                         metric=matching_dissim)

            if score["Silhouette"] > best_scores["Silhouette"]:
                best_clusters = copy(clusters)
                best_scores = copy(score)
                self.centroids = copy(km.cluster_centroids_)
                self.km = deepcopy(km)
        self.best_scores = best_scores

        return best_clusters, best_scores
コード例 #12
0
 def KModesRatio(self):
     '''
     Type: K-Modes
     Y-axis: No Reaction
     X-axis: Reaction
     '''
     if self.authenticated:
         from kmodes.kmodes import KModes as KMo
         algorithm = KMo(n_clusters=2)
         categories = algorithm.fit_predict(self.allCoord)
         print(algorithm.cluster_centroids_)
         plt.scatter(self.allCoord[categories == 0, 0],
                     self.allCoord[categories == 0, 1],
                     c="green")
         plt.scatter(self.allCoord[categories == 1, 0],
                     self.allCoord[categories == 1, 1],
                     c="red")
         plt.scatter(algorithm.cluster_centroids_[:, 0],
                     algorithm.cluster_centroids_[:, 1],
                     c="black",
                     marker="*")
         for i, txt in enumerate(self.labels):
             plt.annotate(txt, (self.allCoord[i][0], self.allCoord[i][1]))
         plt.ylabel("NO REACTION")
         plt.xlabel("REACTION")
         plt.annotate("NO INFLAMMATION", algorithm.cluster_centroids_[0])
         plt.annotate("CAUSES INFLAMMATION",
                      algorithm.cluster_centroids_[1])
         plt.title("K-Modes: Reaction, No Reaction")
         plt.show()
コード例 #13
0
    def kmode_calculation(self, data):
        """
        This function calculates the centroid using the k-mode algorithm.

        This functiontakes in the cleaned data and returns:

        - Column element mapping dictionary
        - Centroids
        - The output data with classification
        """
        col_dict = {}

        for col in data.columns:
            data[col] = data[col].astype('category')
            col_dict.update({col: dict(enumerate(data[col].cat.categories))})

        # Get all the cols in the DataFrame
        cols = [col for col in data.columns]

        # Transform all values into categorical and numerical values
        for col in cols:
            data[col] = data[col].astype('category')
            data[col] = data[col].cat.codes

        # Run k-modes using the algorithm
        kmodes_method = KModes(n_clusters=self.n_cluster,
                               init=self.init_method,
                               n_init=self.n_iter,
                               verbose=1)
        kmode_result = kmodes_method.fit_predict(data[cols])

        # Attach the output label for each data point
        data['classification'] = pd.Series(kmode_result, index=data.index)

        return col_dict, kmodes_method.cluster_centroids_, data
コード例 #14
0
def k_modes(questions):
    temp = []
    for col in df.columns:
        temp.append(col)

    for val in questions:
        foo = 'Q' + str(val) + '-0'
        print(str(val) + ' - ' + mapping[foo][0])
    headers = []
    for q in questions:
        head = 'Q' + str(q) + '-'
        for val in temp:
            if head in val:
                headers.append(val)

    km = KModes(n_clusters=2)

    clusters = km.fit_predict(df[headers])

    columns = []
    for centroid in km.cluster_centroids_:
        temp = []
        for i in range(0, len(centroid)):
            if centroid[i] == 1:
                temp.append(headers[i])
        columns.append(temp)

    for column in columns:
        l = [mapping[i][1] for i in column]
        print(column)
        print(l)
コード例 #15
0
ファイル: test_kmodes.py プロジェクト: yongledang/kmodes
    def test_kmodes_init_soybean(self):
        init_vals = np.array(
            [[0, 1, 2, 1, 0, 3, 1, 1, 0, 2, 1, 1, 0, 2, 2, 0, 0, 0, 1, 0, 1, 2,
              0, 0, 0, 0, 0, 3, 4, 0, 0, 0, 0, 0, 1],
             [4, 0, 0, 1, 1, 1, 3, 1, 1, 1, 1, 1, 0, 2, 2, 0, 0, 0, 1, 1, 0, 3,
              0, 0, 0, 2, 1, 0, 4, 0, 0, 0, 0, 0, 0],
             [3, 0, 2, 1, 0, 2, 0, 2, 1, 1, 1, 1, 0, 2, 2, 0, 0, 0, 1, 0, 3, 0,
              1, 1, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0],
             [3, 0, 2, 0, 1, 3, 1, 2, 0, 1, 1, 0, 0, 2, 2, 0, 0, 0, 1, 1, 1, 1,
              0, 1, 1, 0, 0, 3, 4, 0, 0, 0, 0, 0, 0]])
        kmodes_init = KModes(n_clusters=4, init=init_vals, verbose=2)
        result = kmodes_init.fit_predict(SOYBEAN)
        expected = np.array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1,
                             1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0,
                             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
        assert_cluster_splits_equal(result, expected)

        # 5 initial centroids, 4 n_clusters
        init_vals = np.array(
            [[0, 1],
             [4, 0],
             [4, 0],
             [3, 0],
             [3, 0]])
        kmodes_init = KModes(n_clusters=4, init=init_vals, verbose=2)
        with self.assertRaises(AssertionError):
            kmodes_init.fit(SOYBEAN)

        # wrong number of attributes
        init_vals = np.array(
            [0, 1, 2, 3])
        kmodes_init = KModes(n_clusters=4, init=init_vals, verbose=2)
        with self.assertRaises(AssertionError):
            kmodes_init.fit(SOYBEAN)
コード例 #16
0
    def run_cluster(self):
        columns = self.board_game_data.columns.tolist()
        columns = [
            c for c in columns if c not in [
                'board_game_id', 'name', 'year', 'minplayer', 'maxplayer',
                'playingtime', 'avgratings', 'designer', 'category',
                'mechanic', 'publisher', 'age', 'rank'
            ]
        ]
        print(columns)
        cluster_df = self.board_game_data[columns]
        km = KModes(n_clusters=15, init='Huang', n_init=10, verbose=1)
        clusters = km.fit_predict(cluster_df)
        print(km.cluster_centroids_)

        centroids = km.cluster_centroids_
        for i in range(centroids.shape[0]):
            if sum(centroids[i, :]) == 0:
                print("\ncluster " + str(i) + ": ")
                print("no cluster")
            else:
                print("\ncluster " + str(i) + ": ")
                cent = centroids[i, :]
                for j in cluster_df.columns[np.nonzero(cent)]:
                    print(j)
コード例 #17
0
    def kmodes(self, K=20, N=int(1e5), T=50, type='huang', save=True):
        # data

        data = self.to_numpy()

        # data.fillna(0)
        # missing = ~np.isfinite(data)
        # mu = np.nanmean(data, 0, keepdims=1)
        # data = np.where(missing, mu, data)
        if type == 'huang':
            model = KModes(n_clusters=K, init='Huang', n_init=1, verbose=2)
        elif type == 'huang_ng':
            model = KModes(n_clusters=K,
                           init='Huang',
                           cat_dissim=ng_dissim,
                           n_init=1,
                           verbose=1)
        if type == 'cao':
            model = KModes(n_clusters=K, init='Cao', verbose=2)
        preds = model.fit_predict(data)
        centroids = model.cluster_centroids_
        labels = model.labels_
        if save:
            self.save(model, 'Clustering_kmodes_model')
        return centroids, labels
コード例 #18
0
def makeClusters (data,year, numClusters):
    km=KModes(n_clusters=numClusters, init="Cao", n_init=1, verbose=1)
    subsetData= data[data["Year"]==year].drop(["Year","Community Area","Beat"],axis=1).values
    fitClusters=km.fit_predict(subsetData)
    clustersCentroidsData=pd.DataFrame(km.cluster_centroids_)
    clustersCentroidsData.columns=subsetData.columns
    return fitClusters, clustersCentroidsData
コード例 #19
0
def cat_clust(citycompile):
    ''' Clustering for numerical features.
    Args:
        citycompile (Dataframe): Dataframe with CDP questionaire responses compiled at city level

    Output:
        CatClust_LPlot.jpg: plot of error vs number of clusters to find elbow (best number of clusters).
        catclustresults.csv: csv file saved to data path with clustering results.
    '''
    # get categorical variables and try various number of clusters
    citycatvars = citycompile.select_dtypes(include='object')
    catcols = citycatvars.columns
    cost = []
    for num_clusters in list(range(1, 6)):
        kmode = KModes(n_clusters=num_clusters, init="Cao", n_init=1)
        kmode.fit_predict(citycatvars)
        cost.append(kmode.cost_)

    # plot error vs clusters
    y = np.array([i for i in range(1, 6, 1)])
    catlplot = plt.plot(y, cost)
    plt.title('Cost vs Number of Clusters')
    plt.xlabel('Clusters')
    plt.ylabel('Cost')
    # saving to flask app static/images folder
    plt.savefig(
        Path(PROJECT_HOME, 'app', 'static', 'images', 'CatClust_Lplot.jpg'))
    plt.close()

    citycatvars = citycompile.select_dtypes(include='object')
    catcols = citycatvars.columns

    # define the k-modes model
    km = KModes(n_clusters=3, init='Cao', n_init=11)

    # fit the clusters to the skills dataframe
    clusters = km.fit_predict(citycatvars)

    # get an array of cluster modes
    kmodes = km.cluster_centroids_
    shape = kmodes.shape

    clustdf = pd.DataFrame(data=kmodes, columns=catcols)
    clustdf = clustdf.reset_index()
    clustdf = clustdf.rename(columns={'index': 'Cluster'})
    clustdf.Cluster = clustdf.Cluster + 1
    clustdf.to_csv(Path(DATA_PATH, 'catclustresults.csv'), index=False)
コード例 #20
0
ファイル: test_kmodes.py プロジェクト: harry-b-harish/kmodes
 def test_kmodes_fit_predict(self):
     """Test whether fit_predict interface works the same as fit and predict."""
     kmodes = KModes(n_clusters=4, init='Cao', random_state=42)
     sample_weight = [0.5] * TEST_DATA.shape[0]
     data1 = kmodes.fit_predict(TEST_DATA, sample_weight=sample_weight)
     data2 = kmodes.fit(TEST_DATA,
                        sample_weight=sample_weight).predict(TEST_DATA)
     assert_cluster_splits_equal(data1, data2)
コード例 #21
0
ファイル: test_kmodes.py プロジェクト: ashishyadavppe/kmodes
 def test_kmodes_cao_soybean_ng(self):
     kmodes_cao = KModes(n_clusters=4, init='Cao', verbose=2, cat_dissim=ng_dissim)
     result = kmodes_cao.fit_predict(SOYBEAN)
     expected = np.array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1,
                          1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0,
                          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
     assert_cluster_splits_equal(result, expected)
     self.assertTrue(result.dtype == np.dtype(np.uint8))
コード例 #22
0
def clusterBitVec(data, max_clusters=5):
    best_k = findKBitVec(data, max_clusters)
    if best_k == 0:
        return 0, []
    else:
        kmodes = KModes(best_k)
        labels = kmodes.fit_predict(data)
        return best_k, labels
def fit_kModes(data, n_cluster=2, N_trials=10):
    kmo = KModes(n_clusters=n_cluster,
                 n_init=N_trials,
                 init='Huang',
                 random_state=616)
    clusters = kmo.fit_predict(data)
    cluster_feature_weights = kmo.cluster_centroids_
    return clusters, cluster_feature_weights
コード例 #24
0
ファイル: test_kmodes.py プロジェクト: yongledang/kmodes
 def test_kmodes_cao_soybean(self):
     kmodes_cao = KModes(n_clusters=4, init='Cao', verbose=2)
     result = kmodes_cao.fit_predict(SOYBEAN)
     expected = np.array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1,
                          1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0,
                          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
     assert_cluster_splits_equal(result, expected)
     self.assertTrue(result.dtype == np.dtype(np.uint16))
コード例 #25
0
ファイル: test_kmodes.py プロジェクト: yongledang/kmodes
 def test_kmodes_predict_soybean_jaccard_dissim_label(self):
     kmodes_huang = KModes(n_clusters=4, n_init=2, init='Huang', verbose=2,
                           cat_dissim=jaccard_dissim_label, random_state=42)
     kmodes_huang = kmodes_huang.fit(TEST_DATA)
     result = kmodes_huang.fit_predict(TEST_DATA_PREDICT)
     expected = np.array([1, 0, 1, 2])
     assert_cluster_splits_equal(result, expected)
     self.assertTrue(result.dtype == np.dtype(np.uint16))
コード例 #26
0
def clusterCreationKmode():
    # random categorical data
    data = np.random.choice(20, (100, 10))

    km = KModes(n_clusters=4, init='Huang', n_init=5, verbose=1)

    clusters = km.fit_predict(data)

    return HttpResponse(km.cluster_centroids_)
コード例 #27
0
ファイル: test_kmodes.py プロジェクト: yongledang/kmodes
 def test_kmodes_huang_soybean_ng(self):
     kmodes_huang = KModes(n_clusters=4, n_init=2, init='Huang', verbose=2,
                           cat_dissim=ng_dissim, random_state=42)
     result = kmodes_huang.fit_predict(SOYBEAN)
     expected = np.array([3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0,
                          0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2,
                          2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])
     assert_cluster_splits_equal(result, expected)
     self.assertTrue(result.dtype == np.dtype(np.uint16))
コード例 #28
0
ファイル: test_kmodes.py プロジェクト: xuhuan666/kmodes
 def test_kmodes_nunique_nclusters(self):
     data = np.array([[0, 1], [0, 1], [0, 1], [0, 2], [0, 2], [0, 2]])
     np.random.seed(42)
     kmodes_cao = KModes(n_clusters=6, init='Cao', verbose=2)
     result = kmodes_cao.fit_predict(data, categorical=[1])
     expected = np.array([0, 0, 0, 1, 1, 1])
     assert_cluster_splits_equal(result, expected)
     np.testing.assert_array_equal(kmodes_cao.cluster_centroids_,
                                   np.array([[0, 1], [0, 2]]))
コード例 #29
0
ファイル: test_kmodes.py プロジェクト: yongledang/kmodes
 def test_kmodes_huang_soybean_parallel(self):
     kmodes_huang = KModes(n_clusters=4, n_init=4, init='Huang', verbose=2,
                           random_state=42, n_jobs=4)
     result = kmodes_huang.fit_predict(SOYBEAN)
     expected = np.array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
                          0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2,
                          2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])
     assert_cluster_splits_equal(result, expected)
     self.assertTrue(result.dtype == np.dtype(np.uint16))
コード例 #30
0
ファイル: test_kmodes.py プロジェクト: ashishyadavppe/kmodes
 def test_kmodes_huang_soybean(self):
     np.random.seed(42)
     kmodes_huang = KModes(n_clusters=4, n_init=2, init='Huang', verbose=2)
     result = kmodes_huang.fit_predict(SOYBEAN)
     expected = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                          0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 1,
                          2, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 2, 1, 2, 1])
     assert_cluster_splits_equal(result, expected)
     self.assertTrue(result.dtype == np.dtype(np.uint8))
コード例 #31
0
def kmodes_samping(df):
    km = KModes(n_clusters=100, init='Huang', n_init=5, verbose=1, n_jobs=-1)
    #model = KPrototypes(n_clusters=100, init='Huang', n_init=5, verbose=1, n_jobs=1)
    data = df[[
        'PANDAID', 'JOBSTATUS', 'COMPUTINGSITE', 'FINAL_STATUS', 'IS_SCOUT',
        'DURATION'
    ]].values
    clusters = km.fit_predict(data)
    centers = [row[0] for row in km.cluster_centroids_]
    return df[df['PANDAID'].isin(centers)]
コード例 #32
0
ファイル: test_kmodes.py プロジェクト: xuhuan666/kmodes
 def test_kmodes_huang_soybean(self):
     np.random.seed(42)
     kmodes_huang = KModes(n_clusters=4, n_init=2, init='Huang', verbose=2)
     result = kmodes_huang.fit_predict(SOYBEAN)
     expected = np.array([
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3,
         3, 3, 3, 3, 3, 3, 3, 3, 2, 1, 2, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 2,
         1, 2, 1
     ])
     assert_cluster_splits_equal(result, expected)
     self.assertTrue(result.dtype == np.dtype(np.uint8))
コード例 #33
0
ファイル: test_kmodes.py プロジェクト: yongledang/kmodes
 def test_kmodes_huang_soybean_jaccard_dissim_binary(self):
     kmodes_huang = KModes(n_clusters=4, n_init=2, init='Huang', verbose=2,
                           cat_dissim=jaccard_dissim_binary, random_state=42)
     # binary encoded variables are required
     bin_variables = SOYBEAN.astype(bool).astype(int)
     result = kmodes_huang.fit_predict(bin_variables)
     expected = np.array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                          0, 3, 1, 1, 3, 3, 1, 1, 1, 1, 3, 1, 1, 3, 1, 3, 3, 1, 3,
                          3, 3, 1, 1, 3, 1, 3, 1, 1])
     assert_cluster_splits_equal(result, expected)
     self.assertTrue(result.dtype == np.dtype(np.uint16))
コード例 #34
0
ファイル: test_kmodes.py プロジェクト: ashishyadavppe/kmodes
 def test_kmodes_nunique_nclusters_ng(self):
     data = np.array([
         [0, 1],
         [0, 1],
         [0, 1],
         [0, 2],
         [0, 2],
         [0, 2]
     ])
     np.random.seed(42)
     kmodes_cao = KModes(n_clusters=6, init='Cao', verbose=2, cat_dissim=ng_dissim)
     result = kmodes_cao.fit_predict(data, categorical=[1])
     expected = np.array([0, 0, 0, 1, 1, 1])
     assert_cluster_splits_equal(result, expected)
     np.testing.assert_array_equal(kmodes_cao.cluster_centroids_,
                                   np.array([[0, 1],
                                             [0, 2]]))