Esempio n. 1
0
    def kproto(self):  # TODO- solve clustering issue with PCA + K-means
        cluster_data = self.data
        opt_k = self.silouhette_analysis(cluster_data, prototype=True)

        kp = KPrototypes(n_clusters=opt_k)
        kp.fit(cluster_data, categorical=self.categorical_features)
        labels = kp.predict(cluster_data,
                            categorical=self.categorical_features)

        cluster_data['labels'] = labels
        self.data_clustered = cluster_data

        return cluster_data
    kproto.fit_predict(df_model, categorical=[0, 1, 2])
    cost[k] = kproto.cost_

#Visualisasi Elbow Plot
sns.pointplot(x=list(cost.keys()), y=list(cost.values()))
plt.show()

#Menyimpan model dengan jumlah cluster 5 berdasarkan Elbow Plot
import pickle

kproto = KPrototypes(n_clusters=5, random_state=75)
kproto = kproto.fit(df_model, categorical=[0, 1, 2])
pickle.dump(kproto, open('best_cluster.pkl', 'wb'))

#Menentukan segmen tiap pelanggan
clusters = kproto.predict(df_model, categorical=[0, 1, 2])
print('segmen_pelanggan: {}\n'.format(clusters))

#Menggabungkan data awal dan segmen pelanggan
df_final = df.copy()
df_final['cluster'] = clusters
print(df_final.head())

#Menampilkan data pelanggan berdasarkan cluster
for i in range(0, 5):
    print('\nPelanggan cluster {}\n'.format(i))
    print(df_final[df_final['cluster'] == i])

#Visualisasi box plot hasil clustering
for i in kolom_numerik:
    plt.figure(figsize=(6, 4))
Esempio n. 3
0
class KPrototypesClustering(baseoperationclass.BaseOperationClass):

    _operation_name = 'K-Prototypes Clustering'
    _operation_code_name = 'KPrototypes'
    _type_of_operation = 'cluster'

    def __init__(self):
        super().__init__()
        self.cluster_number = CLUSTER_NUMBER
        self.categorical_weight = CATEGORICAL_WEIGHT
        self.selected_features = []
        self.model = None
        self.labels = None
        self.centers = None

    def _preprocessed_data(self, data):
        return data if not self.selected_features \
            else data.loc[:, self.selected_features]

    def set_parameters(self,
                       cluster_number,
                       categorical_weight=None,
                       features=None):
        if cluster_number is not None:
            self.cluster_number = cluster_number
        if categorical_weight is not None:
            self.categorical_weight = categorical_weight
        if features is not None and isinstance(features, (list, tuple)):
            self.selected_features = list(features)
        return True

    def get_parameters(self):
        return {
            'cluster_number_KPrototypes': self.cluster_number,
            'categorical_data_weight_KPrototypes': self.categorical_weight,
            'features_KPrototypes': self.selected_features
        }

    def _get_initial_centers(self, dataset, categorical_indices):
        dataset_cat = dataset.take(categorical_indices, axis=1).values
        categorical_labels = [
            column for index, column in enumerate(dataset.columns)
            if index in categorical_indices
        ]
        dataset_num = dataset.drop(categorical_labels, axis=1).values

        categorical_weight = self.categorical_weight
        if categorical_weight is None or categorical_weight < 0:
            categorical_weight = 0.5 * dataset_num.std()
        initial_centroids_num = np.zeros(
            (self.cluster_number, dataset_num.shape[1]))
        initial_centroids_cat = np.zeros(
            (self.cluster_number, dataset_cat.shape[1]))
        rand_index = randint(0, dataset.shape[0] - 1)
        initial_centroids_num[0], initial_centroids_cat[0] = dataset_num[
            rand_index], dataset_cat[rand_index]

        for i in range(1, self.cluster_number):
            distances_num_cat = [
                np.zeros((i, dataset.shape[0]), dtype=np.float64),
                np.zeros((i, dataset.shape[0]))
            ]
            for j in range(0, i):
                distances_num_cat[0][j] = dissimilarity_python.euclidean(
                    dataset_num, initial_centroids_num[j])
                distances_num_cat[1][j] = matching_dissim(
                    dataset_cat, initial_centroids_cat[j])
            distances = np.amin(distances_num_cat[0] +
                                categorical_weight * distances_num_cat[1],
                                axis=0)
            probabilities = distances / np.sum(distances)
            chosen_point = np.random.choice(range(0, dataset.shape[0]),
                                            p=probabilities)
            initial_centroids_num[i] = dataset_num[chosen_point]
            initial_centroids_cat[i] = dataset_cat[chosen_point]

        initial_centroids = [initial_centroids_num, initial_centroids_cat]
        return initial_centroids

    # Used if there's no categorical properties in the dataset
    def _fallback_algorithm(self, dataset):
        from . import KMeansClustering
        self.model = KMeansClustering.KMeansClustering()
        self.model.set_parameters(self.cluster_number, self.selected_features)
        self.labels = self.model.get_labels(dataset)
        self.centers = self.model.centers
        return self.labels

    # By default, K-Prototypes uses euclidean distance for numerical data and Hamming distance for categorical data
    # n_init is the number of time the k-modes algorithm will be run with different centroid seeds
    # gamma is the weight to balance numerical data against categorical.
    # If None, it defaults to half of standard deviation for numerical data
    def get_labels(self, data, reprocess=False):
        data_original = data
        data = self._preprocessed_data(data)

        categorical_indices = get_categorical_indices(data)
        if not categorical_indices:
            return self._fallback_algorithm(data_original)

        if self.model is None or reprocess:
            data = encode_nominal_parameters(data)
            data = normalized_dataset(data, categorical_indices)

            initial_centers = self._get_initial_centers(
                data, categorical_indices)
            self.model = KPrototypes(n_clusters=self.cluster_number,
                                     max_iter=1000,
                                     init=initial_centers,
                                     n_init=10,
                                     gamma=self.categorical_weight,
                                     num_dissim=dissimilarity_python.euclidean,
                                     n_jobs=1)
            data = data.values
            self.model.fit(data, categorical=categorical_indices)
            self.labels = self.model.predict(data,
                                             categorical=categorical_indices)
            self.centers = self.model.cluster_centroids_
            centers = self.centers[0]
            for index, cat_index in enumerate(categorical_indices):
                centers = np.insert(centers,
                                    cat_index,
                                    values=self.centers[1].transpose()[index],
                                    axis=1)
            self.centers = centers
        else:
            self.labels = self.model.predict(data)

        return self.labels

    # Legacy methods

    def print_parameters(self):
        return self.get_parameters()

    def save_parameters(self):
        return self.get_parameters()

    def load_parameters(self, parameters):
        self.set_parameters(
            cluster_number=parameters.get('cluster_number_KPrototypes')
            or CLUSTER_NUMBER,
            categorical_weight=parameters.get(
                'categorical_data_weight_KPrototypes') or CATEGORICAL_WEIGHT,
            features=parameters.get('features_KPrototypes') or [])
        return True

    def save_results(self):
        return {
            'results': self.labels.tolist(),
            'centers': self.centers.tolist(),
            'dump': pickle.dumps(self.model).hex()
        }

    def load_results(self, results_dict):
        if results_dict.get("results") is not None:
            self.labels = np.array(results_dict['results'])
        if results_dict.get("centers") is not None:
            self.centers = np.array(results_dict['centers'])
        if results_dict.get("dump") is not None:
            self.model = pickle.loads(bytes.fromhex(results_dict['dump']))
        return True

    def process_data(self, data):
        return self.get_labels(data)

    def predict(self, data):
        return self.get_labels(data)