def get_knee_results(data, cluster_lims, cores, categorical): knee_results = [] cluster_range = range(*cluster_lims) for n_clusters in tqdm(cluster_range): kp = KPrototypes(n_clusters, init="cao", random_state=0, n_jobs=cores) kp.fit(data[cols], categorical=categorical) knee_results.append(kp.cost_) kl = KneeLocator( cluster_range, knee_results, curve_nature="convex", curve_direction="decreasing", ) n_clusters = kl.knee with open(OUT_DIR / "n_clusters.txt", "w") as f: f.write(str(n_clusters)) knee_results = pd.Series(index=cluster_range, data=knee_results) knee_results.to_csv(OUT_DIR / "knee_results.csv", header=False) return n_clusters
def k_prototypes_fitness(self,individual): self.individual = individual df_cluster=self.X.copy() if self.add_target: self.individual = [1] + self.individual #check if calculation was already made upt to 2nd decimal inf_curr = [round(float(y),2) for y in individual] for x in self.results: ind_test_norm = [round(float(y),2) for y in x[:-1]] if ind_test_norm == inf_curr: print('já calculado') return float(x[-1]), #weights on kmeans for i in self.numerical_index: df_cluster.iloc[:,i] = self.individual[i] * df_cluster.iloc[:,i] random.seed(10) kproto = KPrototypes(n_clusters=self.cluster_param, cat_dissim=self.matching_dissim_weighted, #num_dissim=euclidean_dissim_weighted, max_iter=5, verbose=0, gamma=1,n_init=1, init = 'random', random_state=10) kproto.fit(df_cluster.values,categorical = self.categorical_index) ftnss = self.calculate_fitness(kproto.labels_,kproto) self.save_scoring(self.individual,ftnss,kproto) self.results.append(self.individual + [ftnss]) return ftnss,
def get_labels(data, n_clusters, cores, categorical): kp = KPrototypes(n_clusters, init="matching", n_init=50, random_state=0, n_jobs=cores) kp.fit(data[cols], categorical=categorical) print(kp.cost_) return kp.labels_
def kproto(self): # TODO- solve clustering issue with PCA + K-means cluster_data = self.data opt_k = self.silouhette_analysis(cluster_data, prototype=True) kp = KPrototypes(n_clusters=opt_k) kp.fit(cluster_data, categorical=self.categorical_features) labels = kp.predict(cluster_data, categorical=self.categorical_features) cluster_data['labels'] = labels self.data_clustered = cluster_data return cluster_data
def return_best_cluster(self,df_cluster,cluster_param): if self.cluster_method == 'kprototypes': #weights on kmeans for i in self.numerical_index: df_cluster.iloc[:,i] = self.individual[i] * df_cluster.iloc[:,i] if os.path.exists(self.folder + 'cluster_init.json'): with open(self.folder + 'cluster_init.json') as f: cluster_init = json.load(f) ftnss = 100000 for init in cluster_init: init = [ np.array(init[0]),np.array(init[1])] kproto = KPrototypes(n_clusters=cluster_param, cat_dissim=self.matching_dissim_weighted, #num_dissim=euclidean_dissim_weighted, max_iter=5, verbose=1, gamma=1,n_init=1, init = init) kproto.fit(df_cluster.values,categorical = self.categorical_index) x = pd.DataFrame([]) x['cluster'] = kproto.labels_ x['target'] = self.target df_grouped = x.groupby(['cluster'])['target'].max() - x.groupby(['cluster'])['target'].min() curr_ftnss = (df_grouped.values).sum() print(ftnss) print(curr_ftnss < ftnss) winner_model = kproto if curr_ftnss < ftnss: ftnss = curr_ftnss winner_model = kproto else: kproto = KPrototypes(n_clusters=self.cluster_param, cat_dissim=self.matching_dissim_weighted, #num_dissim=euclidean_dissim_weighted, max_iter=5, verbose=1, gamma=1,n_init=1, init = 'Cao') kproto.fit(df_cluster.values,categorical = self.categorical_index) curr_ftnss = self.calculate_fitness(kproto.labels_) winner_model = kproto dump(winner_model,self.folder+'best_model.joblib') self.df['cluster'] = winner_model.labels_ return winner_model elif self.cluster_method == 'hdbscan': clusterer = hdb.HDBSCAN(min_cluster_size=cluster_param, prediction_data=True) clusterer.fit(df_cluster) dump(clusterer,self.folder+'best_model.joblib') self.df['cluster'] = clusterer.labels_ return clusterer
def making_model(self): kproto = KPrototypes ( n_clusters = 5, random_state = 75) kproto = kproto.fit(self.df_model, categorical=[0,1,2]) #Save Model pickle.dump(kproto, open('cluster.pkl', 'wb')) self.kproto = kproto
def get_clusters(self, df, var_list, k_values, map_sa_districts, path_out, cat_list=[]): for k in k_values: # k prototype KPro_model = KPro(n_clusters=k) #df_geo.loc[:, columns4] = preprocessing.normalize(df_geo.loc[:, columns4].values) KPro_fit = KPro_model.fit(X=df[var_list], categorical=cat_list) df['KPrototype cluster labels'] = KPro_fit.labels_ # plot self.plot_clusters(k, df, var_list, map_sa_districts, path_out) return df
def ClusterCreation(request,*args): global kproto #Example of clustering with random data ''' # random categorical data data = np.array([ [0,'a',4], [1,'e',3], [6,'ffed',15], [5,'fdfd',16] ]) kproto = KPrototypes(n_clusters=2, init='Cao', verbose=2) clusters = kproto.fit(data, categorical=[1]) # Create CSV with cluster statistics clusterStatisticsCSV(kproto) for argument in args: if argument is not None: return ''' # Get data from database rows=get_training_data() # Cast as numpy Array rows_array=np.array(rows) #Split data into variables and id's data_array = np.array(rows_array)[:,1:] #dejamos sólo las variables que pueden clusterizar el cliente ids_array = np.array(rows_array)[:, 0] #guardamos las id's en otro array #Clustering kproto = KPrototypes(n_clusters=3, init='Cao', verbose=2) #clusters = kproto.fit(data_array, categorical=[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26]) clusters = kproto.fit(data_array,categorical=[1, 2, 3, 4]) # Create CSV with cluster statistics clusterStatisticsCSV(kproto) for argument in args: if argument is not None: return return HttpResponse('Clustering realizado y CSV report generado')
labels = ms.labels_ cluster_centers = ms.cluster_centers_ df["Labels"] = ms.predict(df_cust_num_norm) cols = customer_related_num + ["labels"] cc_mshift = df[cols].groupby("labels").mean() sizes = df["labels"].value_counts() ######## Categorical ########### ### 1. Approach: K-Prototype with categorical and numerical Features scaler = StandardScaler() cust_norm = scaler.fit_transform(df[customer_related_num]) df_num_norm = pd.DataFrame(cust_norm, columns=customer_related_num) df_cust_norm = df_num_norm.join(df[customer_related_cat]) # create_elbowgraph(10, df_cust_norm, "kproto", [4,5,6,7,8] ) kproto = KPrototypes(n_clusters=3, init='random', random_state=1) model = kproto.fit(df_cust_norm, categorical=[4, 5, 6, 7, 8, 9]) # Inverse Normalization for Interpretation cc_kproto_num = pd.DataFrame( scaler.inverse_transform(X=model.cluster_centroids_[0])) cc_kproto = pd.concat( [cc_kproto_num, pd.DataFrame(model.cluster_centroids_[1])], axis=1) cc_kproto.columns = customer_related ###### 2. Approach: Categorical Kmodes ######## kmodes = KModes(n_clusters=4) temp_kmodes = kmodes.fit_predict(df[customer_related_cat]) kmcc = pd.DataFrame(kmodes.cluster_centroids_, columns=customer_related_cat) df["cat_cluster"] = temp_kmodes ########################################################################################################################################################################
#Melakukan Iterasi untuk mendapatkan nilai Cost cost = {} for k in range(2, 10): kproto = KPrototypes(n_clusters=k, random_state=75) kproto.fit_predict(df_model, categorical=[0, 1, 2]) cost[k] = kproto.cost_ #Visualisasi Elbow Plot sns.pointplot(x=list(cost.keys()), y=list(cost.values())) plt.show() #Menyimpan model dengan jumlah cluster 5 berdasarkan Elbow Plot import pickle kproto = KPrototypes(n_clusters=5, random_state=75) kproto = kproto.fit(df_model, categorical=[0, 1, 2]) pickle.dump(kproto, open('best_cluster.pkl', 'wb')) #Menentukan segmen tiap pelanggan clusters = kproto.predict(df_model, categorical=[0, 1, 2]) print('segmen_pelanggan: {}\n'.format(clusters)) #Menggabungkan data awal dan segmen pelanggan df_final = df.copy() df_final['cluster'] = clusters print(df_final.head()) #Menampilkan data pelanggan berdasarkan cluster for i in range(0, 5): print('\nPelanggan cluster {}\n'.format(i)) print(df_final[df_final['cluster'] == i])
class KPrototypesClustering(baseoperationclass.BaseOperationClass): _operation_name = 'K-Prototypes Clustering' _operation_code_name = 'KPrototypes' _type_of_operation = 'cluster' def __init__(self): super().__init__() self.cluster_number = CLUSTER_NUMBER self.categorical_weight = CATEGORICAL_WEIGHT self.selected_features = [] self.model = None self.labels = None self.centers = None def _preprocessed_data(self, data): return data if not self.selected_features \ else data.loc[:, self.selected_features] def set_parameters(self, cluster_number, categorical_weight=None, features=None): if cluster_number is not None: self.cluster_number = cluster_number if categorical_weight is not None: self.categorical_weight = categorical_weight if features is not None and isinstance(features, (list, tuple)): self.selected_features = list(features) return True def get_parameters(self): return { 'cluster_number_KPrototypes': self.cluster_number, 'categorical_data_weight_KPrototypes': self.categorical_weight, 'features_KPrototypes': self.selected_features } def _get_initial_centers(self, dataset, categorical_indices): dataset_cat = dataset.take(categorical_indices, axis=1).values categorical_labels = [ column for index, column in enumerate(dataset.columns) if index in categorical_indices ] dataset_num = dataset.drop(categorical_labels, axis=1).values categorical_weight = self.categorical_weight if categorical_weight is None or categorical_weight < 0: categorical_weight = 0.5 * dataset_num.std() initial_centroids_num = np.zeros( (self.cluster_number, dataset_num.shape[1])) initial_centroids_cat = np.zeros( (self.cluster_number, dataset_cat.shape[1])) rand_index = randint(0, dataset.shape[0] - 1) initial_centroids_num[0], initial_centroids_cat[0] = dataset_num[ rand_index], dataset_cat[rand_index] for i in range(1, self.cluster_number): distances_num_cat = [ np.zeros((i, dataset.shape[0]), dtype=np.float64), np.zeros((i, dataset.shape[0])) ] for j in range(0, i): distances_num_cat[0][j] = dissimilarity_python.euclidean( dataset_num, initial_centroids_num[j]) distances_num_cat[1][j] = matching_dissim( dataset_cat, initial_centroids_cat[j]) distances = np.amin(distances_num_cat[0] + categorical_weight * distances_num_cat[1], axis=0) probabilities = distances / np.sum(distances) chosen_point = np.random.choice(range(0, dataset.shape[0]), p=probabilities) initial_centroids_num[i] = dataset_num[chosen_point] initial_centroids_cat[i] = dataset_cat[chosen_point] initial_centroids = [initial_centroids_num, initial_centroids_cat] return initial_centroids # Used if there's no categorical properties in the dataset def _fallback_algorithm(self, dataset): from . import KMeansClustering self.model = KMeansClustering.KMeansClustering() self.model.set_parameters(self.cluster_number, self.selected_features) self.labels = self.model.get_labels(dataset) self.centers = self.model.centers return self.labels # By default, K-Prototypes uses euclidean distance for numerical data and Hamming distance for categorical data # n_init is the number of time the k-modes algorithm will be run with different centroid seeds # gamma is the weight to balance numerical data against categorical. # If None, it defaults to half of standard deviation for numerical data def get_labels(self, data, reprocess=False): data_original = data data = self._preprocessed_data(data) categorical_indices = get_categorical_indices(data) if not categorical_indices: return self._fallback_algorithm(data_original) if self.model is None or reprocess: data = encode_nominal_parameters(data) data = normalized_dataset(data, categorical_indices) initial_centers = self._get_initial_centers( data, categorical_indices) self.model = KPrototypes(n_clusters=self.cluster_number, max_iter=1000, init=initial_centers, n_init=10, gamma=self.categorical_weight, num_dissim=dissimilarity_python.euclidean, n_jobs=1) data = data.values self.model.fit(data, categorical=categorical_indices) self.labels = self.model.predict(data, categorical=categorical_indices) self.centers = self.model.cluster_centroids_ centers = self.centers[0] for index, cat_index in enumerate(categorical_indices): centers = np.insert(centers, cat_index, values=self.centers[1].transpose()[index], axis=1) self.centers = centers else: self.labels = self.model.predict(data) return self.labels # Legacy methods def print_parameters(self): return self.get_parameters() def save_parameters(self): return self.get_parameters() def load_parameters(self, parameters): self.set_parameters( cluster_number=parameters.get('cluster_number_KPrototypes') or CLUSTER_NUMBER, categorical_weight=parameters.get( 'categorical_data_weight_KPrototypes') or CATEGORICAL_WEIGHT, features=parameters.get('features_KPrototypes') or []) return True def save_results(self): return { 'results': self.labels.tolist(), 'centers': self.centers.tolist(), 'dump': pickle.dumps(self.model).hex() } def load_results(self, results_dict): if results_dict.get("results") is not None: self.labels = np.array(results_dict['results']) if results_dict.get("centers") is not None: self.centers = np.array(results_dict['centers']) if results_dict.get("dump") is not None: self.model = pickle.loads(bytes.fromhex(results_dict['dump'])) return True def process_data(self, data): return self.get_labels(data) def predict(self, data): return self.get_labels(data)