Esempio n. 1
0
def get_knee_results(data, cluster_lims, cores, categorical):

    knee_results = []
    cluster_range = range(*cluster_lims)
    for n_clusters in tqdm(cluster_range):

        kp = KPrototypes(n_clusters, init="cao", random_state=0, n_jobs=cores)
        kp.fit(data[cols], categorical=categorical)

        knee_results.append(kp.cost_)

    kl = KneeLocator(
        cluster_range,
        knee_results,
        curve_nature="convex",
        curve_direction="decreasing",
    )

    n_clusters = kl.knee

    with open(OUT_DIR / "n_clusters.txt", "w") as f:
        f.write(str(n_clusters))

    knee_results = pd.Series(index=cluster_range, data=knee_results)
    knee_results.to_csv(OUT_DIR / "knee_results.csv", header=False)

    return n_clusters
Esempio n. 2
0
File: gco.py Progetto: acardosoj/ml
 def k_prototypes_fitness(self,individual):
 
 
     self.individual = individual
     df_cluster=self.X.copy()   
     if self.add_target:
         self.individual = [1] + self.individual
     
     
     #check if calculation was already made upt to 2nd decimal
     inf_curr = [round(float(y),2) for y in individual]
     for x in self.results:
         ind_test_norm = [round(float(y),2) for y in x[:-1]]
         if ind_test_norm == inf_curr:
             print('já calculado')
             return float(x[-1]),
     
     #weights on kmeans
     for i in self.numerical_index:
         df_cluster.iloc[:,i] = self.individual[i] * df_cluster.iloc[:,i] 
     random.seed(10)
     kproto = KPrototypes(n_clusters=self.cluster_param, cat_dissim=self.matching_dissim_weighted, #num_dissim=euclidean_dissim_weighted, 
                     max_iter=5, verbose=0, gamma=1,n_init=1, init = 'random', random_state=10)
     kproto.fit(df_cluster.values,categorical = self.categorical_index)            
     ftnss = self.calculate_fitness(kproto.labels_,kproto)                
     
     self.save_scoring(self.individual,ftnss,kproto)
     self.results.append(self.individual + [ftnss])
     
     return ftnss,
Esempio n. 3
0
def get_labels(data, n_clusters, cores, categorical):

    kp = KPrototypes(n_clusters,
                     init="matching",
                     n_init=50,
                     random_state=0,
                     n_jobs=cores)
    kp.fit(data[cols], categorical=categorical)
    print(kp.cost_)

    return kp.labels_
Esempio n. 4
0
    def kproto(self):  # TODO- solve clustering issue with PCA + K-means
        cluster_data = self.data
        opt_k = self.silouhette_analysis(cluster_data, prototype=True)

        kp = KPrototypes(n_clusters=opt_k)
        kp.fit(cluster_data, categorical=self.categorical_features)
        labels = kp.predict(cluster_data,
                            categorical=self.categorical_features)

        cluster_data['labels'] = labels
        self.data_clustered = cluster_data

        return cluster_data
Esempio n. 5
0
File: gco.py Progetto: acardosoj/ml
    def return_best_cluster(self,df_cluster,cluster_param):
        if self.cluster_method == 'kprototypes':
            
            #weights on kmeans
            
          
            for i in self.numerical_index:
                df_cluster.iloc[:,i] = self.individual[i] * df_cluster.iloc[:,i] 

            if os.path.exists(self.folder + 'cluster_init.json'):
                with open(self.folder + 'cluster_init.json') as f:
                    cluster_init = json.load(f)
                ftnss = 100000
                for init in cluster_init:    
                    init = [ np.array(init[0]),np.array(init[1])] 
                    kproto = KPrototypes(n_clusters=cluster_param, cat_dissim=self.matching_dissim_weighted, #num_dissim=euclidean_dissim_weighted, 
                                    max_iter=5, verbose=1, gamma=1,n_init=1, init = init)
                    kproto.fit(df_cluster.values,categorical = self.categorical_index)    

                    x = pd.DataFrame([])
                    x['cluster'] = kproto.labels_
                    x['target'] = self.target
                    df_grouped = x.groupby(['cluster'])['target'].max() - x.groupby(['cluster'])['target'].min()
                    curr_ftnss = (df_grouped.values).sum()   
                    print(ftnss) 

                    print(curr_ftnss < ftnss)       
                    winner_model = kproto          
                           
                    if curr_ftnss < ftnss:
                        ftnss = curr_ftnss     
                        winner_model = kproto  
            else:
                kproto = KPrototypes(n_clusters=self.cluster_param, cat_dissim=self.matching_dissim_weighted, #num_dissim=euclidean_dissim_weighted, 
                                max_iter=5, verbose=1, gamma=1,n_init=1, init = 'Cao')
                kproto.fit(df_cluster.values,categorical = self.categorical_index)            
                curr_ftnss = self.calculate_fitness(kproto.labels_)                
                winner_model = kproto                     
            
            dump(winner_model,self.folder+'best_model.joblib')
            self.df['cluster'] = winner_model.labels_
            return winner_model
        
        elif self.cluster_method == 'hdbscan':
            clusterer = hdb.HDBSCAN(min_cluster_size=cluster_param, prediction_data=True)
            clusterer.fit(df_cluster)    
            dump(clusterer,self.folder+'best_model.joblib')
            self.df['cluster'] = clusterer.labels_
            return clusterer   
Esempio n. 6
0
    def making_model(self):
        kproto = KPrototypes ( n_clusters = 5, random_state = 75)  
        kproto = kproto.fit(self.df_model, categorical=[0,1,2])  
        
        #Save Model  
        pickle.dump(kproto, open('cluster.pkl', 'wb'))

        self.kproto = kproto
Esempio n. 7
0
    def get_clusters(self,
                     df,
                     var_list,
                     k_values,
                     map_sa_districts,
                     path_out,
                     cat_list=[]):

        for k in k_values:
            # k prototype
            KPro_model = KPro(n_clusters=k)
            #df_geo.loc[:, columns4] = preprocessing.normalize(df_geo.loc[:, columns4].values)
            KPro_fit = KPro_model.fit(X=df[var_list], categorical=cat_list)
            df['KPrototype cluster labels'] = KPro_fit.labels_

            # plot
            self.plot_clusters(k, df, var_list, map_sa_districts, path_out)

        return df
Esempio n. 8
0
def ClusterCreation(request,*args):
    global kproto

    #Example of clustering with random data
    '''
    # random categorical data
    data = np.array([
            [0,'a',4],
            [1,'e',3],
            [6,'ffed',15],
            [5,'fdfd',16]
            ])

    kproto = KPrototypes(n_clusters=2, init='Cao', verbose=2)
    clusters = kproto.fit(data, categorical=[1])
    # Create CSV with cluster statistics
    clusterStatisticsCSV(kproto)
    for argument in args:
        if argument is not None:
            return
    '''
    # Get data from database
    rows=get_training_data()
    # Cast as numpy Array
    rows_array=np.array(rows)
    #Split data into variables and id's
    data_array = np.array(rows_array)[:,1:] #dejamos sólo las variables que pueden clusterizar el cliente
    ids_array = np.array(rows_array)[:, 0] #guardamos las id's en otro array

    #Clustering
    kproto = KPrototypes(n_clusters=3, init='Cao', verbose=2)
    #clusters = kproto.fit(data_array, categorical=[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26])
    clusters = kproto.fit(data_array,categorical=[1, 2, 3, 4])
    # Create CSV with cluster statistics
    clusterStatisticsCSV(kproto)
    for argument in args:
        if argument is not None:
            return

    return HttpResponse('Clustering realizado y CSV report generado')
Esempio n. 9
0
labels = ms.labels_
cluster_centers = ms.cluster_centers_
df["Labels"] = ms.predict(df_cust_num_norm)
cols = customer_related_num + ["labels"]
cc_mshift = df[cols].groupby("labels").mean()
sizes = df["labels"].value_counts()

######## Categorical ###########
### 1. Approach: K-Prototype with categorical and numerical Features
scaler = StandardScaler()
cust_norm = scaler.fit_transform(df[customer_related_num])
df_num_norm = pd.DataFrame(cust_norm, columns=customer_related_num)
df_cust_norm = df_num_norm.join(df[customer_related_cat])
# create_elbowgraph(10, df_cust_norm, "kproto", [4,5,6,7,8] )
kproto = KPrototypes(n_clusters=3, init='random', random_state=1)
model = kproto.fit(df_cust_norm, categorical=[4, 5, 6, 7, 8, 9])
# Inverse Normalization for Interpretation
cc_kproto_num = pd.DataFrame(
    scaler.inverse_transform(X=model.cluster_centroids_[0]))
cc_kproto = pd.concat(
    [cc_kproto_num, pd.DataFrame(model.cluster_centroids_[1])], axis=1)
cc_kproto.columns = customer_related

###### 2. Approach: Categorical Kmodes ########
kmodes = KModes(n_clusters=4)
temp_kmodes = kmodes.fit_predict(df[customer_related_cat])
kmcc = pd.DataFrame(kmodes.cluster_centroids_, columns=customer_related_cat)

df["cat_cluster"] = temp_kmodes

########################################################################################################################################################################
Esempio n. 10
0
#Melakukan Iterasi untuk mendapatkan nilai Cost
cost = {}
for k in range(2, 10):
    kproto = KPrototypes(n_clusters=k, random_state=75)
    kproto.fit_predict(df_model, categorical=[0, 1, 2])
    cost[k] = kproto.cost_

#Visualisasi Elbow Plot
sns.pointplot(x=list(cost.keys()), y=list(cost.values()))
plt.show()

#Menyimpan model dengan jumlah cluster 5 berdasarkan Elbow Plot
import pickle

kproto = KPrototypes(n_clusters=5, random_state=75)
kproto = kproto.fit(df_model, categorical=[0, 1, 2])
pickle.dump(kproto, open('best_cluster.pkl', 'wb'))

#Menentukan segmen tiap pelanggan
clusters = kproto.predict(df_model, categorical=[0, 1, 2])
print('segmen_pelanggan: {}\n'.format(clusters))

#Menggabungkan data awal dan segmen pelanggan
df_final = df.copy()
df_final['cluster'] = clusters
print(df_final.head())

#Menampilkan data pelanggan berdasarkan cluster
for i in range(0, 5):
    print('\nPelanggan cluster {}\n'.format(i))
    print(df_final[df_final['cluster'] == i])
Esempio n. 11
0
class KPrototypesClustering(baseoperationclass.BaseOperationClass):

    _operation_name = 'K-Prototypes Clustering'
    _operation_code_name = 'KPrototypes'
    _type_of_operation = 'cluster'

    def __init__(self):
        super().__init__()
        self.cluster_number = CLUSTER_NUMBER
        self.categorical_weight = CATEGORICAL_WEIGHT
        self.selected_features = []
        self.model = None
        self.labels = None
        self.centers = None

    def _preprocessed_data(self, data):
        return data if not self.selected_features \
            else data.loc[:, self.selected_features]

    def set_parameters(self,
                       cluster_number,
                       categorical_weight=None,
                       features=None):
        if cluster_number is not None:
            self.cluster_number = cluster_number
        if categorical_weight is not None:
            self.categorical_weight = categorical_weight
        if features is not None and isinstance(features, (list, tuple)):
            self.selected_features = list(features)
        return True

    def get_parameters(self):
        return {
            'cluster_number_KPrototypes': self.cluster_number,
            'categorical_data_weight_KPrototypes': self.categorical_weight,
            'features_KPrototypes': self.selected_features
        }

    def _get_initial_centers(self, dataset, categorical_indices):
        dataset_cat = dataset.take(categorical_indices, axis=1).values
        categorical_labels = [
            column for index, column in enumerate(dataset.columns)
            if index in categorical_indices
        ]
        dataset_num = dataset.drop(categorical_labels, axis=1).values

        categorical_weight = self.categorical_weight
        if categorical_weight is None or categorical_weight < 0:
            categorical_weight = 0.5 * dataset_num.std()
        initial_centroids_num = np.zeros(
            (self.cluster_number, dataset_num.shape[1]))
        initial_centroids_cat = np.zeros(
            (self.cluster_number, dataset_cat.shape[1]))
        rand_index = randint(0, dataset.shape[0] - 1)
        initial_centroids_num[0], initial_centroids_cat[0] = dataset_num[
            rand_index], dataset_cat[rand_index]

        for i in range(1, self.cluster_number):
            distances_num_cat = [
                np.zeros((i, dataset.shape[0]), dtype=np.float64),
                np.zeros((i, dataset.shape[0]))
            ]
            for j in range(0, i):
                distances_num_cat[0][j] = dissimilarity_python.euclidean(
                    dataset_num, initial_centroids_num[j])
                distances_num_cat[1][j] = matching_dissim(
                    dataset_cat, initial_centroids_cat[j])
            distances = np.amin(distances_num_cat[0] +
                                categorical_weight * distances_num_cat[1],
                                axis=0)
            probabilities = distances / np.sum(distances)
            chosen_point = np.random.choice(range(0, dataset.shape[0]),
                                            p=probabilities)
            initial_centroids_num[i] = dataset_num[chosen_point]
            initial_centroids_cat[i] = dataset_cat[chosen_point]

        initial_centroids = [initial_centroids_num, initial_centroids_cat]
        return initial_centroids

    # Used if there's no categorical properties in the dataset
    def _fallback_algorithm(self, dataset):
        from . import KMeansClustering
        self.model = KMeansClustering.KMeansClustering()
        self.model.set_parameters(self.cluster_number, self.selected_features)
        self.labels = self.model.get_labels(dataset)
        self.centers = self.model.centers
        return self.labels

    # By default, K-Prototypes uses euclidean distance for numerical data and Hamming distance for categorical data
    # n_init is the number of time the k-modes algorithm will be run with different centroid seeds
    # gamma is the weight to balance numerical data against categorical.
    # If None, it defaults to half of standard deviation for numerical data
    def get_labels(self, data, reprocess=False):
        data_original = data
        data = self._preprocessed_data(data)

        categorical_indices = get_categorical_indices(data)
        if not categorical_indices:
            return self._fallback_algorithm(data_original)

        if self.model is None or reprocess:
            data = encode_nominal_parameters(data)
            data = normalized_dataset(data, categorical_indices)

            initial_centers = self._get_initial_centers(
                data, categorical_indices)
            self.model = KPrototypes(n_clusters=self.cluster_number,
                                     max_iter=1000,
                                     init=initial_centers,
                                     n_init=10,
                                     gamma=self.categorical_weight,
                                     num_dissim=dissimilarity_python.euclidean,
                                     n_jobs=1)
            data = data.values
            self.model.fit(data, categorical=categorical_indices)
            self.labels = self.model.predict(data,
                                             categorical=categorical_indices)
            self.centers = self.model.cluster_centroids_
            centers = self.centers[0]
            for index, cat_index in enumerate(categorical_indices):
                centers = np.insert(centers,
                                    cat_index,
                                    values=self.centers[1].transpose()[index],
                                    axis=1)
            self.centers = centers
        else:
            self.labels = self.model.predict(data)

        return self.labels

    # Legacy methods

    def print_parameters(self):
        return self.get_parameters()

    def save_parameters(self):
        return self.get_parameters()

    def load_parameters(self, parameters):
        self.set_parameters(
            cluster_number=parameters.get('cluster_number_KPrototypes')
            or CLUSTER_NUMBER,
            categorical_weight=parameters.get(
                'categorical_data_weight_KPrototypes') or CATEGORICAL_WEIGHT,
            features=parameters.get('features_KPrototypes') or [])
        return True

    def save_results(self):
        return {
            'results': self.labels.tolist(),
            'centers': self.centers.tolist(),
            'dump': pickle.dumps(self.model).hex()
        }

    def load_results(self, results_dict):
        if results_dict.get("results") is not None:
            self.labels = np.array(results_dict['results'])
        if results_dict.get("centers") is not None:
            self.centers = np.array(results_dict['centers'])
        if results_dict.get("dump") is not None:
            self.model = pickle.loads(bytes.fromhex(results_dict['dump']))
        return True

    def process_data(self, data):
        return self.get_labels(data)

    def predict(self, data):
        return self.get_labels(data)