def preparation_data(self, df): # Standarisasi Kolom Numerik kolom_numerik = ['Umur', 'NilaiBelanjaSetahun'] # Statistik sebelum Standardisasi print('Statistik Sebelum Standardisasi\n') print(df[kolom_numerik].describe().round(1)) # Standardisasi df_std = StandardScaler().fit_transform(df[kolom_numerik]) # Membuat DataFrame df_std = pd.DataFrame(data=df_std, index=df.index, columns=df[kolom_numerik].columns) # Menampilkan contoh isi data dan summary statistic print('Contoh hasil standardisasi\n') print(df_std.head()) print('Statistik hasil standardisasi\n') print(df_std.describe().round(0)) # Konversi Kategorikal Data # Inisiasi nama kolom kategorikal kolom_kategorikal = ['Jenis Kelamin', 'Profesi', 'Tipe Residen'] # Membuat salinan data frame df_encode = df[kolom_kategorikal].copy() # Melakukan labelEncoder untuk semua kolom kategorikal for col in kolom_kategorikal: df_encode[col] = LabelEncoder().fit_transform(df_encode[col]) # Menampilkan data print(df_encode.head()) # Menggabungkan data frame df_model = df_encode.merge(df_std, left_index=True, right_index=True, how='left') print(df_model.head())
plt.xlabel(principalDf.columns[0], fontsize=10) plt.ylabel(principalDf.columns[1], fontsize=10) plt.show() pca_plotting(featuresCons) # 3.2.2 - Scaling #K-means clustering is "isotropic" in all directions of space and therefore tends to produce more or less round (rather than elongated) clusters. #In this situation leaving variances unequal is equivalent to putting more weight on variables with smaller variance. So we scale the variables in order to remove the difference in magnitude of the clustering analysis. scaled_featuresCons = StandardScaler().fit_transform(featuresCons.values) scaled_featuresCons = pd.DataFrame(scaled_featuresCons, index=featuresCons.index, columns=featuresCons.columns) scaled_featuresCons.describe() # 3.2.3 - Elbow Graph elbow_graph(scaled_featuresCons) #The Elbow Graph seems to be indicating a number between 2 and 5 for number of clusters. Lets see what the Dendogram shows. # 3.2.4 - Hierarchical (Dendogram) #This might help us decide the number of clusters. dendrogram_(scaled_featuresCons) #Again, 3 seems to be the best number of clusters. # 3.2.5 - K-Means # Setting up the K-Means model:
print(df[kolom_numerik].describe().round(1)) # Standardisasi df_std = StandardScaler().fit_transform(df[kolom_numerik]) # Membuat DataFrame df_std = pd.DataFrame(data=df_std, index=df.index, columns=df[kolom_numerik].columns) # Menampilkan contoh isi data dan summary statistic print('Contoh hasil standardisasi\n') print(df_std.head()) print('Statistik hasil standardisasi\n') print(df_std.describe().round(0)) # # [Konversi Kategorikal Data dengan Label Encoder](https://academy.dqlab.id/main/livecode/293/562/2809) # In[10]: from sklearn.preprocessing import LabelEncoder # Inisiasi nama kolom kategorikal kolom_kategorikal = ['Jenis Kelamin', 'Profesi', 'Tipe Residen'] # Membuat salinan data frame df_encode = df[kolom_kategorikal].copy() # Melakukan labelEncoder untuk semua kolom kategorikal for col in kolom_kategorikal:
#Create a dataframe df_ss = pd.DataFrame(df_ss) df_ss.columns = [ 'Sports', 'SUV', 'Wagon', 'Minivan', 'Pickup', 'AWD', 'RWD', 'Retail', 'Dealer', 'Engine', 'Cylinders', 'Horsepower', 'CityMPG', 'HighwayMPG', 'Weight', 'Wheelbase', 'Length', 'Width' ] # In[16]: #Check first few rows df_ss.head() # In[17]: df_ss.describe() # In[18]: #Reduce feature down to 3 comp = 3 pca, X_pca = do_pca(comp, df_ss) # In[19]: X_pca # In[23]: pca
#%% ##################################################################################### # clustering from sklearn.preprocessing import StandardScaler from scipy.cluster.hierarchy import dendrogram, linkage from sklearn.cluster import AgglomerativeClustering from sklearn.cluster import KMeans ############################## # Hierarchical clustering # USArrests data usarrest = pd.read_csv("./data/usarrest.csv") Xname = ['Murder', 'Assault', 'UrbanPop', 'Rape'] Xdata = StandardScaler().fit_transform(usarrest[Xname]) Xdata = pd.DataFrame(Xdata) Xdata.columns = Xname Xdata.describe() # Calculate the linkage: mergings mergings = linkage(Xdata, method='average') # Plot the dendrogram, using varieties as labels plt.figure(figsize=(20, 10)) dendrogram(mergings, leaf_rotation=90, leaf_font_size=20, labels=usarrest['State'].values) plt.show() # Calculate means for each cluster cluster = AgglomerativeClustering(n_clusters=5, affinity='euclidean', linkage='average') group = cluster.fit_predict(Xdata) group = pd.DataFrame(group)
# In[6]: Y_num = df['class'] X_num = df.drop(['class'], axis = 1) X_num # In[7]: # Normalise the Data X_num_std = StandardScaler().fit_transform(X_num) X_num_std = pd.DataFrame(X_num_std) X_num_std.describe() # In[8]: df_norm = pd.concat((X_num_std, Y_num), axis = 1) df_norm # In[9]: Y = df_norm.iloc[:,[57]].values X = np.asarray(df_norm.drop(['class'], axis = 1)) print ("Shape of X: ", X.shape)
#Statistik sebelum standarisasi print('Statistik Sebelum Standardisasi:') print(df[kolom_numerik].describe().round(1)) #Standarisasi numerik df_standar = StandardScaler().fit_transform(df[kolom_numerik]) #Membuat dataframe df_standar = pd.DataFrame(data=df_standar, index=df.index, columns=df[kolom_numerik].columns) #Statistik setelah standarisasi print('\nStatistik hasil standardisasi:') print(df_standar.describe().round(0)) #Konversi data kategorik ke data numerik from sklearn.preprocessing import LabelEncoder #Membuat salinan dataframe df_encode = df[kolom_kategorikal].copy() #Menerapkan Label Encoder ke semua kolom kategorik for col in kolom_kategorikal: df_encode[col] = LabelEncoder().fit_transform(df_encode[col]) print(df_encode.head()) #Menggabungkan dataframe df_model = df_encode.merge(df_standar, left_index=True,