Example #1
0
    def preparation_data(self, df):
        # Standarisasi Kolom Numerik
        kolom_numerik = ['Umur', 'NilaiBelanjaSetahun']

        # Statistik sebelum Standardisasi
        print('Statistik Sebelum Standardisasi\n')
        print(df[kolom_numerik].describe().round(1))

        # Standardisasi
        df_std = StandardScaler().fit_transform(df[kolom_numerik])

        # Membuat DataFrame
        df_std = pd.DataFrame(data=df_std,
                              index=df.index,
                              columns=df[kolom_numerik].columns)

        # Menampilkan contoh isi data dan summary statistic
        print('Contoh hasil standardisasi\n')
        print(df_std.head())

        print('Statistik hasil standardisasi\n')
        print(df_std.describe().round(0))

        # Konversi Kategorikal Data
        # Inisiasi nama kolom kategorikal
        kolom_kategorikal = ['Jenis Kelamin', 'Profesi', 'Tipe Residen']

        # Membuat salinan data frame
        df_encode = df[kolom_kategorikal].copy()

        # Melakukan labelEncoder untuk semua kolom kategorikal
        for col in kolom_kategorikal:
            df_encode[col] = LabelEncoder().fit_transform(df_encode[col])

        # Menampilkan data
        print(df_encode.head())

        # Menggabungkan data frame
        df_model = df_encode.merge(df_std,
                                   left_index=True,
                                   right_index=True,
                                   how='left')
        print(df_model.head())
    plt.xlabel(principalDf.columns[0], fontsize=10)
    plt.ylabel(principalDf.columns[1], fontsize=10)
    plt.show()


pca_plotting(featuresCons)

# 3.2.2 - Scaling
#K-means clustering is "isotropic" in all directions of space and therefore tends to produce more or less round (rather than elongated) clusters.
#In this situation leaving variances unequal is equivalent to putting more weight on variables with smaller variance. So we scale the variables in order to remove the difference in magnitude of the clustering analysis.

scaled_featuresCons = StandardScaler().fit_transform(featuresCons.values)
scaled_featuresCons = pd.DataFrame(scaled_featuresCons,
                                   index=featuresCons.index,
                                   columns=featuresCons.columns)
scaled_featuresCons.describe()

# 3.2.3 - Elbow Graph
elbow_graph(scaled_featuresCons)
#The Elbow Graph seems to be indicating a number between 2 and 5 for number of clusters. Lets see what the Dendogram shows.

# 3.2.4 - Hierarchical (Dendogram)
#This might help us decide the number of clusters.

dendrogram_(scaled_featuresCons)

#Again, 3 seems to be the best number of clusters.

# 3.2.5 - K-Means

# Setting up the K-Means model:
Example #3
0
print(df[kolom_numerik].describe().round(1))

# Standardisasi
df_std = StandardScaler().fit_transform(df[kolom_numerik])

# Membuat DataFrame
df_std = pd.DataFrame(data=df_std,
                      index=df.index,
                      columns=df[kolom_numerik].columns)

# Menampilkan contoh isi data dan summary statistic
print('Contoh hasil standardisasi\n')
print(df_std.head())

print('Statistik hasil standardisasi\n')
print(df_std.describe().round(0))

# # [Konversi Kategorikal Data dengan Label Encoder](https://academy.dqlab.id/main/livecode/293/562/2809)

# In[10]:

from sklearn.preprocessing import LabelEncoder

# Inisiasi nama kolom kategorikal
kolom_kategorikal = ['Jenis Kelamin', 'Profesi', 'Tipe Residen']

# Membuat salinan data frame
df_encode = df[kolom_kategorikal].copy()

# Melakukan labelEncoder untuk semua kolom kategorikal
for col in kolom_kategorikal:
Example #4
0
#Create a dataframe
df_ss = pd.DataFrame(df_ss)
df_ss.columns = [
    'Sports', 'SUV', 'Wagon', 'Minivan', 'Pickup', 'AWD', 'RWD', 'Retail',
    'Dealer', 'Engine', 'Cylinders', 'Horsepower', 'CityMPG', 'HighwayMPG',
    'Weight', 'Wheelbase', 'Length', 'Width'
]

# In[16]:

#Check first few rows
df_ss.head()

# In[17]:

df_ss.describe()

# In[18]:

#Reduce feature down to 3
comp = 3
pca, X_pca = do_pca(comp, df_ss)

# In[19]:

X_pca

# In[23]:

pca
Example #5
0
#%%
#####################################################################################
# clustering
from sklearn.preprocessing import StandardScaler
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import KMeans
##############################
# Hierarchical clustering
# USArrests data
usarrest = pd.read_csv("./data/usarrest.csv")
Xname = ['Murder', 'Assault', 'UrbanPop', 'Rape']
Xdata = StandardScaler().fit_transform(usarrest[Xname])
Xdata = pd.DataFrame(Xdata)
Xdata.columns = Xname
Xdata.describe()
# Calculate the linkage: mergings
mergings = linkage(Xdata, method='average')
# Plot the dendrogram, using varieties as labels
plt.figure(figsize=(20, 10))
dendrogram(mergings,
           leaf_rotation=90,
           leaf_font_size=20,
           labels=usarrest['State'].values)
plt.show()
# Calculate means for each cluster
cluster = AgglomerativeClustering(n_clusters=5,
                                  affinity='euclidean',
                                  linkage='average')
group = cluster.fit_predict(Xdata)
group = pd.DataFrame(group)
Example #6
0
# In[6]:


Y_num = df['class']
X_num = df.drop(['class'], axis = 1)
X_num


# In[7]:


# Normalise the Data
X_num_std = StandardScaler().fit_transform(X_num)
X_num_std = pd.DataFrame(X_num_std)
X_num_std.describe()


# In[8]:


df_norm = pd.concat((X_num_std, Y_num), axis = 1)
df_norm


# In[9]:


Y = df_norm.iloc[:,[57]].values
X = np.asarray(df_norm.drop(['class'], axis = 1))
print ("Shape of X: ", X.shape)
Example #7
0
#Statistik sebelum standarisasi
print('Statistik Sebelum Standardisasi:')
print(df[kolom_numerik].describe().round(1))

#Standarisasi numerik
df_standar = StandardScaler().fit_transform(df[kolom_numerik])

#Membuat dataframe
df_standar = pd.DataFrame(data=df_standar,
                          index=df.index,
                          columns=df[kolom_numerik].columns)

#Statistik setelah standarisasi
print('\nStatistik hasil standardisasi:')
print(df_standar.describe().round(0))

#Konversi data kategorik ke data numerik
from sklearn.preprocessing import LabelEncoder

#Membuat salinan dataframe
df_encode = df[kolom_kategorikal].copy()

#Menerapkan Label Encoder ke semua kolom kategorik
for col in kolom_kategorikal:
    df_encode[col] = LabelEncoder().fit_transform(df_encode[col])
print(df_encode.head())

#Menggabungkan dataframe
df_model = df_encode.merge(df_standar,
                           left_index=True,