from pydataset import data mtcars = data('mtcars') data = mtcars.copy() data #need for scaling : height & weight are in different scales scaler = StandardScaler() scaled_features = scaler.fit_transform(data) scaled_features[:5] #values between -3 to +3 kmeans = KMeans(init='random', n_clusters=2, n_init=3, max_iter=300, random_state=42) kmeans kmeans.fit(scaled_features) kmeans.inertia_ kmeans.cluster_centers_ #average or rep values kmeans.n_iter_ #in 6 times, clusters stabilised kmeans.labels_[:5] kmeans.cluster_centers_.shape kmeans.cluster_centers_[0:1] #group means data.groupby(kmeans.labels_).agg({'mpg': 'mean', 'hp': 'mean', 'wt': 'mean'}) #plot scatter plt.scatter(x=data.wt, y=data.mpg, c=kmeans.labels_)
kmeans.labels_[:5] kmeans.cluster_centers_.shape kmeans.cluster_centers_[0:1] kmeans.predict(scaled_features) scaled_features[1:5] scaled_features.columns import pandas as pd y = pd.DataFrame(scaled_features) y y.columns clusterNos = kmeans.labels_ clusterNos #mean of mpg, hp, wt data.groupby(clusterNos).agg({'mpg': 'mean', 'hp': 'mean', 'wt': 'mean'}) ##plot scatter wt vs mpg with color cluster plt.scatter(data.wt, data.mpg, c=clusterNos) plt.xlabel('Weight') plt.ylabel('Mileage') plt.title('Color Cluster') plt.show() ##plot scatter wt vs hp with color cluster data.columns plt.scatter(data.hp, data.wt, c=clusterNos) plt.xlabel('Horse Power') plt.ylabel('Weight') plt.title('Color Cluster') plt.show()
import squarify # pip install squarify (algorithm for treemap) # Change color squarify.plot(sizes=[13,22,35,5], label=["group A", "group B", "group C", "group D"], color=["red","green","blue", "grey"], alpha=.4 ) plt.axis('off') plt.show(); #mtcars - distribution of Gears import matplotlib.pyplot as plt from pydataset import data mtcars = data('mtcars') data=mtcars.copy() gcount = data.gear.value_counts() squarify.plot(sizes=gcount, label=["Gear3", "Gear4", "Gear5"], color=["red","green","blue"], alpha=.4 ) plt.axis('off') plt.show(); squarify.plot(sizes=data.am.value_counts(), alpha=.4 ) plt.axis('off') plt.show(); data.groupby('am')['am'].agg(['count']) data.groupby('am').size() gcount2 = data.groupby('am').size() gcount2.index? squarify.plot(sizes=gcount2, label=gcount2.index, color=['red','green'], alpha=.4 ) plt.axis('off') plt.show();
pd.set_option('display.max_columns', 12) pd.set_option('display.width', 1000) #pd.reset_option("^display") data.head() #%%% Mean of column /rows mean([1, 2, 3, 10]) #mean not defined in base python np.mean([1, 2, 3, 10]) np.mean(data.mpg) data.mpg.mean() data['mpg'].mean() data[['mpg', 'hp', 'wt']].mean() data.mean() data.mean(axis=0) data.mean(axis=1) data.groupby('cyl').mean() data.groupby('cyl').mean().apply(lambda x: round(x, 1)) #df.mean(axis = 1, skipna = True) #with missing values #%%% data.mpg #single column is series #data.mpg.sort() #depreciated data.mpg.sort_values() data['mpg'].sort_values() data.mpg.sort_values().iloc[15] data.mpg.sort_values().iloc[[0, 7, 15, 23, 31]] data[['mpg', 'wt']].median(axis=0) data.median(axis=0) #columns mean #df.median(axis = 1, skipna = True) #with missing values #%% Mode - Value with max frequency data.gear.value_counts() data.groupby('gear').size() #how many of each gear type
data = mtcars.copy() id(data) data #need for scaling : height & weight are in different scales scaler = StandardScaler() scaled_features = scaler.fit_transform(data) scaled_features[:5] #values between -3 to +3 kmeans = KMeans( init = 'random', n_clusters=2, n_init=3, max_iter=300, random_state=42) kmeans kmeans.fit(scaled_features) kmeans.inertia_ kmeans.cluster_centers_ #average or rep values kmeans.n_iter_ #in 6 times, clusters stabilised kmeans.labels_[:5] kmeans.cluster_centers_.shape kmeans.cluster_centers_[0:1] data.groupby('kmeans.labels').mean() clusterNos = kmeans.labels_ clusterNos type(clusterNos) data.groupby([clusterNos]).mean() #https://realpython.com/k-means-clustering-python/ #agglomerative #steps