from pydataset import data
mtcars = data('mtcars')
data = mtcars.copy()
data

#need for scaling : height & weight are in different scales
scaler = StandardScaler()
scaled_features = scaler.fit_transform(data)
scaled_features[:5]  #values between -3 to +3

kmeans = KMeans(init='random',
                n_clusters=2,
                n_init=3,
                max_iter=300,
                random_state=42)
kmeans
kmeans.fit(scaled_features)
kmeans.inertia_
kmeans.cluster_centers_  #average or rep values
kmeans.n_iter_  #in 6 times, clusters stabilised
kmeans.labels_[:5]
kmeans.cluster_centers_.shape
kmeans.cluster_centers_[0:1]

#group means
data.groupby(kmeans.labels_).agg({'mpg': 'mean', 'hp': 'mean', 'wt': 'mean'})

#plot scatter
plt.scatter(x=data.wt, y=data.mpg, c=kmeans.labels_)
kmeans.labels_[:5]
kmeans.cluster_centers_.shape
kmeans.cluster_centers_[0:1]
kmeans.predict(scaled_features)
scaled_features[1:5]
scaled_features.columns
import pandas as pd
y = pd.DataFrame(scaled_features)
y
y.columns

clusterNos = kmeans.labels_
clusterNos

#mean of mpg, hp, wt
data.groupby(clusterNos).agg({'mpg': 'mean', 'hp': 'mean', 'wt': 'mean'})

##plot scatter wt vs mpg with color cluster
plt.scatter(data.wt, data.mpg, c=clusterNos)
plt.xlabel('Weight')
plt.ylabel('Mileage')
plt.title('Color Cluster')
plt.show()

##plot scatter wt vs hp with color cluster
data.columns
plt.scatter(data.hp, data.wt, c=clusterNos)
plt.xlabel('Horse Power')
plt.ylabel('Weight')
plt.title('Color Cluster')
plt.show()
Example #3
0
import squarify # pip install squarify (algorithm for treemap)
 
# Change color
squarify.plot(sizes=[13,22,35,5], label=["group A", "group B", "group C", "group D"], color=["red","green","blue", "grey"], alpha=.4 )
plt.axis('off')
plt.show();


#mtcars - distribution of Gears
import matplotlib.pyplot as plt
from pydataset import data
mtcars = data('mtcars')
data=mtcars.copy()

gcount = data.gear.value_counts()
squarify.plot(sizes=gcount, label=["Gear3", "Gear4", "Gear5"], color=["red","green","blue"], alpha=.4 )
plt.axis('off')
plt.show();

squarify.plot(sizes=data.am.value_counts(), alpha=.4 )
plt.axis('off')
plt.show();

data.groupby('am')['am'].agg(['count'])
data.groupby('am').size()
gcount2 = data.groupby('am').size()
gcount2.index?
squarify.plot(sizes=gcount2, label=gcount2.index, color=['red','green'], alpha=.4 )
plt.axis('off')
plt.show();
Example #4
0
pd.set_option('display.max_columns', 12)
pd.set_option('display.width', 1000)
#pd.reset_option("^display")
data.head()

#%%% Mean of column /rows
mean([1, 2, 3, 10])  #mean not defined in base python
np.mean([1, 2, 3, 10])
np.mean(data.mpg)
data.mpg.mean()
data['mpg'].mean()
data[['mpg', 'hp', 'wt']].mean()
data.mean()
data.mean(axis=0)
data.mean(axis=1)
data.groupby('cyl').mean()
data.groupby('cyl').mean().apply(lambda x: round(x, 1))
#df.mean(axis = 1, skipna = True) #with missing values
#%%%
data.mpg  #single column is series
#data.mpg.sort()  #depreciated
data.mpg.sort_values()
data['mpg'].sort_values()
data.mpg.sort_values().iloc[15]
data.mpg.sort_values().iloc[[0, 7, 15, 23, 31]]
data[['mpg', 'wt']].median(axis=0)
data.median(axis=0)  #columns mean
#df.median(axis = 1, skipna = True) #with missing values
#%% Mode - Value with max frequency
data.gear.value_counts()
data.groupby('gear').size()  #how many of each gear type
Example #5
0
data = mtcars.copy()
id(data)

data

#need for scaling : height & weight are in different scales
scaler = StandardScaler()
scaled_features = scaler.fit_transform(data)
scaled_features[:5]  #values between -3 to +3

kmeans = KMeans( init = 'random', n_clusters=2, n_init=3, max_iter=300, random_state=42)
kmeans
kmeans.fit(scaled_features)
kmeans.inertia_
kmeans.cluster_centers_  #average or rep values
kmeans.n_iter_  #in 6 times, clusters stabilised
kmeans.labels_[:5]
kmeans.cluster_centers_.shape
kmeans.cluster_centers_[0:1]

data.groupby('kmeans.labels').mean()
clusterNos = kmeans.labels_
clusterNos
type(clusterNos)

data.groupby([clusterNos]).mean()
#https://realpython.com/k-means-clustering-python/

#agglomerative
#steps