forked from aleks0and/CPA
-
Notifications
You must be signed in to change notification settings - Fork 0
/
utils.py
59 lines (50 loc) · 2.05 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import cm
from sklearn.preprocessing import StandardScaler
def load_data(path):
path = "TelcoCustomerChurn.csv"
df_telco = pd.read_csv(path)
return df_telco
def standardize_data(data, standardization, column_names):
if standardization:
for name in column_names:
data[name] = StandardScaler().fit_transform(data.loc[:,[name]])
return data
def standardize_columns(data, standardization, column_names):
if standardization:
for name in column_names:
std_dev = data[name].std(axis=0)
mean = data[name].mean(axis=0)
# done for checking
# print(std_dev)
# print(mean)
data[name].apply(lambda x: (x-mean) / std_dev)
result = data.values
else:
result = data.values
return result
def plot_clusters(data, predicted_clusters, initialized_kmeans, number_of_clusters):
for i in range(0, number_of_clusters):
color = cm.nipy_spectral(float(i) / number_of_clusters)
plt.scatter(data[predicted_clusters == i, 0],
data[predicted_clusters == i, 1],
s=50, c=color,
marker='o', edgecolor=color,
label='cluster %d' % (i+1))
color = cm.nipy_spectral(float(number_of_clusters) / number_of_clusters)
plt.scatter(initialized_kmeans.cluster_centers_[:, 0],
initialized_kmeans.cluster_centers_[:, 1],
s=250, marker='*',
c=color, edgecolor='black',
label='centroids')
plt.legend(scatterpoints=1)
plt.grid()
plt.tight_layout()
plt.show()
# ============================================TESTING=======================================================
# path = "TelcoCustomerChurn.csv"
# df_telco = pd.read_csv(path)
# df_preprocessed = data_preprocessing(df_telco)
# columns_to_standardize = ['tenure', 'MonthlyCharges', 'TotalCharges']
# df_preprocessed = standardize_columns(df_preprocessed, True, columns_to_standardize)