/
code_cluster_prediction.py
117 lines (106 loc) · 4.56 KB
/
code_cluster_prediction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn import preprocessing
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory
from sklearn.cluster import KMeans
import os
# Any results you write to the current directory are saved as output.
yale = pd.read_csv('../input/yalev1.csv')
yale = yale.loc[:31,:]
yale.isnull().sum()
#label encoder
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(yale['County'].unique())
yale['County'] = le.transform(yale['County'])
#Label propagation
from sklearn.semi_supervised import LabelSpreading
def getLabelPropa(yale):
n = len(yale)
yale['labels'] = yale['Rank']
yale['labels'].loc[yale['Town'].isin(['Greenwich','Westport','Fairfield','Trumbull','Ridgefield'])] = 1
#print(yale['labels'])
label = yale['labels']
yale = yale.select_dtypes(include = ['float64','int64'])
label_prop_model = LabelSpreading(alpha = 0.1, kernel = 'rbf', n_neighbors = 3, max_iter = 300,gamma =2)
yale = yale.drop(['labels'],axis = 1)
yale = preprocessing.normalize(yale,axis = 0,norm='max')
label_prop_model.fit(yale, label)
label = label_prop_model.predict(yale)
##print(label_prop_model.predict(yale))
#print(label_prop_model)
#print(label_prop_model.predict_proba(yale))
return label
label = getLabelPropa(yale)
yale['label_lp'] = label
yale[yale['label_lp'] == 1]
def get_label_km(yale):
yale_train = yale.loc[yale['Town'].isin(['Greenwich','Westport','Fairfield','Trumbull','Ridgefield'])]
yale_trial = yale.drop(['Rank'],axis = 1)
normalized_yale_train = preprocessing.normalize(yale_train.select_dtypes(include = ['float64','int64']),axis = 0,norm='max')
print(normalized_yale_train.var(axis = 0))
yale_cov = (preprocessing.normalize(yale_trial.select_dtypes(include = ['float64','int64']),axis = 0,norm='max'))
kmeans = KMeans(n_clusters = 2, random_state=0).fit(yale_cov)
yale['labels'] = kmeans.labels_
#print(yale.loc[yale['Town'].isin(['Greenwich','Westport','Fairfield','Trumbull','Ridgefield'])])
#print(yale[yale['labels'] == 0])
return yale['labels']
label = get_label_km(yale)
yale['label_km'] = label
yale.loc[yale['Town'].isin(['Greenwich','Westport','Fairfield','Trumbull','Ridgefield'])]
yale[yale['label_km'] == 0]
yale_train_cov = yale[yale['label_lp'] == 1].select_dtypes(include = ['float64','int64'])
import matplotlib.pyplot as plt
import seaborn as sns
def get_cor(train_SJ):
colormap = plt.cm.RdBu
plt.figure(figsize=(14,12))
plt.title('Pearson Correlation of Features', y=1.05, size=15)
sns.heatmap(train_SJ.astype(float).corr(),linewidths=0.1,vmax=1.0,
square=True, cmap=colormap, linecolor='white', annot=True)
get_cor(yale_train_cov)
def param(x):
mean = np.mean(x,axis = 0)
std = np.std(x,axis = 0)
return mean,std
def preprocess(x,mean,std):
m,n=x.shape
x_normal = np.zeros((m,n))
for i in range(x.shape[1]):
if(std[i] == 0):
std[i] =1
x_normal[:,i] = (x[:,i] - mean[i])/std[i]
#b = np.ones((len(x),1))
# x_new= np.column_stack([b,x_normal])
x_new = x_normal
return x_new
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
def rmsle_cv(model,train_data,y):
n_folds = 5
kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(train_data)
rmse= np.sqrt(-cross_val_score(model, train_data, y, scoring="neg_mean_squared_error", cv = kf))
return(np.mean(rmse))
from sklearn.linear_model import LassoCV, Lasso, ElasticNet
from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import make_pipeline
#Lasso = make_pipeline(RobustScaler(),LassoCV(alphas = [1, 0.1, 0.001, 0.0005], random_state = 1))
#lasso
train_x = yale[yale['label_lp'] == 1]
#print(train_x.info)
test_x = train_x[train_x.Town == 'Fairfield']
train_x = train_x[train_x.Town != 'Fairfield']
test_y = test_x['PPE']
train_y = train_x['PPE']
train_x = train_x.drop(['PPE','Town','labels','label_lp','label_km','Average_Age','Population'],axis = 1)
test_x = test_x.drop(['PPE','Town','labels','label_lp','label_km','Average_Age','Population'],axis = 1)
Lasso = Lasso(alpha = 0.001,tol=0.005).fit(train_x,train_y)
print(Lasso.coef_)
mean,std = param(train_x)
train_x = preprocess(train_x.values,mean,std)
test_x = preprocess(test_x.values,mean,std)
print(rmsle_cv(Lasso,train_x,train_y))
Lasso.fit(train_x,train_y)
print(Lasso.predict(test_x))
print(test_y)