-
Notifications
You must be signed in to change notification settings - Fork 1
/
04-KerasRegressionCode.py
204 lines (134 loc) · 5.89 KB
/
04-KerasRegressionCode.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
""" Here one focuse more on feature engineering with realistic data set
I will be using data from a Kaggle data can be found:
https://www.kaggle.com/harlfoxem/housesalesprediction
"""
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
pd.set_option('display.max_columns',50)
house = pd.read_csv('kc_house_data.csv')
# first let us see if we have any missing data in each columns
# Looking for nulls
house.isnull().any()
house.isnull().sum()
# let us see the features name
house.columns
house.describe().transpose()
#%%
with sns.plotting_context("notebook",font_scale=2.5):
g = sns.pairplot(house[['sqft_lot','sqft_above','price','sqft_living','bedrooms']],
hue='bedrooms', palette='tab20',size=1.4)
g.set(xticklabels=[]);
plt.subplots(figsize=(17,14))
sns.heatmap(house.corr(),annot=True,linewidths=0.5,linecolor="Black",fmt="1.1f")
plt.title("Data Correlation",fontsize=50)
plt.show()
#%%
# in order to have a better idea let us plot the price and see its distribution
plt.figure(figsize=(10,5))
sns.distplot(house['price'])
# as we can see the are some extream points (expensive houses which are not that many)
# so it is better to skip them when we build the model
sns.pairplot(house[['sqft_above','price','sqft_living','bedrooms']],hue='bedrooms', palette='husl');
sns.countplot(x= house['bedrooms'])
# we can see it is extended u tp 33 but it is a small bar and not visible
sns.pairplot(house, hue='price')
# now let us see the correlation of our target with other value
house.corr()['price'].sort_values()
# we see "sqft_living" has a strong correlation with price
sns.scatterplot(x= house['sqft_living'], y = house['price'])
sns.scatterplot(x=house['grade'], y = house['price'])
sns.scatterplot(x=house['bedrooms'], y = house['price'])
# in our data we have "lattitude" and "longitude" which give the house position in
# king country, USA
plt.figure(figsize=(5,8))
sns.scatterplot(x= house['long'],y =house['lat'], hue = house['price'])
# because of those extream points we are not getting a good
# color distribution. so now let us drop those extreame points
house2 = house.sort_values('price',ascending=False)
len(house)*(0.01) #216
non_top_1_perc = house2[216:]
#non_top_1_perc = house.sort_values('price',ascending=False).iloc[216:]
# now let us plot it again, to get more color distribution
plt.figure(figsize=(12,8))
sns.scatterplot(x='long',y='lat',
data=non_top_1_perc,hue='price',
palette='RdYlGn',edgecolor=None,alpha=0.2)
#%%% 138
"""Feature engineering """
# now we can do some feature engineerin from data
# let us drop some featuers which is not very informatic in given data set
# we look at data again
house.head(5)
house = house.drop('id', axis=1)
house['date']
# next if we look at the date it shows some sort of the string "" dtype: object"".
# So we need to convert it by doing the following, we can use "to_datetime" function from Pandas
house['date'] = pd.to_datetime(house['date'])
# now the type of date is ""dtype: datetime64[ns]""
# this is called feature engineering, because these features are hidden inside of
# string date. now we try to exteact or engineering more information off the original data
house['year'] = house['date'].apply(lambda date:date.year)
house['month'] = house['date'].apply(lambda date:date.month)
# we can do some explatory visualization to see the impact of month on house selling
sns.boxplot(x='year',y='price',data=house)
house.groupby('month').mean()['price'].plot()
house = house.drop('date', axis = 1)
# now if we look at the zipcode column we see we have 70 uniqu values
# which is basically too much to catagorize the data
house['zipcode'].value_counts()
# So I go ahead and for this particular example I drop this colomn
house = house.drop('zipcode',axis=1)
# could make sense due to scaling, higher should correlate to more value
# other thing er can d is looking at continoues or categorical data
house['yr_renovated'].value_counts()
house['sqft_basement'].value_counts()
#%%139
"""Scaling and Train Test Split"""
X = house.drop('price',axis=1).values
y = house['price'].values
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=101)
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
# we can save time by fitting ans transfering at the same time
X_train= scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
X_train.shape
""" Creating a Model"""
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.optimizers import Adam
model = Sequential()
# typcally what we do is we try to base the number of neurons
# in our layer from the size of actual feature data
model.add(Dense(19,activation='relu'))
model.add(Dense(19,activation='relu'))
model.add(Dense(19,activation='relu'))
model.add(Dense(19,activation='relu'))
model.add(Dense(1))
model.compile(optimizer='adam',loss='mse')
"""Training the Model"""
model.fit(x=X_train,y=y_train,
validation_data=(X_test,y_test),
batch_size=128,epochs=400)
#140
losses = pd.DataFrame(model.history.history)
# this data frame has two columns, one is loss and the other is called
# val_loss ---> this is loss on that test set
# that validation data and now I can directly compare the loss on training
# and loss on test data in order to see if i am overfitting to the training
# data on my model. Simply we can plot it
losses.plot()
#%%% we can do some evaluationon our test data 140
"""Evaluation on Test Data"""
from sklearn.metrics import mean_squared_error,mean_absolute_error,explained_variance_score
predictions = model.predict(X_test)
mean_absolute_error(y_test,predictions)
house['price'].mean()
explained_variance_score(y_test,predictions)
# Our predictions
plt.scatter(y_test,predictions)
# Perfect predictions
plt.plot(y_test,y_test,'r')