/
house_prices.py
195 lines (155 loc) · 6.59 KB
/
house_prices.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
from mxnet import ndarray as nd
from mxnet import autograd
from mxnet import gluon
# s1 = pd.Series(['a', 'b'])
# s2 = pd.Series(['c', 'd'])
# print s1
# print s2
# print pd.concat(([s1, s2]))
# print pd.concat([s1, s2])
#控制调试or测试
DEBUG = False
#测试输出submission是否出现负数
if DEBUG:
debug_test_data = pd.read_csv("submission.csv")
debug_test = debug_test_data.loc[:,'SalePrice']
i = 0
for price in debug_test:
i += 1
if price<=0:
print price, i
train = pd.read_csv("data/kaggle_house_pred_train.csv")
test = pd.read_csv("data/kaggle_house_pred_test.csv")
all_X = pd.concat((train.loc[:, 'MSSubClass':'SaleCondition'], #train.loc[:, 'a':'b'] 表示列标签从a列至b列的所有数据。
test.loc[:, 'MSSubClass':'SaleCondition'])) #pd.concat(x,y) 将两组数据按行按组合
numeric_feats = all_X.dtypes[all_X.dtypes != "object"].index # all_X.dtypes表示其列数据类型,整条语句的意思就是选择所有列是数值类型的索引
all_X[numeric_feats] = all_X[numeric_feats].apply(lambda x: (x - x.mean()) / x.std()) # 对数值类型的列数据,按列逐一标准化处理
#将分类变量转换为0或者1的值。具体地,比如一个列X有三个属性,外加nan:(a,b,c,nan),
#将变为4列,X_a,X-b,X-c,X_nan,样本有哪个属性就填充为1。
all_X = pd.get_dummies(all_X, dummy_na=True)
# print all_X.head()
#对数值序列,用列的均值填充nan
all_X = all_X.fillna(all_X.mean())
# print all_X.head()
num_train = train.shape[0]
x_train = all_X[:num_train].as_matrix()
x_test = all_X[num_train:].as_matrix()
# y_train = train.SalePrice
# y_train = (y_train - y_train.mean()) / y_train.std()
y_train = train.SalePrice.as_matrix()
#转换为NDarray数据
X_train = nd.array(x_train)
X_test = nd.array(x_test)
Y_train = nd.array(y_train)
Y_train.reshape((num_train,1))
square_loss = gluon.loss.L2Loss()
def get_srme_log(net, X_train, Y_train):
num_train = X_train.shape[0]
clipped_preds = nd.clip(net(X_train), 1, float('inf'))#对net(X)结果进行截断[1,正无穷]
return np.sqrt(2 * nd.sum(square_loss(nd.log(clipped_preds), nd.log(Y_train))).asscalar() / num_train)
def direct_srme_log(output, label):
clipped_preds = nd.clip(output, 1, float('inf'))
return np.sqrt(2 * nd.sum(square_loss(nd.log(clipped_preds), nd.log(label))).asscalar() / output.shape[0])
def get_net():
net = gluon.nn.Sequential()
with net.name_scope():
net.add(gluon.nn.Dense(8192,activation = 'relu'))
net.add(gluon.nn.Dropout(0.5))
# net.add(gluon.nn.Dense(512,activation='relu'))
# net.add(gluon.nn.Dense(256,activation='relu'))
# net.add(gluon.nn.Dense(128, activation='relu'))
net.add(gluon.nn.Dense(1))
net.initialize()
return net
import matplotlib as mpl
mpl.rcParams['figure.dpi']= 120
import matplotlib.pyplot as plt
def train(net, X_train,Y_train,X_test,Y_test, epochs,
verbose_epoch, learning_rate, weight_decay):
train_loss = []
if X_test is not None:
test_loss = []
batch_size = 100
dataset_train = gluon.data.ArrayDataset(X_train,Y_train)
data_iter_train = gluon.data.DataLoader(dataset_train,batch_size,shuffle=True)
trainer = gluon.Trainer(net.collect_params(), 'RMSProp', {'learning_rate': learning_rate, 'wd': weight_decay})
net.collect_params().initialize(force_reinit=True)
for e in range(epochs):
for data, label in data_iter_train:
with autograd.record():
output = net(data)
loss = square_loss(output,label)
loss.backward()
trainer.step(batch_size)
cur_train_loss = get_srme_log(net, X_train, Y_train)
if e > verbose_epoch:
print 'epoach %d, current loss: %f' %(e, cur_train_loss)
train_loss.append(cur_train_loss)
if X_test is not None:
cur_test_loss = get_srme_log(net, X_test, Y_test)
test_loss.append(cur_test_loss)
plt.plot(train_loss)
plt.legend(['train'])
if X_test is not None:
plt.plot(test_loss)
plt.legend(['train', 'test'])
plt.show()
if X_test is not None:
return cur_train_loss, cur_test_loss
else:
return cur_train_loss
def k_fold_cross_valid(k, epochs, verbose_epoch, X_train, y_train,
learning_rate, weight_decay):
assert k > 1
fold_size = X_train.shape[0] // k
train_loss_sum = 0.0
test_loss_sum = 0.0
for test_i in range(k):
x_val_test = X_train[test_i*fold_size: (test_i+1)*fold_size,:]
y_val_test = y_train[test_i*fold_size: (test_i+1)*fold_size]
val_train_defined = False
for j in range(k):
if j != test_i:
x_cur_fold = X_train[j*fold_size:(j+1)*fold_size,:]
y_cur_fold = y_train[j*fold_size:(j+1)*fold_size]
if not val_train_defined:
x_val_train = x_cur_fold
y_val_train = y_cur_fold
val_train_defined = True
else:
x_val_train = nd.concat(x_val_train, x_cur_fold, dim=0)
y_val_train = nd.concat(y_val_train, y_cur_fold, dim=0)
net = get_net()
train_loss, test_loss = train(
net,x_val_train,y_val_train,x_val_test,y_val_test,
epochs,verbose_epoch,learning_rate,weight_decay)
print 'after train'
train_loss_sum += train_loss
print("Test loss: %f" % test_loss)
test_loss_sum += test_loss
return train_loss_sum / k, test_loss_sum / k
################################################################################
k = 3
epochs = 50
verbose_epoch = epochs -10
learning_rate = 0.0185
weight_decay = 480
#预测函数
def learn(epochs, verbose_epoch, X_train, y_train, test, learning_rate,
weight_decay):
net = get_net()
train(net, X_train, y_train, None, None, epochs, verbose_epoch,
learning_rate, weight_decay)
preds = net(X_test).asnumpy()
test['SalePrice'] = pd.Series(preds.reshape(1, -1)[0])
submission = pd.concat([test['Id'], test['SalePrice']], axis=1)
submission.to_csv('submission.csv', index=False)
if DEBUG:
train_loss, test_loss = k_fold_cross_valid(k, epochs, verbose_epoch, X_train,
Y_train, learning_rate, weight_decay)
print "%d-fold validation: Avg train loss: %f, Avg test loss: %f" %(k, train_loss, test_loss)
else:
learn(epochs, verbose_epoch, X_train, Y_train, test, learning_rate ,weight_decay)