/
LR_model.py
94 lines (72 loc) · 3.26 KB
/
LR_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
## linear recursive model, jumps to 5.56
import numpy as np
from scipy.stats.mstats import mode
from copy import deepcopy
import pandas as pd
import time
from helper import *
from IDWmodel import *
from sklearn.linear_model import LinearRegression
# ### Building a reference table with average daily value of the sensor
def build_avg_time_table(df_train):
df_train['day_time'] = df_train.time % 10000
# Initializing the dataframe
# Update: rounding the value
col_name = 'S1'
df_day_avg_values = df_train[[col_name, 'day_time']][df_train[col_name] != -1].groupby('day_time').mean().apply(pd.Series.round)
col_names = ['S'+str(i) for i in xrange(1, 57)]
for col_name in col_names[1:]:
df_day_avg_values = df_day_avg_values.join(df_train[[col_name, 'day_time']][df_train[col_name] != -1].groupby('day_time').mean().apply(pd.Series.round))
return df_day_avg_values
def lr_prediction(df_train, col_names, df_day_avg_values, adjacency_list, df_model):
# Dataframe to store the model prediction
df_model_lr = df_train.copy()
for col in col_names:
# X will store the features and the outcome Y
X = df_train.copy()
X = X.rename(columns={col:'Y'})
X = pd.merge(X, df_day_avg_values[[col]], left_on='day_time', right_index=True)
X = X.rename(columns={col:col+'avg'})
# Building the neighbors (from adjacency list) with missing values filled as in model
neighbors_col = ['S'+str(n) for n in adjacency_list[int(col[1:])]]
X = X[['Y']].join(df_model[neighbors_col])
X_train = X[X['Y'] != -1]
X_test = X[X['Y'] == -1]
test_indices = X[X['Y'] == -1].index
col_values = X['Y']
if len(X_test):
# Models
lr = LinearRegression()
lr = lr.fit(X_train.drop('Y', axis=1), X_train.Y)
col_values.ix[test_indices] = lr.predict(X_test.drop('Y', axis=1))
# Filling the result with the current sensor prediction
df_model_lr[col] = np.round(col_values)
return df_model_lr
### train the model, main code here
if __name__ == "__main__":
df_train = load_train_data()
if model_mode == 'full':
idwmodel_file = 'data-final/IDWmodel-final.csv'
submit_file = 'models-final/lr_model-final.csv'
else:
idwmodel_file = 'data/IDWmodel_train.csv'
submit_file = 'models/lr_model.csv'
# check if IDW model already exists, if not train it!
if not file_exists(idwmodel_file):
print 'building IDW model first...'
df_IDWmodel = build_IDWmodel()
else:
print 'loading IDW model...'
df_IDWmodel = pd.read_csv(idwmodel_file)
print 'computing time features...'
df_day_avg_values = build_avg_time_table(df_train)
print 'computing adj list...'
adjacency_list = compute_adjlist(27.)
col_names = ['S'+str(i) for i in xrange(1, 57)]
print 'running linear model, round #1 ...'
df_model_lr = lr_prediction(df_train, col_names, df_day_avg_values, adjacency_list, df_IDWmodel) # 5.78
print 'running linear model, round #2 ...'
df_model_lr = lr_prediction(df_train, col_names, df_day_avg_values, adjacency_list, df_model_lr) # 5.56
print 'writing to file...'
create_submission_file(df_model_lr, submit_file)
print 'done!'