-
Notifications
You must be signed in to change notification settings - Fork 0
/
ml_cms_heights.py
357 lines (303 loc) · 16 KB
/
ml_cms_heights.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
import cPickle
import os
import pandas as pd
import pdb
import numpy as np
import logging
import sys
import calendar
import matplotlib.pyplot as plt
from math import ceil
from sklearn.feature_selection import SelectKBest, f_classif, f_regression
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV
from sklearn.learning_curve import learning_curve
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble.forest import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import normalize
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import cross_val_predict
from sklearn import linear_model
from sklearn.svm import SVR
from sklearn.ensemble import ExtraTreesRegressor
from ConfigParser import SafeConfigParser
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.cross_validation import train_test_split
import constants
import compute_stats
import rgeo
import utils
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
# Logging
cur_flname = os.path.splitext(os.path.basename(__file__))[0]
LOG_FILENAME = constants.log_dir + os.sep + 'Log_' + cur_flname + '.txt'
logging.basicConfig(filename=LOG_FILENAME, level=logging.INFO, filemode='w',
format='%(asctime)s %(levelname)s %(module)s - %(funcName)s: %(message)s',
datefmt="%m-%d %H:%M") # Logging levels are DEBUG, INFO, WARNING, ERROR, and CRITICAL
# Output to screen
logger = logging.getLogger(cur_flname)
if not logger.handlers:
logger.addHandler(logging.StreamHandler())
# loop_countries
# ; Loop over countries x crops
# train_ml_model
# create_train_df
# compute_ml_vars
# ; create ml model
# loop_forecasting
# create_forecast_df
# do_forecasting
class MLCms:
"""
"""
def __init__(self, config_file=''):
# Parse config file
self.parser = SafeConfigParser()
self.parser.read(config_file)
# machine learning specific variables
self.classify = constants.DO_CLASSIFICATION # Regress or classify?
self.vars_features = constants.fixed_vars
self.vars_target = constants.ML_TARGETS
if self.classify:
self.var_target = constants.ML_TARGETS
self.task = 'classification'
self.model = RandomForestClassifier(n_estimators=2500, n_jobs=constants.ncpu, random_state=0)
else:
self.var_target = constants.ML_TARGETS
self.task = 'regression'
self.model = RandomForestRegressor(n_estimators=2500, n_jobs=constants.ncpu, random_state=0) # SVR()
# Get path to input
self.path_inp = constants.base_dir + os.sep + constants.name_inp_fl
# Output directory is <dir>_<classification>_<2014>
self.path_out_dir = constants.out_dir
utils.make_dir_if_missing(self.path_out_dir)
# Model pickle
self.path_pickle_model = self.path_out_dir + os.sep + constants.model_pickle
self.path_pickle_features = self.path_out_dir + os.sep + 'pickled_features'
def output_model_importance(self, gs, name_gs, num_cols):
"""
:param gs:
:param name_gs:
:param num_cols:
:return:
"""
rows_list = []
name_vars = []
feature_importance = gs.best_estimator_.named_steps[name_gs].feature_importances_
importances = 100.0 * (feature_importance / feature_importance.max())
std = np.std([tree.feature_importances_ for tree in self.model.estimators_], axis=0)
indices = np.argsort(importances)[::-1]
# Store feature ranking in a dataframe
for f in range(num_cols):
dict_results = {'Variable': self.vars_features[indices[f]], 'Importance': importances[indices[f]]}
name_vars.append(self.vars_features[indices[f]])
rows_list.append(dict_results)
df_results = pd.DataFrame(rows_list)
num_cols = 10 if len(indices) > 10 else len(indices) # Plot upto a maximum of 10 features
plot.plot_model_importance(num_bars=num_cols, xvals=importances[indices][:num_cols],
std=std[indices][:num_cols], fname=self.task + '_importance_' + self.crop,
title='Importance of variable (' + self.country + ' ' + self.crop_lname + ')',
xlabel=name_vars[:num_cols], out_path=self.path_out_dir)
df_results.to_csv(self.path_out_dir + os.sep + self.task + '_importance_' + self.crop + '.csv')
def get_data(self):
"""
:return:
"""
df = pd.read_csv(self.path_inp)
cols = [col for col in df.columns if col not in self.vars_features]
# cols.extend(['DI', 'PI'])
# Add information on PI and DI of soils
# iterate over each row, get lat and lon
# Find corresponding DI and PI
lat_lons = zip(df['Long_round'], df['Lat_round'])
vals_di = []
vals_pi = []
# for idx, (lon, lat) in enumerate(lat_lons):
# print idx, len(lat_lons)
# vals_pi.append(rgeo.get_value_at_point('C:\\Users\\ritvik\\Documents\\PhD\\Projects\\CMS\\Input\\Soils\\PI.tif',
# lon, lat, replace_ras=False))
# vals_di.append(rgeo.get_value_at_point('C:\\Users\\ritvik\\Documents\\PhD\\Projects\\CMS\\Input\\Soils\\DI.tif',
# lon, lat, replace_ras=False))
#
# df['DI'] = vals_di
# df['PI'] = vals_pi
df = df[cols]
data = df.as_matrix(columns=cols[1:])
target = df.as_matrix(columns=[self.var_target]).ravel()
# Get training and testing splits
splits = train_test_split(data, target, test_size=0.2)
return cols, splits
def train_ml_model(self):
"""
:return:
"""
logger.info('#########################################################################')
logger.info('train_ml_model')
logger.info('#########################################################################')
######################################################
# Load dataset
######################################################
cols, splits = self.get_data()
data_train, data_test, target_train, target_test = splits
# clf = ExtraTreesRegressor(500, n_jobs=constants.ncpu)
# #clf = SVR(kernel='rbf', C=1e3, gamma=0.1)
# #clf = skflow.TensorFlowDNNClassifier(hidden_units=[10, 20, 10], n_classes=3)
# data = df_train.as_matrix(columns=cols[1:]) # convert dataframe column to matrix
# #data = preprocessing.scale(data)
# target = df_train.as_matrix(columns=[self.var_target]).ravel() # convert dataframe column to matrix
# clf.fit(data, target)
#
# predict_val = clf.predict(after.as_matrix(columns=cols[1:]))
# results = compute_stats.ols(predict_val.tolist(), after_target.tolist())
# print results.rsquared
# import matplotlib.pyplot as plt
# plt.scatter(after_target, predict_val)
# plt.show()
# pdb.set_trace()
if not os.path.isfile(self.path_pickle_model):
# For details in scikit workflow: See http://stackoverflow.com/questions/
# 35256876/ensuring-right-order-of-operations-in-random-forest-classification-in-scikit-lea
# TODO Separate out a dataset so that even the grid search cv can be tested
############################
# Select features from model
############################
logger.info('Selecting important features from model')
if self.classify:
rf_feature_imp = ExtraTreesRegressor(150, n_jobs=constants.ncpu)
else:
rf_feature_imp = ExtraTreesRegressor(150, n_jobs=constants.ncpu)
feat_selection = SelectFromModel(rf_feature_imp)
pipeline = Pipeline([
('fs', feat_selection),
('clf', self.model),
])
#################################
# Grid search for best parameters
#################################
C_range = np.logspace(-2, 10, 13)
gamma_range = np.logspace(-9, 3, 13)
logger.info('Tuning hyperparameters')
param_grid = {
'fs__threshold': ['mean', 'median'],
'fs__estimator__max_features': ['auto', 'log2'],
'clf__max_features': ['auto', 'log2'],
'clf__n_estimators': [1000, 2000]
#'clf__gamma': np.logspace(-9, 3, 13),
#'clf__C': np.logspace(-2, 10, 13)
}
gs = GridSearchCV(pipeline, param_grid=param_grid, verbose=2, n_jobs=constants.ncpu, error_score=np.nan)
# Fir the data before getting the best parameter combination. Different data sets will have
# different optimized parameter combinations, i.e. without data, there is no optimal parameter combination.
gs.fit(data_train, target_train)
logger.info(gs.best_params_)
data_test = pd.DataFrame(data_test, columns=cols[1:])
# Update features that should be used in model
selected_features = gs.best_estimator_.named_steps['fs'].transform([cols[1:]])
cols = selected_features[0]
data_test = data_test[cols]
# Update model with the best parameters learnt in the previous step
self.model = gs.best_estimator_.named_steps['clf']
predict_val = self.model.predict(data_test)
results = compute_stats.ols(predict_val.tolist(), target_test.tolist())
print results.rsquared
print cols
plt.scatter(target_test, predict_val)
plt.show()
pdb.set_trace()
###################################################################
# Output and plot importance of model features, and learning curves
###################################################################
self.output_model_importance(gs, 'clf', num_cols=len(cols[1:]))
if constants.plot_model_importance:
train_sizes, train_scores, test_scores = learning_curve(self.model, data, target, cv=k_fold,
n_jobs=constants.ncpu)
plot.plot_learning_curve(train_scores, test_scores, train_sizes=train_sizes, fname='learning_curve',
ylim=(0.0, 1.01), title='Learning curves', out_path=self.path_out_dir)
# Save the model to disk
logger.info('Saving model and features as pickle on disk')
with open(self.path_pickle_model, 'wb') as f:
cPickle.dump(self.model, f)
with open(self.path_pickle_features, 'wb') as f:
cPickle.dump(self.vars_features, f)
else:
# Read model from pickle on disk
with open(self.path_pickle_model, 'rb') as f:
logger.info('Reading model from pickle on disk')
self.model = cPickle.load(f)
logger.info('Reading features from pickle on disk')
self.vars_features = pd.read_pickle(self.path_pickle_features)
return df_cc
def do_forecasting(self, df_forecast, mon_names, available_target=False, name_target='yield'):
"""
1. Does classification/regression based on already built model.
2. Plots confusion matrix for classification tasks, scatter plot for regression
3. Plots accuracy statistics for classification/regression
:param df_forecast:
:param mon_names:
:param available_target: Is target array available?
:param name_target: Name of target array (defaults to yield)
:return:
"""
data = df_forecast.as_matrix(columns=self.vars_features) # convert dataframe column to matrix
predicted = self.model.predict(data)
if available_target:
expected = df_forecast.as_matrix(columns=[name_target]).ravel()
if not self.classify: # REGRESSION
# Compute stats
results = compute_stats.ols(predicted.tolist(), expected.tolist())
bias = compute_stats.bias(predicted, expected)
rmse = compute_stats.rmse(predicted, expected)
mae = compute_stats.mae(predicted, expected)
# Plot!
plot.plot_regression_scatter(expected, np.asarray(predicted),
annotate=r'$r^{2}$ ' + '{:0.2f}'.format(results.rsquared) + '\n' +
'peak NDVI date: ' + self.time_peak_ndvi.strftime('%b %d'),
xlabel='Expected yield',
ylabel='Predicted yield',
title=mon_names + ' ' + str(int(df_forecast[self.season].unique()[0])),
fname=self.task + '_' + '_'.join([mon_names]) + '_' + self.crop,
out_path=self.path_out_dir)
# global expected vs predicted
if self.debug:
# any non-existing index will add row
self.df_global.loc[len(self.df_global)] = [np.nanmean(expected), np.nanmean(predicted), mon_names,
self.forecast_yr]
return predicted, {'RMSE': rmse, 'MAE': mae, r'$r^{2}$': results.rsquared, 'Bias': bias}
else: # CLASSIFICATION
# Convert from crop condition class (e.g. 4) to string (e.g. exceptional)
expected, predicted = compute_stats.remove_nans(expected, predicted)
cm = confusion_matrix(expected, predicted, labels=self.dict_cc.keys()).T
# Compute and plot class probabilities
proba_cc = self.model.predict_proba(data)
df_proba = pd.DataFrame(proba_cc, columns=self.dict_cc.values())
plot.plot_class_probabilities(df_proba, fname='proba_' + '_'.join([mon_names]) + '_' + self.crop,
out_path=self.path_out_dir)
# Plot confusion matrix
plot.plot_confusion_matrix(cm, normalized=False, fname='cm_' + '_'.join([mon_names]) + '_' + self.crop,
xlabel='True class', ylabel='Predicted class', ticks=self.dict_cc.values(),
out_path=self.path_out_dir)
# Normalize and plot confusion matrix
cm_normalized = normalize(cm.astype(float), axis=1, norm='l1')
plot.plot_confusion_matrix(cm_normalized, fname='norm_cm_' + '_'.join([mon_names]) + '_' + self.crop,
xlabel='True class', ylabel='Predicted class', normalized=True,
ticks=self.dict_cc.values(), out_path=self.path_out_dir)
score_accuracy = accuracy_score(expected, predicted) * 100.0
score_precision = precision_score(expected, predicted, average='weighted') * 100.0
return predicted, {'Accuracy': score_accuracy, 'Precision': score_precision}
else:
return predicted, {'RMSE': np.nan, 'MAE': np.nan, r'$r^{2}$': np.nan, 'Bias': np.nan,
'Nash-Sutcliff': np.nan}
def do_ml_model():
obj = MLCms(config_file='config_CMS.txt')
obj.train_ml_model()
if __name__ == '__main__':
do_ml_model()