def val(self, val_chunk): Y = val_chunk['y'].values X = val_chunk.drop('y', 1) Y_pred = self.predict(X) return cal_r(Y, Y_pred)
def fit(self, tr_chunk): ''' 1. Remove observations whose Y are saturated values ''' #tr_chunk = tr_chunk.loc[~tr_chunk['y'].isin(y_saturated_values)] tr_X = tr_chunk.drop(['id', 'y', 'timestamp'], 1) tr_Y = tr_chunk['y'].values ''' 2. replace NaN with median value use SUPER_VALUES_selected_feature_ids to accelerate processing ''' self.imputer = Imputer(missing_values='NaN', strategy='median', axis=0) self.imputer.fit(tr_X) tr_X = self.imputer.transform(tr_X) ''' 3. normalization ''' self.normalization = preprocessing.StandardScaler().fit(tr_X) tr_X = self.normalization.transform(tr_X) ''' 4. kbest for over-fitting problem ''' self.kbest = SelectKBest(mutual_info_regression, k=self.kbest_k) self.kbest.fit(tr_X, tr_Y) tr_X = self.kbest.transform(tr_X) ''' 5. Apply regression ''' self.model = GaussianProcessRegressor(alpha=self.alpha, random_state=self.seed) self.model.fit(tr_X, tr_Y) tr_Y_pred = self.model.predict(tr_X) return cal_r(tr_Y, tr_Y_pred)
tr_X_norm = normalization.transform(tr_X_imputed) val_X_norm = normalization.transform(val_X_imputed) # default 1e-4 alpha = 1e-4 for hidden_layer_sizes in range(100, 1000, 100): print('hidden_layer_sizes', hidden_layer_sizes, end='-->') ''' activation : {‘identity’, ‘logistic’, ‘tanh’, ‘relu’}, default ‘relu’ ''' model = MLPRegressor(solver='lbfgs', alpha=alpha) t0 = time.time() model.fit(tr_X_norm, tr_Y) t1 = time.time() tr_Y_pred = model.predict(tr_X_norm) t2 = time.time() tr_r = utils.cal_r(tr_Y, tr_Y_pred) print('tr:', tr_r, end=',') val_Y_pred = model.predict(val_X_norm) val_r = utils.cal_r(val_Y, val_Y_pred) print('val:', val_r) #print('cost: traning',t1-t0,',prediction:',t2-t1) alpha /= 10 break
def fit(self, tr_chunk, ts_chunk): model_Y = tr_chunk['y'].values model_X = tr_chunk.drop(['id', 'y'], 1) test_Y = ts_chunk['y'].values test_X = ts_chunk.drop(['id', 'y'], 1) #% ''' 2. replace NaN with median value ''' X = model_X self.imputer = Imputer(missing_values='NaN', strategy='median', axis=0) self.imputer.fit(X) model_X_imputed = self.imputer.transform(X) test_X_imputed = self.imputer.transform(test_X) #% ''' 3. normalization ''' self.normalization = preprocessing.StandardScaler().fit( model_X_imputed) model_X_norm = self.normalization.transform(model_X_imputed) test_X_norm = self.normalization.transform(test_X_imputed) ''' Apply regression ''' X = model_X_norm Y = model_Y skf = KFold(n_splits=self.kf_k, shuffle=True, random_state=self.seed) kf_i = 0 self.models = [] for tr_idx, val_idx in skf.split(X): print('KF', kf_i, end=',') tr_X, tr_Y = X[tr_idx, :], Y[tr_idx] val_X, val_Y = X[val_idx, :], Y[val_idx] model = SVR(kernel=self.model_config['kernel'], C=self.model_config['C'], gamma=self.model_config['gamma'], tol=self.model_config['tol']) model.fit(tr_X, tr_Y) tr_Y_pred = model.predict(tr_X) tr_r = utils.cal_r(tr_Y, tr_Y_pred) print('tr:', tr_r, end=',') val_Y_pred = model.predict(val_X) val_r = utils.cal_r(val_Y, val_Y_pred) print('val:', val_r, end=',') test_Y_pred = model.predict(test_X_norm) test_r = utils.cal_r(test_Y, test_Y_pred) print('test:', test_r, end='') # discard kf model whose val R is too low if val_r > -1: self.models.append(model) print(' (SAVED)') else: print(' (DISCARD)') kf_i += 1 tr_Y_sum = np.zeros(Y.shape[0]) test_Y_sum = np.zeros(test_Y.shape[0]) for model in self.models: tr_Y_sum += model.predict(X) test_Y_sum += model.predict(test_X_norm) tr_Y_pred = tr_Y_sum / len(self.models) test_Y_pred = test_Y_sum / len(self.models) tr_r = utils.cal_r(Y, tr_Y_pred) print('AVERAGE tr:', tr_r, end=',') test_r = utils.cal_r(test_Y, test_Y_pred) print('test:', test_r) return self
test_chunk = utils.read_variable('output/test_data') tr_Y = tr_chunk['y'] #%% ''' Linear Regression ''' model_linear = ChunkLinearRegression(kf_k=5, seed=13) model_linear.fit(tr_chunk, test_chunk) tr_Y_pred = model_linear.predict(tr_chunk) print('r:',utils.cal_r(tr_chunk['y'],tr_Y_pred)) ''' KF 0,tr: 0.136013944038,val: -607.864299346,test: -51.432700711 (DISCARD) KF 1,tr: 0.126612607023,val: -42707538498.9,test: -245495431.615 (DISCARD) KF 2,tr: 0.140904353882,val: -0.37319746263,test: -9.83011647629 (SAVED) KF 3,tr: 0.132568523411,val: -0.625998947552,test: -43.6620637906 (SAVED) KF 4,tr: 0.126012579869,val: -0.266122719199,test: -48.915686375 (SAVED) AVERAGE tr: 0.0448775523981,test: -27.5904174498 r: 0.0448775523981 ''' #%% ''' Tree Regression '''
def fit(self, tr_chunk, ts_chunk): model_Y = tr_chunk['y'].values model_X = tr_chunk.drop(['id', 'y'], 1) test_Y = ts_chunk['y'].values test_X = ts_chunk.drop(['id', 'y'], 1) #% ''' 2. replace NaN with median value ''' X = model_X[self.selected_feature_ids] self.imputer = Imputer(missing_values='NaN', strategy='median', axis=0) self.imputer.fit(X) model_X_imputed = self.imputer.transform(X) test_X_imputed = self.imputer.transform( test_X[self.selected_feature_ids]) #% ''' 3. normalization ''' self.normalization = preprocessing.StandardScaler().fit( model_X_imputed) model_X_norm = self.normalization.transform(model_X_imputed) test_X_norm = self.normalization.transform(test_X_imputed) ''' 4. kbest for over-fitting problem ''' tr_X_kbest = model_X_norm test_X_kbest = test_X_norm ''' Apply regression ''' X = tr_X_kbest Y = model_Y skf = KFold(n_splits=self.kf_k, shuffle=True, random_state=self.seed) kf_i = 0 self.models = [] for tr_idx, val_idx in skf.split(X): print('KF', kf_i, end=',') tr_X, tr_Y = X[tr_idx, :], Y[tr_idx] val_X, val_Y = X[val_idx, :], Y[val_idx] model = GaussianProcessRegressor( alpha=self.model_config['alpha'], random_state=self.model_config['random_state']) model.fit(tr_X, tr_Y) tr_Y_pred = model.predict(tr_X) tr_r = utils.cal_r(tr_Y, tr_Y_pred) print('tr:', tr_r, end=',') val_Y_pred = model.predict(val_X) val_r = utils.cal_r(val_Y, val_Y_pred) print('val:', val_r, end=',') test_Y_pred = model.predict(test_X_kbest) test_r = utils.cal_r(test_Y, test_Y_pred) print('test:', test_r, end='') # discard kf model whose val R is too low if val_r > 0: self.models.append(model) print(' (SAVED)') else: print(' (DISCARD)') kf_i += 1 if len(self.models) != 0: tr_Y_sum = np.zeros(Y.shape[0]) test_Y_sum = np.zeros(test_Y.shape[0]) for model in self.models: tr_Y_sum += model.predict(X) test_Y_sum += model.predict(test_X_kbest) tr_Y_pred = tr_Y_sum / len(self.models) test_Y_pred = test_Y_sum / len(self.models) tr_r = utils.cal_r(Y, tr_Y_pred) print('AVERAGE tr:', tr_r, end=',') test_r = utils.cal_r(test_Y, test_Y_pred) print('test:', test_r) else: print('WARNING: no model found.') return self
''' training ''' lag_X = np.zeros([len(kf_tr_chunk.y), len(lag_models)]) for lag_model_i, lag_model in enumerate(lag_models): y_pred = lag_model.predict(kf_tr_chunk) lag_X[:, lag_model_i] = y_pred #print('-->',lag_model_i,utils.cal_r(kf_tr_chunk.y,y_pred)) model_2L = GaussianProcessRegressor(alpha=SUPER_alpha, random_state=SUPER_seed) model_2L.fit(lag_X, kf_tr_chunk.y) tr_Y_pred = model_2L.predict(lag_X) tr_r = utils.cal_r(kf_tr_chunk.y, tr_Y_pred) print('tr:', tr_r, end=',') ''' Val ''' kf_val_chunk = tr_chunk.iloc[val_idx] lag_X = np.zeros([len(kf_val_chunk.y), len(lag_models)]) for lag_model_i, lag_model in enumerate(lag_models): y_pred = lag_model.predict(kf_val_chunk) lag_X[:, lag_model_i] = y_pred val_Y_pred = model_2L.predict(lag_X) val_r = utils.cal_r(kf_val_chunk.y, val_Y_pred) print('val:', val_r, end='') if val_r > best_model_2L_r: best_model_2L = model_2L
imputer.fit(X) model_X_imputed = imputer.transform(X) test_X_imputed = imputer.transform(test_X) normalization = preprocessing.StandardScaler().fit(model_X_imputed) model_X_norm = normalization.transform(model_X_imputed) test_X_norm = normalization.transform(test_X_imputed) t0 = time.time() model.fit(model_X_norm, model_Y) t1 = time.time() tr_Y_pred = model.predict(model_X_norm) t2 = time.time() tr_r = utils.cal_r(tr_Y, tr_Y_pred) print('tr:', tr_r, end=',') test_Y_pred = model.predict(test_X_norm) test_r = utils.cal_r(test_Y, test_Y_pred) print('test:', test_r) print('cost: traning', t1 - t0, ',prediction:', t2 - t1) alpha *= 10 #%% ''' alpha 0.1 tr: 0.995232505044,test: -0.115465678033 alpha 1.0
save Y_Pred of test chunk into files ''' for model_i in range(100): file_path = 'E:/two-sigma/output/chunk_gaussian_kbest/' + str(model_i) print('apply model', model_i, 'on chunk 98', end='...') t0 = time.time() if os.path.isfile(file_path): model = utils.read_variable(file_path) print('kbest i:', np.argmax(model.kbest.scores_), end='...') print('cost', int(time.time() - t0), 'sec') # predict testing chunk, and save Y in file to = time.time() test_Y_pred = model.predict(test_chunk_1L) test_r = utils.cal_r(test_chunk_1L_Y, test_Y_pred) print('test:', test_r, 'cost', int(time.time() - t0), 'sec') utils.save_variable( test_Y_pred, 'E:/two-sigma/output/chunk_98_gaussian_kbest_Y_pred/' + str(model_i)) else: print('No Model Found.') #%% ''' check model on chunk 0...kbest i: 88...cost 13 sec test: -0.00879861627212 cost 21 sec check model on chunk 1...kbest i: 88...cost 40 sec test: -0.00431009568512 cost 61 sec check model on chunk 2...kbest i: 88...cost 13 sec