def xgb_bagging(x_train, y_train, x_test, folds, max_round, n_splits=5, bags=5): params = {} params['max_depth'] = 5 params['objective'] = "binary:logistic" params['eta'] = 0.04 # learning rate params['subsample'] = 0.8 params['min_child_weight'] = 8 params['colsample_bytree'] = 0.8 params['gamma'] = 0.60 params['n_jobs'] = -1 params['reg_alpha'] = 10.4 params['reg_lambda'] = 5 params['silent'] = 1 # Additional processing of data x_train, x_test = feature_engineering_3(x_train, x_test, y_train) # Cross Validate cv = Cross_Validate(None, n_splits, x_train.shape[0], x_test.shape[0], -1, params, max_round, bags) cv.bagging(x_train, y_train, x_test, idx=1, verbose_eval=100) return cv.trn_gini, cv.y_trn, cv.y_tst, cv.y_tst_mrank, cv.fscore
def log06(x_train, y_train, x_test, folds, max_round, n_splits=5): clf = LogisticRegression( penalty='l2', dual=False, tol=0.0001, C=0.005, fit_intercept=True, intercept_scaling=1, class_weight='balanced', random_state=None, solver='sag', max_iter=200, multi_class='ovr', verbose=0, warm_start=False, n_jobs=4, ) # Additional processing of data x_train, x_test = feature_engineering_6(x_train, x_test, y_train) # Cross Validate cv = Cross_Validate(log06.__name__, n_splits, x_train.shape[0], x_test.shape[0], clf, -1, -1) cv.cross_validate(x_train, y_train, x_test, folds, verbose_eval=True) return cv.trn_gini, cv.y_trn, cv.y_tst, cv.fscore
def rgf04(x_train, y_train, x_test, folds, max_round, n_splits=5): clf = RGFClassifier( max_leaf=1000, algorithm="RGF", loss="Log", l2=0.01, sl2=0.01, normalize=False, min_samples_leaf=7, # 10, n_iter=None, opt_interval=100, learning_rate=.45, # .3, calc_prob="sigmoid", n_jobs=-2, memory_policy="generous", verbose=0) # Additional processing of data x_train, x_test = feature_engineering_4(x_train, x_test, y_train) # Cross Validate cv = Cross_Validate(rgf04.__name__, n_splits, x_train.shape[0], x_test.shape[0], clf, -1, -1) cv.cross_validate(x_train, y_train, x_test, folds, verbose_eval=True) return cv.trn_gini, cv.y_trn, cv.y_tst, cv.fscore
def etc07(x_train, y_train, x_test, folds, max_round, n_splits=5): clf = ExtraTreesClassifier( n_estimators = 800, criterion = 'gini', max_depth = 5, min_samples_split = 100, min_samples_leaf = 100, max_features ='auto', min_impurity_decrease = 0.0, n_jobs = 4, verbose = 0, ) # Additional processing of data x_train, x_test = feature_engineering_7(x_train, x_test, y_train) # Cross Validate cv = Cross_Validate(etc07.__name__, n_splits, x_train.shape[0], x_test.shape[0], clf, -1, -1) cv.cross_validate(x_train, y_train, x_test, folds, verbose_eval=True) return cv.trn_gini, cv.y_trn, cv.y_tst, cv.fscore
def xgb03(x_train, y_train, x_test, folds, max_round, n_splits=5): params = {} params['max_depth'] = 4 params['objective'] = "binary:logistic" params['eta'] = 0.025 # learning rate params['subsample'] = 0.9 params['min_child_weight'] = 100 params['colsample_bytree'] = 0.7 params['gamma'] = 0.60 params['n_jobs'] = -1 params['reg_alpha'] = 4 # params['reg_lambda'] = 5 params['silent'] = 1 # Additional processing of data x_train, x_test = feature_engineering_3(x_train, x_test, y_train) # Cross Validate cv = Cross_Validate(xgb03.__name__, n_splits, x_train.shape[0], x_test.shape[0], -1, params, max_round) cv.cross_validate_xgb(x_train, y_train, x_test, folds, verbose_eval=100) return cv.trn_gini, cv.y_trn, cv.y_tst, cv.fscore
def cat05(x_train, y_train, x_test, folds, max_round, n_splits=5): clf = CatBoostClassifier( iterations=900, learning_rate=0.057, depth=5, l2_leaf_reg=23, leaf_estimation_method='Newton', loss_function='Logloss', thread_count=7, random_seed=177, one_hot_max_size=10, allow_writing_files=False, ) # Additional processing of data x_train, x_test = feature_engineering_5(x_train, x_test, y_train) # Cross Validate cv = Cross_Validate(cat05.__name__, n_splits, x_train.shape[0], x_test.shape[0], clf, -1, -1) cv.cross_validate(x_train, y_train, x_test, folds, verbose_eval=True) return cv.trn_gini, cv.y_trn, cv.y_tst, cv.fscore
def tune_seq(self, x_train, y_train, x_test, folds, verbose_eval=False): """ Tune parameters sequentially :return: """ print("\ntuning starts...") for key in self.params_dict.keys(): for item in self.params_dict[key]: print('Tuning for parameter %s with value %f' % (key, item)) self.params_temp = self.params self.params_temp.update({key: item}) cv = Cross_Validate(None, n_splits=self.n_splits, len_trn=x_train.shape[0], len_tst=x_test.shape[0], clf=-1, params=self.params, max_round=self.max_round) cv.cross_validate_xgb(x_train, y_train, x_test, folds, verbose_eval) self.params_temp.update({'score': cv.trn_gini}) self.sframe = pd.concat([ self.sframe, pd.Series(self.params_temp.values(), index=self.params_temp.keys()) ], axis=1) if cv.trn_gini > self.max_score: self.max_item = item self.max_score = cv.trn_gini self.params.update({key: self.max_item}) self.max_item = 0 self.max_score = 0 self.sframe = self.sframe.transpose().reset_index()
def forward_selection(self, x_train, y_train, x_test, folds, cols): cv = Cross_Validate(None, n_splits=self.n_splits, len_trn=x_train.shape[0], len_tst=x_test.shape[0], clf=-1, params=self.params, max_round=self.max_round) x_train_cols = x_train[cols] x_test_cols = x_test[cols] x_train.drop(cols, axis=1, inplace=True) x_test.drop(cols, axis=1, inplace=True) cv.cross_validate_xgb(x_train, y_train, x_test, folds) self.current_best = cv.trn_gini self.scores.append(self.current_best) for i in range(len(cols)): print("Round %i" % (i + 1)) print("Shape of train"), print x_train.shape for col in cols: x_train = pd.concat([x_train, x_train_cols[col]], axis=1) x_test = pd.concat([x_test, x_test_cols[col]], axis=1) cv.cross_validate_xgb(x_train, y_train, x_test, folds) if cv.trn_gini > self.current_best: self.current_best = cv.trn_gini self.col_temp = col x_train.drop(x_train_cols[col], axis=1, inplace=True) x_test.drop(x_test_cols[col], axis=1, inplace=True) if self.col_temp != 0: cols.remove(self.col_temp) x_train = pd.concat([x_train, x_train_cols[self.col_temp]], axis=1) x_test = pd.concat([x_test, x_test_cols[self.col_temp]], axis=1) self.cols.append(self.col_temp) self.scores.append(self.current_best) self.col_temp = 0 else: break