def func(x, X, y, start): score = numpy.dot(X, x) score = score / score.max() # sigmoid(score) thresh, score = mcc_optimize(score, y) if 1: # numpy.random.random() < 0.1: logger.info(' thresh: %s, score: %s, rest:%s' % (thresh, score, time.time() - start)) if time.time() - start > 1200: logger.info('END thresh: %s, score: %s' % (thresh, score)) with open('weight_n.pkl', 'wb') as f: pickle.dump(x, f, -1) return 'aaa' return -score
def func(x, model, X, y, start): W = Parallel(n_jobs=-1, verbose=0, backend="threading")( delayed(parallel_helper)(e, 'predict_proba', X, check_input=False) for e in model.estimators_) W = numpy.array([w[:, 1] for w in W]).T # W = numpy.array([m.predict_proba(X)[:, 1] for m in model.estimators_]).T score = numpy.dot(W, x) score = score / score.max() # sigmoid(score) thresh, score = mcc_optimize(score, y) if 1: # numpy.random.random() < 0.1: logger.info(' thresh: %s, score: %s, rest:%s' % (thresh, score, time.time() - start)) if time.time() - start > 1200: logger.info('END thresh: %s, score: %s' % (thresh, score)) with open('weight.pkl', 'wb') as f: pickle.dump(x, f, -1) return 'aaa' return -score
ans = numpy.array(ans).T insample_ans = numpy.array(insample_ans).T if all_ans is None: all_ans = ans all_target = target[test_idx] all_ids = ids.ix[test_idx].values else: all_ans = numpy.r_[all_ans, ans] all_target = numpy.r_[all_target, target[test_idx]] all_ids = numpy.r_[all_ids, ids.ix[test_idx]] model = XGBClassifier(seed=0) model.fit(ans, target[test_idx]) pred = model.predict_proba(ans)[:, 1] logger.info('model thresh: %s, score: %s' % mcc_optimize(pred, target[test_idx])) pred = ans.max(axis=1) logger.info('max thresh: %s, score: %s' % mcc_optimize(pred, target[test_idx])) pred = ans.min(axis=1) logger.info('min thresh: %s, score: %s' % mcc_optimize(pred, target[test_idx])) logger.info('mean thresh: %s, score: %s' % mcc_optimize(ans.mean(axis=1), target[test_idx])) for j in range(ans.shape[1]): score = roc_auc_score(target[test_idx], ans[:, j]) logger.info('score: %s' % score) logger.info('model thresh: %s, score: %s' % mcc_optimize(ans[:, j], target[test_idx]))
@jit def mcc_scoring2(y_pred_prb, y): list_thresh = numpy.arange(1, 100) / 100 max_score = -1 idx = None for thresh in list_thresh: y_pred = numpy.where(y_pred_prb >= thresh, 1, 0) score = mcc(y, y_pred) if score > max_score: max_score = score idx = thresh return idx, max_score if __name__ == '__main__': logger.info('load start') data = pandas.read_csv('stack_1_pred.csv') target = data[TARGET_COLUMN_NAME].values pred = data['pred'].values logger.info('load end') logger.info('shape %s %s' % data.shape) logger.info('shape %s' % target.shape) logger.info('pos num: %s, pos rate: %s' % (sum(target), float(sum(target)) / target.shape[0])) thresh, score = mcc_optimize(pred, target) logger.info('model:%s, thresh: %s, total score: %s' % (0, thresh, score))
train_dmatrix, evals=[(test_dmatrix, 'eval')], feval=evalmcc_xgb_min, num_boost_round=924, early_stopping_rounds=924, verbose_eval=True) avg_ntree += booster.best_ntree_limit ans = booster.predict(test_dmatrix, ntree_limit=booster.best_ntree_limit) tree_limit = booster.best_ntree_limit score = roc_auc_score(target.ix[test_idx].values, ans) logger.info('score: %s' % score) logger.info('tree: %s' % tree_limit) logger.info('model thresh: %s, score: %s' % mcc_optimize(ans, target.ix[test_idx].values)) logger.info('train_end') if all_ans is None: all_ans = ans all_target = target[test_idx] all_ids = data.ix[test_idx].index.values.astype(int) else: all_ans = numpy.r_[all_ans, ans] all_target = numpy.r_[all_target, target[test_idx]] all_ids = numpy.r_[ all_ids, data.ix[test_idx].index.values.astype(int)] ans = booster.predict(test_dmatrix, ntree_limit=booster.best_iteration - 10) logger.info('model thresh: %s, score: %s' % mcc_optimize(ans, target.ix[test_idx].values))
n_folds=n_iter, shuffle=True, random_state=ep) avg_cost = 0. logger.info('epock: %s' % ep) for i, (_, batch_idx) in enumerate(batchs): logger.info(' batch: %s/%s' % (i + 1, n_iter)) batch_xs = train_data[batch_idx] batch_ys = train_target[batch_idx] _, cost = sess.run([train_step, loss], feed_dict={ x: batch_xs, y_: batch_ys.reshape(-1, 1) }) #train_step.run({x: batch_xs, y_: batch_ys.reshape(-1, 1)}) avg_cost += cost / n_iter logger.info('loss: %s' % avg_cost) pred = y.eval({x: test_data}, session=sess).reshape((1, -1))[0] print(pred) score = mcc_optimize(pred, test_target) logger.info('score: %s %s' % score) score = roc_auc_score(test_target, pred) logger.info('suc: %s' % score) pred = y.eval({x: test_data}, session=sess).reshape((1, -1))[0] score = mcc_optimize(pred, test_target) logger.info('score: %s %s' % score) score = roc_auc_score(test_target, pred) logger.info('suc: %s' % score)
pred /= sum(params.values()) return pred.clip(0, 1) def predict(params): # use_preds = [pred for i, pred in enumerate(list_preds) if params[i]] use_preds = [params[path] * load_test(path) for path in list_dir] # pred = np.mean(use_preds, axis=0) pred = np.sum(use_preds, axis=0) pred /= sum(params.values()) return pred.clip(0, 1) trials = Trials() min_params = optimize(list_dir, score_func, trials) logger.info(f'min params: {min_params}') preds = pred_func({i: min_params[path] for i, path in enumerate(list_dir)}) best_proba, sc = mcc_optimize(preds, y_train) logger.warn('search: %s' % sc) list_test = [load_test(path) for path in list_dir] p_test = predict(min_params) ids = np.loadtxt('ids.npy') sub = pd.DataFrame() sub['Id'] = ids.astype(int) sub['Response'] = p_test >= best_proba sub.to_csv(DIR + 'submit_ens.csv', index=False) logger.info('exit')
logger.info('train_end') ans = numpy.array(ans).T insample_ans = numpy.array(insample_ans).T if all_ans is None: all_ans = ans all_target = target[test_idx] all_ids = ids.ix[test_idx].values else: all_ans = numpy.r_[all_ans, ans] all_target = numpy.r_[all_target, target[test_idx]] all_ids = numpy.r_[all_ids, ids.ix[test_idx]] model = XGBClassifier(seed=0) model.fit(ans, target[test_idx]) pred = model.predict_proba(ans)[:, 1] logger.info('model thresh: %s, score: %s' % mcc_optimize(pred, target[test_idx])) pred = ans.max(axis=1) logger.info('max thresh: %s, score: %s' % mcc_optimize(pred, target[test_idx])) pred = ans.min(axis=1) logger.info('min thresh: %s, score: %s' % mcc_optimize(pred, target[test_idx])) score = roc_auc_score(target[test_idx], ans[:, -1]) logger.info('mean thresh: %s, score: %s' % mcc_optimize(ans.mean(axis=1), target[test_idx])) logger.info('all thresh: %s, score: %s' % mcc_optimize(ans[:, -1], target[test_idx])) logger.info('score: %s' % score) score = roc_auc_score(target[test_idx], pred) logger.info('INSAMPLE score: %s' % score) pred = model.predict_proba(insample_ans)[:, 1] # ans.max(axis=1) score = roc_auc_score(target[train_idx], pred) logger.info('INSAMPLE train score: %s' % score) list_estimator.append(model)
logger.info('%s/%s param: %s' % (i + 1, len(pg), params)) pred_proba_all = [] y_true = [] for train_idx, test_idx in cv: model = NMOpt() model.fit(data[train_idx], target[train_idx]) # pred_proba = data[test_idx, -1] pred_proba = model.predict_proba(data[test_idx])[:, 1] pred_proba_all = numpy.r_[pred_proba_all, pred_proba] y_true = numpy.r_[y_true, target[test_idx]] score = roc_auc_score(target[test_idx], pred_proba) # logger.info(' score: %s' % score) list_score.append(score) thresh, score = mcc_optimize(pred_proba, target[test_idx]) logger.info(' thresh: %s, score: %s' % (thresh, score)) score = numpy.mean(list_score) thresh, score = mcc_optimize(pred_proba_all, y_true) max_score = max(max_score, score) logger.info('thresh: %s, total score: %s, max_score: %s' % (thresh, score, max_score)) if max_score == score: best_param = params best_thresh = thresh logger.info('best_thresh: %s, total max score: %s' % (best_thresh, max_score)) # model = XGBClassifier(seed=0) # model = LogisticRegression(n_jobs=-1, class_weight='balanced') # model.set_params(**best_param) model = NMOpt()
logger.info('train_end') if all_ans is None: all_ans = ans all_target = target[test_idx] all_ids = data.ix[test_idx].index.values.astype(int) else: all_ans = numpy.r_[all_ans, ans] all_target = numpy.r_[all_target, target[test_idx]] all_ids = numpy.r_[all_ids, data.ix[test_idx].index.values.astype(int)] score = roc_auc_score(target[test_idx], ans) logger.info('score: %s' % score) logger.info('model thresh: %s, score: %s' % mcc_optimize(ans, target[test_idx])) logger.info('cv model thresh: %s, score: %s' % mcc_optimize(all_ans, all_target)) for i in ['']: logger.info('model: %s' % i) cols = [col for col in feature_column if 'L%s' % i in col] logger.info('model xg: %s' % i) model = XGBClassifier(seed=0) model.set_params(**params) model.fit(data[cols], target) ids = pandas.read_csv('stack_1_id_1.csv')['0'].values _data = pandas.read_csv('stack_1_data_1.csv') logger.info('old data %s %s' % _data.shape)
for params in ParameterGrid(all_params): logger.info('param: %s' % (params)) for train_idx, test_idx in list(cv)[:1]: with gzip.open('train_fm.svm', 'wb') as f: dump_svmlight_file(data[train_idx], target[train_idx], f) del output gc.collect() with gzip.open('test_svm.svm', 'wb') as f: dump_svmlight_file(data[test_idx], target[test_idx], f) model = TFFMClassifier( order=2, rank=10, optimizer=tf.train.AdamOptimizer(learning_rate=0.01), n_epochs=50, batch_size=100000, init_std=0.001, reg=0.001, input_type='sparse') """ model = FMClassification() """ model.fit(data[train_idx], target[train_idx], show_progress=True) ans = model.predict_proba(data[test_idx])[:, 1] score = roc_auc_score(target[test_idx], ans) logger.info('score: %s' % score) logger.info('all thresh: %s, score: %s' % mcc_optimize(ans, target[test_idx])) score = roc_auc_score(target[test_idx], ans)
gc.collect() model.set_params(**params) if 1: model.fit(data.ix[train_idx, cols], target[train_idx]) else: model.fit(data.ix[train_idx, cols], target[train_idx], eval_set=[(data.ix[test_idx, cols], target[test_idx])], eval_metric=evalmcc_xgb_min, early_stopping_rounds=1000, verbose=True) ans = model.predict_proba(data.ix[test_idx, cols])[:, 1] score = roc_auc_score(target[test_idx], ans) thresh, mcc = mcc_optimize(ans, target[test_idx]) logger.info('auc: %s thresh: %s, score: %s' % (score, thresh, mcc)) """ for t in range(1, 101): ans = model.predict_proba(data.ix[test_idx, cols], ntree_limit=t)[:, 1] score = roc_auc_score(target[test_idx], ans) logger.info(' score: %s' % score) logger.info(' model thresh: %s, score: %s' % mcc_optimize(ans, target[test_idx])) """ logger.info('train_end') if all_ans is None: all_ans = ans all_target = target[test_idx] all_ids = ids.ix[test_idx].values else:
TEST_DATA = os.path.join(DATA_DIR, 'test_simple_join.csv.gz') TARGET_COLUMN_NAME = u'Response' from utils import mcc_optimize, evalmcc_xgb_min from feature import LIST_FEATURE_COLUMN_NAME log_fmt = '%(asctime)s %(name)s %(lineno)d [%(levelname)s][%(funcName)s] %(message)s ' logging.basicConfig(format=log_fmt, datefmt='%Y-%m-%d/%H:%M:%S', level='INFO') logger = logging.getLogger(__name__) if __name__ == '__main__': logger.info('load start') target = pandas.read_csv('stack_1_target_1.csv')['0'].values data = pandas.read_csv('stack_1_data_1.csv').values logger.info('load end') logger.info('shape %s %s' % data.shape) logger.info('shape %s' % target.shape) logger.info('pos num: %s, pos rate: %s' % (sum(target), float(sum(target)) / target.shape[0])) with open('list_xgb_model.pkl', 'rb') as f: list_model = pickle.load(f) for i in range(data.shape[1]): thresh, score = mcc_optimize(data[:, i], target) auc_score = roc_auc_score(target, data[:, i]) print('"%s"' % list_model[i].__repr__(), auc_score, thresh, score, sep=',')
except AttributeError: ans.append( model.predict_proba(data.ix[test_idx, cols])[:, 1]) ans = numpy.array(ans).T if all_ans is None: all_ans = ans all_target = target[test_idx] all_ids = data.ix[test_idx].index.values.astype(int) else: all_ans = numpy.r_[all_ans, ans] all_target = numpy.r_[all_target, target[test_idx]] all_ids = numpy.r_[all_ids, data.ix[test_idx].index.values.astype(int)] for j in range(ans.shape[1]): score = roc_auc_score(target[test_idx], ans[:, j]) logger.info('score: %s' % score) logger.info('model thresh: %s, score: %s' % mcc_optimize(ans[:, j], target[test_idx].values)) pandas.DataFrame(all_ans).to_csv('stack_1_data_1.csv.gz', index=False, compression='gzip') pandas.DataFrame(all_target).to_csv('stack_1_target_1.csv.gz', index=False, compression='gzip') pandas.DataFrame(all_ids).to_csv('stack_1_id_1.csv.gz', index=False, compression='gzip')
def loss_func(y, pred): best_proba, best_mcc = mcc_optimize(pred, y) return - best_mcc
insample_ans = numpy.array(insample_ans).T if all_ans is None: all_ans = ans all_target = target.ix[test_idx].values all_ids = data.ix[test_idx].index.values.astype(int) else: all_ans = numpy.r_[all_ans, ans] all_target = numpy.r_[all_target, target.ix[test_idx].values] all_ids = numpy.r_[all_ids, data.ix[test_idx].index.values.astype(int)] model = XGBClassifier(seed=0) model.fit(ans, target.ix[test_idx]) pred = model.predict_proba(ans)[:, 1] logger.info('model thresh: %s, score: %s' % mcc_optimize(pred, target.ix[test_idx].values)) pred = ans.max(axis=1) logger.info('max thresh: %s, score: %s' % mcc_optimize(pred, target.ix[test_idx].values)) pred = ans.min(axis=1) logger.info('min thresh: %s, score: %s' % mcc_optimize(pred, target.ix[test_idx].values)) logger.info( 'mean thresh: %s, score: %s' % mcc_optimize(ans.mean(axis=1), target.ix[test_idx].values)) for j in range(ans.shape[1]): score = roc_auc_score(target.ix[test_idx].values, ans[:, j]) logger.info('score: %s' % score) logger.info(
def train(): df = pd.concat([ pd.read_feather('train_0713.ftr', nthreads=8).astype(DTYPE), pd.read_feather('train_feat_agg.ftr', nthreads=8).astype(DTYPE), pd.read_feather('train_feat_agg_sec.ftr', nthreads=8).astype(DTYPE), pd.read_feather('train_hash_cnt.ftr', nthreads=8).astype(DTYPE), pd.read_feather('train_hash_cnt_nos38.ftr', nthreads=8).astype(DTYPE), pd.read_feather('train_hash_cnt_sec.ftr', nthreads=8).astype(DTYPE), pd.read_feather('train_date_min.ftr', nthreads=8).astype(DTYPE), pd.read_feather('train_num_pass_sec.ftr', nthreads=8).astype(DTYPE), pd.read_feather('train_diff.ftr', nthreads=8).astype(DTYPE), pd.read_feather('train_time_mean.ftr', nthreads=8).astype(DTYPE), pd.read_feather('train_time_mean_norm.ftr', nthreads=8).astype(DTYPE), pd.read_feather('train_magic.ftr', nthreads=8)[[ 'magic1', 'magic2', 'magic3', 'magic4' ]].astype(DTYPE), ], axis=1) df_cols = pd.read_csv('result_0715_allfeat/feature_importances.csv') drop_cols = df_cols[df_cols['imp'] == 0]['col'].values df.drop(drop_cols, axis=1, errors='ignore', inplace=True) df_cols = pd.read_csv('result_0715_magic/feature_importances.csv') drop_cols = df_cols[df_cols['imp'] == 0]['col'].values df.drop(drop_cols, axis=1, errors='ignore', inplace=True) df_cols = pd.read_csv('result_0715_sec/feature_importances.csv') drop_cols = df_cols[df_cols['imp'] == 0]['col'].values df.drop(drop_cols, axis=1, errors='ignore', inplace=True) df_cols = pd.read_csv('result_0716_sec_hash/feature_importances.csv') drop_cols = df_cols[df_cols['imp'] == 0]['col'].values df.drop(drop_cols, axis=1, errors='ignore', inplace=True) df_cols = pd.read_csv('result_0716_num_sec/feature_importances.csv') drop_cols = df_cols[df_cols['imp'] == 0]['col'].values df.drop(drop_cols, axis=1, errors='ignore', inplace=True) df_cols = pd.read_csv('result_0716_rate001/feature_importances.csv') drop_cols = df_cols[df_cols['imp'] == 0]['col'].values df.drop(drop_cols, axis=1, errors='ignore', inplace=True) df_cols = pd.read_csv('result_0717_s38/feature_importances.csv') drop_cols = df_cols[df_cols['imp'] == 0]['col'].values df.drop(drop_cols, axis=1, errors='ignore', inplace=True) df_cols = pd.read_csv('result_0717_diff/feature_importances.csv') drop_cols = df_cols[df_cols['imp'] == 0]['col'].values df.drop(drop_cols, axis=1, errors='ignore', inplace=True) df_cols = pd.read_csv('result_0717_time_mean/feature_importances.csv') drop_cols = df_cols[df_cols['imp'] == 0]['col'].values df.drop(drop_cols, axis=1, errors='ignore', inplace=True) df_cols = pd.read_csv('result_0718_time_mean/feature_importances.csv') drop_cols = df_cols[df_cols['imp'] == 0]['col'].values df.drop(drop_cols, axis=1, errors='ignore', inplace=True) logger.info(f'load 1 {df.shape}') y_train = df['Response'].values df.drop(['Response', 'Id'], axis=1, errors='ignore', inplace=True) logger.info(f'load dropcols {df.shape}') gc.collect() x_train = df.values # sparse.csc_matrix(df.values, dtype=DTYPE) usecols = df.columns.values.tolist() del df gc.collect() logger.info('train data size {}'.format(x_train.shape)) cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=871) with open(DIR + 'usecols.pkl', 'wb') as f: pickle.dump(usecols, f, -1) # {'boosting_type': 'gbdt', 'colsample_bytree': 0.7, 'learning_rate': 0.1, 'max_bin': 511, 'max_depth': -1, 'metric': 'None', 'min_child_weight': 10, 'min_split_gain': 0, 'num_leaves': 31, 'objective': 'binary', 'reg_alpha': 1, 'scale_pos_weight': 1, 'seed': 114, 'subsample': 0.99, 'subsample_freq': 1, 'verbose': -1, 'xgboost_dart_mode': True} all_params = { 'min_child_weight': [10], 'subsample': [0.99], 'subsample_freq': [1], 'seed': [114], 'colsample_bytree': [0.7], 'learning_rate': [0.01], 'max_depth': [-1], 'min_split_gain': [0], 'reg_alpha': [1], 'max_bin': [511], 'num_leaves': [31], 'objective': ['binary'], 'scale_pos_weight': [1], 'verbose': [-1], 'boosting_type': ['gbdt'], 'metric': ["None"], 'xgboost_dart_mode': [True], # 'device': ['gpu'], } use_score = 0 min_score = (100, 100, 100) for params in tqdm(list(ParameterGrid(all_params))): cnt = -1 list_score = [] list_score2 = [] list_best_iter = [] all_pred = np.zeros(y_train.shape[0]) for train, test in cv.split(x_train, y_train): cnt += 1 trn_x = x_train[ train] # [[i for i in range(x_train.shape[0]) if train[i]]] val_x = x_train[ test] # [[i for i in range(x_train.shape[0]) if test[i]]] trn_y = y_train[train] val_y = y_train[test] train_data = lgb.Dataset( trn_x, # .values.astype(np.float32), label=trn_y, feature_name=usecols) test_data = lgb.Dataset( val_x, # .values.astype(np.float32), label=val_y, feature_name=usecols) del trn_x gc.collect() clf = lgb.train( params, train_data, 100000, # params['n_estimators'], early_stopping_rounds=500, valid_sets=[test_data], feval=cst_metric_xgb, # callbacks=[callback], verbose_eval=10) pred = clf.predict(val_x).clip(0, 1) all_pred[test] = pred best_proba, best_mcc = mcc_optimize(pred, val_y) _score = -best_mcc _score2 = log_loss(val_y, pred) logger.info(' _score: %s' % _score) logger.info(' _best_proba: %s' % best_proba) logger.info(' _score2: %s' % _score2) list_score.append(_score) list_score2.append(_score2) if clf.best_iteration != 0: list_best_iter.append(clf.best_iteration) else: list_best_iter.append(params['n_estimators']) with open(DIR + 'train_cv_pred_%s.pkl' % cnt, 'wb') as f: pickle.dump(pred, f, -1) with open(DIR + 'model_%s.pkl' % cnt, 'wb') as f: pickle.dump(clf, f, -1) gc.collect() break with open(DIR + 'train_cv_tmp.pkl', 'wb') as f: pickle.dump(all_pred, f, -1) logger.info('trees: {}'.format(list_best_iter)) # trees = np.mean(list_best_iter, dtype=int) score = (np.mean(list_score), np.min(list_score), np.max(list_score)) score2 = (np.mean(list_score2), np.min(list_score2), np.max(list_score2)) logger.info('param: %s' % (params)) logger.info('cv: {})'.format(list_score)) logger.info('cv2: {})'.format(list_score2)) logger.info('loss: {} (avg min max {})'.format(score[use_score], score)) logger.info('qwk: {} (avg min max {})'.format(score2[use_score], score2)) if min_score[use_score] > score[use_score]: min_score = score min_params = params logger.info('best score: {} {}'.format(min_score[use_score], min_score)) logger.info('best params: {}'.format(min_params)) imp = pd.DataFrame(clf.feature_importance(), columns=['imp']) imp['col'] = usecols n_features = imp.shape[0] imp = imp.sort_values('imp', ascending=False) imp.to_csv(DIR + 'feature_importances_0.csv') logger.info('imp use {} {}'.format(imp[imp.imp > 0].shape, n_features)) del val_x del trn_y del val_y del train_data del test_data gc.collect() trees = np.mean(list_best_iter) logger.info('all data size {}'.format(x_train.shape)) train_data = lgb.Dataset(x_train, label=y_train, feature_name=usecols) del x_train gc.collect() logger.info('train start') clf = lgb.train( min_params, train_data, int(trees * 1.1), feval=cst_metric_xgb, # valid_sets=[train_data], verbose_eval=10, callbacks=[callback]) logger.info('train end') with open(DIR + 'model.pkl', 'wb') as f: pickle.dump(clf, f, -1) # del x_train gc.collect() logger.info('save end') return best_proba
'n_estimators': [50, 100, 200], 'learning_rate': [0.1], 'min_child_weight': [1], 'subsample': [1], 'reg_alpha': [0, 0.1, 0.01], 'colsample_bytree': [1], 'scale_pos_weight': [1] } _all_params = {'C': [10**i for i in range(-3, 2)], 'penalty': ['l2']} cv = StratifiedKFold(target, n_folds=5, shuffle=True, random_state=0) list_score = [] max_score = -100 best_thresh = None pg = list(ParameterGrid(all_params)) for i in range(data.shape[1]): thresh, score = mcc_optimize(data[:, i], target) logger.info('model:%s, thresh: %s, total score: %s, max_score: %s' % (i, thresh, score, max_score)) for i, params in enumerate(pg): logger.info('%s/%s param: %s' % (i + 1, len(pg), params)) pred_proba_all = [] y_true = [] for train_idx, test_idx in cv: model = XGBClassifier(seed=0) #model = LogisticRegression(n_jobs=-1, class_weight='balanced') model.set_params(**params) model.fit(data[train_idx], target[train_idx], eval_metric=evalmcc_xgb_min,
""" booster = train(params, train_dmatrix, evals=[(test_dmatrix, 'eval')], feval=evalmcc_xgb_min, num_boost_round=700, early_stopping_rounds=200, verbose_eval=True) """ avg_ntree += booster.best_ntree_limit ans = booster.predict(test_dmatrix, ntree_limit=booster.best_ntree_limit) tree_limit = booster.best_ntree_limit score = roc_auc_score(target.ix[test_idx].values, ans) logger.info('score: %s' % score) logger.info('tree: %s' % tree_limit) logger.info('model thresh: %s, score: %s' % mcc_optimize(ans, target.ix[test_idx].values)) logger.info('train_end') if all_ans is None: all_ans = ans all_target = target[test_idx] all_ids = data.ix[test_idx].index.values.astype(int) else: all_ans = numpy.r_[all_ans, ans] all_target = numpy.r_[all_target, target[test_idx]] all_ids = numpy.r_[all_ids, data.ix[test_idx].index.values.astype(int)] ans = booster.predict(test_dmatrix, ntree_limit=booster.best_iteration - 10) logger.info('model thresh: %s, score: %s' % mcc_optimize(ans, target.ix[test_idx].values)) ans = booster.predict(test_dmatrix,
def cst_metric_xgb(pred, dtrain): label = dtrain.get_label().astype(np.int) best_proba, best_mcc = mcc_optimize(pred, label) return 'mcc', best_mcc, True
def train(x_train): #y_train = pd.read_feather('train_0713.ftr')['Response'].values #np.savetxt('y_train.npy', y_train) y_train = np.loadtxt('y_train.npy') usecols = x_train.columns.values.tolist() cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=871) with open(DIR + 'usecols.pkl', 'wb') as f: pickle.dump(usecols, f, -1) for _, test in cv.split(x_train, y_train): x_train = x_train.iloc[test].values y_train = y_train[test] np.savetxt('index.npy', test) break all_params = { 'boosting_type': 'gbdt', 'colsample_bytree': 0.8, 'learning_rate': 0.01, 'max_bin': 255, 'max_depth': -1, 'metric': 'None', 'min_child_weight': 50, 'min_split_gain': 0.01, 'num_leaves': 15, 'objective': 'xentropy ', 'reg_alpha': 0, 'scale_pos_weight': 1, 'seed': 114514, 'subsample': 1, 'subsample_freq': 0, 'verbose': -1 } """ all_params = {'min_child_weight': [80], 'subsample': [1], 'subsample_freq': [0], 'seed': [114514], 'colsample_bytree': [0.8], 'learning_rate': [0.01], 'max_depth': [4], 'min_split_gain': [0.01], 'reg_alpha': [0.001], 'reg_lambda': [0.1], 'max_bin': [255], 'num_leaves': [15], 'objective': ['xentropy'], 'scale_pos_weight': [1], 'verbose': [-1], 'boosting_type': ['gbdt'], 'metric': ['rmse'], # 'skip_drop': [0.7], } """ all_params = {k: [v] for k, v in all_params.items()} use_score = 0 min_score = (100, 100, 100) cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=871) for params in tqdm(list(ParameterGrid(all_params))): cnt = -1 list_score = [] list_score2 = [] list_best_iter = [] list_thresh = [] all_pred = np.zeros(y_train.shape[0]) for train, test in cv.split(x_train, y_train): cnt += 1 trn_x = x_train[train] val_x = x_train[test] trn_y = y_train[train] val_y = y_train[test] train_data = lgb.Dataset(trn_x, label=trn_y, feature_name=usecols) test_data = lgb.Dataset(val_x, label=val_y, feature_name=usecols) del trn_x gc.collect() clf = lgb.train( params, train_data, 100000, # params['n_estimators'], early_stopping_rounds=100, valid_sets=[test_data], feval=cst_metric_xgb, # callbacks=[callback], verbose_eval=10) pred = clf.predict(val_x).clip(0, 1) all_pred[test] = pred best_mcc, _score = mcc_optimize(pred, val_y) _score2 = _score # - roc_auc_score(val_y, pred) logger.info(' _score: %s' % _score) logger.info(' _mcc: %s' % best_mcc) logger.info(' _score2: %s' % _score2) list_thresh.append(best_mcc) list_score.append(_score) list_score2.append(_score2) if clf.best_iteration != 0: list_best_iter.append(clf.best_iteration) else: list_best_iter.append(params['n_estimators']) with open(DIR + 'train_cv_pred_%s.pkl' % cnt, 'wb') as f: pickle.dump(pred, f, -1) with open(DIR + 'model_%s.pkl' % cnt, 'wb') as f: pickle.dump(clf, f, -1) gc.collect() with open(DIR + 'train_cv_tmp.pkl', 'wb') as f: pickle.dump(all_pred, f, -1) logger.info('trees: {}'.format(list_best_iter)) # trees = np.mean(list_best_iter, dtype=int) score = (np.mean(list_score), np.min(list_score), np.max(list_score)) score2 = (np.mean(list_score2), np.min(list_score2), np.max(list_score2)) logger.info('param: %s' % (params)) logger.info('cv: {})'.format(list_score)) logger.info('mcc: {})'.format(list_thresh)) logger.info('cv2: {})'.format(list_score2)) logger.info('loss: {} (avg min max {})'.format(score[use_score], score)) logger.info('all loss: {}'.format( np.sqrt(mean_squared_error(y_train, all_pred)))) logger.info('qwk: {} (avg min max {})'.format(score2[use_score], score2)) if min_score[use_score] > score[use_score]: min_score = score min_params = params logger.info('best score: {} {}'.format(min_score[use_score], min_score)) logger.info('best params: {}'.format(min_params)) imp = pd.DataFrame(clf.feature_importance(), columns=['imp']) imp['col'] = usecols n_features = imp.shape[0] imp = imp.sort_values('imp', ascending=False) imp.to_csv(DIR + 'feature_importances_0.csv') logger.info('imp use {} {}'.format(imp[imp.imp > 0].shape, n_features)) del val_x del trn_y del val_y del train_data del test_data gc.collect() trees = np.mean(list_best_iter) logger.info('all data size {}'.format(x_train.shape)) train_data = lgb.Dataset(x_train, label=y_train, feature_name=usecols) del x_train gc.collect() logger.info('train start') clf = lgb.train(min_params, train_data, int(trees * 1.1), valid_sets=[train_data], verbose_eval=10) logger.info('train end') with open(DIR + 'model.pkl', 'wb') as f: pickle.dump(clf, f, -1) with open(DIR + 'list_thresh', 'wb') as f: pickle.dump(list_thresh, f, -1) # del x_train gc.collect() logger.info('save end')
1] logger.info('train_end') """ if all_ans is None: all_ans = ans all_target = target[test_idx] all_ids = ids.ix[test_idx].values else: all_ans = numpy.r_[all_ans, ans] all_target = numpy.r_[all_target, target[test_idx]] all_ids = numpy.r_[all_ids, ids.ix[test_idx]] """ score_auc = roc_auc_score(target[test_idx], ans) logger.info('score: %s' % score_auc) thresh, score = mcc_optimize(ans, target[test_idx]) logger.info('model thresh: %s, score: %s' % (thresh, score)) if add_col is None: base_score = score base_score_auc = score_auc continue if score > base_score: logger.info('col: %s, mcc is good %s' % (add_col, score - base_score)) feature_column_use.append(add_col) pandas.DataFrame(feature_column_use).to_csv( 'feature_column_use_mcc.csv') if score_auc > base_score_auc:
model.fit(data[train_idx], target[train_idx], eval_metric=evalmcc_xgb_min, verbose=False) #pred_proba = data[test_idx, -1] pred_proba = model.predict_proba(data[test_idx])[:, 1] pred_proba_all = numpy.r_[pred_proba_all, pred_proba] y_true = numpy.r_[y_true, target[test_idx]] score = roc_auc_score(target[test_idx], pred_proba) #logger.info(' score: %s' % score) #thresh, score = mcc_scoring(model, data[test_idx], target[test_idx]) list_score.append(score) #logger.info(' thresh: %s' % thresh) score = numpy.mean(list_score) thresh, score = mcc_optimize(pred_proba_all, y_true) max_score = max(max_score, score) logger.info('thresh: %s, total score: %s, max_score: %s' % (thresh, score, max_score)) if max_score == score: best_param = params best_thresh = thresh logger.info('best_thresh: %s, total max score: %s' % (best_thresh, max_score)) model = XGBClassifier(seed=0) #model = LogisticRegression(n_jobs=-1, class_weight='balanced') model.set_params(**best_param) model.fit(data[train_idx], target[train_idx], eval_metric=evalmcc_xgb_min, verbose=False) with open('stack_model_1.pkl', 'wb') as f: pickle.dump(model, f, -1)