dataset = dataset.dropna() X = dataset.iloc[:, :-1] y = dataset.iloc[:, -1].values X = X.dropna() submit = submit.fillna(0) # Replacing 0's and unknown values of gender by unknown X.Gender.replace(['0', 'unknown'], ['unknown' , 'unknown'], inplace=True) submit.Gender.replace(['0', 'unknown'], ['unknown' , 'unknown'], inplace=True) # Replacing values of 0's by No in the university column X['University Degree'].replace(['0', 0], ['No', 'No'], inplace = True) submit['University Degree'].replace(['0', 0], ['No', 'No'], inplace = True) # Dropping the instance column only as it gives best result X = X.drop(['Instance'] , axis='columns') submit = submit.drop(['Instance'] , axis='columns') # Train test split of 80 : 20 from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0) # Using Categorical Boost Regresor from catboost import CatBoostRegressor model=CatBoostRegressor(task_type = 'GPU', iterations = 100000, learning_rate = 0.005) model.fit(X_train,y_train,cat_features=([1, 3, 5, 6, 8]),eval_set=(X_test, y_test)) model.score(X_test,y_test) # Getting the predicted values ans = model.predict(submit)
def train_model(X, X_test, y, params=None, folds=folds, model_type='lgb', plot_feature_importance=False, model=None): oof = np.zeros(X.shape[0]) prediction = np.zeros(X_test.shape[0]) scores = [] feature_importance = pd.DataFrame() for fold_n, (train_index, valid_index) in enumerate(folds.split(X)): print('Fold', fold_n, 'started at', time.ctime()) if model_type == 'sklearn': X_train, X_valid = X[train_index], X[valid_index] else: X_train, X_valid = X.values[train_index], X.values[valid_index] y_train, y_valid = y[train_index], y[valid_index] if model_type == 'lgb': model = lgb.LGBMRegressor(**params, n_estimators = 20000, nthread = 4, n_jobs = -1) model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_valid, y_valid)], eval_metric='rmse', verbose=1000, early_stopping_rounds=200) y_pred_valid = model.predict(X_valid) y_pred = model.predict(X_test, num_iteration=model.best_iteration_) if model_type == 'xgb': train_data = xgb.DMatrix(data=X_train, label=y_train) valid_data = xgb.DMatrix(data=X_valid, label=y_valid) watchlist = [(train_data, 'train'), (valid_data, 'valid_data')] model = xgb.train(dtrain=train_data, num_boost_round=20000, evals=watchlist, early_stopping_rounds=200, verbose_eval=500, params=params) y_pred_valid = model.predict(xgb.DMatrix(X_valid), ntree_limit=model.best_ntree_limit) y_pred = model.predict(xgb.DMatrix(X_test.values), ntree_limit=model.best_ntree_limit) if model_type == 'sklearn': model = model model.fit(X_train, y_train) y_pred_valid = model.predict(X_valid).reshape(-1,) score = mean_squared_error(y_valid, y_pred_valid) y_pred = model.predict(X_test) if model_type == 'cat': model = CatBoostRegressor(iterations=20000, eval_metric='RMSE', **params) model.fit(X_train, y_train, eval_set=(X_valid, y_valid), cat_features=[], use_best_model=True, verbose=False) y_pred_valid = model.predict(X_valid) y_pred = model.predict(X_test) oof[valid_index] = y_pred_valid.reshape(-1,) scores.append(mean_squared_error(y_valid, y_pred_valid) ** 0.5) prediction += y_pred if model_type == 'lgb': # feature importance fold_importance = pd.DataFrame() fold_importance["feature"] = X.columns fold_importance["importance"] = model.feature_importances_ fold_importance["fold"] = fold_n + 1 feature_importance = pd.concat([feature_importance, fold_importance], axis=0) prediction /= n_fold print('CV mean score: {0:.4f}, std: {1:.4f}.'.format(np.mean(scores), np.std(scores))) if model_type == 'lgb': feature_importance["importance"] /= n_fold if plot_feature_importance: cols = feature_importance[["feature", "importance"]].groupby("feature").mean().sort_values( by="importance", ascending=False)[:50].index best_features = feature_importance.loc[feature_importance.feature.isin(cols)] plt.figure(figsize=(16, 12)); sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False)); plt.title('LGB Features (avg over folds)'); return oof, prediction, feature_importance return oof, prediction else: return oof, prediction
print("Train test split") X_train, X_test, y_train, y_test = train_test_split( X[['score_x', 'score_y', 'score']].values, y.values) if False: model = LGBMRegressor(n_estimators=200) model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=True) with open("./model_lgb.pk", "wb") as f: pickle.dump(model, f) else: with open("./model_lgb.pk", "rb") as f: model = pickle.loads(f.read()) if False: if False: model2 = CatBoostRegressor(iterations=1000, learning_rate=0.1) model2.fit(X_train, y_train) with open("./model_cb.pk", "wb") as f: pickle.dump(model2, f) else: with open("./model_cb.pk", "rb") as f: model2 = pickle.load(f) if False: model3 = XGBRegressor(n_estimators=1000, verbosity=2) model3.fit(X_train, y_train) else: with open("./model_xgb.pk", "rb") as f: model3 = pickle.loads(f.read()) pool = Pool(20)
def test_predict_sklearn_regress(): train_pool = Pool(TRAIN_FILE, column_description=CD_FILE) model = CatBoostRegressor(iterations=2, random_seed=0) model.fit(train_pool) model.save_model(OUTPUT_MODEL_PATH) return compare_canonical_models(OUTPUT_MODEL_PATH)
def test_wrong_params_regressor(): with pytest.raises(CatboostError): CatBoostRegressor(wrong_param=1)
shuffle=True) # num_bins = np.int(1 + np.log2(len(train))) # bins = pd.cut(train['Global_Sales'], bins=num_bins, labels=False) # for i, (train_idx, valid_idx) in enumerate(skf.split(train, bins.values)): for i, (train_idx, valid_idx) in enumerate(skf.split(train, publisher)): x_train, x_valid = train.iloc[train_idx], train.iloc[valid_idx] y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx] # # Publisherでfoldを割ってるので、trainはデータを分割した後にカラムをドロップ # x_train = x_train.drop(drop_column, axis=1) # x_valid = x_valid.drop(drop_column, axis=1) train_data = Pool(x_train, y_train) valid_data = Pool(x_valid, y_valid) model = CatBoostRegressor(**cab_params) model.fit(train_data, eval_set=valid_data, early_stopping_rounds=50, verbose=False, use_best_model=True) cab_valid_pred = model.predict(x_valid) score = mean_squared_error(y_valid, cab_valid_pred)**.5 print(f'Fold {i} CAB RMSLE: {score}') cab_oof_pred[valid_idx] = cab_valid_pred models.append(model) scores.append(score) model = lgbm.LGBMRegressor(**lgbm_params) model.fit(
def run_train_all_sklearn(file, fp_name, cv=5, verbose=0, seed=1): np.random.seed(seed) c = defaultdict(list) for k in ProgIter([ 'synergy_zip', 'synergy_bliss', 'synergy_loewe', 'synergy_hsa', 'css_ri', 'name' ], verbose=verbose, total=5): v = file[k] if k != 'name': temp = dict( ) # for results storage. Assuming that "name" comes last if 'drug_row_col' in v.columns: v.drop(columns=['drug_row_col'], inplace=True) cat_cols = ['cell_line_name'] categories = [ v[column].unique() for column in v[cat_cols] ] # manually find all available categories for one-hot # pipelines encode = Pipeline(steps=[('one-hot-encode', OneHotEncoder(categories=categories))]) processor = ColumnTransformer(transformers=[ ('cat_encoding', encode, cat_cols), ('dropping', 'drop', [k]) ], remainder='passthrough') catbst = ColumnTransformer(transformers=[('dropping', 'drop', [k]) ], remainder='passthrough') # regressions lr = make_pipeline(processor, linear_model.LinearRegression()) ridge = make_pipeline(processor, linear_model.Ridge()) lasso = make_pipeline(processor, linear_model.Lasso()) elastic = make_pipeline(processor, linear_model.ElasticNet()) lassolars = make_pipeline(processor, linear_model.LassoLars()) b_ridge = make_pipeline(processor, linear_model.BayesianRidge()) kernel = DotProduct() + WhiteKernel() gpr = make_pipeline(processor, GaussianProcessRegressor(kernel=kernel)) linSVR = make_pipeline(processor, LinearSVR()) hist_gbr = make_pipeline( processor, HistGradientBoostingRegressor(warm_start=True, max_depth=6)) rfr = make_pipeline( processor, RandomForestRegressor(warm_start=True, max_depth=6, n_jobs=3)) iso = make_pipeline(processor, IsotonicRegression(increasing='auto')) xgb = make_pipeline( processor, XGBRegressor(tree_method='gpu_hist', max_depth=6)) cbt = make_pipeline( catbst, CatBoostRegressor(task_type='GPU', depth=6, cat_features=np.array([0]), verbose=False)) mls = [ cbt, rfr, gpr, hist_gbr, lr, ridge, lasso, elastic, lassolars, b_ridge, gpr, linSVR, iso ] mls_names = [ "cbt", "rfr", "gpr", "hist_gbr", "lr", "ridge", "lasso", "elastic", "lassolars", "b_ridge", "gpr", "linSVR", "iso" ] # results start = time.time() for MODEL, name in zip(mls, mls_names): print(f'\n{name}') if 'cbt' == name: n_jobs = 1 else: n_jobs = cv cv_dict = cross_validate( MODEL, v, v[k], cv=cv, scoring={ "pearsonr": pearson, "rmse": rmse }, return_train_score=False, verbose=verbose, n_jobs=n_jobs, ) temp[name] = { 'test_pearsonr': np.nanmean(cv_dict['test_pearsonr']), 'test_rmse': abs(np.nanmean(cv_dict['test_rmse'])) } print(temp[name]) print(f'{k} took {int(time.time()-start)/60} mins') c[k] = temp else: nm = f'/tf/notebooks/code_for_pub/_logs_as_python_files/{fp_name}_13models_5foldCV_{time.ctime()}.pickle' with open(nm, 'wb') as file: pickle.dump(c, file) print(f'saving complete to {nm}') return c
from catboost import CatBoostRegressor model = CatBoostRegressor().load_model("fitted_model") # floor_number, total_floors, area, latitude, longitude, apt_state model.predict([[2, 5, 45, 53.908681, 27.572759, 1]])
train = pd.read_csv('FINAL_TRAIN_month3.csv') test = pd.read_csv('FINAL_TEST_month3.csv') # getting cat features indexes cat_ff = [ 'date1', 'month', 'Класс объекта', 'Огорожена территория', 'Входные группы', 'Спортивная площадка', 'Автомойка', 'Кладовые', 'Колясочные', 'Кондиционирование', 'Вентлияция', 'Лифт', 'Система мусоротведения', 'Видеонаблюдение', 'Подземная парковка', 'Двор без машин', 'most_otdelka', 'most_vid', 'most_plan_size' ] cat_ff = name_to_col_num(train.drop(['value', 'bulk_id'], axis=1), cat_ff) if CROSS_VALIDATION: model = CatBoostRegressor(random_state=19, iterations=1500) # model = CatBoostRegressor(random_state=1, iterations=1300, learning_rate=0.03, depth=10) local_validation_cutoff = pd.DatetimeIndex(['2017-12-01' ]).astype(np.int64)[0] X_train = train[train.date1 < local_validation_cutoff].drop( ['value', 'bulk_id'], axis=1) y_train = train[train.date1 < local_validation_cutoff]['value'] X_validation = train[train.date1 >= local_validation_cutoff].drop( ['value', 'bulk_id'], axis=1) y_validation = train[train.date1 >= local_validation_cutoff]['value'] f_pool = Pool(X_train, y_train, cat_features=cat_ff) model.fit( X_train, y_train,
def main(args): # build search space data = load_data(args.dataset, args.seed) ss, _ = pruning_search_space_by_eda(data) if data.setting == 'inductive': trainer = InductiveTrainer() else: trainer = TransductiveTrainer() sampler = Sampler(args.dataset, ss) archs = [] val_scores = [] top_archs = [] top_val_scores = [] top_test_scores = [] # init training data for GBDT sampled_archs = sampler.sample(args.n) i = 0 while i < len(sampled_archs): arch = sampled_archs[i] data = sampler.load_data(arch) try: model = sampler.build_model(arch, data.x.shape[1], int(max(data.y)) + 1) trainer.init_trainer(model, arch[7], arch[6]) val_score = trainer.train(data) except RuntimeError as e: if "cuda" in str(e) or "CUDA" in str(e): # CUDA OOM, sample another arch print(e) sampled_archs += sampler.sample(1) i += 1 continue else: raise e archs.append(arch) val_scores.append(val_score) print(arch, f'real val score: {val_score}') print(f'Number of evaluated archs: {len(archs)}') i += 1 # train GBDT predictor for iter_round in range(1, args.iterations + 1): print(f'Iteration round {iter_round}, ReTraining model and sampling archs...', datetime.now().strftime("%Y-%m-%d %H:%M:%S")) # train GBDT X = [[str(e) for e in row] for row in archs] y = np.array(val_scores) train_pool = Pool(X, y, cat_features=[i for i in range(len(X[0]))]) # X = lgb.Dataset(pd.DataFrame(X, columns=ss.keys()), label=np.array(val_scores)) # gbdt_model = lgb.train(gbdt_params, X, args.gbdt_num_boost_round, categorical_feature=ss.keys()) gbdt_model = CatBoostRegressor( learning_rate=args.gbdt_lr, verbose=False ) gbdt_model.fit(train_pool) # pruning search space ss = pruning_search_space_by_shap(archs, gbdt_model, ss, args.p) sampler.update_search_space(ss) # predict some archs sampled_archs = sampler.sample(args.m) X = [[str(e) for e in row] for row in sampled_archs] test_pool = Pool(X, cat_features=[i for i in range(len(X[0]))]) predicted_val_scores = gbdt_model.predict(test_pool) # sort the archs according to the predicted value zipped = zip(sampled_archs, predicted_val_scores) zipped = sorted(zipped, key=lambda e: e[1], reverse=True) # sort in decreaing order sampled_archs, predicted_val_scores = zip(*zipped) sampled_archs, predicted_val_scores = list(sampled_archs), list(predicted_val_scores) print(f'Iteration round {iter_round}, evaluating top k archs on valid set', datetime.now().strftime("%Y-%m-%d %H:%M:%S")) # evaluate top k archs i = 0 while i < len(sampled_archs): arch = sampled_archs[i] data = sampler.load_data(arch) try: model = sampler.build_model(arch, data.x.shape[1], int(max(data.y)) + 1) trainer.init_trainer(model, arch[7], arch[6]) val_score = trainer.train(data) predicted_val_score = predicted_val_scores[i] except RuntimeError as e: if "cuda" in str(e) or "CUDA" in str(e): # CUDA OOM, sample another arch print(e) sampled_archs += sampler.sample(1) i += 1 continue else: raise e archs.append(arch) val_scores.append(val_score) print(arch, f'predicted val score: {predicted_val_score} | real val score: {val_score}') print(f'Number of evaluated archs: {len(archs)}') if i + 1 >= args.k: break i += 1 # sort all the evaluated archs zipped = zip(archs, val_scores) zipped = sorted(zipped, key=lambda e: e[1], reverse=True) archs, val_scores = zip(*zipped) archs, val_scores = list(archs), list(val_scores) print(f'Iteration round {iter_round}, evaluating top k_test archs on test set', datetime.now().strftime("%Y-%m-%d %H:%M:%S")) # evaluate top k_test archs on test set i = 0 while i < len(archs): arch = archs[i] data = sampler.load_data(arch) try: model = sampler.build_model(arch, data.x.shape[1], int(max(data.y)) + 1) trainer.init_trainer(model, arch[7], arch[6]) val_score = trainer.train(data) test_score, z = trainer.test(data, return_logits=True) pickle.dump((z, data.y[data.test_mask]), open(f'embeddings/{args.dataset}_AutoGRL-round{iter_round}-top{i + 1}.pt', 'wb')) except RuntimeError as e: if "cuda" in str(e) or "CUDA" in str(e): # CUDA OOM, sample another arch print(e) i += 1 continue else: raise e top_archs.append(arch) top_val_scores.append(val_score) top_test_scores.append(test_score) print(arch) print(f'Testing... round {iter_round} | arch top {i + 1} | real val score {val_score} | real test score {test_score}', datetime.now().strftime("%Y-%m-%d %H:%M:%S")) if i + 1 >= args.k_test: # only test top k_test models for every round break i += 1 zipped = zip(top_val_scores, top_test_scores) zipped = sorted(zipped, key=lambda e: e[0], reverse=True) best_val_score, corr_test_score = zipped[0][0], zipped[0][1] # logging print(f'Iteration {iter_round} | best val score {best_val_score} | corresponding test score {corr_test_score} | best test score {max(top_test_scores)}', datetime.now().strftime("%Y-%m-%d %H:%M:%S")) pickle.dump((ss, sampler, trainer, archs, val_scores, gbdt_model, sampled_archs, predicted_val_scores, top_val_scores, top_test_scores), open(f'cache/gbdt/{args.dataset}_seed{args.seed}_round{iter_round}.pt', 'wb'))
from catboost import CatBoostRegressor from sklearn.datasets import load_diabetes from sklearn.metrics import mean_squared_error, r2_score from sklearn.model_selection import train_test_split # 加载数据 X, y = load_diabetes(return_X_y=True) # 分割数据集 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) # 训练数据 clf = CatBoostRegressor(iterations=800, learning_rate=0.8, depth=6, loss_function='RMSE') fit_model = clf.fit(X_train, y_train) # 模型参数 print(fit_model.get_params()) # 预测模型 y_pred = clf.predict(X_test) # 评估模型 print(f'mean squared error: {mean_squared_error(y_test, y_pred)}') print(f'r2 score: {r2_score(y_test, y_pred)}')
def main(): cmdl = getcommandline() if cmdl.wellscsv: allwells = pd.read_csv(cmdl.wellscsv) # dz = np.diff(allwells.DEPTH)[2] dz = np.diff(allwells[allwells.columns[1]])[2] print('Well Vertical increment {}'.format(dz)) wdirsplit, wfextsplit = os.path.split(cmdl.wellscsv) wfname, wfextn = os.path.splitext(wfextsplit) # logname = allwells.columns[-1] wcols = allwells.columns.tolist() print(wcols) logname = wcols[-1] print('logname:', logname) lognamepred = logname + 'pred' wcols.append(lognamepred) if cmdl.outdir: outfw = os.path.join(cmdl.outdir, wfname) + "_pred.csv" else: outfw = os.path.join(wdirsplit, wfname) + "_pred.csv" if cmdl.segyfileslist: sflist = list() sflist = process_segylist(cmdl.segyfileslist) dirsplit, fextsplit = os.path.split(sflist[0]) fname, fextn = os.path.splitext(fextsplit) if cmdl.outdir: outfsegy = os.path.join(cmdl.outdir, wfname) + "_p%s.sgy" % (logname) else: outfsegy = os.path.join(dirsplit, wfname) + "_p%s.sgy" % (logname) print('Copying file, please wait ........') start_copy = datetime.now() copyfile(sflist[0], outfsegy) end_copy = datetime.now() print('Duration of copying: {}'.format(end_copy - start_copy)) sr = get_samplerate(outfsegy) print('Seismic Sample Rate: {}'.format(sr)) print('Zeroing segy file, please wait ........') start_zero = datetime.now() zero_segy(outfsegy) end_zero = datetime.now() print('Duration of zeroing: {}'.format(end_zero - start_zero)) xclst, yclst = get_xy(fextsplit, cmdl.segyxhdr, cmdl.segyyhdr, cmdl.xyscalerhdr) xydf = pd.DataFrame({'XC': xclst, 'YC': yclst}) preddf = xydf.copy() scols = list() for f in sflist: dirsplit, fextsplit = os.path.split(f) fname, fextn = os.path.splitext(fextsplit) scols.append(fname) sfname = 'allattrib' # slicerange = cmdl.startendslice[1] - cmdl.startendslice[0] sstart = int(cmdl.startendslice[0] // dz) send = int(cmdl.startendslice[1] // dz) start_process = datetime.now() slicelst = list() slicenumlst = list() wnlst = list() slicewnlst = list() coef0lst = list() coef1lst = list() r2lst = list() for slicenum in range(sstart, send): if cmdl.outdir: outfslice = os.path.join(cmdl.outdir, sfname) + "_slice%d.csv" % slicenum else: outfslice = os.path.join(dirsplit, sfname) + "_slice%d.csv" % slicenum zslice = slicenum * dz if cmdl.intime: wdf = allwells[allwells.TIME == zslice] else: wdf = allwells[allwells.DEPTH == zslice] c = wdf.columns[4] #log name nw = wdf[~wdf[c].isnull()].count()[4] if cmdl.intime: print('# of wells for time slice {} is {}'.format(zslice, nw)) else: print('# of wells for depth slice {} is {}'.format(zslice, nw)) slicefiles = list() for i in range(len(sflist)): slicefiles.append(get_slice(sflist[i], slicenum)) slicear = np.array(slicefiles).T slicedf = pd.DataFrame(slicear, columns=scols) alldata = pd.concat((xydf, slicedf), axis=1) if cmdl.intime: print('Slice#: {} @ Time : {} ms'.format(slicenum, zslice)) else: print('Slice#: {} @ Depth : {} ms'.format(slicenum, zslice)) # print(alldata.head()) if cmdl.slicesout: alldata.to_csv(outfslice, index=False) alldatas = process_sscalecols(alldata, includexy=cmdl.includexy) # print('After Scaling .....') # print(alldatas.head()) wdfsa = process_seiswellattrib(alldatas, wdf, cmdl.intime) print(wdfsa.tail()) # lastcol = wdfsa.shape[1] X = wdfsa.iloc[:, 4:-1] y = wdfsa.iloc[:, -1] inshape = y.size # print( f"size of y: {inshape}") if y.size > 2 and cmdl.generatesamples: X, y = gensamples(X, y, nsamples=cmdl.generatensamples, ncomponents=cmdl.generatencomponents, kind='r', func='cbr') Xpred = alldatas.iloc[:, 2:] # print(f'Xpred: {Xpred.shape}' ) # print('# of wells used: ', X.shape[0], y.shape) # print(f'X shape: {X.shape} ') # print(X ) model = CatBoostRegressor(iterations=cmdl.cbriterations, learning_rate=cmdl.cbrlearningrate, depth=cmdl.cbrdepth, loss_function='RMSE', random_seed=42, logging_level='Silent') # Fit model model.fit(X, y) # Get predictions ypred = model.predict(X) # Calculating Mean Squared Error mse = np.mean((ypred - y)**2) print('Metrics on input data: ') print('MSE: %.4f' % (mse)) r2 = r2_score(y, ypred) print('R2 : %10.3f' % r2) ccmdl = sts.pearsonr(y, ypred) if slicenum == sstart: wellsdf = wdfsa[wdfsa.columns[:4]].copy() wellsdf[logname] = wdfsa[wdfsa.columns[-1]].copy() if cmdl.generatesamples: wellsdf[lognamepred] = ypred[:inshape] else: wellsdf[lognamepred] = ypred # print(wellsdf.tail()) # print(wellsdf.shape) else: wellsdf0 = wdfsa[wdfsa.columns[:4]].copy() wellsdf0[logname] = wdfsa[wdfsa.columns[-1]].copy() if cmdl.generatesamples: wellsdf0[lognamepred] = ypred[:inshape] else: wellsdf0[lognamepred] = ypred allwellspred = wellsdf.append(wellsdf0) wellsdf = allwellspred[wcols].copy() print(allwellspred.tail()) print(allwellspred.shape) pred = model.predict(Xpred) alldatas[wdfsa.columns[4]] = pred # print('After Prediction........') # print(alldatas.head()) slicestr = '{:.0f}'.format(zslice) preddf[slicestr] = pred qc0 = np.polyfit(y, ypred, 1) xrngmin, xrngmax = y.min(), y.max() xvi = np.linspace(xrngmin, xrngmax) yvi0 = np.polyval(qc0, xvi) if slicenum % cmdl.plotincrement == 0: slicedepth = slicenum * dz fig, ax = plt.subplots() plt.scatter(y, ypred, alpha=0.5, c='b', s=15, label='Model Predicted') if cmdl.generatesamples: ax.scatter(y[inshape:], ypred[inshape:], c='r', marker='X', s=25, label='Generated Samples') plt.plot(xvi, yvi0, c='k', lw=2) ax.annotate('Model = %-.*f * Actual + %-.*f' % (2, qc0[0], 2, qc0[1]), xy=(xvi[0], yvi0[0]), xytext=(0.14, 0.85), textcoords='figure fraction', fontsize=10) ax.annotate('Model Pearson cc = %-.*f Pearson p = %-.*f' % (2, ccmdl[0], 3, ccmdl[1]), xy=(xvi[0], yvi0[0]), xytext=(0.14, 0.81), textcoords='figure fraction', fontsize=10) ax.set_title(f'CBR Slice {slicedepth:.0f} Pseudo {logname}') ax.set_xlabel('Actual') ax.set_ylabel('Predicted') if not cmdl.hideplots: plt.show() swfname = 'SWAttrib' if cmdl.outdir: # pdfcl = os.path.join(cmdl.outdir,swfname)+"%d" %(slicenum) +"_cbr%s.pdf" %(logname) # wsdf = os.path.join(cmdl.outdir,swfname)+"%d" %(slicenum) +"_cbr%s.csv" %(logname) pdfcl = os.path.join( cmdl.outdir, swfname ) + f"{slicedepth:.0f}" + "_cbr%s.pdf" % (logname) wsdf = os.path.join( cmdl.outdir, swfname ) + f"{slicedepth:.0f}" + "_cbr%s.csv" % (logname) else: pdfcl = os.path.join( dirsplit, swfname ) + f"{slicedepth:.0f}" + "_cbr%s.pdf" % (logname) wsdf = os.path.join( dirsplit, swfname ) + f"{slicedepth:.0f}" + "_cbr%s.csv" % (logname) fig.savefig(pdfcl) wdfsa.to_csv(wsdf, index=False) print(f'Successfully generated {wsdf}') slicelst.append(zslice) wnlst.append(nw) slicewnlst.append(wdfsa.shape[0]) slicenumlst.append(slicenum) r2lst.append(r2) coef0lst.append(qc0[0]) coef1lst.append(qc0[1]) end_process = datetime.now() print('Duration of ML model building and prediction : {}'.format( end_process - start_process)) qccols = [ 'SLICENUM', 'SLICEZ', 'WELLSFOUND', 'WELLSUSED', 'COEF0', 'COEF1', 'R2' ] qcdf = pd.DataFrame({ 'SLICENUM': slicenumlst, 'SLICEZ': slicelst, 'WELLSFOUND': wnlst, 'WELLSUSED': slicewnlst, 'COEF0': coef0lst, 'COEF1': coef1lst, 'R2': r2lst }) qcdf = qcdf[qccols].copy() if cmdl.outdir: outseispred = os.path.join(cmdl.outdir, wfname) + "_slices.csv" outqc = os.path.join(cmdl.outdir, wfname) + "_qc.csv" else: outseispred = os.path.join(dirsplit, wfname) + "_slices.csv" outqc = os.path.join(dirsplit, wfname) + "_qc.csv" preddf.to_csv(outseispred, index=False) print('Successfully generated {}'.format(outseispred)) print('DataFrame size: ', preddf.shape) endsmpl = preddf.shape[1] - 2 # print(preddf.head()) qcdf.to_csv(outqc, index=False) print('Successfully generated {}'.format(outqc)) with sg.open(outfsegy, "r+") as srcp: for trnum, tr in enumerate(srcp.trace): trplog = preddf.iloc[trnum, 2:].values # lentrplog = trplog.size # print(trplog) tr[sstart:(sstart + endsmpl)] = trplog srcp.trace[trnum] = tr print('Successfully generated {}'.format(outfsegy)) allwellspred.to_csv(outfw, index=False) print('Successfully generated {}'.format(outfw)) plotwells(allwellspred, hideplots=cmdl.hideplots)
pred = model.predict(X_test_sn) print(sklearn.metrics.mean_absolute_error(y_test_sn, pred)) from sklearn.model_selection import KFold from catboost import CatBoostRegressor kfolds = 4 models = [] kfold = KFold(n_splits = kfolds, shuffle = True) for i , (train_index, test_index) in enumerate(kfold.split(X)): print('Training cat model with fold {}...'.format(i + 1)) X_train, X_test = X.ix[train_index], X.ix[test_index] y_train, y_test = y[train_index], y[test_index] model = CatBoostRegressor(iterations=200, learning_rate=0.03, depth=6, l2_leaf_reg=3,loss_function='MAE',eval_metric='MAE') models.append(model.fit(X_train, y_train)) months = np.array([10, 11, 12]) y_pred = [] for i in range(0, len(months)): pred = 0 print(months[i]) if months[i] != 0: X_validation['month'] = months[i] for model in models: print('next model...') pred += model.predict(X_validation)/kfolds y_pred.append(pred)
from pytrends.request import TrendReq import datetime import pandas as pd import regex as re from catboost import CatBoostRegressor, Pool post_model = CatBoostRegressor() doc_model = CatBoostRegressor() post_model.load_model("models/posts_model") doc_model.load_model("models/doc_model") def trends(topic): score = 0 time = str(datetime.datetime.now()) year = int(time[0:4]) month = int(time[5:7]) day = int(time[8:10]) hour = int(time[11:13]) pytrends = TrendReq(hl='ru-RU', tz=360) smth = \ pytrends.get_historical_interest([topic], year_start=year, month_start=month, day_start=day - 7, hour_start=hour, year_end=year, month_end=month, day_end=day, hour_end=hour, cat=0, geo='', gprop='', sleep=0)[ topic] for i in range(0, 167): score += smth[-i] score = float(score / 168) return score
def test_benchmark_classification(self): data, label = get_data_label(load_iris()) num_features = 3 corr_threshold = 0.5 alpha = 1000 tree_params = {"random_state": 123, "n_estimators": 100} selectors = { "corr_pearson": SelectionMethod.Correlation(corr_threshold, method="pearson"), "corr_kendall": SelectionMethod.Correlation(corr_threshold, method="kendall"), "corr_spearman": SelectionMethod.Correlation(corr_threshold, method="spearman"), "univ_anova": SelectionMethod.Statistical(num_features, method="anova"), "univ_chi_square": SelectionMethod.Statistical(num_features, method="chi_square"), "univ_mutual_info": SelectionMethod.Statistical(num_features, method="mutual_info"), "linear": SelectionMethod.Linear(num_features, regularization="none"), "lasso": SelectionMethod.Linear(num_features, regularization="lasso", alpha=alpha), "ridge": SelectionMethod.Linear(num_features, regularization="ridge", alpha=alpha), "random_forest": SelectionMethod.TreeBased(num_features), "xgboost_clf": SelectionMethod.TreeBased(num_features, estimator=XGBClassifier(**tree_params)), "xgboost_reg": SelectionMethod.TreeBased(num_features, estimator=XGBRegressor(**tree_params)), "extra_clf": SelectionMethod.TreeBased(num_features, estimator=ExtraTreesClassifier(**tree_params)), "extra_reg": SelectionMethod.TreeBased(num_features, estimator=ExtraTreesRegressor(**tree_params)), "lgbm_clf": SelectionMethod.TreeBased(num_features, estimator=LGBMClassifier(**tree_params)), "lgbm_reg": SelectionMethod.TreeBased(num_features, estimator=LGBMRegressor(**tree_params)), "gradient_clf": SelectionMethod.TreeBased(num_features, estimator=GradientBoostingClassifier(**tree_params)), "gradient_reg": SelectionMethod.TreeBased(num_features, estimator=GradientBoostingRegressor(**tree_params)), "adaboost_clf": SelectionMethod.TreeBased(num_features, estimator=AdaBoostClassifier(**tree_params)), "adaboost_reg": SelectionMethod.TreeBased(num_features, estimator=AdaBoostRegressor(**tree_params)), "catboost_clf": SelectionMethod.TreeBased(num_features, estimator=CatBoostClassifier(**tree_params, silent=True)), "catboost_reg": SelectionMethod.TreeBased(num_features, estimator=CatBoostRegressor(**tree_params, silent=True)) } # Benchmark score_df, selected_df, runtime_df = benchmark(selectors, data, label, output_filename=None) _ = calculate_statistics(score_df, selected_df) self.assertListAlmostEqual([0.7018161715727902, 0.47803395524999537, 0.8157648279049796, 0.7867331225527027], score_df["corr_pearson"].to_list()) self.assertListAlmostEqual([0.6127053183332257, 0.35502921869499415, 0.6778502590804124, 0.6548312268837866], score_df["corr_kendall"].to_list()) self.assertListAlmostEqual([0.7207411401565564, 0.4413611232398492, 0.7823000090067262, 0.7652468370362326], score_df["corr_spearman"].to_list()) self.assertListAlmostEqual([119.26450218449871, 49.16004008961098, 1180.1611822529776, 960.0071468018025], score_df["univ_anova"].to_list()) self.assertListAlmostEqual([10.81782087849401, 3.7107283035324987, 116.31261309207022, 67.04836020011116], score_df["univ_chi_square"].to_list()) self.assertListAlmostEqual([0.4742659474041446, 0.2458627871667194, 0.9899864089960027, 0.9892550496360593], score_df["univ_mutual_info"].to_list()) self.assertListAlmostEqual([0.28992981466266715, 0.5607438535573831, 0.2622507287680856, 0.04272068858604694], score_df["linear"].to_list()) self.assertListAlmostEqual([0.7644807315853743, 0.594582626209646, 0.3661598482641388, 1.0152555188158772], score_df["lasso"].to_list()) self.assertListAlmostEqual([1.646830819860649e-15, 1.572815951552305e-15, 3.2612801348363973e-15, 5.773159728050814e-15], score_df["ridge"].to_list()) self.assertListAlmostEqual([0.09210348279677849, 0.03045933928742506, 0.4257647994615192, 0.45167237845427727], score_df["random_forest"].to_list())
# For IJHN create 2 clusters for high and low group knclf = KNeighborsClassifier(n_neighbors=5) y_kn = [1 if x > 170 else 0 for x in y_train] knclf.fit(X_train_nona, y_kn) X_train['high_low_ind'] = knclf.predict(X_train_nona) X_valid['high_low_ind'] = knclf.predict(X_valid_nona) X_test_type['high_low_ind'] = knclf.predict( X_test_type[X_train_nona.columns]) train_dataset = Pool(data=X_train, label=y_train) valid_dataset = Pool(data=X_valid, label=y_valid) test_dataset = Pool(data=X_test_type) model = CatBoostRegressor( iterations=N_ESTIMATORS, learning_rate=LEARNING_RATE, depth=DEPTH, eval_metric=EVAL_METRIC, verbose=VERBOSE, random_state=RANDOM_STATE, thread_count=N_THREADS, #loss_function=EVAL_METRIC, task_type="GPU") # Train on GPU model.fit(train_dataset, eval_set=valid_dataset, early_stopping_rounds=500) now = timer() update_tracking(run_id, '{}_tr_sec_f{}'.format(bond_type, fold_n + 1), (now - fold_start), integer=True) logger.info('Saving model file') model.save_model('models/{}-{}-{}-{}.model'.format(
from sklearn.model_selection import cross_validate from catboost import CatBoostRegressor best_params = { 'bagging_temperature': 0.6, 'border_count': 200, 'depth': 8, 'iterations': 350, 'l2_leaf_reg': 30, 'learning_rate': 0.30, 'random_strength': 0.01, 'scale_pos_weight': 0.48 } model = CatBoostRegressor(iterations=1000, depth=3, learning_rate=0.1, loss_function='RMSE') # model = LogisticRegression( # penalty='l2', # C=1.0, # fit_intercept=True, # random_state=432, # solver = 'liblinear', # max_iter = 1000, # ) stats = cross_validate(model, X, y, groups=None, scoring='roc_auc', cv=5,
'Hour', 'week_day', 'Year', 'Day', 'season' ] xgb_model = xgb.XGBRegressor(n_estimators=1000, max_depth=5, learning_rate=0.03, colsample_bytree=0.8, subsample=0.7, booster='gbtree') xgb_cols = [ 'weather', 'atemp', 'humidity', 'windspeed', 'holiday', 'workingday', 'Hour', 'week_day', 'Year', 'Day', 'season' ] params = {'depth': 6, 'learning_rate': 0.05, 'iterations': 150} cat_model = CatBoostRegressor(1000) cat_model.fit(train[xgb_cols], train['registered_log']) lr = LinearRegression() streg_model = StackingCVRegressor( regressors=[cat_model, rf_model, gbm_model, xgb_model], meta_regressor=lr) scores_casual_cat = cross_val_score(cat_model, train[xgb_cols], train['casual_log'], cv=5, scoring=make_scorer( log_rmsle, greater_is_better=False)) scores_r_cat = cross_val_score(cat_model, train[xgb_cols], train['registered_log'],
# 'grow_policy': 'Depthwise', # 'l2_leaf_reg': 126, # 'learning_rate': 0.30065425194784257, # 'max_depth': 16, # #'max_leaves': 54, # 'min_data_in_leaf': 90, # 'random_strength': 10, # 'iterations':2000, # 'eval_metric': 'RMSE', # 'random_seed':13, # 'verbose':25, # 'task_type': 'GPU', # 'od_type':'Iter', # 'od_wait': 20 } model_cat = CatBoostRegressor(**param) # model.fit(train_pool, eval_set=validation_pool, verbose= True) # cost(np.log1p(y_test), model.predict(validation_pool)) pred_cat = cluster_id('cat', train_new, test_new, 'building_id', model_cat, num_cluster, num_iters, skip=True, param=None) print('Cat predict end') ####lasso model_lasso = Lasso(alpha=1, random_state=13)
def catboost_regressor_learner(df: pd.DataFrame, features: List[str], target: str, learning_rate: float = 0.1, num_estimators: int = 100, extra_params: Dict[str, Any] = None, prediction_column: str = "prediction", weight_column: str = None) -> LearnerReturnType: """ Fits an CatBoost regressor to the dataset. It first generates a Pool with the specified features and labels from `df`. Then it fits a CatBoost model to this Pool. Return the predict function for the model and the predictions for the input dataset. Parameters ---------- df : pandas.DataFrame A Pandas' DataFrame with features and target columns. The model will be trained to predict the target column from the features. features : list of str A list os column names that are used as features for the model. All this names should be in `df`. target : str The name of the column in `df` that should be used as target for the model. This column should be numerical and continuous, since this is a regression model. learning_rate : float Float in range [0,1]. Step size shrinkage used in update to prevents overfitting. After each boosting step, we can directly get the weights of new features. and eta actually shrinks the feature weights to make the boosting process more conservative. See the eta hyper-parameter in: https://catboost.ai/docs/concepts/python-reference_parameters-list.html num_estimators : int Int in range [0, inf] Number of boosted trees to fit. See the n_estimators hyper-parameter in: https://catboost.ai/docs/concepts/python-reference_parameters-list.html extra_params : dict, optional Dictionary in the format {"hyperparameter_name" : hyperparameter_value. Other parameters for the CatBoost model. See the list in: https://catboost.ai/docs/concepts/python-reference_catboostregressor.html If not passed, the default will be used. prediction_column : str The name of the column with the predictions from the model. weight_column : str, optional The name of the column with scores to weight the data. """ from catboost import Pool, CatBoostRegressor import catboost weights = df[weight_column].values if weight_column else None params = extra_params if extra_params else {} params = assoc(params, "eta", learning_rate) dtrain = Pool(df[features].values, df[target].values, weight=weights, feature_names=list(map(str, features))) cat_boost_regressor = CatBoostRegressor(iterations=num_estimators, **params) cbr = cat_boost_regressor.fit(dtrain, verbose=0) def p(new_df: pd.DataFrame, apply_shap: bool = False) -> pd.DataFrame: dtest = Pool(new_df[features].values, feature_names=list(map(str, features))) col_dict = {prediction_column: cbr.predict(dtest)} if apply_shap: import shap explainer = shap.TreeExplainer(cbr) shap_values = list(explainer.shap_values(new_df[features])) shap_expected_value = explainer.expected_value shap_output = { "shap_values": shap_values, "shap_expected_value": np.repeat(shap_expected_value, len(shap_values)) } col_dict = merge(col_dict, shap_output) return new_df.assign(**col_dict) p.__doc__ = learner_pred_fn_docstring("CatBoostRegressor", shap=False) log = { 'catboost_regression_learner': { 'features': features, 'target': target, 'prediction_column': prediction_column, 'package': "catboost", 'package_version': catboost.__version__, 'parameters': assoc(params, "num_estimators", num_estimators), 'feature_importance': cbr.feature_importances_, 'training_samples': len(df) } } return p, p(df), log
def run_train(file, fp_name, cv=10, for_valid=0.4, ordered=False, ram_fraction=0.95, save=False, cv_params=None): cv_lower = 1 cv_higher = 1 + cv if cv_params is None: cv_params = dict() cv_params['bootstrap_type'] = 'Poisson' cv_params['l2_leaf_reg'] = 9 cv_params['learning_rate'] = 0.15 cv_params['depth'] = 10 cv_params['cat_features'] = ['cell_line_name'] cv_params['use_best_model'] = True cv_params['early_stopping_rounds'] = 50 cv_params['iterations'] = 5000 cv_params['task_type'] = 'GPU' else: cv_params = cv_params if ordered: cv_params['boosting_type'] = 'Ordered' cat_features = cv_params['cat_features'] cv_params['gpu_ram_part'] = ram_fraction f = for_valid c = defaultdict(list) for k in ProgIter([ 'synergy_zip', 'synergy_bliss', 'synergy_loewe', 'synergy_hsa', 'css_ri', 'name' ], total=5, verbose=1): v_temp = file[k] if k != 'name': if 'drug_row_col' in v_temp.columns: v = v_temp.drop(columns=['drug_row_col'], inplace=False) else: v = v_temp size = int(v.shape[0] * f) # 40% for valid a = [] for i in range(cv_lower, cv_higher, 1): print(k) # sampling np.random.seed(i) idx_valid = pd.Index( np.random.choice(v.index, size, replace=False)) idx_test = v.index.difference(idx_valid) train = v.loc[ idx_test, :] # returns df without the dropped idx valid = v.loc[idx_valid, :] #prep datasets true_labels = valid.pop(k) y = train.pop(k) eval_dataset = Pool(valid, true_labels, cat_features=cat_features) #create a model model = CatBoostRegressor(**cv_params) model.fit(train, y, eval_set=eval_dataset, plot=False, verbose=1000) # get stats preds = model.predict(valid) corr = pearsonr(true_labels, preds) rmse = np.sqrt(mean_squared_error(true_labels, preds)) if save: print(f'iteration: {i}, pearson: {corr}, rmse: {rmse}' ) #,file=f, flush=True) a.append([corr, rmse, true_labels, preds]) else: a.append([corr, rmse]) print(f'iteration: {i}, pearson: {corr}, rmse: {rmse}' ) #,file=f, flush=True) c[k].append(a) else: c['name'].append( [v, for_valid, cv]) # name of the fp, valid percentage, number of cv folds if save: nm = f'/tf/notebooks/code_for_pub/_logs_as_python_files/{fp_name}_noreplicates_{for_valid}_{time.ctime()}.pickle' with open(nm, 'wb') as file: pickle.dump(c, file) return c
model = CatBoostRegressor(iterations=1000, learning_rate=0.026, depth=4, l2_leaf_reg=None, model_size_reg=None, rsm=None, loss_function='RMSE', border_count=None, feature_border_type=None, fold_permutation_block_size=None, od_pval=None, od_wait=None, od_type=None, nan_mode=None, counter_calc_method=None, leaf_estimation_iterations=None, leaf_estimation_method=None, thread_count=10, random_seed=None, use_best_model=None, verbose=None, logging_level=None, metric_period=None, ctr_leaf_count_limit=None, store_all_simple_ctr=None, max_ctr_complexity=None, has_time=None, one_hot_max_size=None, random_strength=None, name=None, ignored_features=None, train_dir=None, custom_metric=None, eval_metric=None, bagging_temperature=None, save_snapshot=None, snapshot_file=None, fold_len_multiplier=None, used_ram_limit=None, gpu_ram_part=None, allow_writing_files=None, approx_on_full_history=None, boosting_type=None, simple_ctr=None, combinations_ctr=None, per_feature_ctr=None, task_type=None, device_config=None, devices=None, bootstrap_type=None, subsample=None, max_depth=None, n_estimators=None, num_boost_round=None, num_trees=None, colsample_bylevel=None, random_state=None, reg_lambda=None, objective=None, eta=None, max_bin=None, gpu_cat_features_storage=None, data_partition=None)
def test_invalid_loss_regressor(): with pytest.raises(CatboostError): pool = Pool(TRAIN_FILE, column_description=CD_FILE) model = CatBoostRegressor(loss_function="fee") model.fit(pool)
def fun_cat_fs(x, *args): X, y, flag, n_splits, random_seed = args clf = CatBoostRegressor(random_state=int(random_seed), verbose=0) n_samples, n_var = X.shape #cr ={ # 0:'reg:linear', # 1:'reg:logistic', # 2:'binary:logistic', # } #x=[0.1, 200, 5, 2.5, 10.0, 0.8, ] p = { 'learning_rate': x[0], 'n_estimators': int(round(x[1])), 'depth': int(round(x[2])), 'loss_function': 'RMSE', 'l2_leaf_reg': x[3], 'bagging_temperature': x[4], #'boosting_type':'Pĺain', #'colsample_bytree':x[3], #'min_child_weight':int(round(x[4])), #'bootstrap_type':'Bernoulli', #'subsample':int(x[5]*1000)/1000, ##'alpha':x[6], #'objective':cr[0], ##'presort':ps[0], } clf.set_params(**p) #x[2::] = [1 if k>0.5 else 0 for k in x[4::]] if len(x) <= 6: ft = np.array([1 for i in range(n_var)]) else: ft = np.array([1 if k > 0.5 else 0 for k in x[2::]]) ft = np.where(ft > 0.5) try: cv = KFold(n_splits=n_splits, shuffle=True, random_state=int(random_seed)) y_p = cross_val_predict(clf, X, y.ravel(), cv=cv, n_jobs=1) r = RMSE(y_p, y) r2 = MAPE(y_p, y) r3 = RRMSE(y_p, y) r4 = -r2_score(y_p, y) #r = mean_squared_error(y,y_p)**0.5 #r = -accuracy_score(y,y_p) #r = -f1_score(y,y_p,average='weighted') #r = -precision_score(y,y_p) #print(r,p) except: y_p = [None] r = 1e12 # print(r,'\t',p) if flag == 'eval': return r else: clf.fit(X[:, ft].squeeze(), y) return { 'Y_TRUE': y, 'Y_PRED': y_p, 'EST_PARAMS': p, 'PARAMS': x, 'EST_NAME': 'CAT', 'ESTIMATOR': clf, 'ACTIVE_VAR': ft, 'DATA': X, 'SEED': random_seed, 'ERROR_TRAIN': { 'RMSE': r, 'MAPE': r2, 'RRMSE': r3, 'R2_SCORE': r4 } }
for i, x in enumerate(y_pred_mlp_rd): if x < 0: y_pred_mlp_rd[i] = 0 #y_pred_mlp_rd = [0 for i,x in enumerate(y_pred_mlp_rd) if x<0] test_eval = eval_metrics(y_test, y_pred_mlp_rd) y_test_total2 = ( (pd.DataFrame(y_pred_test_rd) + pd.DataFrame(y_pred_X_test_rd.ravel()) + pd.DataFrame(y_pred_mlp_rd.ravel())) / 3).astype(int) metrics_total2 = eval_metrics(y_test, y_test_total2[0].values) from catboost import Pool, CatBoostRegressor, cv from sklearn.metrics import accuracy_score model = CatBoostRegressor() # Fit model model.fit( X_train, y_train, eval_set=(X_test, y_test), # logging_level='Verbose', # you can uncomment this for text output plot=True) model.fit(X_train, y_train, plot=True) # Get predictions pred_cat = model.predict(X_test) pred_cat_rd = np.round(pred_cat) cat_eval = eval_metrics(y_test, pred_cat_rd)
"Feature", "importance" ]].groupby("Feature").mean().sort_values(by="importance", ascending=False) ############################################### from catboost import CatBoostRegressor # model = "cat" for fold_, (trn_idx, val_idx) in enumerate(folds.split(train_x.values, train_y.values)): print("fold {}".format(fold_)) trn_x, trn_y = train_x.iloc[trn_idx], train_y.iloc[trn_idx] val_x, val_y = train_x.iloc[val_idx], train_y.iloc[val_idx] num_round = 10000 clf = CatBoostRegressor( objective="RMSE", # MultiClass 0.8957 n_estimators=num_round, max_depth=6, reg_lambda=0.01, random_seed=2019, verbose=True, ) clf.fit(trn_x, trn_y, eval_set=[(trn_x, trn_y), (val_x, val_y)], verbose=200, early_stopping_rounds=100 # cat_features=cat_features ) # n*6矩阵 oof[val_idx] = clf.predict(train_x.iloc[val_idx]) fold_importance_df = pd.DataFrame() fold_importance_df["Feature"] = features
def train_model_regression(X, X_test, y, params, folds, model_type='lgb', eval_metric='mae', columns=None, plot_feature_importance=False, model=None, verbose=10000, early_stopping_rounds=200, n_estimators=50000, mol_type=-1): """ A function to train a variety of regression models. Returns dictionary with oof predictions, test predictions, scores and, if necessary, feature importances. :params: X - training data, can be pd.DataFrame or np.ndarray (after normalizing) :params: X_test - test data, can be pd.DataFrame or np.ndarray (after normalizing) :params: y - target :params: folds - folds to split data :params: model_type - type of model to use :params: eval_metric - metric to use :params: columns - columns to use. If None - use all columns :params: plot_feature_importance - whether to plot feature importance of LGB :params: model - sklearn model, works only for "sklearn" model type """ columns = X.columns if columns is None else columns X_test = X_test[columns] # to set up scoring parameters metrics_dict = { 'mae': { 'lgb_metric_name': 'mae', 'catboost_metric_name': 'MAE', 'sklearn_scoring_function': metrics.mean_absolute_error }, 'group_mae': { 'lgb_metric_name': 'mae', 'catboost_metric_name': 'MAE', 'scoring_function': group_mean_log_mae }, 'mse': { 'lgb_metric_name': 'mse', 'catboost_metric_name': 'MSE', 'sklearn_scoring_function': metrics.mean_squared_error } } result_dict = {} # out-of-fold predictions on train data oof = np.zeros(len(X)) # averaged predictions on train data prediction = np.zeros(len(X_test)) # list of scores on folds scores = [] feature_importance = pd.DataFrame() # split and train on folds for fold_n, (train_index, valid_index) in enumerate(folds.split(X)): print(f'Fold {fold_n + 1} started at {time.ctime()}') if type(X) == np.ndarray: X_train, X_valid = X[columns][train_index], X[columns][valid_index] y_train, y_valid = y[train_index], y[valid_index] else: X_train, X_valid = X[columns].iloc[train_index], X[columns].iloc[ valid_index] y_train, y_valid = y.iloc[train_index], y.iloc[valid_index] if model_type == 'lgb': model = lgb.LGBMRegressor(**params, n_estimators=n_estimators, n_jobs=-1) model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_valid, y_valid)], eval_metric=metrics_dict[eval_metric]['lgb_metric_name'], verbose=verbose, early_stopping_rounds=early_stopping_rounds) y_pred_valid = model.predict(X_valid) y_pred = model.predict(X_test, num_iteration=model.best_iteration_) if model_type == 'xgb': train_data = xgb.DMatrix(data=X_train, label=y_train, feature_names=X.columns) valid_data = xgb.DMatrix(data=X_valid, label=y_valid, feature_names=X.columns) watchlist = [(train_data, 'train'), (valid_data, 'valid_data')] model = xgb.train(dtrain=train_data, num_boost_round=20000, evals=watchlist, early_stopping_rounds=200, verbose_eval=verbose, params=params) y_pred_valid = model.predict(xgb.DMatrix(X_valid, feature_names=X.columns), ntree_limit=model.best_ntree_limit) y_pred = model.predict(xgb.DMatrix(X_test, feature_names=X.columns), ntree_limit=model.best_ntree_limit) if model_type == 'sklearn': model = model model.fit(X_train, y_train) y_pred_valid = model.predict(X_valid).reshape(-1, ) score = metrics_dict[eval_metric]['sklearn_scoring_function']( y_valid, y_pred_valid) print(f'Fold {fold_n}. {eval_metric}: {score:.4f}.') print('') y_pred = model.predict(X_test).reshape(-1, ) if model_type == 'cat': model = CatBoostRegressor( iterations=20000, eval_metric=metrics_dict[eval_metric]['catboost_metric_name'], **params, loss_function=metrics_dict[eval_metric] ['catboost_metric_name']) model.fit(X_train, y_train, eval_set=(X_valid, y_valid), cat_features=[], use_best_model=True, verbose=False) y_pred_valid = model.predict(X_valid) y_pred = model.predict(X_test) oof[valid_index] = y_pred_valid.reshape(-1, ) if eval_metric != 'group_mae': scores.append( metrics_dict[eval_metric]['sklearn_scoring_function']( y_valid, y_pred_valid)) else: scores.append(metrics_dict[eval_metric]['scoring_function']( y_valid, y_pred_valid, X_valid['type'])) prediction += y_pred if model_type == 'lgb' and plot_feature_importance: # feature importance fold_importance = pd.DataFrame() fold_importance["feature"] = columns fold_importance["importance"] = model.feature_importances_ fold_importance["fold"] = fold_n + 1 feature_importance = pd.concat( [feature_importance, fold_importance], axis=0) prediction /= folds.n_splits print('CV mean score: {0:.4f}, std: {1:.4f}.'.format( np.mean(scores), np.std(scores))) result_dict['oof'] = oof result_dict['prediction'] = prediction result_dict['scores'] = scores if model_type == 'lgb': if plot_feature_importance: feature_importance["importance"] /= folds.n_splits cols = feature_importance[[ "feature", "importance" ]].groupby("feature").mean().sort_values( by="importance", ascending=False)[:50].index best_features = feature_importance.loc[ feature_importance.feature.isin(cols)] plt.figure(figsize=(16, 12)) sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False)) plt.title('LGB Features (avg over folds)') feature_importance.to_csv(log_path / f"importance_{mol_type}.csv") result_dict['feature_importance'] = feature_importance return result_dict
def test_benchmark_regression(self): data, label = get_data_label(load_boston()) data = data.drop(columns=["CHAS", "NOX", "RM", "DIS", "RAD", "TAX", "PTRATIO", "INDUS"]) num_features = 3 corr_threshold = 0.5 alpha = 1000 tree_params = {"random_state": 123, "n_estimators": 100} selectors = { "corr_pearson": SelectionMethod.Correlation(corr_threshold, method="pearson"), "corr_kendall": SelectionMethod.Correlation(corr_threshold, method="kendall"), "corr_spearman": SelectionMethod.Correlation(corr_threshold, method="spearman"), "univ_anova": SelectionMethod.Statistical(num_features, method="anova"), "univ_chi_square": SelectionMethod.Statistical(num_features, method="chi_square"), "univ_mutual_info": SelectionMethod.Statistical(num_features, method="mutual_info"), "linear": SelectionMethod.Linear(num_features, regularization="none"), "lasso": SelectionMethod.Linear(num_features, regularization="lasso", alpha=alpha), "ridge": SelectionMethod.Linear(num_features, regularization="ridge", alpha=alpha), "random_forest": SelectionMethod.TreeBased(num_features), "xgboost_clf": SelectionMethod.TreeBased(num_features, estimator=XGBClassifier(**tree_params)), "xgboost_reg": SelectionMethod.TreeBased(num_features, estimator=XGBRegressor(**tree_params)), "extra_clf": SelectionMethod.TreeBased(num_features, estimator=ExtraTreesClassifier(**tree_params)), "extra_reg": SelectionMethod.TreeBased(num_features, estimator=ExtraTreesRegressor(**tree_params)), "lgbm_clf": SelectionMethod.TreeBased(num_features, estimator=LGBMClassifier(**tree_params)), "lgbm_reg": SelectionMethod.TreeBased(num_features, estimator=LGBMRegressor(**tree_params)), "gradient_clf": SelectionMethod.TreeBased(num_features, estimator=GradientBoostingClassifier(**tree_params)), "gradient_reg": SelectionMethod.TreeBased(num_features, estimator=GradientBoostingRegressor(**tree_params)), "adaboost_clf": SelectionMethod.TreeBased(num_features, estimator=AdaBoostClassifier(**tree_params)), "adaboost_reg": SelectionMethod.TreeBased(num_features, estimator=AdaBoostRegressor(**tree_params)), "catboost_clf": SelectionMethod.TreeBased(num_features, estimator=CatBoostClassifier(**tree_params, silent=True)), "catboost_reg": SelectionMethod.TreeBased(num_features, estimator=CatBoostRegressor(**tree_params, silent=True)) } # Benchmark score_df, selected_df, runtime_df = benchmark(selectors, data, label, output_filename=None) _ = calculate_statistics(score_df, selected_df) self.assertListAlmostEqual([0.4787777784012165, 0.47170429073431874, 0.5596288196730658, 0.4400410275414326, 0.5674082968785575], score_df["corr_pearson"].to_list()) self.assertListAlmostEqual([0.5357134888110283, 0.48128808343101986, 0.5132201793752295, 0.3384081264406572, 0.49448886053070107], score_df["corr_kendall"].to_list()) self.assertListAlmostEqual([0.6542231557010167, 0.5538583519391704, 0.6267310661636885, 0.3924548536221991, 0.5984933578623318], score_df["corr_spearman"].to_list()) self.assertListAlmostEqual([89.48611475768125, 75.25764229895405, 83.47745921923685, 63.05422911249312, 601.6178711099022], score_df["univ_anova"].to_list()) self.assertListAlmostEqual([0, 0, 0, 0, 0], score_df["univ_chi_square"].to_list()) self.assertListAlmostEqual([0.3421450205863028, 0.1806168920395521, 0.31266011627421086, 0.16107911083428794, 0.666208499757925], score_df["univ_mutual_info"].to_list()) self.assertListAlmostEqual([0.06901111285092865, 0.05408618283036938, 0.06145227292569164, 0.006510036424819454, 0.9546615660373198], score_df["linear"].to_list()) self.assertListAlmostEqual([0.05682706487290267, 0.051008405488957305, 0.05319245109490162, 0.007176306398647428, 0.9231211889322195], score_df["lasso"].to_list()) self.assertListAlmostEqual([0.0690214777400926, 0.054087779998048285, 0.06144441861097637, 0.006510854482697315, 0.95459417786841], score_df["ridge"].to_list()) self.assertListAlmostEqual([0.10947144861974874, 0.020211076089938374, 0.08416074180466389, 0.045604950489313435, 0.7405517829963355], score_df["random_forest"].to_list())
print("Replacing NaN values by -999 !!") train_df.fillna(-999, inplace=True) test_df.fillna(-999, inplace=True) print("Training time !!") X_train = train_df[train_features] y_train = train_df['血糖'] print(X_train.shape, y_train.shape) X_test = test_df[train_features] print(X_test.shape) num_ensembles = 5 y_pred_cat = 0.0 for i in tqdm(range(num_ensembles)): model = CatBoostRegressor(iterations=1000, learning_rate=0.03, depth=6, l2_leaf_reg=3, loss_function='RMSE', eval_metric='RMSE', random_seed=i) model.fit(X_train, y_train, cat_features=cat_feature_inds) y_pred_cat += model.predict(X_test) y_pred_cat /= num_ensembles del train del test gc.collect() ################ ################ ## OLS ## ################ ################ np.random.seed(17)
def test_regression_ctr(): pool = Pool(TRAIN_FILE, column_description=CD_FILE) model = CatBoostRegressor(iterations=5, random_seed=0, ctr_description=['Borders:TargetBorderCount=5:TargetBorderType=Uniform', 'Counter']) model.fit(pool) model.save_model(OUTPUT_MODEL_PATH) return compare_canonical_models(OUTPUT_MODEL_PATH)