def fit( self, X, y, sample_weight=None, X_validation=None, y_validation=None, sample_weight_validation=None, log_to_file=None, max_time=None, ): if self.is_fitted(): print("CatBoost model already fitted. Skip fit().") return if self.cat_features is None: self.cat_features = [] for i in range(X.shape[1]): if PreprocessingUtils.is_categorical(X.iloc[:, i]): self.cat_features += [i] eval_set = None if X_validation is not None and y_validation is not None: eval_set = Pool( data=X_validation, label=y_validation, cat_features=self.cat_features, weight=sample_weight_validation, ) if self.params.get("num_boost_round") is None: model_init, new_iterations = self._assess_iterations( X, y, sample_weight, eval_set, max_time) self.model.set_params(iterations=new_iterations) else: model_init = None self.model.set_params( iterations=self.params.get("num_boost_round")) self.early_stopping_rounds = self.params.get( "early_stopping_rounds", 50) self.model.fit( X, y, sample_weight=sample_weight, cat_features=self.cat_features, init_model=model_init, eval_set=eval_set, early_stopping_rounds=self.early_stopping_rounds, verbose_eval=False, ) if self.model.best_iteration_ is not None: if model_init is not None: self.best_ntree_limit = (self.model.best_iteration_ + model_init.tree_count_ + 1) else: self.best_ntree_limit = self.model.best_iteration_ + 1 else: # just take all the trees # the warm-up trees are already included # dont need to add +1 self.best_ntree_limit = self.model.tree_count_ if log_to_file is not None: metric_name = list(self.model.evals_result_["learn"].keys())[0] train_scores = self.model.evals_result_["learn"][metric_name] validation_scores = self.model.evals_result_["validation"][ metric_name] if model_init is not None: train_scores = ( model_init.evals_result_["learn"][metric_name] + train_scores) validation_scores = ( model_init.evals_result_["validation"][metric_name] + validation_scores) result = pd.DataFrame({ "iteration": range(len(train_scores)), "train": train_scores, "validation": validation_scores, }) result.to_csv(log_to_file, index=False, header=False)
'max_ctr_complexity': 1, 'depth': 8, 'leaf_estimation_method': 'Gradient', 'use_best_model': True, 'iterations': 100000, 'early_stopping_rounds': 5000, 'verbose': 500 } cate_cols = [ 'Uid', 'Category', 'Subcategory', 'Concept', 'Mediatype', 'hour', 'day', 'weekday', 'week_hour', 'year_weekday', 'Geoaccuracy', 'ispro', 'Ispublic', 'img_model' ] submit_data = Pool(data=submit_feature_df, label=submit_label_df['label'], cat_features=cate_cols) valid_ans = [] submit_proba = [] kfold = KFold(n_splits=5, shuffle=True, random_state=2020) k = 0 for train_idx, valid_idx in kfold.split(train_feature_df, train_label_df): fold_valid_x, fold_valid_y = train_feature_df.loc[ valid_idx], train_label_df['label'].loc[valid_idx] valid_data = Pool(data=fold_valid_x, label=fold_valid_y, cat_features=cate_cols)
folds = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=random_state) df = train.copy() # columns = all_features # categoric_columns = all_features # X_train = df[columns] y_train = df['target'] logloss_all = [] _proba = np.zeros((X_train.shape[0], y_train.nunique())) _probas = np.zeros((X_train.shape[0], y_train.nunique())) for n_fold, (train_idx, valid_idx) in enumerate(folds.split(X_train, y_train)): print("Fold --> " + str(n_fold + 1) + "/" + str(n_splits)) train_X, train_y = X_train.iloc[train_idx].copy(), y_train.iloc[train_idx] valid_X, valid_y = X_train.iloc[valid_idx].copy(), y_train.iloc[valid_idx] dataset = Pool(train_X, train_y, categoric_columns) evalset = Pool(valid_X, valid_y, categoric_columns) model = CatBoostClassifier(task_type="GPU", depth=4, iterations=iterations, od_wait=1000, od_type='Iter', learning_rate=0.02, use_best_model=True, loss_function='MultiClass', verbose=False) model.fit(dataset, plot=False, verbose=500, eval_set=evalset) _proba = model.predict_proba(valid_X[all_features]) logloss_of_fold = log_loss(list(valid_y), _proba) logloss_all.append(logloss_of_fold) _probas[valid_idx, :] += _proba / n_repeats
RS=2305 # Seed for partition and model random part TS=0.3 # Validation size esr=50 # Early stopping rounds (when validation does not improve in these rounds, stops) from sklearn.model_selection import train_test_split x_tr, x_val, y_tr, y_val = train_test_split(X_train, Y_train, test_size=TS, random_state=RS) # Categorical positions for catboost Pos=list() As_Categorical=Categorical.tolist() As_Categorical.remove('ID') for col in As_Categorical: Pos.append((X_train.columns.get_loc(col))) # To Pool Class (for catboost only) pool_tr=Pool(x_tr, y_tr,cat_features=Pos) pool_val=Pool(x_val, y_val,cat_features=Pos) # By-hand paramter tuning. A grid-search is expensive # We test different combinations # See parameter options here: # "https://tech.yandex.com/catboost/doc/dg/concepts/python-reference_parameters-list-docpage/" model_catboost_val = CatBoostClassifier( eval_metric='AUC', iterations=20000, # Very high value, to find the optimum od_type='Iter', # Overfitting detector set to "iterations" or number of trees random_seed=RS, # Random seed for reproducibility verbose=100) # Shows train/test metric every "verbose" trees # "Technical" parameters of the model: params = {'objective': 'Logloss',
logger.info('CV mean score: {0:.4f}, std: {1:.4f}.'.format( np.mean(bond_scores), np.std(bond_scores))) oof[valid_idx] = y_pred_valid.reshape(-1, ) prediction_type += y_pred fold_count += 1 now = timer() logger.info( 'Completed training and predicting for bond {} fold {}-of-{} in {:0.4f} seconds' .format(bond_type, fold_n + 1, fold_count, now - fold_start)) elif MODEL_TYPE == 'catboost': fold_start = timer() logger.info('Running Type {} - Fold {} of {}'.format( bond_type, fold_count, folds.n_splits)) X_train, X_valid = X_type.iloc[train_idx], X_type.iloc[valid_idx] y_train, y_valid = y_type.iloc[train_idx], y_type.iloc[valid_idx] train_dataset = Pool(data=X_train.drop('type', axis=1), label=y_train) valid_dataset = Pool(data=X_valid.drop('type', axis=1), label=y_valid) test_dataset = Pool(data=X_test_type.drop('type', axis=1)) DEPTH = 4 update_tracking(run_id, 'depth', DEPTH) model = CatBoostRegressor(iterations=N_ESTIMATORS, learning_rate=LEARNING_RATE, depth=DEPTH, eval_metric=EVAL_METRIC, verbose=VERBOSE, random_state=RANDOM_STATE, thread_count=N_THREADS, loss_function=EVAL_METRIC, task_type="GPU") # Train on GPU
def test_feature_importance(): pool = Pool(TRAIN_FILE, column_description=CD_FILE) model = CatBoostClassifier(iterations=5, random_seed=0) model.fit(pool) np.save(FIMP_PATH, np.array(model.feature_importances_)) return local_canonical_file(FIMP_PATH)
def _fit(self, X_train, y_train, X_val=None, y_val=None, time_limit=None, num_gpus=0, **kwargs): try_import_catboost() from catboost import CatBoostClassifier, CatBoostRegressor, Pool if self.problem_type == SOFTCLASS: try_import_catboostdev( ) # Need to first import catboost then catboost_dev not vice-versa. from catboost_dev import CatBoostClassifier, CatBoostRegressor, Pool from .catboost_softclass_utils import SoftclassCustomMetric, SoftclassObjective self._set_default_param_value( 'eval_metric', construct_custom_catboost_metric( self.stopping_metric, True, not self.stopping_metric.needs_pred, self.problem_type)) self.params[ 'loss_function'] = SoftclassObjective.SoftLogLossObjective() self.params[ 'eval_metric'] = SoftclassCustomMetric.SoftLogLossMetric() self._set_default_param_value( 'early_stopping_rounds', 50) # Speeds up training with custom (non-C++) losses model_type = CatBoostClassifier if self.problem_type in PROBLEM_TYPES_CLASSIFICATION else CatBoostRegressor if isinstance(self.params['eval_metric'], str): metric_name = self.params['eval_metric'] else: metric_name = type(self.params['eval_metric']).__name__ num_rows_train = len(X_train) num_cols_train = len(X_train.columns) if self.problem_type == MULTICLASS: if self.num_classes is not None: num_classes = self.num_classes else: num_classes = 10 # Guess if not given, can do better by looking at y_train elif self.problem_type == SOFTCLASS: # TODO: delete this elif if it's unnecessary. num_classes = y_train.shape[1] self.num_classes = num_classes else: num_classes = 1 # TODO: Add ignore_memory_limits param to disable NotEnoughMemoryError Exceptions max_memory_usage_ratio = self.params_aux['max_memory_usage_ratio'] approx_mem_size_req = num_rows_train * num_cols_train * num_classes / 2 # TODO: Extremely crude approximation, can be vastly improved if approx_mem_size_req > 1e9: # > 1 GB available_mem = psutil.virtual_memory().available ratio = approx_mem_size_req / available_mem if ratio > (1 * max_memory_usage_ratio): logger.warning( '\tWarning: Not enough memory to safely train CatBoost model, roughly requires: %s GB, but only %s GB is available...' % (round(approx_mem_size_req / 1e9, 3), round(available_mem / 1e9, 3))) raise NotEnoughMemoryError elif ratio > (0.2 * max_memory_usage_ratio): logger.warning( '\tWarning: Potentially not enough memory to safely train CatBoost model, roughly requires: %s GB, but only %s GB is available...' % (round(approx_mem_size_req / 1e9, 3), round(available_mem / 1e9, 3))) start_time = time.time() X_train = self.preprocess(X_train) cat_features = list(X_train.select_dtypes(include='category').columns) X_train = Pool(data=X_train, label=y_train, cat_features=cat_features) if X_val is not None: X_val = self.preprocess(X_val) X_val = Pool(data=X_val, label=y_val, cat_features=cat_features) eval_set = X_val if num_rows_train <= 10000: modifier = 1 else: modifier = 10000 / num_rows_train early_stopping_rounds = max(round(modifier * 150), 10) num_sample_iter_max = max(round(modifier * 50), 2) else: eval_set = None early_stopping_rounds = None num_sample_iter_max = 50 invalid_params = ['num_threads', 'num_gpus'] for invalid in invalid_params: if invalid in self.params: self.params.pop(invalid) train_dir = None if 'allow_writing_files' in self.params and self.params[ 'allow_writing_files']: if 'train_dir' not in self.params: try: # TODO: What if path is in S3? os.makedirs(os.path.dirname(self.path), exist_ok=True) except: pass else: train_dir = self.path + 'catboost_info' logger.log(15, f'\tCatboost model hyperparameters: {self.params}') # TODO: Add more control over these params (specifically early_stopping_rounds) verbosity = kwargs.get('verbosity', 2) if verbosity <= 1: verbose = False elif verbosity == 2: verbose = False elif verbosity == 3: verbose = 20 else: verbose = True init_model = None init_model_tree_count = None init_model_best_iteration = None init_model_best_score = None params = self.params.copy() num_features = len(self.features) if num_gpus != 0: if 'task_type' not in params: params['task_type'] = 'GPU' # TODO: Confirm if GPU is used in HPO (Probably not) # TODO: Adjust max_bins to 254? if params.get('task_type', None) == 'GPU': if 'colsample_bylevel' in params: params.pop('colsample_bylevel') logger.log( 30, f'\t\'colsample_bylevel\' is not supported on GPU, using default value (Default = 1).' ) if 'rsm' in params: params.pop('rsm') logger.log( 30, f'\t\'rsm\' is not supported on GPU, using default value (Default = 1).' ) if self.problem_type == MULTICLASS and 'rsm' not in params and 'colsample_bylevel' not in params and num_features > 1000: if time_limit: # Reduce sample iterations to avoid taking unreasonable amounts of time num_sample_iter_max = max(round(num_sample_iter_max / 2), 2) # Subsample columns to speed up training if params.get('task_type', None) != 'GPU': # RSM does not work on GPU params['colsample_bylevel'] = max( min(1.0, 1000 / num_features), 0.05) logger.log( 30, f'\tMany features detected ({num_features}), dynamically setting \'colsample_bylevel\' to {params["colsample_bylevel"]} to speed up training (Default = 1).' ) logger.log( 30, f'\tTo disable this functionality, explicitly specify \'colsample_bylevel\' in the model hyperparameters.' ) else: params['colsample_bylevel'] = 1.0 logger.log( 30, f'\t\'colsample_bylevel\' is not supported on GPU, using default value (Default = 1).' ) if time_limit: time_left_start = time_limit - (time.time() - start_time) if time_left_start <= time_limit * 0.4: # if 60% of time was spent preprocessing, likely not enough time to train model raise TimeLimitExceeded params_init = params.copy() num_sample_iter = min(num_sample_iter_max, params_init['iterations']) params_init['iterations'] = num_sample_iter if train_dir is not None: params_init['train_dir'] = train_dir self.model = model_type(**params_init, ) self.model.fit( X_train, eval_set=eval_set, use_best_model=True, verbose=verbose, # early_stopping_rounds=early_stopping_rounds, ) init_model_tree_count = self.model.tree_count_ init_model_best_iteration = self.model.get_best_iteration() init_model_best_score = self.model.get_best_score( )['validation'][metric_name] time_left_end = time_limit - (time.time() - start_time) time_taken_per_iter = (time_left_start - time_left_end) / num_sample_iter estimated_iters_in_time = round(time_left_end / time_taken_per_iter) init_model = self.model params_final = params.copy() # TODO: This only handles memory with time_limits specified, but not with time_limits=None, handle when time_limits=None available_mem = psutil.virtual_memory().available if self.problem_type == SOFTCLASS: # TODO: remove this once catboost-dev is no longer necessary and SOFTCLASS objectives can be pickled. model_size_bytes = 1 # skip memory check else: model_size_bytes = sys.getsizeof(pickle.dumps(self.model)) max_memory_proportion = 0.3 * max_memory_usage_ratio mem_usage_per_iter = model_size_bytes / num_sample_iter max_memory_iters = math.floor( available_mem * max_memory_proportion / mem_usage_per_iter) params_final['iterations'] = min( params['iterations'] - num_sample_iter, estimated_iters_in_time) if params_final['iterations'] > max_memory_iters - num_sample_iter: if max_memory_iters - num_sample_iter <= 500: logger.warning( '\tWarning: CatBoost will be early stopped due to lack of memory, increase memory to enable full quality models, max training iterations changed to %s from %s' % (max_memory_iters, params_final['iterations'] + num_sample_iter)) params_final['iterations'] = max_memory_iters - num_sample_iter else: params_final = params.copy() if train_dir is not None: params_final['train_dir'] = train_dir if params_final['iterations'] > 0: self.model = model_type(**params_final, ) # TODO: Strangely, this performs different if clone init_model is sent in than if trained for same total number of iterations. May be able to optimize catboost models further with this self.model.fit( X_train, eval_set=eval_set, verbose=verbose, early_stopping_rounds=early_stopping_rounds, # use_best_model=True, init_model=init_model, ) if init_model is not None: final_model_best_score = self.model.get_best_score( )['validation'][metric_name] if self.stopping_metric._optimum > final_model_best_score: if final_model_best_score > init_model_best_score: best_iteration = init_model_tree_count + self.model.get_best_iteration( ) else: best_iteration = init_model_best_iteration else: if final_model_best_score < init_model_best_score: best_iteration = init_model_tree_count + self.model.get_best_iteration( ) else: best_iteration = init_model_best_iteration self.model.shrink(ntree_start=0, ntree_end=best_iteration + 1) self.params_trained['iterations'] = self.model.tree_count_
weight ) num_train = int(df.shape[0] * PCT_TRAIN)# DONE y = df[DIRECTION] # DONE y_train = y[: num_train] y_test = y[num_train:] w_train = w[: num_train] w_test = w[num_train:] X = df.iloc[:, 1:] X_train = X.iloc[:num_train, :] X_test = X.iloc[num_train:, :] train_pool = Pool(X_train, y_train, weight=w_train) test_pool = Pool(X_test, y_test, weight=w_test) # %% # %%time # train_pool, test_pool = get_train_test_pools(N,DIRECTION, WEIGHTS_QUNATILE, CACHE=True) # best_iter = next(map(lambda x: len(x[1])-1, model.eval_metrics(train_pool,['Precision']).items())) # model.plot_tree(best_iter) # %% # %%time N=5 DIRECTION='is_turnpt' WEIGHTS_QUNATILE = 0.01 model = CatBoostClassifier(iterations=10**5, # set very large number and set early stops
def test_pool_cat_features(): pool = Pool(TRAIN_FILE, column_description=CD_FILE) assert np.all(pool.get_cat_feature_indices() == CAT_FEATURES)
x = train_df.drop('Ans', axis = 1) y = train_df.Ans from catboost import CatBoostClassifier, Pool, cv from sklearn.metrics import accuracy_score import catboost model = CatBoostClassifier( custom_loss=['Accuracy'], logging_level='Silent', loss_function='MultiClass' ) from sklearn.model_selection import train_test_split x_train, x_validation, y_train, y_validation = train_test_split(x, y, train_size=0.75, random_state=27) train_pool = Pool(x_train, y_train) validate_pool = Pool(x_validation, y_validation) model.fit( x_train, y_train, eval_set=(x_validation, y_validation), ); from sklearn.metrics import accuracy_score print('Train Accuracy:', accuracy_score(y_train, model.predict(x_train))) print('Validation Accuracy:', accuracy_score(y_validation, model.predict(x_validation))) print('Save current model? Y/n') ans = input() if ans == 'n' : print('Model will not save') sys.exit(0)
df = pd.read_csv(args.infile) #df=pd.read_csv('C:/TSU_GIT/MedicalDataAnalysisService/R_scripts/Arizona_informative.csv') # кодирование признака в цифры le = LabelEncoder() encode_feature = le.fit_transform(df.Class) df = df.drop(['Class'], axis=1) X_train, X_test, y_train, y_test = train_test_split(df, encode_feature, test_size=args.cross, stratify=encode_feature) train_pool = Pool(data=X_train, label=y_train) test_pool = Pool(data=X_test, label=y_test) model = CatBoostClassifier(iterations=args.itera, depth=6, learning_rate=args.learn, loss_function='MultiClass', eval_metric='Accuracy', logging_level='verbose') model.fit(train_pool, eval_set=test_pool, logging_level='Verbose') pred = model.predict(data=test_pool, prediction_type='Class') acc_test = accuracy_score(y_test, pred)
def catboost_bootstrap(dir_, learn_name, test_name, cd_file, classes, learning_rate=None, border_count=32, cnt_values=20, file_result_to=sys.stdout, file_info_to=sys.stdout, iterations=1500): logloss = {} auc = {} for clazz in classes: print('class={}'.format(clazz.WRAPPER_NAME)) print('class={}; step={}'.format(clazz.WRAPPER_NAME, learning_rate[clazz]), file=file_result_to) file_result_to.flush() auc[clazz.WRAPPER_NAME] = [] logloss[clazz.WRAPPER_NAME] = [] tree_counts = [] logloss_curves = [] auc_curves = [] cl = clazz() source_learn_pool = Pool(data=os.path.join(dir_, learn_name), column_description=os.path.join( dir_, cd_file)) beg = time.time() learn_pool = cl.handle_learn_pool(source_learn_pool) end = time.time() print('!!!time: {}'.format(end - beg), file=file_info_to) print('priors: {}'.format(cl.prior), file=file_info_to) print('prior scores: {}'.format(cl.score), file=file_info_to) file_info_to.flush() source_test_pool = Pool(data=os.path.join(dir_, test_name), column_description=os.path.join(dir_, cd_file)) source_test_label = np.array(source_test_pool.get_label()) source_test_features = np.array(source_test_pool.get_features()) cat = CatBoostClassifier(max_ctr_complexity=1, custom_metric='AUC', boosting_type='Plain', random_seed=0, border_count=border_count, iterations=iterations, learning_rate=learning_rate[clazz], thread_count=multiprocessing.cpu_count()) beg = time.time() cat.fit(learn_pool, use_best_model=True) end = time.time() for seed in range(cnt_values): idx = list(range(source_test_features.shape[0])) np.random.seed(seed * 10 + 300) boot_idx = np.random.choice(idx, len(idx), replace=True) boot_test_features = source_test_features[boot_idx] boot_test_label = source_test_label[boot_idx] X, y = cl.handle_test_matrix(boot_test_features, boot_test_label, False) metrics = cat.eval_metrics( Pool(X, y), ['Logloss', 'AUC'], eval_period=1, thread_count=multiprocessing.cpu_count()) for num, loss in enumerate(metrics['Logloss']): print('iter={:10}: loss={:.10}'.format(num + 1, loss)) cnt_trees = np.argmin(metrics['Logloss']) print('choose cnt_trees={}'.format(cnt_trees)) print('overfit={}; AUC={}; logloss={}'.format( cnt_trees, metrics['AUC'][cnt_trees], metrics['Logloss'][cnt_trees]), file=file_result_to) tree_counts.append(cnt_trees) file_result_to.flush() logloss_curves.append(metrics['Logloss']) auc_curves.append(metrics['AUC']) auc[clazz.WRAPPER_NAME].append(metrics['AUC'][cnt_trees]) logloss[clazz.WRAPPER_NAME].append(metrics['Logloss'][cnt_trees]) print('class={}, learn_time={}, mean_tree_count={}'.format( clazz.WRAPPER_NAME, end - beg, sum(tree_counts) / len(tree_counts)), file=file_result_to) print('mean_AUC={}, mean_logloss={}'.format( sum(auc[clazz.WRAPPER_NAME]) / len(auc[clazz.WRAPPER_NAME]), sum(logloss[clazz.WRAPPER_NAME]) / len(logloss[clazz.WRAPPER_NAME])), file=file_result_to) file_result_to.flush() logloss_fig = create_learning_curves_plot( logloss_curves, 'logloss {}'.format(clazz.WRAPPER_NAME)) auc_fig = create_learning_curves_plot( auc_curves, 'AUC {}'.format(clazz.WRAPPER_NAME)) logloss_file = os.path.join( dir_, 'fig_{}_{}'.format('Logloss', clazz.WRAPPER_NAME)) AUC_file = os.path.join(dir_, 'fig_{}_{}'.format('AUC', clazz.WRAPPER_NAME)) plot(logloss_fig, filename=logloss_file, auto_open=False) plot(auc_fig, filename=AUC_file, auto_open=False) file_name = os.path.join(dir_, 'boot.txt') with open(file_name, 'w') as file_to: json.dump(auc, file_to) for cl1 in classes: for cl2 in classes: stat, p_value = wilcoxon(auc[cl1.WRAPPER_NAME], auc[cl2.WRAPPER_NAME], zero_method="pratt") print('for {} & {}: stat: {}, p_value: {}'.format( cl1.WRAPPER_NAME, cl2.WRAPPER_NAME, stat, p_value), file=file_result_to)
import sys from pathlib import Path import json import pandas as pd from catboost import CatBoostClassifier, Pool project_dir = Path(__file__).resolve().parents[2] params = json.load(open(f"{project_dir}/src/models/params.json")) model = CatBoostClassifier() model.load_model(f"{project_dir}/models/heart.cbm") test_df = pd.read_csv(f"{project_dir}/data/processed/test.csv") target = test_df.pop(params['data_params']['target']) X = test_df test_pool = Pool(X, target, params['data_params']['cat_features']) print(model.predict(test_pool)) sys.exit(0)
categorical_features_indices = np.where(train_data.dtypes != np.float)[0] params = { 'iterations': 1000, 'learning_rate': 0.16129990013229004, 'eval_metric': 'F1', 'random_seed': 42, 'logging_level': 'Silent', 'l2_leaf_reg': 1.0, 'depth' : 5, 'random_strength' : 1, 'use_best_model': True } #train_pool = Pool(train_data, train_label, cat_features=categorical_features_indices) train_pool = Pool(data, data_label, cat_features=categorical_features_indices) validate_pool = Pool(test_data, test_label, cat_features=categorical_features_indices) model = CatBoostClassifier(**params) model.fit(train_pool,eval_set=validate_pool, logging_level='Verbose', plot=False ) ##cv_params = model.get_params() ##cv_params.update({ ## 'loss_function': 'Logloss' ##}) ##cv_data = cv(
def test_real_numbers_cat_features(): with pytest.raises(CatboostError): data = np.random.rand(100, 10) label = np.random.randint(2, size=100) Pool(data, label, [1, 2])
def test_predict_sklearn_regress(): train_pool = Pool(TRAIN_FILE, column_description=CD_FILE) model = CatBoostRegressor(iterations=2, random_seed=0) model.fit(train_pool) model.save_model(OUTPUT_MODEL_PATH) return compare_canonical_models(OUTPUT_MODEL_PATH)
def test_load_file(): assert _check_shape(Pool(TRAIN_FILE, column_description=CD_FILE))
def test_invalid_loss_base(): with pytest.raises(CatboostError): pool = Pool(TRAIN_FILE, column_description=CD_FILE) model = CatBoost({"loss_function": "abcdef"}) model.fit(pool)
def test_load_ndarray(): pool = Pool(TRAIN_FILE, column_description=CD_FILE) cat_features = pool.get_cat_feature_indices() data = np.array(map_cat_features(pool.get_features(), cat_features)) label = np.array(pool.get_label()) assert _check_shape(Pool(data, label, cat_features))
def test_invalid_loss_classifier(): with pytest.raises(CatboostError): pool = Pool(TRAIN_FILE, column_description=CD_FILE) model = CatBoostClassifier(loss_function="abcdef") model.fit(pool)
def train_features(models_dict, pools_dict, features, x_train, x_test, y_train, y_test): """ Function to aggregate models from a set of features """ learn_mape_train_df = pd.DataFrame() learn_rmse_train_df = pd.DataFrame() learn_mape_test_df = pd.DataFrame() learn_rmse_test_df = pd.DataFrame() categorical_features_indices = [] for feature in features: y_train = pd.DataFrame(data=y_train, columns=features) y_test = pd.DataFrame(data=y_test, columns=features) train_pool = Pool(data=x_train, label=y_train[feature], cat_features=categorical_features_indices) num_trees = 500 loss_funct = 'MAPE' depth = 1 l2_leaf_reg = 0.2 learning_rate = 0.005 if 'ssim' in feature: loss_funct = 'MAE' depth = 1 num_trees = 500 learning_rate = 0.05 l2_leaf_reg = 0.2 models_dict[feature] = CatBoostRegressor(depth=depth, num_trees=num_trees, l2_leaf_reg=l2_leaf_reg, learning_rate=learning_rate, loss_function=loss_funct) #train the model print('Training QoE model:', feature) models_dict[feature].fit(train_pool) pools_dict[feature] = Pool(data=x_test, label=y_test[feature], cat_features=categorical_features_indices) learn_mape_train_df[feature] = models_dict[feature].eval_metrics( train_pool, ['MAPE'])['MAPE'] learn_mape_test_df[feature] = models_dict[feature].eval_metrics( pools_dict[feature], ['MAPE'])['MAPE'] learn_rmse_train_df[feature] = models_dict[feature].eval_metrics( train_pool, ['RMSE'])['RMSE'] learn_rmse_test_df[feature] = models_dict[feature].eval_metrics( pools_dict[feature], ['RMSE'])['RMSE'] st.write('QoE model test set MAPE:') st.write(learn_mape_test_df.min()) st.write(learn_mape_test_df.min().describe()) return models_dict, pools_dict
def test_invalid_loss_regressor(): with pytest.raises(CatboostError): pool = Pool(TRAIN_FILE, column_description=CD_FILE) model = CatBoostRegressor(loss_function="fee") model.fit(pool)
def Model_cv(MODEL, k, X_train, X_test, y, RE, makepred=True, CatPos=None): # Create the k folds kf=StratifiedKFold(n_splits=k, shuffle=True, random_state=RE) # first level train and test Level_1_train = pd.DataFrame(np.zeros((X_train.shape[0],1)), columns=['train_yhat']) if makepred==True: Level_1_test = pd.DataFrame() # Main loop for each fold. Initialize counter count=0 for train_index, test_index in kf.split(X_train, Y_train): count+=1 # Define train and test depending in which fold are we fold_train= X_train.loc[train_index.tolist(), :] fold_test=X_train.loc[test_index.tolist(), :] fold_ytrain=y[train_index.tolist()] fold_ytest=y[test_index.tolist()] # (k-1)-folds model adjusting if CatPos: # Prepare Pool pool_train=Pool(fold_train, fold_ytrain,cat_features=Pos) # (k-1)-folds model adjusting model_fit=MODEL.fit(X=pool_train) else: # (k-1)-folds model adjusting model_fit=MODEL.fit(fold_train, fold_ytrain) # Predict on the free fold to evaluate metric # and on train to have an overfitting-free prediction for the next level p_fold=MODEL.predict_proba(fold_test)[:,1] p_fold_train=MODEL.predict_proba(fold_train)[:,1] # Score in the free fold score=roc_auc_score(fold_ytest,p_fold) score_train=roc_auc_score(fold_ytrain,p_fold_train) print(k, '-cv, Fold ', count, '\t --test AUC: ', round(score,4), '\t--train AUC: ', round(score_train,4),sep='') # Save in Level_1_train the "free" predictions concatenated Level_1_train.loc[test_index.tolist(),'train_yhat'] = p_fold # Predict in test to make the k model mean # Define name of the prediction (p_"iteration number") if makepred==True: name = 'p_' + str(count) # Predictin to real test real_pred = MODEL.predict_proba(X_test)[:,1] # Name real_pred = pd.DataFrame({name:real_pred}, columns=[name]) # Add to Level_1_test Level_1_test=pd.concat((Level_1_test,real_pred),axis=1) # Compute the metric of the total concatenated prediction (and free of overfitting) in train score_total=roc_auc_score(y,Level_1_train['train_yhat']) print('\n',k, '- cv, TOTAL AUC:', round((score_total)*100,4),'%') # mean of the k predictions in test if makepred==True: Level_1_test['model']=Level_1_test.mean(axis=1) # Return train and test sets with predictions and the performance if makepred==True: return Level_1_train, pd.DataFrame({'test_yhat':Level_1_test['model']}), score_total else: return score_total
def test_no_eval_set(): with pytest.raises(CatboostError): pool = Pool(TRAIN_FILE, column_description=CD_FILE) model = CatBoostClassifier() model.fit(pool, use_best_model=True)
details = [] answers = [] mean_f1 = 0 n_splits = 5 sk = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=2020) cnt = 0 for train, test in sk.split(train_data, label): x_train = train_data.iloc[train] y_train = label.iloc[train] x_test = train_data.iloc[test] y_test = label.iloc[test] train_dataset = Pool(data=x_train, label=y_train, cat_features=cat_features) eval_dataset = Pool(data=x_test, label=y_test, cat_features=cat_features) model.fit(train_dataset, use_best_model=True, eval_set=eval_dataset) importance_df = pd.DataFrame() importance_df["feature"] = x_train.columns.tolist() importance_df["importance"] = model.feature_importances_
def test_fit_no_label(): with pytest.raises(CatboostError): pool = Pool(TRAIN_FILE, column_description=CD_FILE) model = CatBoostClassifier() model.fit(pool.get_features())
def calc_attributes(name, city): data = pd.read_csv('data/final.csv') #ids=[142] pokemon = data[data['city'] == city] pokemon['temp_new'] = pokemon['temperature'].apply(temperature_changer) weather_classes = [ 'Foggy', 'Clear', 'PartlyCloudy', 'MostlyCloudy', 'Overcast', 'Rain', 'BreezyandOvercast', 'LightRain', 'Drizzle', 'BreezyandPartlyCloudy', 'HeavyRain', 'BreezyandMostlyCloudy', 'Breezy', 'Windy', 'WindyandFoggy', 'Humid', 'Dry', 'WindyandPartlyCloudy', 'DangerouslyWindy', 'DryandMostlyCloudy', 'DryandPartlyCloudy', 'DrizzleandBreezy', 'LightRainandBreezy', 'HumidandPartlyCloudy', 'HumidandOvercast', 'RainandWindy' ] day_of_week = [ "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday" ] time_of_day = ["Morning", "Afternoon", "Evening", "Night"] mapper = DataFrameMapper([ ('close_to_water', LabelEncoder()), ('weather', MultiLabelBinarizer(classes=weather_classes)), ('temp_new', MultiLabelBinarizer(classes=['Cool', 'Mild', 'Hot'])), ('day', MultiLabelBinarizer(classes=day_of_week)), ('time', MultiLabelBinarizer(classes=time_of_day)), (['level_one'], [ SimpleImputer(strategy='constant', fill_value='most_frequent'), LabelBinarizer() ]), ('population_density', MultiLabelBinarizer(classes=['Low', 'Medium', 'High'])), ], df_out=True) def pokemon_target(x): #print(id) if name == x: return 1 else: return 0 pokemon['target'] = pokemon['name'].apply(pokemon_target) target = 'target' y = pokemon[target] X = pokemon.drop(target, axis=1) X = mapper.fit_transform(X) #X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=117, test_size=0.85) train_pool = Pool(X, y) #val_pool = Pool(X_val, y_val) model = cb.CatBoostClassifier(iterations=150, logging_level='Silent', custom_loss=['AUC'], depth=None, l2_leaf_reg=7) model.fit(X, y, plot=False, verbose=False) res = pd.DataFrame(zip(X.columns, model.feature_importances_), columns=['Feature', 'Score']).sort_values(by='Score', ascending=False) loc = [] weather = [] day = [] density = [] time = [] for res in (list(res.Feature.values)): if (res.find("level_one") >= 0) and (len(loc) <= 1): loc.append(res.split('_')[-1]) elif (res.find("weather") >= 0) and (len(weather) <= 1): weather.append(res.split('_')[-1]) elif (res.find("time") >= 0) and (len(time) <= 1): time.append(res.split('_')[-1]) elif (res.find("day") >= 0) and (len(day) <= 1): day.append(res.split('_')[-1]) elif (res.find("density") >= 0) and (len(density) <= 1): density.append(res.split('_')[-1]) res = pd.DataFrame(zip(loc, weather, day, density, time), columns=[ 'loc', 'weather', 'day', 'density', 'time', ]) return res
def test_predict_without_fit(): with pytest.raises(CatboostError): pool = Pool(TRAIN_FILE, column_description=CD_FILE) model = CatBoostClassifier() model.predict(pool)
# 'nthread':12 } params['silent'] = 1 watchlist = [(xgb_train, 'train'), (xgb_eval, 'eval')] xgb_model = xgb.train(params, xgb_train, 5000, watchlist, early_stopping_rounds=40, verbose_eval=40) train_model_pred['xgb_pred'].iloc[test_index] += xgb_model.predict( xgb_eval) test_model_pred['xgb_pred'] += xgb_model.predict(xgb_test) print('开始cb训练...') train_pool = Pool(train_feat[predictors].iloc[train_index], train_feat['loan_sum'].iloc[train_index]) eval_pool = Pool(train_feat[predictors].iloc[test_index], train_feat['loan_sum'].iloc[test_index]) test_pool = Pool(test_feat[predictors]) cb_model = cb.CatBoostRegressor(iterations=400, depth=7, learning_rate=0.06, eval_metric='RMSE', od_type='Iter', od_wait=20, random_seed=42, thread_count=7, bagging_temperature=0.85, rsm=0.85, verbose=False) cb_model.fit(train_pool)
# DIVIDE TRAINDATA AND PREDDATA DX = D[D['DateTime'] < pd.to_datetime('2015')].copy() DY = D[D['DateTime'] >= pd.to_datetime('2015')].copy() DY = DY[DY['DateTime'] < pd.to_datetime('2016')].copy() # HOURLY CONCENTRATION ESTIMATION ############################################## RDX = DX[DX['ConcentrationObs'].notnull()].copy() m = CatBoostRegressor(learning_rate=LR_R, iterations=NT_R, logging_level='Silent') m.fit(RDX[FEAT], y=RDX['ConcentrationObs'], cat_features=CATS) DX['ConcentrationPred'] = m.predict(Pool(DX[FEAT], cat_features=CATS)) DY['ConcentrationPred'] = m.predict(Pool(DY[FEAT], cat_features=CATS)) FEAT += ['ConcentrationPred'] # HOURLY HIGH-CONCENTRATION PROBABILITY ESTIMATION ############################# m = CatBoostClassifier(learning_rate=LR_C, iterations=NT_C, logging_level='Silent') m.fit(DX[FEAT], y=DX['HourTarget'], cat_features=CATS) # DAILY HIGH-CONCENTRATION PROBABILITY ESTIMATION ############################## fday = lambda x: 1 - np.prod(1 - x.nlargest(5))