def test_load_generated(): pool_size = (100, 10) data = np.round(np.random.normal(size=pool_size), decimals=3) label = np.random.randint(2, size=pool_size[0]) pool = Pool(data, label) assert _check_data(pool.get_features(), data) assert _check_data(pool.get_label(), label)
def test_load_df_vs_load_from_file(): pool1 = Pool(TRAIN_FILE, column_description=CD_FILE) data = read_table(TRAIN_FILE, header=None, dtype=str) label = DataFrame(data.iloc[:, TARGET_IDX]) data.drop([TARGET_IDX], axis=1, inplace=True) cat_features = pool1.get_cat_feature_indices() pool2 = Pool(np.array(data), label, cat_features) assert pool1 == pool2
def test_zero_baseline(): pool = Pool(TRAIN_FILE, column_description=CD_FILE) baseline = np.zeros(pool.num_row()) pool.set_baseline(baseline) model = CatBoostClassifier(iterations=2, random_seed=0) model.fit(pool) model.save_model(OUTPUT_MODEL_PATH) return compare_canonical_models(OUTPUT_MODEL_PATH)
def test_non_ones_weight(): pool = Pool(TRAIN_FILE, column_description=CD_FILE) weight = np.arange(1, pool.num_row()+1) pool.set_weight(weight) model = CatBoostClassifier(iterations=2, random_seed=0) model.fit(pool) model.save_model(OUTPUT_MODEL_PATH) return compare_canonical_models(OUTPUT_MODEL_PATH)
def test_load_df(): pool = Pool(NAN_TRAIN_FILE, column_description=NAN_CD_FILE) data = read_table(NAN_TRAIN_FILE, header=None) label = DataFrame(data.iloc[:, TARGET_IDX]) data.drop([TARGET_IDX], axis=1, inplace=True) cat_features = pool.get_cat_feature_indices() pool2 = Pool(data, label, cat_features) assert _check_data(pool.get_features(), pool2.get_features()) assert _check_data(pool.get_label(), pool2.get_label())
def test_fit_data(): pool = Pool(CLOUDNESS_TRAIN_FILE, column_description=CLOUDNESS_CD_FILE) eval_pool = Pool(CLOUDNESS_TEST_FILE, column_description=CLOUDNESS_CD_FILE) base_model = CatBoostClassifier(iterations=2, random_seed=0, loss_function="MultiClass") base_model.fit(pool) baseline = np.array(base_model.predict(pool, prediction_type='RawFormulaVal')) eval_baseline = np.array(base_model.predict(eval_pool, prediction_type='RawFormulaVal')) eval_pool.set_baseline(eval_baseline) model = CatBoostClassifier(iterations=2, random_seed=0, loss_function="MultiClass") data = map_cat_features(pool.get_features(), pool.get_cat_feature_indices()) model.fit(data, pool.get_label(), pool.get_cat_feature_indices(), sample_weight=np.arange(1, pool.num_row()+1), baseline=baseline, use_best_model=True, eval_set=eval_pool) model.save_model(OUTPUT_MODEL_PATH) return compare_canonical_models(OUTPUT_MODEL_PATH)
def test_load_series(): pool = Pool(TRAIN_FILE, column_description=CD_FILE) data = read_table(TRAIN_FILE, header=None) label = Series(data.iloc[:, TARGET_IDX]) data.drop([TARGET_IDX], axis=1, inplace=True) data = Series(list(data.values)) cat_features = pool.get_cat_feature_indices() pool2 = Pool(data, label, cat_features) assert _check_data(pool.get_features(), pool2.get_features()) assert _check_data(pool.get_label(), pool2.get_label())
def test_pool_after_fit(): pool1 = Pool(TRAIN_FILE, column_description=CD_FILE) pool2 = Pool(TRAIN_FILE, column_description=CD_FILE) assert _check_data(pool1.get_features(), pool2.get_features()) model = CatBoostClassifier(iterations=5, random_seed=0) model.fit(pool2) assert _check_data(pool1.get_features(), pool2.get_features())
def test_no_cat_in_predict(): train_pool = Pool(TRAIN_FILE, column_description=CD_FILE) test_pool = Pool(TEST_FILE, column_description=CD_FILE) model = CatBoostClassifier(iterations=2, random_seed=0) model.fit(train_pool) pred1 = model.predict(map_cat_features(test_pool.get_features(), train_pool.get_cat_feature_indices())) pred2 = model.predict(Pool(map_cat_features(test_pool.get_features(), train_pool.get_cat_feature_indices()), cat_features=train_pool.get_cat_feature_indices())) assert _check_data(pred1, pred2)
def test_load_dumps(): pool_size = (100, 10) data = np.random.randint(10, size=pool_size) label = np.random.randint(2, size=pool_size[0]) pool1 = Pool(data, label) lines = [] for i in range(len(data)): line = [str(label[i])] + [str(x) for x in data[i]] lines.append('\t'.join(line)) text = '\n'.join(lines) with open('test_data_dumps', 'w') as f: f.write(text) pool2 = Pool('test_data_dumps') assert _check_data(pool1.get_features(), pool2.get_features()) assert _check_data(pool1.get_label(), pool2.get_label())
def test_fit_no_label(): with pytest.raises(CatboostError): pool = Pool(TRAIN_FILE, column_description=CD_FILE) model = CatBoostClassifier() model.fit(pool.get_features())
def main_task(): # Process if manual_entry: tmp_df = get_sample(df, 1) else: tmp_df = get_sample(df, num_prediccions) #tmp_df.reset_index(inplace=True) #st.text(tmp_df.head()) res = process_data(tmp_df, cols_input, cols_empty, manual_entry) #test_real_labels = res['Pedido_real'] test_real_labels = res.pop('Pedido_real') _ = res.pop('Pedido real') # Apply MinMaxScaler #scaler_data_ = np.load("./data/my_scaler.npy") #scaler_scale, scaler_min = scaler_data_[0], scaler_data_[1] #test_real_labels_scaled = test_real_labels * scaler_scale #test_real_labels_scaled += scaler_min #st.text("Real {}".format(tmp_df['Pedido real'])) #res.drop(columns=['Tipo Articulo', 'Fecha'], inplace=True) #st.table(tmp_df) cat_features = np.where((res.dtypes != 'float32') & (res.dtypes != 'float64'))[0] #test_data = get_pool(res, test_real_labels_scaled, cat_features) #test_data = Pool(res, test_real_labels_scaled, cat_features=cat_features) test_data = Pool(res, test_real_labels, cat_features=cat_features) model = get_model() preds_scaled = model.predict(test_data) preds = preds_scaled #preds = preds_scaled - scaler_min #preds /= scaler_scale st.text("Predicción de pedidos: {}".format([int(np.round(p)) for p in preds])) st.text("Pedido real realizado: {}".format([p for p in test_real_labels])) resultados = pd.DataFrame(np.stack([[int(np.round(p)) for p in preds],np.array(test_real_labels)], axis=1), columns=['Predicciones','Pedidos reales']) fig = go.Figure() fig.add_trace(go.Scatter(x=resultados.index, y=resultados['Pedidos reales'], mode='lines+markers', name='Pedidos reales')) fig.add_trace(go.Scatter(x=resultados.index, y=resultados['Predicciones'], mode='lines+markers', name='Predicciones')) st.plotly_chart(fig, use_container_width=True) shap_values = model.get_feature_importance( data=test_data, type='ShapValues', shap_calc_type='Approximate' ) #sp_shape = shap_values.shape #st.text(shap_values) #st.text(shap_values.shape) #spv = shap_values.ravel() #st.text(spv) #st.text(spv.shape) #spv = spv - scaler_min #spv /= scaler_scale #st.text(spv) #st.text(spv.shape) #st.text(spv.reshape(sp_shape)) #return test_data, res, shap_values.reshape(sp_shape), tmp_df, fig return test_data, res, shap_values, tmp_df, fig
#print(clf.predict(X_test[104])) cm = confusion_matrix(y_test, y_pred) from sklearn.metrics import recall_score, precision_score print(recall_score(y_test, y_pred, average='macro')) print(precision_score(y_test, y_pred, average='micro')) print(accuracy_score(y_test, y_pred)) #cr0ss validati0n cv_params = clf.get_params() cv_params.update({'loss_function': 'Logloss'}) cv_data = cv(Pool(X, y, cat_features=cat_featuresind), cv_params, plot=True) print('Best validation accuracy score: {:.2f}±{:.2f} on step {}'.format( np.max(cv_data['test-Accuracy-mean']), cv_data['test-Accuracy-std'][np.argmax(cv_data['test-Accuracy-mean'])], np.argmax(cv_data['test-Accuracy-mean']))) print('Precise validation accuracy score: {}'.format( np.max(cv_data['test-Accuracy-mean']))) """ importances = clf.feature_importances_ print(clf.feature_importances_) plt.title('Feature Importances ') plt.barh(range(len(cat_featuresind)), importances[cat_featuresind], color='b', align='center') #plt.yticks(dataset[i][0] for i in cat_featuresind)
def fit(self, Xc: FloatTensor, Xe: LongTensor, y: FloatTensor): Xc, Xe, y = filter_nan(Xc, Xe, y, 'all') train_data = Pool(data=self.xtrans(Xc=Xc, Xe=Xe), label=y.numpy().reshape(-1)) self.model.fit(train_data)
def test_fit_data(): pool = Pool(CLOUDNESS_TRAIN_FILE, column_description=CLOUDNESS_CD_FILE) eval_pool = Pool(CLOUDNESS_TEST_FILE, column_description=CLOUDNESS_CD_FILE) base_model = CatBoostClassifier(iterations=2, random_seed=0, loss_function="MultiClass") base_model.fit(pool) baseline = np.array( base_model.predict(pool, prediction_type='RawFormulaVal')) eval_baseline = np.array( base_model.predict(eval_pool, prediction_type='RawFormulaVal')) eval_pool._set_baseline(eval_baseline) model = CatBoostClassifier(iterations=2, random_seed=0, loss_function="MultiClass") data = map_cat_features(pool.get_features(), pool.get_cat_feature_indices()) model.fit(data, pool.get_label(), pool.get_cat_feature_indices(), sample_weight=np.arange(1, pool.num_row() + 1), baseline=baseline, use_best_model=True, eval_set=eval_pool) model.save_model(OUTPUT_MODEL_PATH) return local_canonical_file(OUTPUT_MODEL_PATH)
x_train, x_valid, y_train, y_valid = train_test_split(train_x, train_y, test_size=0.2, random_state=2019) y_pred_lgb = np.zeros(len(x_test)) catboost_model = CatBoostRegressor(custom_metric='MAE', eval_metric='MAE', learning_rate=0.1, l2_leaf_reg=5, early_stopping_rounds=100, num_trees=2000, loss_function='MAE', verbose=True) train_pool = Pool(x_train, y_train) val_pool = Pool(x_valid, y_valid) catboost_model.fit(train_pool, eval_set=val_pool, verbose_eval=100) test_pool = Pool(x_test) y_pred_lgb = catboost_model.predict(test_pool) result = pd.read_csv('/cos_person/tencent/train/test_id.csv') ['sample_id', 'ad_id'] result['ecpm'] = y_pred_lgb result_tmp = result[:10] request_ecpm = pd.read_csv('/cos_person/tencent/train/max_total.csv', header=None) request_ecpm.columns = [ 'Ad_Request_id', 'Ad_Request_Time', 'user_id', 'Ad_pos_id', 'test_ad_id',
def test_predict_without_fit(): with pytest.raises(CatboostError): pool = Pool(TRAIN_FILE, column_description=CD_FILE) model = CatBoostClassifier() model.predict(pool)
def test_invalid_loss_regressor(): with pytest.raises(CatboostError): pool = Pool(TRAIN_FILE, column_description=CD_FILE) model = CatBoostRegressor(loss_function="fee") model.fit(pool)
def test_invalid_loss_classifier(): with pytest.raises(CatboostError): pool = Pool(TRAIN_FILE, column_description=CD_FILE) model = CatBoostClassifier(loss_function="abcdef") model.fit(pool)
def test_python_export_with_cat_features(): train_pool = Pool(TRAIN_FILE, column_description=CD_FILE) model = CatBoost({'iterations': 20, 'random_seed': 0}) model.fit(train_pool) model.save_model(OUTPUT_PYTHON_MODEL_PATH, format="python") return local_canonical_file(OUTPUT_PYTHON_MODEL_PATH)
def test_predict_sklearn_regress(): train_pool = Pool(TRAIN_FILE, column_description=CD_FILE) model = CatBoostRegressor(iterations=2, random_seed=0) model.fit(train_pool) model.save_model(OUTPUT_MODEL_PATH) return compare_canonical_models(OUTPUT_MODEL_PATH)
def test_pool_cat_features(): pool = Pool(TRAIN_FILE, column_description=CD_FILE) assert np.all(pool.get_cat_feature_indices() == CAT_FEATURES)
def _fit(self, X, y, X_val=None, y_val=None, time_limit=None, num_gpus=0, sample_weight=None, sample_weight_val=None, **kwargs): try_import_catboost() from catboost import CatBoostClassifier, CatBoostRegressor, Pool ag_params = self._get_ag_params() params = self._get_model_params() if self.problem_type == SOFTCLASS: # FIXME: This is extremely slow due to unoptimized metric / objective sent to CatBoost from .catboost_softclass_utils import SoftclassCustomMetric, SoftclassObjective params['loss_function'] = SoftclassObjective.SoftLogLossObjective() params['eval_metric'] = SoftclassCustomMetric.SoftLogLossMetric() model_type = CatBoostClassifier if self.problem_type in PROBLEM_TYPES_CLASSIFICATION else CatBoostRegressor if isinstance(params['eval_metric'], str): metric_name = params['eval_metric'] else: metric_name = type(params['eval_metric']).__name__ num_rows_train = len(X) num_cols_train = len(X.columns) if self.problem_type == MULTICLASS: if self.num_classes is not None: num_classes = self.num_classes else: num_classes = 10 # Guess if not given, can do better by looking at y elif self.problem_type == SOFTCLASS: # TODO: delete this elif if it's unnecessary. num_classes = y.shape[1] else: num_classes = 1 # TODO: Add ignore_memory_limits param to disable NotEnoughMemoryError Exceptions max_memory_usage_ratio = self.params_aux['max_memory_usage_ratio'] approx_mem_size_req = num_rows_train * num_cols_train * num_classes / 2 # TODO: Extremely crude approximation, can be vastly improved if approx_mem_size_req > 1e9: # > 1 GB available_mem = psutil.virtual_memory().available ratio = approx_mem_size_req / available_mem if ratio > (1 * max_memory_usage_ratio): logger.warning( '\tWarning: Not enough memory to safely train CatBoost model, roughly requires: %s GB, but only %s GB is available...' % (round(approx_mem_size_req / 1e9, 3), round(available_mem / 1e9, 3))) raise NotEnoughMemoryError elif ratio > (0.2 * max_memory_usage_ratio): logger.warning( '\tWarning: Potentially not enough memory to safely train CatBoost model, roughly requires: %s GB, but only %s GB is available...' % (round(approx_mem_size_req / 1e9, 3), round(available_mem / 1e9, 3))) start_time = time.time() X = self.preprocess(X) cat_features = list(X.select_dtypes(include='category').columns) X = Pool(data=X, label=y, cat_features=cat_features, weight=sample_weight) if X_val is None: eval_set = None num_sample_iter_max = 50 early_stopping_rounds = None else: X_val = self.preprocess(X_val) X_val = Pool(data=X_val, label=y_val, cat_features=cat_features, weight=sample_weight_val) eval_set = X_val modifier = min(1.0, 10000 / num_rows_train) num_sample_iter_max = max(round(modifier * 50), 2) early_stopping_rounds = ag_params.get('ag.early_stop', 'auto') if isinstance(early_stopping_rounds, str): early_stopping_rounds = self._get_early_stopping_rounds( num_rows_train=num_rows_train, strategy=early_stopping_rounds) if params.get('allow_writing_files', False): if 'train_dir' not in params: try: # TODO: What if path is in S3? os.makedirs(os.path.dirname(self.path), exist_ok=True) except: pass else: params['train_dir'] = self.path + 'catboost_info' # TODO: Add more control over these params (specifically early_stopping_rounds) verbosity = kwargs.get('verbosity', 2) if verbosity <= 1: verbose = False elif verbosity == 2: verbose = False elif verbosity == 3: verbose = 20 else: verbose = True init_model = None init_model_tree_count = None init_model_best_score = None num_features = len(self._features) if num_gpus != 0: if 'task_type' not in params: params['task_type'] = 'GPU' logger.log( 20, f'\tTraining {self.name} with GPU, note that this may negatively impact model quality compared to CPU training.' ) # TODO: Confirm if GPU is used in HPO (Probably not) # TODO: Adjust max_bins to 254? if params.get('task_type', None) == 'GPU': if 'colsample_bylevel' in params: params.pop('colsample_bylevel') logger.log( 30, f'\t\'colsample_bylevel\' is not supported on GPU, using default value (Default = 1).' ) if 'rsm' in params: params.pop('rsm') logger.log( 30, f'\t\'rsm\' is not supported on GPU, using default value (Default = 1).' ) if self.problem_type == MULTICLASS and 'rsm' not in params and 'colsample_bylevel' not in params and num_features > 1000: if time_limit: # Reduce sample iterations to avoid taking unreasonable amounts of time num_sample_iter_max = max(round(num_sample_iter_max / 2), 2) # Subsample columns to speed up training if params.get('task_type', None) != 'GPU': # RSM does not work on GPU params['colsample_bylevel'] = max( min(1.0, 1000 / num_features), 0.05) logger.log( 30, f'\tMany features detected ({num_features}), dynamically setting \'colsample_bylevel\' to {params["colsample_bylevel"]} to speed up training (Default = 1).' ) logger.log( 30, f'\tTo disable this functionality, explicitly specify \'colsample_bylevel\' in the model hyperparameters.' ) else: params['colsample_bylevel'] = 1.0 logger.log( 30, f'\t\'colsample_bylevel\' is not supported on GPU, using default value (Default = 1).' ) logger.log(15, f'\tCatboost model hyperparameters: {params}') if time_limit: time_left_start = time_limit - (time.time() - start_time) if time_left_start <= time_limit * 0.4: # if 60% of time was spent preprocessing, likely not enough time to train model raise TimeLimitExceeded params_init = params.copy() num_sample_iter = min(num_sample_iter_max, params_init['iterations']) params_init['iterations'] = num_sample_iter self.model = model_type(**params_init, ) self.model.fit( X, eval_set=eval_set, use_best_model=True, verbose=verbose, # early_stopping_rounds=early_stopping_rounds, ) init_model_tree_count = self.model.tree_count_ init_model_best_score = self._get_best_val_score( self.model, metric_name) time_left_end = time_limit - (time.time() - start_time) time_taken_per_iter = (time_left_start - time_left_end) / num_sample_iter estimated_iters_in_time = round(time_left_end / time_taken_per_iter) init_model = self.model if self.stopping_metric._optimum == init_model_best_score: # Done, pick init_model params_final = None else: params_final = params.copy() # TODO: This only handles memory with time_limit specified, but not with time_limit=None, handle when time_limit=None available_mem = psutil.virtual_memory().available if self.problem_type == SOFTCLASS: # TODO: remove this once catboost-dev is no longer necessary and SOFTCLASS objectives can be pickled. model_size_bytes = 1 # skip memory check else: model_size_bytes = sys.getsizeof(pickle.dumps(self.model)) max_memory_proportion = 0.3 * max_memory_usage_ratio mem_usage_per_iter = model_size_bytes / num_sample_iter max_memory_iters = math.floor( available_mem * max_memory_proportion / mem_usage_per_iter) if params.get('task_type', None) == 'GPU': # Cant use init_model iterations_left = params['iterations'] else: iterations_left = params['iterations'] - num_sample_iter params_final['iterations'] = min(iterations_left, estimated_iters_in_time) if params_final[ 'iterations'] > max_memory_iters - num_sample_iter: if max_memory_iters - num_sample_iter <= 500: logger.warning( '\tWarning: CatBoost will be early stopped due to lack of memory, increase memory to enable full quality models, max training iterations changed to %s from %s' % (max_memory_iters, params_final['iterations'] + num_sample_iter)) params_final[ 'iterations'] = max_memory_iters - num_sample_iter else: params_final = params.copy() if params_final is not None and params_final['iterations'] > 0: self.model = model_type(**params_final, ) fit_final_kwargs = dict( eval_set=eval_set, verbose=verbose, early_stopping_rounds=early_stopping_rounds, ) # TODO: Strangely, this performs different if clone init_model is sent in than if trained for same total number of iterations. May be able to optimize catboost models further with this warm_start = False if params_final.get('task_type', None) == 'GPU': # Cant use init_model fit_final_kwargs['use_best_model'] = True elif init_model is not None: fit_final_kwargs['init_model'] = init_model warm_start = True self.model.fit(X, **fit_final_kwargs) if init_model is not None: final_model_best_score = self._get_best_val_score( self.model, metric_name) if self.stopping_metric._optimum == init_model_best_score: # Done, pick init_model self.model = init_model else: if (init_model_best_score > self.stopping_metric._optimum ) or (final_model_best_score > self.stopping_metric._optimum): init_model_best_score = -init_model_best_score final_model_best_score = -final_model_best_score if warm_start: if init_model_best_score >= final_model_best_score: self.model = init_model else: best_iteration = init_model_tree_count + self.model.get_best_iteration( ) self.model.shrink(ntree_start=0, ntree_end=best_iteration + 1) else: if init_model_best_score >= final_model_best_score: self.model = init_model self.params_trained['iterations'] = self.model.tree_count_
'%Y-%m-%d') in hk_2018_holidays) * 1 y = dataset['speed'] X = dataset.drop(["id", "date", "speed"], axis=1) X_train, X_eval, y_train, y_eval = train_test_split(X, y, test_size=0.2, random_state=1) # Initialize CatBoostRegressor model = CatBoostRegressor(iterations=1100, learning_rate=0.09, depth=10, use_best_model=True, l2_leaf_reg=2) eval_dataset = Pool(X_eval, y_eval) # Fit model model.fit(X_train, y_train, eval_set=eval_dataset, verbose=False) # Get predictions print(model.get_best_score()) test_set = pd.read_csv('test.csv') test_set["date"] = pd.to_datetime(test_set["date"], format="%d/%m/%Y %H:%M") test_set["year"] = test_set["date"].apply(lambda x: x.year) test_set["month"] = test_set["date"].apply(lambda x: x.month) test_set["day"] = test_set["date"].apply(lambda x: x.day) test_set["hour"] = test_set["date"].apply(lambda x: x.hour) test_set["weekday"] = test_set["date"].apply(lambda x: x.isoweekday()) test_set['hour_of_week'] = (test_set["weekday"] * 24 - 24) + test_set["hour"] test_set['hour_of_month'] = (test_set["month"] * 24 - 24) + test_set["hour"] test_set["quarter"] = test_set["date"].apply(lambda x: x.quarter)
def test_real_numbers_cat_features(): with pytest.raises(CatboostError): data = np.random.rand(100, 10) label = np.random.randint(2, size=100) Pool(data, label, [1, 2])
def test_predict_sklearn_class(): train_pool = Pool(TRAIN_FILE, column_description=CD_FILE) model = CatBoostClassifier(iterations=2, random_seed=0) model.fit(train_pool) model.save_model(OUTPUT_MODEL_PATH) return local_canonical_file(OUTPUT_MODEL_PATH)
def test_feature_importance(): pool = Pool(TRAIN_FILE, column_description=CD_FILE) model = CatBoostClassifier(iterations=5, random_seed=0) model.fit(pool) np.save(FIMP_PATH, np.array(model.feature_importances_)) return local_canonical_file(FIMP_PATH)
def test_invalid_loss(): with pytest.raises(CatboostError): pool = Pool(TRAIN_FILE, column_description=CD_FILE) model = CatBoost({"loss_function": "abcdef"}) model.fit(pool)
def test_load_file(): assert _check_shape(Pool(TRAIN_FILE, column_description=CD_FILE))
def test_one_doc_feature_importance(): pool = Pool(TRAIN_FILE, column_description=CD_FILE) model = CatBoostClassifier(iterations=5, random_seed=0) model.fit(pool) np.save(FIMP_PATH, np.array(model.get_feature_importance(np.ones(pool.num_col(), dtype=int), 0, cat_features=pool.get_cat_feature_indices(), fstr_type='Doc'))) return local_canonical_file(FIMP_PATH)
plt.ylabel(labl) plt.title("mean " + labl+": "+ str(round(obj.mean(),1))) plt.show() plt.figure(0) plot1(rrmse, "RRMSE") plt.figure(1) plot1(rmse, "RMSE") plt.figure(2) plot1(r1 , "Rsquared") #ligtht X_train, Y_train, X_test, Y_test = preprocessing(ap, False, False, select_hr=None) #catboost X_train, Y_train, X_test, Y_test = preprocessing(ap, False, False, select_hr=None) train_pool = Pool(X_train, Y_train, cat_features=["hours"]) test_pool = Pool(X_test, cat_features=["hours"]) model = CatBoostRegressor(iterations=300, depth=6, learning_rate=0.007, loss_function='RMSE') #train the model model.fit(train_pool) # make the prediction using the resulting model preds = model.predict(test_pool) print(preds) #1 = np.delete(r1, np.where(r1 ==16))
fold_importance["fold"] = fold_n + 1 feature_importance = pd.concat( [feature_importance, fold_importance], axis=0) bond_scores.append(mean_absolute_error(y_valid, y_pred_valid)) logger.info('CV mean score: {0:.4f}, std: {1:.4f}.'.format( np.mean(bond_scores), np.std(bond_scores))) oof[valid_idx] = y_pred_valid.reshape(-1,) prediction_type += y_pred elif MODEL_TYPE == 'catboost': fold_start = timer() logger.info('Running Type {} - Fold {} of {}'.format(bond_type, fold_count, folds.n_splits)) X_train, X_valid = X_type.iloc[train_idx], X_type.iloc[valid_idx] y_train, y_valid = y_type.iloc[train_idx], y_type.iloc[valid_idx] train_dataset = Pool(data=X_train.drop('type', axis=1), label=y_train) valid_dataset = Pool(data=X_valid.drop('type', axis=1), label=y_valid) test_dataset = Pool(data=X_test_type.drop('type', axis=1)) DEPTH = 7 update_tracking(run_id, 'depth', DEPTH) model = CatBoostRegressor(iterations=N_ESTIMATORS, learning_rate=LEARNING_RATE, depth=DEPTH, eval_metric=EVAL_METRIC, verbose=VERBOSE, random_state = RANDOM_STATE, thread_count=N_THREADS, #loss_function=EVAL_METRIC, task_type = "GPU") # Train on GPU model.fit(train_dataset,
def _get_train_test_pool(dataset): train_path, test_path, cd_path = _get_train_test_cd_path(dataset) train_pool = Pool(train_path, column_description=cd_path) test_pool = Pool(test_path, column_description=cd_path) return (train_pool, test_pool)
trained = trained_set.drop('price', axis=1) trained_price = np.log(trained_set['price']) X_trained, X_test, y_trained, y_test = train_test_split(trained, trained_price, test_size=0.33, random_state=42, shuffle=False) #X_test stays the same X_test, X_values, y_test, y_values = train_test_split(X_test, y_test, test_size=0.33, random_state=42, shuffle=False) trained_pool = Pool(X_trained.values, y_trained.values) test_pool = Pool(X_test.values) values_pool = Pool(X_values.values, y_values.values) cbr = CatBoostRegressor(iterations=99, depth=10, learning_rate=0.3, loss_function='RMSE', random_seed=42, eval_metric='RMSE', use_best_model=True) cbr.fit(trained_pool, eval_set=values_pool, early_stopping_rounds=80) predictions = cbr.predict(test_pool) # calculate MAE, MSE, RMSE print('RMSE: {}'.format( math.sqrt(mean_squared_error(y_test.values, predictions))))
def get_pool(data, labels, cat_f): return Pool(data, labels, cat_features=cat_f)
c.append(i[0]) data.columns = c # Use select features print("Preparing data...") data = data.loc[:, [ 'suburb', 'rooms', 'type', 'price', 'postcode', 'bathroom', 'car' ]] data = data.dropna() X = data.drop(columns=["price"]) y = data["price"] # Split data for training and evaluation X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) categorical = ['suburb', 'type', 'postcode'] train_pool = Pool(X_train, y_train, cat_features=categorical) test_pool = Pool(X_test, cat_features=categorical) # Find optimal hyper-parameters print("Training model...") model = CatBoostRegressor(loss_function="RMSE", logging_level=None) grid = { 'learning_rate': [0.01, 0.03, 0.06, 0.09, 0.12], 'depth': [4, 6, 8, 10], 'l2_leaf_reg': [1, 3, 5, 7, 9], 'random_strength': [2, 4], 'bagging_temperature': [0, 1], } grid_search_result = model.randomized_search(grid, X=train_pool, search_by_train_test_split=True,
def test_cv_logging(): pool = Pool(TRAIN_FILE, column_description=CD_FILE) cv(pool, {"iterations": 5, "random_seed": 0, "loss_function": "Logloss"}) return local_canonical_file(remove_time_from_json(JSON_LOG_PATH))
##submission sub_df = pd.DataFrame({"ID_code":test_df["ID_code"].values}) sub_df["target"] = predictions sub_df.to_csv("lgb_submission.csv", index=False) ## Catboost : https://www.kaggle.com/wakamezake/starter-code-catboost-baseline from catboost import Pool, CatBoostClassifier model = CatBoostClassifier(loss_function="Logloss", eval_metric="AUC") kf = KFold(n_splits=5, random_state=42, shuffle=True) y_valid_pred = 0 * target y_test_pred = 0 for idx, (train_index, valid_index) in enumerate(kf.split(train_df)): y_train, y_valid = target.iloc[train_index], target.iloc[valid_index] X_train, X_valid = train_df[features].iloc[train_index,:], train_df[features].iloc[valid_index,:] _train = Pool(X_train, label=y_train) _valid = Pool(X_valid, label=y_valid) print( "\nFold ", idx) fit_model = model.fit(_train, eval_set=_valid, use_best_model=True, verbose=200 ) pred = fit_model.predict_proba(X_valid)[:,1] print( " auc = ", roc_auc_score(y_valid, pred) ) y_valid_pred.iloc[valid_index] = pred y_test_pred += fit_model.predict_proba(test_df[features])[:,1] y_test_pred /= 5 ##submission sub_df1 = pd.DataFrame({"ID_code":test_df["ID_code"].values}) sub_df1["target"] = y_test_pred
def test_load_ndarray(): pool = Pool(TRAIN_FILE, column_description=CD_FILE) cat_features = pool.get_cat_feature_indices() data = np.array(map_cat_features(pool.get_features(), cat_features)) label = np.array(pool.get_label()) assert _check_shape(Pool(data, label, cat_features))
def test_cv_with_not_binarized_target(): train_file = data_file('adult_not_binarized', 'train_small') cd = data_file('adult_not_binarized', 'train.cd') pool = Pool(train_file, column_description=cd) cv(pool, {"iterations": 5, "random_seed": 0, "loss_function": "Logloss"}) return local_canonical_file(remove_time_from_json(JSON_LOG_PATH))
# Fit model # index of ORIGIN col; only categorical cat_ft_indices = np.where(X_train.dtypes != np.int64)[0] cb_model.fit(X_train, y_train, cat_features=cat_ft_indices, eval_set=(X_test, y_test), plot=True) # Accuracy & Cross-Validation cb_accuracy = cb_model.score(X_train, y_train) # cv train_pool = Pool(X_train, y_train, cat_ft_indices) cross_val_paramt = cb_model.get_params() cross_val_results = cv(pool=train_pool, params=cross_val_paramt, fold_count=10, plot=True) cb_cross_val_acc_avg = np.mean(cross_val_results['test-MultiClass-mean']) cb_cross_val_acc_min = np.min(cross_val_results['test-MultiClass-mean']) cb_cross_val_acc_max = np.max(cross_val_results['test-MultiClass-mean']) print('Average Cross Validation Score:', round(cb_cross_val_acc_avg * 100,