def test_copy_model(): pool = Pool(TRAIN_FILE, column_description=CD_FILE) model1 = CatBoostRegressor(iterations=5, random_seed=0) model1.fit(pool) model2 = model1.copy() predictions1 = model1.predict(pool) predictions2 = model2.predict(pool) assert _check_data(predictions1, predictions2) model2.save_model(OUTPUT_MODEL_PATH) return compare_canonical_models(OUTPUT_MODEL_PATH)
def test_shap(): train_pool = Pool([[0, 0], [0, 1], [1, 0], [1, 1]], [0, 1, 5, 8], cat_features=[]) test_pool = Pool([[0, 0], [0, 1], [1, 0], [1, 1]]) model = CatBoostRegressor(iterations=1, random_seed=0, max_ctr_complexity=1, depth=2) model.fit(train_pool) shap_values = model.get_feature_importance(test_pool, fstr_type='ShapValues') dataset = [(0.5, 1.2), (1.6, 0.5), (1.8, 1.0), (0.4, 0.6), (0.3, 1.6), (1.5, 0.2)] labels = [1.1, 1.85, 2.3, 0.7, 1.1, 1.6] train_pool = Pool(dataset, labels, cat_features=[]) model = CatBoost({'iterations': 10, 'random_seed': 0, 'max_ctr_complexity': 1}) model.fit(train_pool) testset = [(0.6, 1.2), (1.4, 0.3), (1.5, 0.8), (1.4, 0.6)] predictions = model.predict(testset) shap_values = model.get_feature_importance(Pool(testset), fstr_type='ShapValues') assert(len(predictions) == len(shap_values)) for pred_idx in range(len(predictions)): assert(abs(sum(shap_values[pred_idx]) - predictions[pred_idx]) < 1e-9) with open(FIMP_PATH, 'w') as out: out.write(shap_values) local_canonical_file(FIMP_PATH)
def test_coreml_import_export(): train_pool = Pool(QUERYWISE_TRAIN_FILE, column_description=QUERYWISE_CD_FILE) test_pool = Pool(QUERYWISE_TEST_FILE, column_description=QUERYWISE_CD_FILE) model = CatBoost(params={'loss_function': 'QueryRMSE', 'random_seed': 0, 'iterations': 20, 'thread_count': 8}) model.fit(train_pool) model.save_model(OUTPUT_COREML_MODEL_PATH, format="coreml") canon_pred = model.predict(test_pool) coreml_loaded_model = CatBoostRegressor() coreml_loaded_model.load_model(OUTPUT_COREML_MODEL_PATH, format="coreml") assert all(canon_pred == coreml_loaded_model.predict(test_pool)) return local_canonical_file(OUTPUT_COREML_MODEL_PATH)
# # # mod = xgb.XGBClassifier(n_estimators=10000,learning_rate=.4) # # # eval_set = [(select_input_columns(ts), select_output_columns_as_row(ts))] # # mod.fit(select_input_columns(tr), select_output_columns_as_row(tr) ,eval_metric=xgb_f1, eval_set=eval_set, verbose=True) #g=f1_score(select_output_columns_as_row(ts),mod.predict(select_input_columns(ts))) ######################################################################################################################## #def r(): #global input_columns_classify clas = CatBoostClassifier(iterations=10000, eval_metric='F1') reg = CatBoostRegressor(iterations=np.random.randint(1, 4), eval_metric='MAE') cat_features = [] if 'weekday' in input_columns_regress: cat_features.append('weekday') if 'time' in input_columns_regress: cat_features.append('time') reg.fit( select_input_columns_regress(pd_vstack([tr, ts, oo])), select_output_columns_as_row_regress(pd_vstack([tr, ts, oo])), eval_set=[ ((select_input_columns_regress(remove_label_true(pd_vstack([tr, ts])))), select_output_columns_as_row_regress( remove_label_true(pd_vstack([tr, ts])))), ((select_input_columns_regress(remove_label_false(pd_vstack([tr, ts])))), select_output_columns_as_row_regress(
def catboost_train_regression( training_data_path: InputPath('CSV'), model_path: OutputPath('CatBoostModel'), starting_model_path: InputPath('CatBoostModel') = None, label_column: int = 0, loss_function: str = 'RMSE', num_iterations: int = 500, learning_rate: float = None, depth: int = 6, random_seed: int = 0, cat_features: list = None, additional_training_options: dict = {}, ): '''Train a CatBoost classifier model. Args: training_data_path: Path for the training data in CSV format. model_path: Output path for the trained model in binary CatBoostModel format. starting_model_path: Path for the existing trained model to start from. label_column: Column containing the label data. loss_function: The metric to use in training and also selector of the machine learning problem to solve. Default = 'RMSE'. Possible values: 'RMSE', 'MAE', 'Quantile:alpha=value', 'LogLinQuantile:alpha=value', 'Poisson', 'MAPE', 'Lq:q=value' num_iterations: Number of trees to add to the ensemble. learning_rate: Step size shrinkage used in update to prevents overfitting. Default value is selected automatically for binary classification with other parameters set to default. In all other cases default is 0.03. depth: Depth of a tree. All trees are the same depth. Default = 6 random_seed: Random number seed. Default = 0 cat_features: A list of Categorical features (indices or names). additional_training_options: A dictionary with additional options to pass to CatBoostRegressor Outputs: model: Trained model in binary CatBoostModel format. Annotations: author: Alexey Volkov <*****@*****.**> ''' import tempfile from pathlib import Path from catboost import CatBoostRegressor, Pool column_descriptions = {label_column: 'Label'} column_description_path = tempfile.NamedTemporaryFile(delete=False).name with open(column_description_path, 'w') as column_description_file: for idx, kind in column_descriptions.items(): column_description_file.write('{}\t{}\n'.format(idx, kind)) train_data = Pool( training_data_path, column_description=column_description_path, has_header=True, delimiter=',', ) model = CatBoostRegressor( iterations=num_iterations, depth=depth, learning_rate=learning_rate, loss_function=loss_function, random_seed=random_seed, verbose=True, **additional_training_options, ) model.fit( train_data, cat_features=cat_features, init_model=starting_model_path, #verbose=False, #plot=True, ) Path(model_path).parent.mkdir(parents=True, exist_ok=True) model.save_model(model_path)
categorical_features = open("../excluded_categorical_columns.txt").read().splitlines() independent_variables += categorical_features # X=X.fillna(-1) X = pd.DataFrame(data, columns=independent_variables) # print(X.columns) y = data[dependent_variable] # convert categorical columns to integers cat_dims = [X.columns.get_loc(i) for i in categorical_features[:-1]] for header in categorical_features: X[header] = X[header].astype('category').cat.codes X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) model = CatBoostRegressor() # grid_parameters = {'depth': [3,1,2,6,4,5,7,8,9,10], # 'learning_rate': [0.01, 0.03, 0.05, 0.07, 0.1, 0.13, 0.15,0,2], # 'iterations': [30, 50, 100,200,400,600,800,100], # # 'loss_function': ['RMSE', 'MultiRMSE', 'MAE', 'Quantile', 'LogLinQuantile', 'Poisson'], # 'l2_leaf_reg': [1, 3, 5, 7, 9, 10,50, 100], # # 'border_count':[32,5,10,20,50,100,200], # # 'ctr_border_count':[50,5,10,20,100,200], # } # paramter old used # grid_parameters = {'depth': [3,1,2,6,4,5,7,8,9,10], # 'learning_rate': [0.01,0.02,0.03,0.05,0.07, 0.1,0.15], # 'iterations': [30, 50, 100,200,400,600,800,1000,1200], # 'l2_leaf_reg': [1, 3, 5, 7, 9, 10,50, 100], # # 'border_count':[32,5,10,20,50,100,200], # }
def main(iterations): # Download train and validation datasets train_df, test_df = msrank() # Column 0 contains label values, column 1 contains group ids. X_train, y_train = train_df.drop([0, 1], axis=1).values, train_df[0].values X_test, y_test = test_df.drop([0, 1], axis=1).values, test_df[0].values # Split train data into two parts. First part - for baseline model, # second part - for major model splitted_data = train_test_split(X_train, y_train, test_size=0.5) X_train_first, X_train_second, y_train_first, y_train_second = splitted_data catboost_model = CatBoostRegressor(iterations=iterations, verbose=False) # Prepare simple baselines (just mean target on first part of train pool). baseline_value = y_train_first.mean() train_baseline = np.array([baseline_value] * y_train_second.shape[0]) test_baseline = np.array([baseline_value] * y_test.shape[0]) # Create pools train_pool = Pool(X_train_second, y_train_second, baseline=train_baseline) test_pool = Pool(X_test, y_test, baseline=test_baseline) # Train CatBoost model catboost_model.fit(train_pool, eval_set=test_pool, verbose=True, plot=False, save_snapshot=True) catboost_model.save_model("example.cbm") catboost_model = CatBoostRegressor() catboost_model.load_model("example.cbm") # Apply model on pool with baseline values preds1 = catboost_model.predict(test_pool) # Apply model on numpy.array and then add the baseline values preds2 = test_baseline + catboost_model.predict(X_test) # Check that preds have small diffs assert (np.abs(preds1 - preds2) < 1e-6).all()
def fit(self, X, y, sample_weight=None, eval_set=None, sample_weight_eval_set=None, **kwargs): logger = None if self._make_logger: # Example use of logger, with required import of: # from h2oaicore.systemutils import make_experiment_logger, loggerinfo # Can use loggerwarning, loggererror, etc. for different levels if self.context and self.context.experiment_id: logger = make_experiment_logger( experiment_id=self.context.experiment_id, tmp_dir=self.context.tmp_dir, experiment_tmp_dir=self.context.experiment_tmp_dir) if self._show_logger_test: loggerinfo(logger, "TestLOGGER: Fit CatBoost") if self._show_task_test: # Example task sync operations if hasattr(self, 'testcount'): self.test_count += 1 else: self.test_count = 0 # The below generates a message in the GUI notifications panel if self.test_count == 0 and self.context and self.context.experiment_id: warning = "TestWarning: First CatBoost fit for this model instance" loggerwarning(logger, warning) task = kwargs.get('task') if task: task.sync(key=self.context.experiment_id, progress=dict(type='warning', data=warning)) task.flush() # The below generates a message in the GUI top-middle panel above the progress wheel if self.test_count == 0 and self.context and self.context.experiment_id: message = "Tuning CatBoost" loggerinfo(logger, message) task = kwargs.get('task') if task: task.sync(key=self.context.experiment_id, progress=dict(type='update', message=message)) task.flush() from catboost import CatBoostClassifier, CatBoostRegressor, EFstrType # label encode target and setup type of problem lb = LabelEncoder() if self.num_classes >= 2: lb.fit(self.labels) y = lb.transform(y) if eval_set is not None: valid_X = eval_set[0][0] valid_y = eval_set[0][1] valid_y = lb.transform(valid_y) eval_set = [(valid_X, valid_y)] self.params.update({'objective': 'Logloss'}) if self.num_classes > 2: self.params.update({'objective': 'MultiClass'}) if isinstance(X, dt.Frame): orig_cols = list(X.names) numeric_cols = list(X[:, [bool, int, float]].names) else: orig_cols = list(X.columns) numeric_cols = list(X.select_dtypes([np.number]).columns) # unlike lightgbm that needs label encoded categoricals, catboots can take raw strings etc. self.params['cat_features'] = [ i for i, x in enumerate(orig_cols) if 'CatOrig:' in x or 'Cat:' in x or x not in numeric_cols ] if not self.get_uses_gpus(self.params): # monotonicity constraints not available for GPU for catboost # get names of columns in same order X_names = list(dt.Frame(X).names) X_numeric = self.get_X_ordered_numerics(X) X_numeric_names = list(X_numeric.names) self.set_monotone_constraints(X=X_numeric, y=y, params=self.params) numeric_constraints = copy.deepcopy( self.params['monotone_constraints']) # if non-numerics, then fix those to have 0 constraint self.params['monotone_constraints'] = [0] * len(X_names) colnumi = 0 for coli in X_names: if X_names[coli] in X_numeric_names: self.params['monotone_constraints'][ coli] = numeric_constraints[colnumi] colnumi += 1 if isinstance(X, dt.Frame) and len(self.params['cat_features']) == 0: # dt -> catboost internally using buffer leaks, so convert here # assume predict is after pipeline collection or in subprocess so needs no protection X = X.to_numpy( ) # don't assign back to X so don't damage during predict X = np.ascontiguousarray(X, dtype=np.float32 if config.data_precision == "float32" else np.float64) if eval_set is not None: valid_X = eval_set[0][0].to_numpy( ) # don't assign back to X so don't damage during predict valid_X = np.ascontiguousarray( valid_X, dtype=np.float32 if config.data_precision == "float32" else np.float64) valid_y = eval_set[0][1] eval_set = [(valid_X, valid_y)] if eval_set is not None: valid_X_shape = eval_set[0][0].shape else: valid_X_shape = None X, eval_set = self.process_cats(X, eval_set, orig_cols) # modify self.params_base['gpu_id'] based upon actually-available GPU based upon training and valid shapes self.acquire_gpus_function(train_shape=X.shape, valid_shape=valid_X_shape) params = copy.deepcopy( self.params ) # keep separate, since then can be pulled form lightgbm params params = self.transcribe_and_filter_params(params, eval_set is not None) if logger is not None: loggerdata( logger, "CatBoost parameters: params_base : %s params: %s catboost_params: %s" % (str(self.params_base), str(self.params), str(params))) if self.num_classes == 1: model = CatBoostRegressor(**params) else: model = CatBoostClassifier(**params) # Hit sometimes: Exception: catboost/libs/data_new/quantization.cpp:779: All features are either constant or ignored. if self.num_classes == 1: # assume not mae, which would use median # baseline = [np.mean(y)] * len(y) baseline = None else: baseline = None kargs = dict(X=X, y=y, sample_weight=sample_weight, baseline=baseline, eval_set=eval_set) pickle_path = None if config.debug_daimodel_level >= 2: self.uuid = str(uuid.uuid4())[:6] pickle_path = "catboost%s.pickle" % self.uuid save_obj((model, kargs), pickle_path) # FIT model.fit(**kargs) if config.debug_daimodel_level <= 2: remove(pickle_path) # https://catboost.ai/docs/concepts/python-reference_catboostclassifier.html # need to move to wrapper if model.get_best_iteration() is not None: iterations = model.get_best_iteration() + 1 else: iterations = self.params['n_estimators'] # must always set best_iterations self.model_path = None importances = copy.deepcopy(model.feature_importances_) if not self._save_by_pickle: self.uuid = str(uuid.uuid4())[:6] model_file = "catboost_%s.bin" % str(self.uuid) self.model_path = os.path.join(self.context.experiment_tmp_dir, model_file) model.save_model(self.model_path) with open(self.model_path, mode='rb') as f: model = f.read() self.set_model_properties(model=model, features=orig_cols, importances=importances, iterations=iterations)
X = dw.drop(['weight'], axis=1) y = dw.weight from sklearn.model_selection import train_test_split X_train, X_validation, y_train, y_validation = train_test_split( X, y, train_size=0.8, random_state=42) # In[34]: categorical_features_indices = np.where(X.dtypes != np.float)[0] # In[35]: model = CatBoostRegressor(iterations=1, depth=10, learning_rate=0.1, loss_function='RMSE', use_best_model=True) model.fit(X_train, y_train, cat_features=categorical_features_indices, eval_set=(X_validation, y_validation)) # In[84]: da = ds.sample(frac=1) def conv(s): if s < 15: return 0
cab_oof_pred = np.zeros_like(y, dtype=np.float) lgbm_oof_pred = np.zeros_like(y, dtype=np.float) scores, models = [], [] skf = StratifiedKFold(n_splits=N_SPLITS, random_state=RANDOM_SEED, shuffle=True) for i, (train_idx, valid_idx) in enumerate(skf.split(train, train['Publisher'])): x_train, x_valid = train.iloc[train_idx], train.iloc[valid_idx] y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx] # Publisherでfoldを割ってるので、trainはデータを分割した後にカラムをドロップ x_train = x_train.drop(drop_column, axis=1) x_valid = x_valid.drop(drop_column, axis=1) train_data = Pool(x_train, y_train) valid_data = Pool(x_valid, y_valid) model = CatBoostRegressor(**cab_params) model.fit(train_data, eval_set=valid_data, early_stopping_rounds=50, verbose=False, use_best_model=True) cab_valid_pred = model.predict(x_valid) score = mean_squared_error(y_valid, cab_valid_pred) ** .5 print(f'Fold {i} CAB RMSLE: {score}') cab_oof_pred[valid_idx] = cab_valid_pred models.append(model) scores.append(score) model = lgbm.LGBMRegressor(**lgbm_params) model.fit(x_train, y_train,
]) labels.append(evt.r) energy_true.append(evt.E0) print(len(features)) #train_features = np.array(features[:820000]) test_features = np.array(features[820000:]) #eval_features = features[770000:820000] #train_labels = labels[:820000] test_labels = labels[820000:] #energy_test = energy_true[820000:] #eval_labels = labels[770000:820000] print(len(test_features)) model = CatBoostRegressor( ) #learning_rate = 0.1, iterations = 3000, depth=10, loss_function='RMSE')# l2_leaf_reg = 14, od_type = "Iter",od_wait = 50) model.load_model("models/vertex.model") #fit_model = model.fit(train_features, train_labels) #eval_set = (eval_features,eval_labels)) predictions = model.predict(test_features) #print (fit_model.get_params()) #mse = mean_squared_error(test_labels, predictions) #print("MSE: %.4f" % mse) #model.save_model("vertex.model", format="cbm", export_parameters=None) print(findmaximum(predictions - test_labels), findsigma(predictions - test_labels)) plt.hist(predictions - test_labels, bins=100, range=[-750, 750])
import numpy from catboost import CatBoostRegressor dataset = numpy.array([[1, 4, 5, 6], [4, 5, 6, 7], [30, 40, 50, 60], [20, 15, 85, 60]]) train_labels = [1.2, 3.4, 9.5, 24.5] model = CatBoostRegressor(learning_rate=1, depth=6, loss_function='RMSE') fit_model = model.fit(dataset, train_labels) print fit_model.get_params()
def get_base_estimator(self, model, create_nn_model=None): # keras config tf.random.set_seed(42) # torch config # for reproducibility torch.manual_seed(42) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False # gpu or cpu device = 'cuda' if torch.cuda.is_available() else 'cpu' if model == 'log_reg': return LogisticRegression(solver='lbfgs') elif model == 'log_reg_cv': return LogisticRegressionCV() elif model == 'linear_reg': return LinearRegression() elif model == 'lasso': return Lasso() elif model == 'ridge': return Ridge() elif model == 'svc': return SVC() elif model == 'svr': return SVR() elif model == 'l_svc': return LinearSVC() elif model == 'l_svr': return LinearSVR() elif model == 'rf_clf': return RandomForestClassifier() elif model == 'rf_reg': return RandomForestRegressor() elif model == 'gbdt_clf': return GradientBoostingClassifier() elif model == 'gbdt_reg': return GradientBoostingRegressor() elif model == 'knn_clf': return KNeighborsClassifier() elif model == 'knn_reg': return KNeighborsRegressor() elif model == 'g_mix': return GaussianMixture() elif model == 'g_nb': return GaussianNB() elif model == 'preceptron': return Perceptron() elif model == 'sgd_clf': return SGDClassifier() elif model == 'sgd_reg': return SGDRegressor() elif model == 'dt_clf': return DecisionTreeClassifier() elif model == 'dt_reg': return DecisionTreeRegressor() elif model == 'xgb_clf': return XGBClassifier() elif model == 'xgb_reg': return XGBRegressor() elif model == 'lgb_clf': return LGBMClassifier() elif model == 'lgb_reg': return LGBMRegressor() elif model == 'catb_clf': return CatBoostClassifier() elif model == 'catb_reg': return CatBoostRegressor() elif model == 'rgf_clf': return RGFClassifier() elif model == 'rgf_reg': return RGFRegressor() elif model == 'keras_clf': return MyKerasClassifier(build_fn=create_nn_model) elif model == 'keras_reg': return MyKerasRegressor(build_fn=create_nn_model) elif model == 'torch_clf': return NeuralNetClassifier(module=create_nn_model(), device=device, train_split=None) elif model == 'torch_reg': return NeuralNetRegressor(module=create_nn_model(), device=device, train_split=None) elif model == 'tabnet_clf': return TabNetClassifier() elif model == 'tabnet_reg': return TabNetRegressor() else: logger.error('NOT IMPLEMENTED BASE MODEL: %s' % model) raise Exception('NOT IMPLEMENTED')
k_y_train = Y_data[train_index] k_x_vali = X_data[vali_index] k_y_vali = Y_data[vali_index] cb_params = { 'n_estimators': 1000000, 'loss_function': 'MAE', 'eval_metric': 'MAE', 'learning_rate': 0.02, 'depth': 6, 'use_best_model': True, 'subsample': 0.6, 'bootstrap_type': 'Bernoulli', 'reg_lambda': 3, 'one_hot_max_size': 2, } model_cb = CatBoostRegressor(**cb_params) # train the model model_cb.fit(k_x_train, k_y_train, eval_set=[(k_x_vali, k_y_vali)], verbose=300, early_stopping_rounds=300) oof_cb[vali_index] = model_cb.predict(k_x_vali, ntree_end=model_cb.best_iteration_) predictions_cb += model_cb.predict(X_test, ntree_end=model_cb.best_iteration_) / kfolder.n_splits predictions_train_cb += model_cb.predict(X_data, ntree_end=model_cb.best_iteration_) / kfolder.n_splits print("catboost score: {:<8.8f}".format(mean_absolute_error(np.expm1(oof_cb), np.expm1(Y_data)))) output_path = path + '/user_data/' # 测试集输出 predictions = predictions_cb predictions[predictions < 0] = 0 sub = pd.DataFrame() sub['SaleID'] = TestA_data.SaleID sub['price'] = predictions
def grid(self, method, params={}, ver=2, griall=False): from sklearn.model_selection import GridSearchCV if method == 'mlp': # eğer mlp yaparsan scale ediyor datanı from sklearn.preprocessing import StandardScaler scaler = StandardScaler() scaler.fit(self.X_train) X_train = scaler.transform(self.X_train) X_test = scaler.transform(self.X_test) else: X_train = self.X_train X_test = self.X_test if params: if method == 'rf': from sklearn.ensemble import RandomForestRegressor classifier = RandomForestRegressor() grid_params = params elif method == 'dt': from sklearn.tree import DecisionTreeRegressor classifier = DecisionTreeRegressor() grid_params = params elif method == 'mlp': from sklearn.neural_network import MLPRegressor classifier = MLPRegressor() grid_params = params elif method == 'lr': from sklearn.linear_model import LinearRegression classifier = LinearRegression() grid_params = params elif method == 'gbm': from sklearn.ensemble import GradientBoostingRegressor classifier = GradientBoostingRegressor() grid_params = params elif method == 'xgb': from xgboost import XGBRegressor classifier = XGBRegressor() grid_params = params elif method == 'lgbm': from lightgbm import LGBMRegressor classifier = LGBMRegressor() grid_params = params elif method == 'cat': from catboost import CatBoostRegressor classifier = CatBoostRegressor(silent=True) grid_params = params elif method == 'svm': from sklearn.svm import SVR classifier = SVR() grid_params = params elif method == 'knn': from sklearn.neighbors import KNeighborsRegressor knn = KNeighborsRegressor() grid_params = params else: print('Unknown method') return else: if method == 'rf': from sklearn.ensemble import RandomForestRegressor classifier = RandomForestRegressor() grid_params = { "max_depth": [8, 10, 11, 13, 15, 18], "max_features": [5, 10, 15, 20], "n_estimators": [5, 10, 50, 100, 200, 500], "min_samples_split": [3, 5, 10], "criterion": ['mse', 'mae'] } elif method == 'dt': from sklearn.tree import DecisionTreeRegressor classifier = DecisionTreeRegressor() grid_params = { "max_depth": range(1, 10), "min_samples_split": list(range(2, 50)), "criterion": ['mse', 'mae'] } elif method == 'mlp': from sklearn.neural_network import MLPRegressor classifier = MLPRegressor() grid_params = { 'alpha': [0.1, 0.01, 0.001, 0.005, 0.0001, 0.00001], 'hidden_layer_sizes': [(10, 10, 10), (45, 50, 60), (25, 35, 45), (15, 15), (100, ), (100, 100)], 'solver': ['lbfgs', 'adam', 'sgd'], 'activation': ['relu', 'logistic', 'tanh', 'identity'] } elif method == 'gbm': from sklearn.ensemble import GradientBoostingRegressor classifier = GradientBoostingRegressor() grid_params = { 'loss': ['ls', 'lad', 'huber', 'quantile'], 'learning_rate': [0.001, 0.01, 0.1, 0.2, 0.3], 'n_estimators': [100, 500, 1000, 1500], 'max_depth': [3, 5, 6], 'min_samples_split': [2, 5, 10, 15], 'subsample': [0.6, 1.0] } elif method == 'xgb': from xgboost import XGBRegressor classifier = XGBRegressor() grid_params = { 'colsample_bytree': [0.6, 1.0], 'n_estimators': [100, 200, 500, 1000], 'max_depth': [4, 5, 6, 7], 'min_child_weight': [0.8, 0.9, 1], 'learning_rate': [0.1, 0.01, 0.02, 0.05] } elif method == 'lgbm': from lightgbm import LGBMRegressor classifier = LGBMRegressor() grid_params = { 'subsample': [0.6, 0.8, 1.0], 'n_estimators': [100, 500, 1000, 1500], 'max_depth': [4, 5, 6, 7], 'min_child_samples': [10, 20], 'learning_rate': [0.2, 0.1, 0.01, 0.02, 0.05], 'importance_type': ['gains', 'split'] } elif method == 'cat': from catboost import CatBoostRegressor classifier = CatBoostRegressor(silent=True) grid_params = { 'iterations': [200, 500], 'learning_rate': [0.01, 0.02, 0.05], 'depth': [3, 5, 8] } elif method == 'svm': from sklearn.svm import SVR classifier = SVR() grid_params = { 'C': np.arange(0.1, 2, 0.1), 'kernel': ['linear', 'rbf', 'poly'] } elif method == 'knn': from sklearn.neighbors import KNeighborsRegressor classifier = KNeighborsRegressor() grid_params = { 'n_neighbors': np.arange(1, 40), 'weights': ['uniform', 'distance'], 'metric': ['minkowski', 'euclidean', 'manhattan'] } else: print('Unknown method') return grid_cv = GridSearchCV(classifier, grid_params, cv=5, n_jobs=-1, verbose=ver) grid_cv_model = grid_cv.fit(X_train, self.y_train) # en iyi parametleri bastırıp onları kullanarak model kuruyor print("En iyi parametlerler: " + str(grid_cv_model.best_params_)) if method == 'rf': classifier = RandomForestRegressor( max_depth=grid_cv_model.best_params_['max_depth'], max_features=grid_cv_model.best_params_['max_features'], n_estimators=grid_cv_model.best_params_['n_estimators'], min_samples_split=grid_cv_model. best_params_['min_samples_split'], criterion=grid_cv_model.best_params_['criterion']) elif method == 'dt': classifier = DecisionTreeRegressor( max_depth=grid_cv_model.best_params_['max_depth'], min_samples_split=grid_cv_model. best_params_['min_samples_split'], criterion=grid_cv_model.best_params_['criterion']) elif method == 'mlp': classifier = MLPRegressor( alpha=grid_cv_model.best_params_['alpha'], hidden_layer_sizes=grid_cv_model. best_params_['hidden_layer_sizes'], solver=grid_cv_model.best_params_['solver'], activation=grid_cv_model.best_params_['activation']) elif method == 'gbm': from sklearn.ensemble import GradientBoostingRegressor classifier = GradientBoostingRegressor( learning_rate=grid_cv_model.best_params_['learning_rate'], n_estimators=grid_cv_model.best_params_['n_estimators'], max_depth=grid_cv_model.best_params_['max_depth'], min_samples_split=grid_cv_model. best_params_['min_samples_split'], loss=grid_cv_model.best_params_['loss'], subsample=grid_cv_model.best_params_['subsample']) elif method == 'xgb': from xgboost import XGBRegressor classifier = XGBRegressor( colsample_bytree=grid_cv_model. best_params_['colsample_bytree'], n_estimators=grid_cv_model.best_params_['n_estimators'], max_depth=grid_cv_model.best_params_['max_depth'], min_child_weight=grid_cv_model. best_params_['min_child_weight'], learning_rate=grid_cv_model.best_params_['learning_rate']) elif method == 'lgbm': from lightgbm import LGBMRegressor classifier = LGBMRegressor( subsample=grid_cv_model.best_params_['subsample'], n_estimators=grid_cv_model.best_params_['n_estimators'], max_depth=grid_cv_model.best_params_['max_depth'], min_child_samples=grid_cv_model. best_params_['min_child_samples'], learning_rate=grid_cv_model.best_params_['learning_rate'], importance_type=grid_cv_model.best_params_['importance_type']) elif method == 'cat': from catboost import CatBoostRegressor classifier = CatBoostRegressor( silent=True, iterations=grid_cv_model.best_params_['iterations'], learning_rate=grid_cv_model.best_params_['learning_rate'], depth=grid_cv_model.best_params_['depth']) elif method == 'svm': from sklearn.svm import SVR classifier = SVR(C=grid_cv_model.best_params_['C'], kernel=grid_cv_model.best_params_['kernel']) elif method == 'knn': from sklearn.neighbors import KNeighborsRegressor knn = KNeighborsRegressor( n_neighbors=grid_cv_model.best_params_['n_neighbors'], weights=grid_cv_model.best_params_['weights'], metric=grid_cv_model.best_params_['metric']) print('Result for ', method) from sklearn.metrics import classification_report, accuracy_score, mean_squared_error, r2_score classifier.fit(X_train, self.y_train) y_pred = classifier.predict(X_test) print(np.sqrt(mean_squared_error(self.y_test, y_pred))) if griall: self.rmse.append(np.sqrt(mean_squared_error(self.y_test, y_pred)))
def default_processes( self ): #Bildiğimiz classification yöntemlerini hiçbir hyperparametre değiştirmeden uyguluyor from warnings import filterwarnings filterwarnings('ignore') from sklearn.linear_model import LinearRegression from sklearn.tree import DecisionTreeRegressor from sklearn.ensemble import RandomForestRegressor from sklearn.neural_network import MLPRegressor from sklearn.ensemble import GradientBoostingRegressor from xgboost import XGBRegressor from lightgbm import LGBMRegressor from catboost import CatBoostRegressor from sklearn.neighbors import KNeighborsRegressor from sklearn.metrics import classification_report, accuracy_score, mean_squared_error, r2_score acc = [] acc_colmns = [ 'lr', 'dtc', 'rfc', 'mlpc', 'svm', 'gbm', 'xgb', 'lgbc', 'catboost', 'knn' ] lr = LinearRegression() lr.fit(self.X_train, self.y_train) y_pred = lr.predict(self.X_test) print('Results for default LinearRegression ') print(np.sqrt(mean_squared_error(self.y_test, y_pred))) acc.append(np.sqrt(mean_squared_error(self.y_test, y_pred))) dtc = DecisionTreeRegressor() dtc.fit(self.X_train, self.y_train) y_pred = dtc.predict(self.X_test) print('Results for default decision tree') print(np.sqrt(mean_squared_error(self.y_test, y_pred))) acc.append(np.sqrt(mean_squared_error(self.y_test, y_pred))) rfc = RandomForestRegressor() rfc.fit(self.X_train, self.y_train) y_pred = rfc.predict(self.X_test) print('Results for default random forest') print(np.sqrt(mean_squared_error(self.y_test, y_pred))) acc.append(np.sqrt(mean_squared_error(self.y_test, y_pred))) from sklearn.preprocessing import StandardScaler scaler = StandardScaler() scaler.fit(self.X_train) X_train_scaled = scaler.transform(self.X_train) X_test_scaled = scaler.transform(self.X_test) mlpc = MLPRegressor() mlpc.fit(X_train_scaled, self.y_train) y_pred = mlpc.predict(X_test_scaled) print('Results for default MLP') print(np.sqrt(mean_squared_error(self.y_test, y_pred))) acc.append(np.sqrt(mean_squared_error(self.y_test, y_pred))) from sklearn.svm import SVR svm = SVR().fit(self.X_train, self.y_train) print('Results for default Gradient Boosting ') print(np.sqrt(mean_squared_error(self.y_test, y_pred))) acc.append(np.sqrt(mean_squared_error(self.y_test, y_pred))) gbc = GradientBoostingRegressor() gbc.fit(self.X_train, self.y_train) y_pred = gbc.predict(X_test_scaled) print('Results for default Gradient Boosting ') print(np.sqrt(mean_squared_error(self.y_test, y_pred))) acc.append(np.sqrt(mean_squared_error(self.y_test, y_pred))) xgb = XGBRegressor().fit(self.X_train, self.y_train) y_pred = xgb.predict(self.X_test) print('Results for default XGBoost ') print(np.sqrt(mean_squared_error(self.y_test, y_pred))) acc.append(np.sqrt(mean_squared_error(self.y_test, y_pred))) lgbm = LGBMRegressor().fit(self.X_train, self.y_train) print('Results for default LGBM') y_pred = lgbm.predict(self.X_test) print(np.sqrt(mean_squared_error(self.y_test, y_pred))) acc.append(np.sqrt(mean_squared_error(self.y_test, y_pred))) cat = CatBoostRegressor(silent=True).fit(self.X_train, self.y_train) print('Results for default CatBoost') y_pred = cat.predict(self.X_test) print(np.sqrt(mean_squared_error(self.y_test, y_pred))) acc.append(np.sqrt(mean_squared_error(self.y_test, y_pred))) knn = KNeighborsRegressor().fit(self.X_train, self.y_train) print('Results for default KNeighborsRegressor') y_pred = knn.predict(self.X_test) print(np.sqrt(mean_squared_error(self.y_test, y_pred))) acc.append(np.sqrt(mean_squared_error(self.y_test, y_pred))) acc = [i * 100 for i in acc] accuracy = pd.DataFrame({"RMSE": acc}, index=acc_colmns) accuracy.sort_values(by="RMSE", axis=0, ascending=True).plot(kind="barh", color="r")
df_kag = sc_x.transform(df_kag) # ### train-test split # In[6]: X_train, X_test, y_train, y_test = model_selection.train_test_split( df, income, test_size=0.15) # ### Train on catboostregressor # In[7]: reg = CatBoostRegressor(iterations=2000, eval_metric='RMSE', depth=8, bagging_temperature=0.2, learning_rate=0.02) reg.fit(X_train, y_train, eval_set=(X_test, y_test), use_best_model=True) # ### predict and get RMSE # In[8]: # Test y_pred = reg.predict(X_test) print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred))) # ### publish results
def train_model(X, X_test, y, params=None, folds=folds, model_type='lgb', plot_feature_importance=False, model=None): oof = np.zeros(len(X)) prediction = np.zeros(len(X_test)) scores = [] feature_importance = pd.DataFrame() for fold_n, (train_index, valid_index) in enumerate(folds.split(X)): print('Fold', fold_n, 'started at', time.ctime()) if type(X) == np.ndarray: X_train, X_valid = X[train_index], X[valid_index] y_train, y_valid = y[train_index], y[valid_index] else: X_train, X_valid = X.iloc[train_index], X.iloc[valid_index] y_train, y_valid = y.iloc[train_index], y.iloc[valid_index] if model_type == 'lgb': model = lgb.LGBMRegressor(**params, n_estimators=50000, n_jobs=-1) model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_valid, y_valid)], eval_metric='mae', verbose=10000, early_stopping_rounds=200) y_pred_valid = model.predict(X_valid) y_pred = model.predict(X_test, num_iteration=model.best_iteration_) if model_type == 'xgb': train_data = xgb.DMatrix(data=X_train, label=y_train, feature_names=X.columns) valid_data = xgb.DMatrix(data=X_valid, label=y_valid, feature_names=X.columns) watchlist = [(train_data, 'train'), (valid_data, 'valid_data')] model = xgb.train(dtrain=train_data, num_boost_round=20000, evals=watchlist, early_stopping_rounds=200, verbose_eval=500, params=params) y_pred_valid = model.predict(xgb.DMatrix(X_valid, feature_names=X.columns), ntree_limit=model.best_ntree_limit) y_pred = model.predict(xgb.DMatrix(X_test, feature_names=X.columns), ntree_limit=model.best_ntree_limit) if model_type == 'sklearn': model = model model.fit(X_train, y_train) y_pred_valid = model.predict(X_valid).reshape(-1, ) score = mean_absolute_error(y_valid, y_pred_valid) print(f'Fold {fold_n}. MAE: {score:.4f}.') print('') y_pred = model.predict(X_test).reshape(-1, ) if model_type == 'cat': model = CatBoostRegressor(iterations=20000, eval_metric='MAE', **params) model.fit(X_train, y_train, eval_set=(X_valid, y_valid), cat_features=[], use_best_model=True, verbose=False) y_pred_valid = model.predict(X_valid) y_pred = model.predict(X_test) oof[valid_index] = y_pred_valid.reshape(-1, ) scores.append(mean_absolute_error(y_valid, y_pred_valid)) prediction += y_pred if model_type == 'lgb': # feature importance fold_importance = pd.DataFrame() fold_importance["feature"] = X.columns fold_importance["importance"] = model.feature_importances_ fold_importance["fold"] = fold_n + 1 feature_importance = pd.concat( [feature_importance, fold_importance], axis=0) prediction /= n_fold print('CV mean score: {0:.4f}, std: {1:.4f}.'.format( np.mean(scores), np.std(scores))) if model_type == 'lgb': feature_importance["importance"] /= n_fold if plot_feature_importance: cols = feature_importance[[ "feature", "importance" ]].groupby("feature").mean().sort_values( by="importance", ascending=False)[:50].index best_features = feature_importance.loc[ feature_importance.feature.isin(cols)] plt.figure(figsize=(16, 12)) sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False)) plt.title('LGB Features (avg over folds)') return oof, prediction, feature_importance return oof, prediction, scores else: return oof, prediction, scores
def catboost_regressor_learner(df: pd.DataFrame, features: List[str], target: str, learning_rate: float = 0.1, num_estimators: int = 100, extra_params: Dict[str, Any] = None, prediction_column: str = "prediction", weight_column: str = None) -> LearnerReturnType: """ Fits an CatBoost regressor to the dataset. It first generates a Pool with the specified features and labels from `df`. Then it fits a CatBoost model to this Pool. Return the predict function for the model and the predictions for the input dataset. Parameters ---------- df : pandas.DataFrame A Pandas' DataFrame with features and target columns. The model will be trained to predict the target column from the features. features : list of str A list os column names that are used as features for the model. All this names should be in `df`. target : str The name of the column in `df` that should be used as target for the model. This column should be numerical and continuous, since this is a regression model. learning_rate : float Float in range [0,1]. Step size shrinkage used in update to prevents overfitting. After each boosting step, we can directly get the weights of new features. and eta actually shrinks the feature weights to make the boosting process more conservative. See the eta hyper-parameter in: https://catboost.ai/docs/concepts/python-reference_parameters-list.html num_estimators : int Int in range [0, inf] Number of boosted trees to fit. See the n_estimators hyper-parameter in: https://catboost.ai/docs/concepts/python-reference_parameters-list.html extra_params : dict, optional Dictionary in the format {"hyperparameter_name" : hyperparameter_value. Other parameters for the CatBoost model. See the list in: https://catboost.ai/docs/concepts/python-reference_catboostregressor.html If not passed, the default will be used. prediction_column : str The name of the column with the predictions from the model. weight_column : str, optional The name of the column with scores to weight the data. """ from catboost import Pool, CatBoostRegressor import catboost weights = df[weight_column].values if weight_column else None params = extra_params if extra_params else {} params = assoc(params, "eta", learning_rate) dtrain = Pool(df[features].values, df[target].values, weight=weights, feature_names=list(map(str, features))) cat_boost_regressor = CatBoostRegressor(iterations=num_estimators, **params) cbr = cat_boost_regressor.fit(dtrain, verbose=0) def p(new_df: pd.DataFrame, apply_shap: bool = False) -> pd.DataFrame: dtest = Pool(new_df[features].values, feature_names=list(map(str, features))) col_dict = {prediction_column: cbr.predict(dtest)} if apply_shap: import shap explainer = shap.TreeExplainer(cbr) shap_values = list(explainer.shap_values(new_df[features])) shap_expected_value = explainer.expected_value shap_output = { "shap_values": shap_values, "shap_expected_value": np.repeat(shap_expected_value, len(shap_values)) } col_dict = merge(col_dict, shap_output) return new_df.assign(**col_dict) p.__doc__ = learner_pred_fn_docstring("CatBoostRegressor", shap=False) log = { 'catboost_regression_learner': { 'features': features, 'target': target, 'prediction_column': prediction_column, 'package': "catboost", 'package_version': catboost.__version__, 'parameters': assoc(params, "num_estimators", num_estimators), 'feature_importance': cbr.feature_importances_, 'training_samples': len(df) } } return p, p(df), log
'max_depth': 7, 'boosting': 'gbdt', 'objective': 'regression', 'metric': 'mse', 'is_training_metric': False, 'seed': 18 } #lgb_model = lgb.train(params, lgb.Dataset(x1, label=y1), 100, lgb.Dataset(x2, label=y2), feval=lgb_rmse, verbose_eval=10, early_stopping_rounds=20) #test['item_cnt_month'] = lgb_model.predict(test[col], num_iteration=lgb_model.best_iteration) #test[['ID','item_cnt_month']].to_csv('lgb_submission.csv', index=False) #CatBoost cb_model = CatBoostRegressor(iterations=100, learning_rate=0.2, depth=7, loss_function='RMSE', eval_metric='RMSE', random_seed=18, od_type='Iter', od_wait=20) cb_model.fit(x1, y1, eval_set=(x2, y2), use_best_model=True, verbose=False) print( 'RMSE:', np.sqrt( metrics.mean_squared_error(y2.clip(0., 20.), cb_model.predict(x2).clip(0., 20.)))) test['item_cnt_month'] += cb_model.predict(test[col]) test['item_cnt_month'] /= 2 test[['ID', 'item_cnt_month']].to_csv('cb_blend_submission.csv', index=False) # In[ ]:
hc_pipeline = make_pipeline(ce.GLMMEncoder()) column_transformer = ColumnTransformer(transformers=\ [('numeric_pipeline', numeric_pipeline, select_numeric_features),\ ('oh_pipeline', oh_pipeline, select_oh_features),\ ('hc_pipeline', hc_pipeline, select_hc_features) ],\ n_jobs=n_threads, remainder='drop') #### create pipeline #### cat = CatBoostRegressor(thread_count=n_threads, n_estimators=N_ESTIMATORS, random_state=SEED, verbose=False) pipe = Pipeline(steps=[('column_transformer', column_transformer),\ ('variancethreshold', VarianceThreshold(threshold=0.0)),\ ('selectpercentile', SelectPercentile(f_regression, percentile=90)),\ ('model', cat)]) _ = pipe.fit(train_df, log_y_train)
knclf = KNeighborsClassifier(n_neighbors=5) y_kn = [1 if x > 170 else 0 for x in y_train] knclf.fit(X_train_nona, y_kn) X_train['high_low_ind'] = knclf.predict(X_train_nona) X_valid['high_low_ind'] = knclf.predict(X_valid_nona) X_test_type['high_low_ind'] = knclf.predict( X_test_type[X_train_nona.columns]) train_dataset = Pool(data=X_train, label=y_train) valid_dataset = Pool(data=X_valid, label=y_valid) test_dataset = Pool(data=X_test_type) model = CatBoostRegressor( iterations=N_ESTIMATORS, learning_rate=LEARNING_RATE, depth=DEPTH, eval_metric=EVAL_METRIC, verbose=VERBOSE, random_state=RANDOM_STATE, thread_count=N_THREADS, #loss_function=EVAL_METRIC, # bootstrap_type='Poisson', # bagging_temperature=5, task_type="GPU") # Train on GPU model.fit(train_dataset, eval_set=valid_dataset, early_stopping_rounds=500) now = timer() update_tracking(run_id, '{}_tr_sec_f{}'.format(bond_type, fold_n + 1), (now - fold_start), integer=True) logger.info('Saving model file')
def train_model_regression(X, X_test, y, params, folds, model_type='lgb', eval_metric='mae', columns=None, plot_feature_importance=False, model=None, verbose=10000, early_stopping_rounds=200, n_estimators=50000, mol_type=-1, fold_group=None, skip_folds=None, phase_mark="", skipped_mark=[]): """ A function to train a variety of regression models. Returns dictionary with oof predictions, test predictions, scores and, if necessary, feature importances. :params: X - training data, can be pd.DataFrame or np.ndarray (after normalizing) :params: X_test - test data, can be pd.DataFrame or np.ndarray (after normalizing) :params: y - target :params: folds - folds to split data :params: model_type - type of model to use :params: eval_metric - metric to use :params: columns - columns to use. If None - use all columns :params: plot_feature_importance - whether to plot feature importance of LGB :params: model - sklearn model, works only for "sklearn" model type """ assert isinstance(skip_folds, list) or skip_folds is None print(f"skip_folds :{skip_folds}") columns = X.columns if columns is None else columns X_test = X_test[columns] # to set up scoring parameters metrics_dict = { 'mae': { 'lgb_metric_name': 'mae', 'catboost_metric_name': 'MAE', 'sklearn_scoring_function': metrics.mean_absolute_error }, 'group_mae': { 'lgb_metric_name': 'mae', 'catboost_metric_name': 'MAE', 'scoring_function': group_mean_log_mae }, 'mse': { 'lgb_metric_name': 'mse', 'catboost_metric_name': 'MSE', 'sklearn_scoring_function': metrics.mean_squared_error } } result_dict = {} # out-of-fold predictions on train data oof = np.zeros(len(X)) # averaged predictions on train data prediction = np.zeros(len(X_test)) # list of scores on folds scores = [] feature_importance = pd.DataFrame() model_list = [] # split and train on folds for fold_n, (train_index, valid_index) in enumerate(folds.split(X, groups=fold_group)): if skip_folds is not None and fold_n in skip_folds and phase_mark in skipped_mark: print(f'Fold {fold_n + 1} is skipped!!! at {time.ctime()}') oof = unpickle(mid_path / f"oof_cv{phase_mark}_{fold_n}.pkl", ) y_pred = unpickle( mid_path / f"prediction_cv{phase_mark}_{fold_n}.pkl", ) model = unpickle(mid_path / f"model_cv{phase_mark}_{fold_n}.pkl", ) fold_importance = unpickle( mid_path / f"importance_cv{phase_mark}_{fold_n}.pkl", ) feature_importance = pd.concat( [feature_importance, fold_importance], axis=0) prediction += y_pred model_list += [model] continue print(f'Fold {fold_n + 1} started at {time.ctime()}') if type(X) == np.ndarray: X_train, X_valid = X[columns][train_index], X[columns][valid_index] y_train, y_valid = y[train_index], y[valid_index] else: X_train, X_valid = X[columns].iloc[train_index], X[columns].iloc[ valid_index] y_train, y_valid = y.iloc[train_index], y.iloc[valid_index] if model_type == 'lgb': model = lgb.LGBMRegressor(**params, n_estimators=n_estimators, n_jobs=-1, importance_type='gain') print(model) model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_valid, y_valid)], eval_metric=metrics_dict[eval_metric]['lgb_metric_name'], verbose=verbose, early_stopping_rounds=early_stopping_rounds) y_pred_valid = model.predict(X_valid) y_pred = model.predict(X_test, num_iteration=model.best_iteration_) if model_type == 'xgb': train_data = xgb.DMatrix(data=X_train, label=y_train, feature_names=X.columns) valid_data = xgb.DMatrix(data=X_valid, label=y_valid, feature_names=X.columns) watchlist = [(train_data, 'train'), (valid_data, 'valid_data')] params["objective"] = "reg:linear" params["eval_metric"] = metrics_dict[eval_metric][ 'lgb_metric_name'] model = xgb.train(dtrain=train_data, num_boost_round=20000, evals=watchlist, early_stopping_rounds=200, verbose_eval=verbose, params=params) y_pred_valid = model.predict(xgb.DMatrix(X_valid, feature_names=X.columns), ntree_limit=model.best_ntree_limit) y_pred = model.predict(xgb.DMatrix(X_test, feature_names=X.columns), ntree_limit=model.best_ntree_limit) if model_type == 'sklearn': model = model model.fit(X_train, y_train) y_pred_valid = model.predict(X_valid).reshape(-1, ) score = metrics_dict[eval_metric]['sklearn_scoring_function']( y_valid, y_pred_valid) print(f'Fold {fold_n}. {eval_metric}: {score:.4f}.') print('') y_pred = model.predict(X_test).reshape(-1, ) if model_type == 'cat': model = CatBoostRegressor( iterations=20000, eval_metric=metrics_dict[eval_metric]['catboost_metric_name'], **params, loss_function=metrics_dict[eval_metric] ['catboost_metric_name']) model.fit(X_train, y_train, eval_set=(X_valid, y_valid), cat_features=[], use_best_model=True, verbose=False) y_pred_valid = model.predict(X_valid) y_pred = model.predict(X_test) oof[valid_index] = y_pred_valid.reshape(-1, ) if eval_metric != 'group_mae': scores.append( metrics_dict[eval_metric]['sklearn_scoring_function']( y_valid, y_pred_valid)) else: scores.append(metrics_dict[eval_metric]['scoring_function']( y_valid, y_pred_valid, X_valid['type'])) prediction += y_pred if model_type == 'lgb' and plot_feature_importance: # feature importance fold_importance = pd.DataFrame() fold_importance["feature"] = columns fold_importance["importance"] = model.feature_importances_ fold_importance["fold"] = fold_n + 1 try: fold_importance.to_csv(mid_path / f"importance_cv_{fold_n}.csv") except Exception as e: print("failed to save importance...") print(e) feature_importance = pd.concat( [feature_importance, fold_importance], axis=0) model_list += [model] try: to_pickle(mid_path / f"oof_cv{phase_mark}_{fold_n}.pkl", oof) to_pickle(mid_path / f"prediction_cv{phase_mark}_{fold_n}.pkl", y_pred) to_pickle(mid_path / f"model_cv{phase_mark}_{fold_n}.pkl", model) to_pickle(mid_path / f"importance_cv{phase_mark}_{fold_n}.pkl", fold_importance) except Exception as e: print("failed to save intermediate data...") print(e) if model_type == 'lgb' and plot_feature_importance: result_dict['importance'] = feature_importance prediction /= folds.n_splits try: cv_score_msg = f'{DATA_VERSION}_{TRIAL_NO}' + ' CV mean score: {0:.4f}, std: {1:.4f}.'.format( np.mean(scores), np.std(scores)) print(cv_score_msg) send_message(cv_score_msg) except Exception as e: print(e) pass result_dict["models"] = model_list result_dict['oof'] = oof result_dict['prediction'] = prediction result_dict['scores'] = scores return result_dict
def predict(self, X, **kwargs): model, features, importances, iterations = self.get_model_properties() if not self._save_by_pickle: from catboost import CatBoostClassifier, CatBoostRegressor, EFstrType if self.num_classes >= 2: from_file = CatBoostClassifier() else: from_file = CatBoostRegressor() with open(self.model_path, mode='wb') as f: f.write(model) model = from_file.load_model(self.model_path) # FIXME: Do equivalent throttling of predict size like def _predict_internal(self, X, **kwargs), wrap-up. if isinstance(X, dt.Frame) and len(self.params['cat_features']) == 0: # dt -> lightgbm internally using buffer leaks, so convert here # assume predict is after pipeline collection or in subprocess so needs no protection X = X.to_numpy( ) # don't assign back to X so don't damage during predict X = np.ascontiguousarray(X, dtype=np.float32 if config.data_precision == "float32" else np.float64) X, eval_set = self.process_cats(X, None, self.feature_names_fitted) pred_contribs = kwargs.get('pred_contribs', None) output_margin = kwargs.get('output_margin', None) fast_approx = kwargs.pop('fast_approx', False) if fast_approx: kwargs['ntree_limit'] = min(config.fast_approx_num_trees, iterations - 1) kwargs['approx_contribs'] = pred_contribs else: kwargs['ntree_limit'] = iterations - 1 # implicit import from catboost import CatBoostClassifier, CatBoostRegressor, EFstrType n_jobs = max(1, physical_cores_count) if not pred_contribs: if self.num_classes >= 2: preds = model.predict_proba( data=X, ntree_start=0, ntree_end=iterations - 1, thread_count=self.params_base.get( 'n_jobs', n_jobs), # -1 is not supported ) if preds.shape[1] == 2: return preds[:, 1] else: return preds else: return model.predict( data=X, ntree_start=0, ntree_end=iterations - 1, thread_count=self.params_base.get( 'n_jobs', n_jobs), # -1 is not supported ) else: # For Shapley, doesn't come from predict, instead: return model.get_feature_importance( data=X, ntree_start=0, ntree_end=iterations - 1, thread_count=self.params_base.get( 'n_jobs', n_jobs), # -1 is not supported, type=EFstrType.ShapValues)
X_train = train_df[train_features] y_train = train_df.logerror print(X_train.shape, y_train.shape) test_df['transactiondate'] = pd.Timestamp('2017-12-01') test_df = add_date_features(test_df) X_test = test_df[train_features] print(X_test.shape) num_ensembles = 5 y_pred = 0.0 for i in tqdm(range(num_ensembles)): model = CatBoostRegressor(iterations=630, learning_rate=0.03, depth=6, l2_leaf_reg=3, loss_function='MAE', eval_metric='MAE', random_seed=i) model.fit(X_train, y_train, cat_features=cat_feature_inds) y_pred += model.predict(X_test) y_pred /= num_ensembles submission = pd.DataFrame({ 'ParcelId': test_df['ParcelId'], }) test_dates = { '201610': pd.Timestamp('2016-09-30'), '201611': pd.Timestamp('2016-10-31'), '201612': pd.Timestamp('2016-11-30'), '201710': pd.Timestamp('2017-09-30'),
This script will be my submission script """ # Importing some libraries import pandas as pd from catboost import CatBoostRegressor import numpy as np # Getting the submission set raw_sub = pd.read_csv('/Users/jinalshah/Jinal/Github Repos/House-Prices-Challenge-Solution'+ '/Data/Raw-Data/test.csv') ids = raw_sub['Id'] sub_prep = pd.read_csv('/Users/jinalshah/Jinal/Github Repos/House-Prices-Challenge-Solution'+ '/Data/Prepared Data/prepared-submission-data.csv') # Loading the Model model = CatBoostRegressor() model.load_model('/Users/jinalshah/Jinal/Github Repos/House-Prices-Challenge-Solution'+ '/Models/tuned_catboost1.cbm') # Making predictions pred = model.predict(sub_prep) # Building a dataframe final_sub = pd.DataFrame() final_sub['Id'] = ids final_sub['SalePrice'] = np.expm1(pred) # I need to raise the predictions to e bc I performed log on them # Putting Submission into a CSV file final_sub.to_csv(path_or_buf='/Users/jinalshah/Jinal/Github Repos/House-Prices-Challenge-Solution'+ '/Submissions/submission10.csv', index=False)
def main(): # Loading Data training_data = pd.read_csv( r'../input/tcd-ml-comp-201920-income-pred-group/tcd-ml-1920-group-income-train.csv', sep=',', error_bad_lines=False, index_col=False, low_memory=False).drop_duplicates() predict_data = pd.read_csv( r'../input/tcd-ml-comp-201920-income-pred-group/tcd-ml-1920-group-income-test.csv', sep=',', error_bad_lines=False, index_col=False, low_memory=False) # Preprocessing training_data, predict_data = preprocess(training_data, predict_data) # Renaming columns and making all categorical feature values to lowercase training_data = rename_and_lower(training_data) predict_data = rename_and_lower(predict_data) # Handeling null values training_data = impute(training_data) predict_data = impute(predict_data) y = training_data['total_yearly_income'] # Combining Training and Test data sets to get all possible values of a categorical feature train_plus_test = pd.concat(objs=[training_data, predict_data], axis=0, sort=True) # Making non numeric columns as CategoricalDtype for column in train_plus_test.select_dtypes(include=[np.object]).columns: training_data[column] = training_data[column].astype( CategoricalDtype(categories=train_plus_test[column].unique())) predict_data[column] = predict_data[column].astype( CategoricalDtype(categories=train_plus_test[column].unique())) X = training_data.drop(columns=['total_yearly_income']) # Split data into train and validate datasets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) # One Hot Encode Categorical features X_train = pd.get_dummies(X_train, prefix_sep='_', drop_first=True) X_test = pd.get_dummies(X_test, prefix_sep='_', drop_first=True) # Initialize Model parameters #categorical_features_indices = np.where(x_train.dtypes != np.float)[0] model = CatBoostRegressor(iterations=7000, depth=4, learning_rate=0.03, loss_function='MAE', verbose=1000, od_type="Iter", od_wait=500, use_best_model=True, task_type='GPU') # Train model with labelled dataset model.fit(X_train, y_train, eval_set=(X_test, y_test), plot=True) # Run rediction on validattion data split and check MAE j_validate = model.predict(X_test) print("Mean Absolute Error: ", mean_absolute_error(np.exp(y_test), np.exp(j_validate))) prediction = predictIncome(predict_data, model) output_file = '../input/tcd-ml-comp-201920-income-pred-group/tcd-ml-1920-group-income-submission.csv' # Write prediction to output file writeOutput(prediction, output_file) # create a link to download the dataframe which was saved with .to_csv method create_download_link(filename=output_file)
def train_model_regression(X, X_test, y, params, folds, model_type='lgb', eval_metric='mae', columns=None, plot_feature_importance=False, model=None, verbose=10000, early_stopping_rounds=200, n_estimators=50000): """ A function to train a variety of regression models. Returns dictionary with oof predictions, test predictions, scores and, if necessary, feature importances. :params: X - training data, can be pd.DataFrame or np.ndarray (after normalizing) :params: X_test - test data, can be pd.DataFrame or np.ndarray (after normalizing) :params: y - target :params: folds - folds to split data :params: model_type - type of model to use :params: eval_metric - metric to use :params: columns - columns to use. If None - use all columns :params: plot_feature_importance - whether to plot feature importance of LGB :params: model - sklearn model, works only for "sklearn" model type """ columns = X.columns if columns is None else columns X_test = X_test[columns] # to set up scoring parameters metrics_dict = { 'mae': { 'lgb_metric_name': 'mae', 'catboost_metric_name': 'MAE', 'sklearn_scoring_function': metrics.mean_absolute_error }, 'group_mae': { 'lgb_metric_name': 'mae', 'catboost_metric_name': 'MAE', 'scoring_function': group_mean_log_mae }, 'mse': { 'lgb_metric_name': 'mse', 'catboost_metric_name': 'MSE', 'sklearn_scoring_function': metrics.mean_squared_error } } result_dict = {} # out-of-fold predictions on train data oof = np.zeros(len(X)) # averaged predictions on train data prediction = np.zeros(len(X_test)) # list of scores on folds scores = [] feature_importance = pd.DataFrame() # split and train on folds for fold_n, (train_index, valid_index) in enumerate(folds.split(X)): print(f'Fold {fold_n + 1} started at {time.ctime()}') if type(X) == np.ndarray: X_train, X_valid = X[columns][train_index], X[columns][valid_index] y_train, y_valid = y[train_index], y[valid_index] else: X_train, X_valid = X[columns].iloc[train_index], X[columns].iloc[ valid_index] y_train, y_valid = y.iloc[train_index], y.iloc[valid_index] if model_type == 'lgb': model = lgb.LGBMRegressor(**params, n_estimators=n_estimators, n_jobs=-1) model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_valid, y_valid)], eval_metric=metrics_dict[eval_metric]['lgb_metric_name'], verbose=verbose, early_stopping_rounds=early_stopping_rounds) y_pred_valid = model.predict(X_valid) y_pred = model.predict(X_test, num_iteration=model.best_iteration_) if model_type == 'xgb': train_data = xgb.DMatrix(data=X_train, label=y_train, feature_names=X.columns) valid_data = xgb.DMatrix(data=X_valid, label=y_valid, feature_names=X.columns) watchlist = [(train_data, 'train'), (valid_data, 'valid_data')] model = xgb.train(dtrain=train_data, num_boost_round=20000, evals=watchlist, early_stopping_rounds=200, verbose_eval=verbose, params=params) y_pred_valid = model.predict(xgb.DMatrix(X_valid, feature_names=X.columns), ntree_limit=model.best_ntree_limit) y_pred = model.predict(xgb.DMatrix(X_test, feature_names=X.columns), ntree_limit=model.best_ntree_limit) if model_type == 'sklearn': model = model model.fit(X_train, y_train) y_pred_valid = model.predict(X_valid).reshape(-1, ) score = metrics_dict[eval_metric]['sklearn_scoring_function']( y_valid, y_pred_valid) print(f'Fold {fold_n}. {eval_metric}: {score:.4f}.') print('') y_pred = model.predict(X_test).reshape(-1, ) if model_type == 'cat': model = CatBoostRegressor( iterations=20000, eval_metric=metrics_dict[eval_metric]['catboost_metric_name'], **params, loss_function=metrics_dict[eval_metric] ['catboost_metric_name']) model.fit(X_train, y_train, eval_set=(X_valid, y_valid), cat_features=[], use_best_model=True, verbose=False) y_pred_valid = model.predict(X_valid) y_pred = model.predict(X_test) oof[valid_index] = y_pred_valid.reshape(-1, ) if eval_metric != 'group_mae': scores.append( metrics_dict[eval_metric]['sklearn_scoring_function']( y_valid, y_pred_valid)) else: scores.append(metrics_dict[eval_metric]['scoring_function']( y_valid, y_pred_valid, X_valid['type'])) prediction += y_pred if model_type == 'lgb' and plot_feature_importance: # feature importance fold_importance = pd.DataFrame() fold_importance["feature"] = columns fold_importance["importance"] = model.feature_importances_ fold_importance["fold"] = fold_n + 1 feature_importance = pd.concat( [feature_importance, fold_importance], axis=0) prediction /= folds.n_splits print('CV mean score: {0:.4f}, std: {1:.4f}.'.format( np.mean(scores), np.std(scores))) result_dict['oof'] = oof result_dict['prediction'] = prediction result_dict['scores'] = scores if model_type == 'lgb': if plot_feature_importance: feature_importance["importance"] /= folds.n_splits cols = feature_importance[[ "feature", "importance" ]].groupby("feature").mean().sort_values( by="importance", ascending=False)[:50].index best_features = feature_importance.loc[ feature_importance.feature.isin(cols)] plt.figure(figsize=(16, 12)) sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False)) plt.title('LGB Features (avg over folds)') result_dict['feature_importance'] = feature_importance return result_dict
def test_predict_sklearn_regress(): train_pool = Pool(TRAIN_FILE, column_description=CD_FILE) model = CatBoostRegressor(iterations=2, random_seed=0) model.fit(train_pool) model.save_model(OUTPUT_MODEL_PATH) return compare_canonical_models(OUTPUT_MODEL_PATH)
print('saving submission...') now_time = time.strftime("%m-%d %H_%M_%S", time.localtime()) lgb_sub[["uid","lgb_loan_sum"]].to_csv("./submission/" +now_time+'_lightgbm_Vscore_' + str(valid_score) + '.csv', index=False, header=False) from catboost import Pool, CatBoostRegressor train_pool = Pool(train_df[features], train_df["loan_sum"]) test_pool = Pool(valid_df[features], valid_df["loan_sum"]) dtrain_all_pool = Pool(tr_user[features], tr_user["loan_sum"]) dtest_pool = Pool(ts_user[features]) catb = CatBoostRegressor(iterations=300, depth=3, learning_rate=0.05, loss_function='RMSE') catb.fit(train_pool) print('catb train rmse: %g' % sqrt(mean_squared_error(train_df["loan_sum"], catb.predict( train_pool)))) valid_score = sqrt(mean_squared_error(valid_df["loan_sum"], catb.predict( Pool(valid_df[features])))) print('catb valid rmse: %g' % valid_score) catb = CatBoostRegressor(iterations=300, depth=3, learning_rate=0.05, loss_function='RMSE') catb.fit(dtrain_all_pool) ##提交文件 pred = catb.predict(dtest_pool) id_test = ts_user['uid'] catb_sub = pd.DataFrame({'uid': id_test, 'catb_loan_sum': pred}) print(catb_sub.describe()) catb_sub.loc[catb_sub["catb_loan_sum"] < 0,"catb_loan_sum"] = 0 print('saving submission...')
def test_regression_ctr(): pool = Pool(TRAIN_FILE, column_description=CD_FILE) model = CatBoostRegressor(iterations=5, random_seed=0, ctr_description=['Borders:TargetBorderCount=5:TargetBorderType=Uniform', 'Counter']) model.fit(pool) model.save_model(OUTPUT_MODEL_PATH) return compare_canonical_models(OUTPUT_MODEL_PATH)
def predict(self, X, y=None, **kwargs): model, features, importances, iterations = self.get_model_properties() if not self._save_by_pickle: from catboost import CatBoostClassifier, CatBoostRegressor, EFstrType if self.num_classes >= 2: from_file = CatBoostClassifier() else: from_file = CatBoostRegressor() with open(self.model_path, mode='wb') as f: f.write(model) model = from_file.load_model(self.model_path) # FIXME: Do equivalent throttling of predict size like def _predict_internal(self, X, **kwargs), wrap-up. if isinstance(X, dt.Frame) and len(self.params['cat_features']) == 0: # dt -> lightgbm internally using buffer leaks, so convert here # assume predict is after pipeline collection or in subprocess so needs no protection X = X.to_numpy( ) # don't assign back to X so don't damage during predict X = np.ascontiguousarray(X, dtype=np.float32 if config.data_precision == "float32" else np.float64) X, eval_set = self.process_cats(X, None, self.feature_names_fitted) pred_contribs = kwargs.get('pred_contribs', False) output_margin = kwargs.get('output_margin', False) fast_approx = kwargs.pop('fast_approx', False) if fast_approx: iterations = min(config.fast_approx_num_trees, iterations) # implicit import from catboost import CatBoostClassifier, CatBoostRegressor, EFstrType, Pool n_jobs = max(1, physical_cores_count) if not pred_contribs and not output_margin: if self.num_classes >= 2: preds = model.predict_proba( X, ntree_start=0, ntree_end=iterations, # index of first tree *not* to be used thread_count=self.params_base.get( 'n_jobs', n_jobs), # -1 is not supported ) if preds.shape[1] == 2: return preds[:, 1] else: return preds else: return model.predict( X, ntree_start=0, ntree_end=iterations, # index of first tree *not* to be used thread_count=self.params_base.get( 'n_jobs', n_jobs), # -1 is not supported ) elif output_margin: # uses "predict" for raw for any class preds = model.predict( X, prediction_type="RawFormulaVal", ntree_start=0, ntree_end=iterations, # index of first tree *not* to be used thread_count=self.params_base.get( 'n_jobs', n_jobs), # -1 is not supported ) if len(preds.shape ) > 1 and preds.shape[1] == 2 and self.num_classes == 2: return preds[:, 1] else: return preds elif pred_contribs: # For Shapley, doesn't come from predict # For regression/binary, shap is shape of (rows, features + bias) # for multiclass, shap is shape of (rows, classes, features + bias) data = Pool(X, label=y, cat_features=self.params['cat_features']) if fast_approx: # https://github.com/catboost/catboost/issues/1146 # https://github.com/catboost/catboost/issues/1535 # can't specify trees, but they have approx version # Regular, Exact, or Approximate shap_calc_type = "Approximate" else: shap_calc_type = "Regular" # See also shap_mode # help(CatBoostClassifier.get_feature_importance) print_debug("shap_calc_type: %s" % shap_calc_type) pickle_path = None if config.debug_daimodel_level >= 2: self.uuid = str(uuid.uuid4())[:6] pickle_path = os.path.join( exp_dir(), "catboost_shappredict%s.tmp.pickle" % self.uuid) model.save_model( os.path.join(exp_dir(), "catshapproblem%s.catboost.model" % self.uuid)) # save_obj((self, self.model, model, X, y, kwargs, shap_calc_type, self.params['cat_features']), pickle_path) save_obj((model, X, y, kwargs, shap_calc_type, self.params['cat_features']), pickle_path) preds_shap = model.get_feature_importance( data=data, thread_count=self.params_base.get( 'n_jobs', n_jobs), # -1 is not supported, type=EFstrType.ShapValues, shap_calc_type=shap_calc_type, ) # repair broken shap sum: https://github.com/catboost/catboost/issues/1125 print_debug("shap_fix") preds_raw = model.predict( X, prediction_type="RawFormulaVal", ntree_start=0, ntree_end=iterations, # index of first tree *not* to be used thread_count=self.params_base.get( 'n_jobs', n_jobs), # -1 is not supported ) if self.num_classes <= 2: axis = 1 else: axis = 2 orig_sum = np.sum(preds_shap, axis=axis) print_debug("shap_fix2") # avoid division by 0, need different trick, e.g. change baseline, to fix that case if axis == 1: orig_sum[orig_sum[:] == 0.0] = 1.0 preds_shap = preds_shap * preds_raw[:, None] / orig_sum[:, None] else: # each feature and each class must sum up orig_sum[orig_sum[:, :] == 0.0] = 1.0 preds_shap = preds_shap * preds_raw[:, :, None] / orig_sum[:, :, None] if config.hard_asserts and config.debug_daimodel_level >= 2: print_debug("shap_check") model.save_model(os.path.join(exp_dir(), "catshapproblem")) pickle.dump((X, y, self.params['cat_features']), open(os.path.join(exp_dir(), "catshapproblem.pkl"), "wb")) preds_raw = model.predict( X, prediction_type="RawFormulaVal", ntree_start=0, ntree_end=iterations, # index of first tree *not* to be used thread_count=self.params_base.get( 'n_jobs', n_jobs), # -1 is not supported ) assert np.isclose(preds_raw, np.sum( preds_shap, axis=axis)).all( ), "catboost shapley does not sum up correctly" if config.debug_daimodel_level <= 2: remove(pickle_path) if axis == 1: return preds_shap else: # DAI expects (shape rows) * (classes x (features + 1)) with "columns" as blocks of # feature_0_class_0 feature_0_class_0 ... feature_0_class_1 feature_1_class_1 ... return preds_shap.reshape( preds_shap.shape[0], preds_shap.shape[1] * preds_shap.shape[2]) else: raise RuntimeError("No such case")
flag = False break return flag from google.colab import files uploaded = files.upload() X_train = (pd.read_csv('h_10_X_train.csv')).values X_test = (pd.read_csv('h_10_X_test.csv')).values X_val = (pd.read_csv('h_10_X_val.csv')).values Y_train = (pd.read_csv('h_10_Y_train.csv')['0']).values Y_test = (pd.read_csv('h_10_Y_test.csv')['0']).values Y_val = (pd.read_csv('h_10_Y_val.csv')['0']).values model = CatBoostRegressor(iterations=100) model.fit(X_train, Y_train, verbose=False) print(model.score(X_val, Y_val)) num_of_individs = 12 best_indiv = [[0 for i in range(len(X_train[0]))] for j in range(num_of_individs)] for i in range(num_of_individs): for j in range(len(X_train[0])): best_indiv[i][j] = 1 ds = Dataset(X_train, Y_train, X_test, Y_test, X_val, Y_val) current_set = [] for i in range(num_of_individs): current_set.append(individual(best_indiv[i])) current_set[i].get_score(ds.X_train, ds.Y_train, ds.X_test, ds.Y_test, ds.X_val, ds.Y_val)
def test_invalid_loss_regressor(): with pytest.raises(CatboostError): pool = Pool(TRAIN_FILE, column_description=CD_FILE) model = CatBoostRegressor(loss_function="fee") model.fit(pool)
fold_count, feature="fc", model_type=MODEL_TYPE, ) DEPTH = 7 update_tracking(run_id, "depth", DEPTH) train_dataset = Pool(data=X_train, label=y_train) valid_dataset = Pool(data=X_valid, label=y_valid) test_dataset = Pool(data=X_test_type) model = CatBoostRegressor( iterations=N_ESTIMATORS, learning_rate=LEARNING_RATE, depth=DEPTH, eval_metric=EVAL_METRIC, verbose=VERBOSE, random_state=RANDOM_STATE, thread_count=N_THREADS, # loss_function=EVAL_METRIC, # bootstrap_type='Poisson', # bagging_temperature=5, task_type="GPU", ) # Train on GPU model.fit( train_dataset, eval_set=valid_dataset, early_stopping_rounds=EARLY_STOPPING_ROUNDS, ) now = timer() update_tracking( run_id,
def fit_meta_feature( X_train, X_valid, X_test, Meta_train, train_idx, bond_type, base_fold, feature="fc", N_META_FOLDS=N_META_FOLDS, N_META_ESTIMATORS=N_META_ESTIMATORS, model_type="catboost", ): """ Adds meta features to train, test and val """ logger.info(f"Creating meta feature {feature}") logger.info("X_train, X_valid and X_test are shapes {} {} {}".format( X_train.shape, X_valid.shape, X_test.shape)) folds = GroupKFold(n_splits=N_META_FOLDS) fold_count = 1 # Init predictions X_valid["meta_" + feature] = 0 X_test["meta_" + feature] = 0 X_train["meta_" + feature] = 0 X_train_oof = X_train[["meta_" + feature]].copy() X_train = X_train.drop("meta_" + feature, axis=1) feature_importance = pd.DataFrame() for fold_n, (train_idx2, valid_idx2) in enumerate( folds.split(X_train, groups=mol_group_type.iloc[train_idx].values)): logger.info("Running Meta Feature Type {} - Fold {} of {}".format( feature, fold_count, folds.n_splits)) update_tracking(run_id, "{}_meta_{}_est".format(bond_type, feature), N_META_ESTIMATORS) update_tracking(run_id, "{}_meta_{}_metafolds".format(bond_type, feature), N_META_FOLDS) X_train2 = X_train.loc[X_train.reset_index().index.isin(train_idx2)] X_valid2 = X_train.loc[X_train.reset_index().index.isin(valid_idx2)] X_train2 = X_train2.copy() X_valid2 = X_valid2.copy() y_train2 = Meta_train.loc[Meta_train.reset_index().index.isin( train_idx2)][feature] y_valid2 = Meta_train.loc[Meta_train.reset_index().index.isin( valid_idx2)][feature] fold_count += 1 if model_type == "catboost": train_dataset = Pool(data=X_train2, label=y_train2) metavalid_dataset = Pool(data=X_valid2, label=y_valid2) valid_dataset = Pool(data=X_valid) test_dataset = Pool(data=X_test) model = CatBoostRegressor( iterations=N_META_ESTIMATORS, learning_rate=LEARNING_RATE, depth=META_DEPTH, eval_metric=EVAL_METRIC, verbose=VERBOSE, random_state=RANDOM_STATE, thread_count=N_THREADS, task_type="GPU", ) # Train on GPU model.fit( train_dataset, eval_set=metavalid_dataset, early_stopping_rounds=EARLY_STOPPING_ROUNDS, ) y_pred_meta_valid = model.predict(metavalid_dataset) y_pred_valid = model.predict(valid_dataset) y_pred = model.predict(test_dataset) X_train_oof.loc[X_train_oof.reset_index().index.isin(valid_idx2), "meta_" + feature] = y_pred_meta_valid X_valid["meta_" + feature] += y_pred_valid X_test["meta_" + feature] += y_pred fold_importance = pd.DataFrame() fold_importance["feature"] = X_train.columns fold_importance["importance"] = model.feature_importances_ fold_importance["type"] = bond_type fold_importance["fold"] = fold_n + 1 feature_importance = pd.concat( [feature_importance, fold_importance], axis=0) elif model_type == "xgboost": model = xgboost.XGBRegressor(**xgb_params) model.fit( X_train2, y_train2, eval_metric=EVAL_METRIC, eval_set=[(X_valid2, y_valid2)], verbose=VERBOSE, early_stopping_rounds=EARLY_STOPPING_ROUNDS, ) y_pred_meta_valid = model.predict(X_valid2) y_pred_valid = model.predict( X_valid.drop("meta_" + feature, axis=1)) y_pred = model.predict(X_test.drop("meta_" + feature, axis=1)) X_train_oof.loc[X_train_oof.reset_index().index.isin(valid_idx2), "meta_" + feature] = y_pred_meta_valid X_valid["meta_" + feature] += y_pred_valid X_test["meta_" + feature] += y_pred fold_importance = pd.DataFrame() fold_importance["feature"] = X_train.columns fold_importance["importance"] = model.feature_importances_ fold_importance["type"] = bond_type fold_importance["fold"] = fold_n + 1 feature_importance = pd.concat( [feature_importance, fold_importance], axis=0) oof_score = mean_absolute_error(Meta_train[feature], X_train_oof["meta_" + feature]) log_oof_score = np.log(oof_score) logger.info( f"Meta feature {feature} has MAE {oof_score:0.4f} LMAE {log_oof_score:0.4f}" ) update_tracking( run_id, "{}_meta_{}_mae_cv_f{}".format(bond_type, feature, base_fold), oof_score) update_tracking( run_id, "{}_meta_{}_lmae_cv_f{}".format(bond_type, feature, base_fold), log_oof_score, ) feature_importance.to_parquet( "type_results/{}/meta/{}_{}_{}_fi_meta_{}_f{}_{:0.4f}MAE_{:0.4f}LMAE.parquet" .format( bond_type, MODEL_NUMBER, run_id, bond_type, feature, base_fold, oof_score, log_oof_score, )) X_train_oof.to_parquet( "type_results/{}/meta/{}_{}_{}_oof_meta_{}_f{}_{:0.4f}MAE_{:0.4f}LMAE.parquet" .format( bond_type, MODEL_NUMBER, run_id, bond_type, feature, base_fold, oof_score, log_oof_score, )) X_train.to_parquet( "type_results/{}/meta/{}_{}_{}_X_train_meta_{}_f{}_{:0.4f}MAE_{:0.4f}LMAE.parquet" .format( bond_type, MODEL_NUMBER, run_id, bond_type, feature, base_fold, oof_score, log_oof_score, )) X_valid.to_parquet( "type_results/{}/meta/{}_{}_{}_X_valid_meta_{}_f{}_{:0.4f}MAE_{:0.4f}LMAE.parquet" .format( bond_type, MODEL_NUMBER, run_id, bond_type, feature, base_fold, oof_score, log_oof_score, )) X_valid.to_parquet( "type_results/{}/meta/{}_{}_{}_X_valid_meta_{}_f{}_{:0.4f}MAE_{:0.4f}LMAE.parquet" .format( bond_type, MODEL_NUMBER, run_id, bond_type, feature, base_fold, oof_score, log_oof_score, )) X_valid["meta_" + feature] = X_valid["meta_" + feature] / N_META_FOLDS X_test["meta_" + feature] = X_test["meta_" + feature] / N_META_FOLDS X_train["meta_" + feature] = X_train_oof["meta_" + feature] logger.info("Done creating meta features") logger.info("X_train, X_valid and X_test are shapes {} {} {}".format( X_train.shape, X_valid.shape, X_test.shape)) return X_train, X_valid, X_test
) #oluşturmuş olduğumuz dumm değişkenleri ve bağımsız değişkenleri bir araya getirme işlemi #yukarda yapılan işlemler kategorik değişkenleri dumm çevirerek veri setinde tutup diğer bağımsız değişkenlerle birleştirdik #aşağıda eğitim ve deneme seti olarak ayrıştırma işlemi yaptık X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42) print(df.head()) print(df.shape) print(X_train.head()) #Model ve Tahmin #%% cat = CatBoostRegressor() cat_model = cat.fit(X_train, y_train) print(cat_model) y_pred = cat_model.predict(X_test) rmse = np.sqrt(mean_squared_error(y_test, y_pred)) print(rmse) #Model Tuning #%% cat_params = { "iterations": [200, 500, 1000], "learning_rate": [0.01, 0.1], "depth": [3, 6, 8] } gs = GridSearchCV(cat, cat_params, cv=5, n_jobs=-1, verbose=2)