예제 #1
0
    def fit(
        self,
        X,
        y,
        sample_weight=None,
        X_validation=None,
        y_validation=None,
        sample_weight_validation=None,
        log_to_file=None,
        max_time=None,
    ):
        if self.is_fitted():
            print("CatBoost model already fitted. Skip fit().")
            return

        if self.cat_features is None:
            self.cat_features = []
            for i in range(X.shape[1]):
                if PreprocessingUtils.is_categorical(X.iloc[:, i]):
                    self.cat_features += [i]

        eval_set = None
        if X_validation is not None and y_validation is not None:
            eval_set = Pool(
                data=X_validation,
                label=y_validation,
                cat_features=self.cat_features,
                weight=sample_weight_validation,
            )

        if self.params.get("num_boost_round") is None:
            model_init, new_iterations = self._assess_iterations(
                X, y, sample_weight, eval_set, max_time)
            self.model.set_params(iterations=new_iterations)
        else:
            model_init = None
            self.model.set_params(
                iterations=self.params.get("num_boost_round"))
            self.early_stopping_rounds = self.params.get(
                "early_stopping_rounds", 50)

        self.model.fit(
            X,
            y,
            sample_weight=sample_weight,
            cat_features=self.cat_features,
            init_model=model_init,
            eval_set=eval_set,
            early_stopping_rounds=self.early_stopping_rounds,
            verbose_eval=False,
        )

        if self.model.best_iteration_ is not None:
            if model_init is not None:
                self.best_ntree_limit = (self.model.best_iteration_ +
                                         model_init.tree_count_ + 1)
            else:
                self.best_ntree_limit = self.model.best_iteration_ + 1

        else:
            # just take all the trees
            # the warm-up trees are already included
            # dont need to add +1
            self.best_ntree_limit = self.model.tree_count_

        if log_to_file is not None:
            metric_name = list(self.model.evals_result_["learn"].keys())[0]
            train_scores = self.model.evals_result_["learn"][metric_name]
            validation_scores = self.model.evals_result_["validation"][
                metric_name]
            if model_init is not None:
                train_scores = (
                    model_init.evals_result_["learn"][metric_name] +
                    train_scores)
                validation_scores = (
                    model_init.evals_result_["validation"][metric_name] +
                    validation_scores)

            result = pd.DataFrame({
                "iteration": range(len(train_scores)),
                "train": train_scores,
                "validation": validation_scores,
            })
            result.to_csv(log_to_file, index=False, header=False)
예제 #2
0
    'max_ctr_complexity': 1,
    'depth': 8,
    'leaf_estimation_method': 'Gradient',
    'use_best_model': True,
    'iterations': 100000,
    'early_stopping_rounds': 5000,
    'verbose': 500
}

cate_cols = [
    'Uid', 'Category', 'Subcategory', 'Concept', 'Mediatype', 'hour', 'day',
    'weekday', 'week_hour', 'year_weekday', 'Geoaccuracy', 'ispro', 'Ispublic',
    'img_model'
]
submit_data = Pool(data=submit_feature_df,
                   label=submit_label_df['label'],
                   cat_features=cate_cols)

valid_ans = []
submit_proba = []
kfold = KFold(n_splits=5, shuffle=True, random_state=2020)
k = 0

for train_idx, valid_idx in kfold.split(train_feature_df, train_label_df):

    fold_valid_x, fold_valid_y = train_feature_df.loc[
        valid_idx], train_label_df['label'].loc[valid_idx]
    valid_data = Pool(data=fold_valid_x,
                      label=fold_valid_y,
                      cat_features=cate_cols)
예제 #3
0
folds = RepeatedStratifiedKFold(n_splits=n_splits,
                                n_repeats=n_repeats,
                                random_state=random_state)
df = train.copy()  #
columns = all_features  #
categoric_columns = all_features  #
X_train = df[columns]
y_train = df['target']
logloss_all = []
_proba = np.zeros((X_train.shape[0], y_train.nunique()))
_probas = np.zeros((X_train.shape[0], y_train.nunique()))
for n_fold, (train_idx, valid_idx) in enumerate(folds.split(X_train, y_train)):
    print("Fold --> " + str(n_fold + 1) + "/" + str(n_splits))
    train_X, train_y = X_train.iloc[train_idx].copy(), y_train.iloc[train_idx]
    valid_X, valid_y = X_train.iloc[valid_idx].copy(), y_train.iloc[valid_idx]
    dataset = Pool(train_X, train_y, categoric_columns)
    evalset = Pool(valid_X, valid_y, categoric_columns)
    model = CatBoostClassifier(task_type="GPU",
                               depth=4,
                               iterations=iterations,
                               od_wait=1000,
                               od_type='Iter',
                               learning_rate=0.02,
                               use_best_model=True,
                               loss_function='MultiClass',
                               verbose=False)
    model.fit(dataset, plot=False, verbose=500, eval_set=evalset)
    _proba = model.predict_proba(valid_X[all_features])
    logloss_of_fold = log_loss(list(valid_y), _proba)
    logloss_all.append(logloss_of_fold)
    _probas[valid_idx, :] += _proba / n_repeats
RS=2305 # Seed for partition and model random part
TS=0.3 # Validation size
esr=50 # Early stopping rounds (when validation does not improve in these rounds, stops)

from sklearn.model_selection import train_test_split
x_tr, x_val, y_tr, y_val = train_test_split(X_train, Y_train, test_size=TS, random_state=RS)

# Categorical positions for catboost
Pos=list()
As_Categorical=Categorical.tolist()
As_Categorical.remove('ID')
for col in As_Categorical:
    Pos.append((X_train.columns.get_loc(col)))

# To Pool Class (for catboost only)
pool_tr=Pool(x_tr, y_tr,cat_features=Pos)
pool_val=Pool(x_val, y_val,cat_features=Pos)

# By-hand paramter tuning. A grid-search is expensive
# We test different combinations
# See parameter options here:
# "https://tech.yandex.com/catboost/doc/dg/concepts/python-reference_parameters-list-docpage/"
model_catboost_val = CatBoostClassifier(
          eval_metric='AUC',
          iterations=20000, # Very high value, to find the optimum
          od_type='Iter', # Overfitting detector set to "iterations" or number of trees
          random_seed=RS, # Random seed for reproducibility
          verbose=100) # Shows train/test metric every "verbose" trees

# "Technical" parameters of the model:
params = {'objective': 'Logloss',
예제 #5
0
            logger.info('CV mean score: {0:.4f}, std: {1:.4f}.'.format(
                np.mean(bond_scores), np.std(bond_scores)))
            oof[valid_idx] = y_pred_valid.reshape(-1, )
            prediction_type += y_pred
            fold_count += 1
            now = timer()
            logger.info(
                'Completed training and predicting for bond {} fold {}-of-{} in {:0.4f} seconds'
                .format(bond_type, fold_n + 1, fold_count, now - fold_start))
        elif MODEL_TYPE == 'catboost':
            fold_start = timer()
            logger.info('Running Type {} - Fold {} of {}'.format(
                bond_type, fold_count, folds.n_splits))
            X_train, X_valid = X_type.iloc[train_idx], X_type.iloc[valid_idx]
            y_train, y_valid = y_type.iloc[train_idx], y_type.iloc[valid_idx]
            train_dataset = Pool(data=X_train.drop('type', axis=1),
                                 label=y_train)
            valid_dataset = Pool(data=X_valid.drop('type', axis=1),
                                 label=y_valid)
            test_dataset = Pool(data=X_test_type.drop('type', axis=1))
            DEPTH = 4
            update_tracking(run_id, 'depth', DEPTH)
            model = CatBoostRegressor(iterations=N_ESTIMATORS,
                                      learning_rate=LEARNING_RATE,
                                      depth=DEPTH,
                                      eval_metric=EVAL_METRIC,
                                      verbose=VERBOSE,
                                      random_state=RANDOM_STATE,
                                      thread_count=N_THREADS,
                                      loss_function=EVAL_METRIC,
                                      task_type="GPU")  # Train on GPU
예제 #6
0
def test_feature_importance():
    pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    model = CatBoostClassifier(iterations=5, random_seed=0)
    model.fit(pool)
    np.save(FIMP_PATH, np.array(model.feature_importances_))
    return local_canonical_file(FIMP_PATH)
예제 #7
0
    def _fit(self,
             X_train,
             y_train,
             X_val=None,
             y_val=None,
             time_limit=None,
             num_gpus=0,
             **kwargs):
        try_import_catboost()
        from catboost import CatBoostClassifier, CatBoostRegressor, Pool
        if self.problem_type == SOFTCLASS:
            try_import_catboostdev(
            )  # Need to first import catboost then catboost_dev not vice-versa.
            from catboost_dev import CatBoostClassifier, CatBoostRegressor, Pool
            from .catboost_softclass_utils import SoftclassCustomMetric, SoftclassObjective
            self._set_default_param_value(
                'eval_metric',
                construct_custom_catboost_metric(
                    self.stopping_metric, True,
                    not self.stopping_metric.needs_pred, self.problem_type))
            self.params[
                'loss_function'] = SoftclassObjective.SoftLogLossObjective()
            self.params[
                'eval_metric'] = SoftclassCustomMetric.SoftLogLossMetric()
            self._set_default_param_value(
                'early_stopping_rounds',
                50)  # Speeds up training with custom (non-C++) losses

        model_type = CatBoostClassifier if self.problem_type in PROBLEM_TYPES_CLASSIFICATION else CatBoostRegressor
        if isinstance(self.params['eval_metric'], str):
            metric_name = self.params['eval_metric']
        else:
            metric_name = type(self.params['eval_metric']).__name__
        num_rows_train = len(X_train)
        num_cols_train = len(X_train.columns)
        if self.problem_type == MULTICLASS:
            if self.num_classes is not None:
                num_classes = self.num_classes
            else:
                num_classes = 10  # Guess if not given, can do better by looking at y_train
        elif self.problem_type == SOFTCLASS:  # TODO: delete this elif if it's unnecessary.
            num_classes = y_train.shape[1]
            self.num_classes = num_classes
        else:
            num_classes = 1

        # TODO: Add ignore_memory_limits param to disable NotEnoughMemoryError Exceptions
        max_memory_usage_ratio = self.params_aux['max_memory_usage_ratio']
        approx_mem_size_req = num_rows_train * num_cols_train * num_classes / 2  # TODO: Extremely crude approximation, can be vastly improved
        if approx_mem_size_req > 1e9:  # > 1 GB
            available_mem = psutil.virtual_memory().available
            ratio = approx_mem_size_req / available_mem
            if ratio > (1 * max_memory_usage_ratio):
                logger.warning(
                    '\tWarning: Not enough memory to safely train CatBoost model, roughly requires: %s GB, but only %s GB is available...'
                    % (round(approx_mem_size_req / 1e9,
                             3), round(available_mem / 1e9, 3)))
                raise NotEnoughMemoryError
            elif ratio > (0.2 * max_memory_usage_ratio):
                logger.warning(
                    '\tWarning: Potentially not enough memory to safely train CatBoost model, roughly requires: %s GB, but only %s GB is available...'
                    % (round(approx_mem_size_req / 1e9,
                             3), round(available_mem / 1e9, 3)))

        start_time = time.time()
        X_train = self.preprocess(X_train)
        cat_features = list(X_train.select_dtypes(include='category').columns)
        X_train = Pool(data=X_train, label=y_train, cat_features=cat_features)

        if X_val is not None:
            X_val = self.preprocess(X_val)
            X_val = Pool(data=X_val, label=y_val, cat_features=cat_features)
            eval_set = X_val
            if num_rows_train <= 10000:
                modifier = 1
            else:
                modifier = 10000 / num_rows_train
            early_stopping_rounds = max(round(modifier * 150), 10)
            num_sample_iter_max = max(round(modifier * 50), 2)
        else:
            eval_set = None
            early_stopping_rounds = None
            num_sample_iter_max = 50

        invalid_params = ['num_threads', 'num_gpus']
        for invalid in invalid_params:
            if invalid in self.params:
                self.params.pop(invalid)
        train_dir = None
        if 'allow_writing_files' in self.params and self.params[
                'allow_writing_files']:
            if 'train_dir' not in self.params:
                try:
                    # TODO: What if path is in S3?
                    os.makedirs(os.path.dirname(self.path), exist_ok=True)
                except:
                    pass
                else:
                    train_dir = self.path + 'catboost_info'
        logger.log(15, f'\tCatboost model hyperparameters: {self.params}')

        # TODO: Add more control over these params (specifically early_stopping_rounds)
        verbosity = kwargs.get('verbosity', 2)
        if verbosity <= 1:
            verbose = False
        elif verbosity == 2:
            verbose = False
        elif verbosity == 3:
            verbose = 20
        else:
            verbose = True

        init_model = None
        init_model_tree_count = None
        init_model_best_iteration = None
        init_model_best_score = None

        params = self.params.copy()
        num_features = len(self.features)
        if num_gpus != 0:
            if 'task_type' not in params:
                params['task_type'] = 'GPU'
                # TODO: Confirm if GPU is used in HPO (Probably not)
                # TODO: Adjust max_bins to 254?

        if params.get('task_type', None) == 'GPU':
            if 'colsample_bylevel' in params:
                params.pop('colsample_bylevel')
                logger.log(
                    30,
                    f'\t\'colsample_bylevel\' is not supported on GPU, using default value (Default = 1).'
                )
            if 'rsm' in params:
                params.pop('rsm')
                logger.log(
                    30,
                    f'\t\'rsm\' is not supported on GPU, using default value (Default = 1).'
                )

        if self.problem_type == MULTICLASS and 'rsm' not in params and 'colsample_bylevel' not in params and num_features > 1000:
            if time_limit:
                # Reduce sample iterations to avoid taking unreasonable amounts of time
                num_sample_iter_max = max(round(num_sample_iter_max / 2), 2)
            # Subsample columns to speed up training
            if params.get('task_type',
                          None) != 'GPU':  # RSM does not work on GPU
                params['colsample_bylevel'] = max(
                    min(1.0, 1000 / num_features), 0.05)
                logger.log(
                    30,
                    f'\tMany features detected ({num_features}), dynamically setting \'colsample_bylevel\' to {params["colsample_bylevel"]} to speed up training (Default = 1).'
                )
                logger.log(
                    30,
                    f'\tTo disable this functionality, explicitly specify \'colsample_bylevel\' in the model hyperparameters.'
                )
            else:
                params['colsample_bylevel'] = 1.0
                logger.log(
                    30,
                    f'\t\'colsample_bylevel\' is not supported on GPU, using default value (Default = 1).'
                )

        if time_limit:
            time_left_start = time_limit - (time.time() - start_time)
            if time_left_start <= time_limit * 0.4:  # if 60% of time was spent preprocessing, likely not enough time to train model
                raise TimeLimitExceeded
            params_init = params.copy()
            num_sample_iter = min(num_sample_iter_max,
                                  params_init['iterations'])
            params_init['iterations'] = num_sample_iter
            if train_dir is not None:
                params_init['train_dir'] = train_dir
            self.model = model_type(**params_init, )
            self.model.fit(
                X_train,
                eval_set=eval_set,
                use_best_model=True,
                verbose=verbose,
                # early_stopping_rounds=early_stopping_rounds,
            )

            init_model_tree_count = self.model.tree_count_
            init_model_best_iteration = self.model.get_best_iteration()
            init_model_best_score = self.model.get_best_score(
            )['validation'][metric_name]

            time_left_end = time_limit - (time.time() - start_time)
            time_taken_per_iter = (time_left_start -
                                   time_left_end) / num_sample_iter
            estimated_iters_in_time = round(time_left_end /
                                            time_taken_per_iter)
            init_model = self.model

            params_final = params.copy()

            # TODO: This only handles memory with time_limits specified, but not with time_limits=None, handle when time_limits=None
            available_mem = psutil.virtual_memory().available
            if self.problem_type == SOFTCLASS:  # TODO: remove this once catboost-dev is no longer necessary and SOFTCLASS objectives can be pickled.
                model_size_bytes = 1  # skip memory check
            else:
                model_size_bytes = sys.getsizeof(pickle.dumps(self.model))

            max_memory_proportion = 0.3 * max_memory_usage_ratio
            mem_usage_per_iter = model_size_bytes / num_sample_iter
            max_memory_iters = math.floor(
                available_mem * max_memory_proportion / mem_usage_per_iter)

            params_final['iterations'] = min(
                params['iterations'] - num_sample_iter,
                estimated_iters_in_time)
            if params_final['iterations'] > max_memory_iters - num_sample_iter:
                if max_memory_iters - num_sample_iter <= 500:
                    logger.warning(
                        '\tWarning: CatBoost will be early stopped due to lack of memory, increase memory to enable full quality models, max training iterations changed to %s from %s'
                        % (max_memory_iters,
                           params_final['iterations'] + num_sample_iter))
                params_final['iterations'] = max_memory_iters - num_sample_iter
        else:
            params_final = params.copy()

        if train_dir is not None:
            params_final['train_dir'] = train_dir
        if params_final['iterations'] > 0:
            self.model = model_type(**params_final, )

            # TODO: Strangely, this performs different if clone init_model is sent in than if trained for same total number of iterations. May be able to optimize catboost models further with this
            self.model.fit(
                X_train,
                eval_set=eval_set,
                verbose=verbose,
                early_stopping_rounds=early_stopping_rounds,
                # use_best_model=True,
                init_model=init_model,
            )

            if init_model is not None:
                final_model_best_score = self.model.get_best_score(
                )['validation'][metric_name]
                if self.stopping_metric._optimum > final_model_best_score:
                    if final_model_best_score > init_model_best_score:
                        best_iteration = init_model_tree_count + self.model.get_best_iteration(
                        )
                    else:
                        best_iteration = init_model_best_iteration
                else:
                    if final_model_best_score < init_model_best_score:
                        best_iteration = init_model_tree_count + self.model.get_best_iteration(
                        )
                    else:
                        best_iteration = init_model_best_iteration

                self.model.shrink(ntree_start=0, ntree_end=best_iteration + 1)

        self.params_trained['iterations'] = self.model.tree_count_
예제 #8
0
weight
)
num_train = int(df.shape[0] * PCT_TRAIN)# DONE

y = df[DIRECTION] # DONE
y_train = y[: num_train]
y_test = y[num_train:]

w_train = w[: num_train]
w_test = w[num_train:]

X = df.iloc[:, 1:]
X_train = X.iloc[:num_train, :]
X_test = X.iloc[num_train:, :]

train_pool = Pool(X_train, y_train, weight=w_train)
test_pool = Pool(X_test, y_test, weight=w_test)

# %%
# %%time
# train_pool, test_pool =  get_train_test_pools(N,DIRECTION, WEIGHTS_QUNATILE, CACHE=True)
# best_iter = next(map(lambda x: len(x[1])-1, model.eval_metrics(train_pool,['Precision']).items()))
# model.plot_tree(best_iter)

# %%
# %%time
N=5
DIRECTION='is_turnpt'
WEIGHTS_QUNATILE = 0.01

model = CatBoostClassifier(iterations=10**5, # set very large number and set early stops
예제 #9
0
def test_pool_cat_features():
    pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    assert np.all(pool.get_cat_feature_indices() == CAT_FEATURES)
예제 #10
0
x = train_df.drop('Ans', axis = 1)
y = train_df.Ans

from catboost import CatBoostClassifier, Pool, cv
from sklearn.metrics import accuracy_score
import catboost
model = CatBoostClassifier(
    custom_loss=['Accuracy'],
    logging_level='Silent',
    loss_function='MultiClass'
)

from sklearn.model_selection import train_test_split
x_train, x_validation, y_train, y_validation = train_test_split(x, y, train_size=0.75, random_state=27)
train_pool = Pool(x_train, y_train)
validate_pool = Pool(x_validation, y_validation)

model.fit(
    x_train, y_train,
    eval_set=(x_validation, y_validation),
);

from sklearn.metrics import accuracy_score
print('Train Accuracy:', accuracy_score(y_train, model.predict(x_train)))
print('Validation Accuracy:', accuracy_score(y_validation, model.predict(x_validation)))
print('Save current model? Y/n')
ans = input()
if ans == 'n' : 
	print('Model will not save')
	sys.exit(0)
예제 #11
0
df = pd.read_csv(args.infile)

#df=pd.read_csv('C:/TSU_GIT/MedicalDataAnalysisService/R_scripts/Arizona_informative.csv')

# кодирование признака в цифры
le = LabelEncoder()
encode_feature = le.fit_transform(df.Class)

df = df.drop(['Class'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(df,
                                                    encode_feature,
                                                    test_size=args.cross,
                                                    stratify=encode_feature)

train_pool = Pool(data=X_train, label=y_train)

test_pool = Pool(data=X_test, label=y_test)

model = CatBoostClassifier(iterations=args.itera,
                           depth=6,
                           learning_rate=args.learn,
                           loss_function='MultiClass',
                           eval_metric='Accuracy',
                           logging_level='verbose')

model.fit(train_pool, eval_set=test_pool, logging_level='Verbose')

pred = model.predict(data=test_pool, prediction_type='Class')

acc_test = accuracy_score(y_test, pred)
예제 #12
0
def catboost_bootstrap(dir_,
                       learn_name,
                       test_name,
                       cd_file,
                       classes,
                       learning_rate=None,
                       border_count=32,
                       cnt_values=20,
                       file_result_to=sys.stdout,
                       file_info_to=sys.stdout,
                       iterations=1500):
    logloss = {}
    auc = {}
    for clazz in classes:
        print('class={}'.format(clazz.WRAPPER_NAME))
        print('class={}; step={}'.format(clazz.WRAPPER_NAME,
                                         learning_rate[clazz]),
              file=file_result_to)
        file_result_to.flush()
        auc[clazz.WRAPPER_NAME] = []
        logloss[clazz.WRAPPER_NAME] = []
        tree_counts = []
        logloss_curves = []
        auc_curves = []

        cl = clazz()
        source_learn_pool = Pool(data=os.path.join(dir_, learn_name),
                                 column_description=os.path.join(
                                     dir_, cd_file))
        beg = time.time()
        learn_pool = cl.handle_learn_pool(source_learn_pool)
        end = time.time()
        print('!!!time: {}'.format(end - beg), file=file_info_to)
        print('priors: {}'.format(cl.prior), file=file_info_to)
        print('prior scores: {}'.format(cl.score), file=file_info_to)
        file_info_to.flush()

        source_test_pool = Pool(data=os.path.join(dir_, test_name),
                                column_description=os.path.join(dir_, cd_file))
        source_test_label = np.array(source_test_pool.get_label())
        source_test_features = np.array(source_test_pool.get_features())

        cat = CatBoostClassifier(max_ctr_complexity=1,
                                 custom_metric='AUC',
                                 boosting_type='Plain',
                                 random_seed=0,
                                 border_count=border_count,
                                 iterations=iterations,
                                 learning_rate=learning_rate[clazz],
                                 thread_count=multiprocessing.cpu_count())
        beg = time.time()
        cat.fit(learn_pool, use_best_model=True)
        end = time.time()

        for seed in range(cnt_values):
            idx = list(range(source_test_features.shape[0]))
            np.random.seed(seed * 10 + 300)
            boot_idx = np.random.choice(idx, len(idx), replace=True)
            boot_test_features = source_test_features[boot_idx]
            boot_test_label = source_test_label[boot_idx]
            X, y = cl.handle_test_matrix(boot_test_features, boot_test_label,
                                         False)
            metrics = cat.eval_metrics(
                Pool(X, y), ['Logloss', 'AUC'],
                eval_period=1,
                thread_count=multiprocessing.cpu_count())
            for num, loss in enumerate(metrics['Logloss']):
                print('iter={:10}:     loss={:.10}'.format(num + 1, loss))
            cnt_trees = np.argmin(metrics['Logloss'])
            print('choose cnt_trees={}'.format(cnt_trees))
            print('overfit={}; AUC={}; logloss={}'.format(
                cnt_trees, metrics['AUC'][cnt_trees],
                metrics['Logloss'][cnt_trees]),
                  file=file_result_to)
            tree_counts.append(cnt_trees)
            file_result_to.flush()
            logloss_curves.append(metrics['Logloss'])
            auc_curves.append(metrics['AUC'])
            auc[clazz.WRAPPER_NAME].append(metrics['AUC'][cnt_trees])
            logloss[clazz.WRAPPER_NAME].append(metrics['Logloss'][cnt_trees])

        print('class={}, learn_time={}, mean_tree_count={}'.format(
            clazz.WRAPPER_NAME, end - beg,
            sum(tree_counts) / len(tree_counts)),
              file=file_result_to)
        print('mean_AUC={}, mean_logloss={}'.format(
            sum(auc[clazz.WRAPPER_NAME]) / len(auc[clazz.WRAPPER_NAME]),
            sum(logloss[clazz.WRAPPER_NAME]) /
            len(logloss[clazz.WRAPPER_NAME])),
              file=file_result_to)
        file_result_to.flush()

        logloss_fig = create_learning_curves_plot(
            logloss_curves, 'logloss {}'.format(clazz.WRAPPER_NAME))
        auc_fig = create_learning_curves_plot(
            auc_curves, 'AUC {}'.format(clazz.WRAPPER_NAME))
        logloss_file = os.path.join(
            dir_, 'fig_{}_{}'.format('Logloss', clazz.WRAPPER_NAME))
        AUC_file = os.path.join(dir_,
                                'fig_{}_{}'.format('AUC', clazz.WRAPPER_NAME))
        plot(logloss_fig, filename=logloss_file, auto_open=False)
        plot(auc_fig, filename=AUC_file, auto_open=False)

    file_name = os.path.join(dir_, 'boot.txt')
    with open(file_name, 'w') as file_to:
        json.dump(auc, file_to)

    for cl1 in classes:
        for cl2 in classes:
            stat, p_value = wilcoxon(auc[cl1.WRAPPER_NAME],
                                     auc[cl2.WRAPPER_NAME],
                                     zero_method="pratt")
            print('for {} & {}: stat: {}, p_value: {}'.format(
                cl1.WRAPPER_NAME, cl2.WRAPPER_NAME, stat, p_value),
                  file=file_result_to)
예제 #13
0
import sys
from pathlib import Path
import json

import pandas as pd
from catboost import CatBoostClassifier, Pool

project_dir = Path(__file__).resolve().parents[2]

params = json.load(open(f"{project_dir}/src/models/params.json"))

model = CatBoostClassifier()

model.load_model(f"{project_dir}/models/heart.cbm")

test_df = pd.read_csv(f"{project_dir}/data/processed/test.csv")

target = test_df.pop(params['data_params']['target'])
X = test_df

test_pool = Pool(X, target, params['data_params']['cat_features'])

print(model.predict(test_pool))

sys.exit(0)
categorical_features_indices = np.where(train_data.dtypes != np.float)[0]

params = {
    'iterations': 1000,
    'learning_rate': 0.16129990013229004,
    'eval_metric': 'F1',
    'random_seed': 42,
    'logging_level': 'Silent',
    'l2_leaf_reg': 1.0,
    'depth' : 5,
    'random_strength' : 1,
    'use_best_model': True
}
#train_pool = Pool(train_data, train_label, cat_features=categorical_features_indices)
train_pool = Pool(data, data_label, cat_features=categorical_features_indices)
validate_pool = Pool(test_data, test_label, cat_features=categorical_features_indices)

model = CatBoostClassifier(**params)

model.fit(train_pool,eval_set=validate_pool,
    logging_level='Verbose',
    plot=False
)


##cv_params = model.get_params()
##cv_params.update({
##    'loss_function': 'Logloss'
##})
##cv_data = cv(
예제 #15
0
def test_real_numbers_cat_features():
    with pytest.raises(CatboostError):
        data = np.random.rand(100, 10)
        label = np.random.randint(2, size=100)
        Pool(data, label, [1, 2])
예제 #16
0
def test_predict_sklearn_regress():
    train_pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    model = CatBoostRegressor(iterations=2, random_seed=0)
    model.fit(train_pool)
    model.save_model(OUTPUT_MODEL_PATH)
    return compare_canonical_models(OUTPUT_MODEL_PATH)
예제 #17
0
def test_load_file():
    assert _check_shape(Pool(TRAIN_FILE, column_description=CD_FILE))
예제 #18
0
def test_invalid_loss_base():
    with pytest.raises(CatboostError):
        pool = Pool(TRAIN_FILE, column_description=CD_FILE)
        model = CatBoost({"loss_function": "abcdef"})
        model.fit(pool)
예제 #19
0
def test_load_ndarray():
    pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    cat_features = pool.get_cat_feature_indices()
    data = np.array(map_cat_features(pool.get_features(), cat_features))
    label = np.array(pool.get_label())
    assert _check_shape(Pool(data, label, cat_features))
예제 #20
0
def test_invalid_loss_classifier():
    with pytest.raises(CatboostError):
        pool = Pool(TRAIN_FILE, column_description=CD_FILE)
        model = CatBoostClassifier(loss_function="abcdef")
        model.fit(pool)
예제 #21
0
def train_features(models_dict, pools_dict, features, x_train, x_test, y_train,
                   y_test):
    """
    Function to aggregate models from a set of features
    """
    learn_mape_train_df = pd.DataFrame()
    learn_rmse_train_df = pd.DataFrame()
    learn_mape_test_df = pd.DataFrame()
    learn_rmse_test_df = pd.DataFrame()
    categorical_features_indices = []

    for feature in features:

        y_train = pd.DataFrame(data=y_train, columns=features)
        y_test = pd.DataFrame(data=y_test, columns=features)

        train_pool = Pool(data=x_train,
                          label=y_train[feature],
                          cat_features=categorical_features_indices)

        num_trees = 500
        loss_funct = 'MAPE'
        depth = 1
        l2_leaf_reg = 0.2
        learning_rate = 0.005

        if 'ssim' in feature:
            loss_funct = 'MAE'
            depth = 1
            num_trees = 500
            learning_rate = 0.05
            l2_leaf_reg = 0.2

        models_dict[feature] = CatBoostRegressor(depth=depth,
                                                 num_trees=num_trees,
                                                 l2_leaf_reg=l2_leaf_reg,
                                                 learning_rate=learning_rate,
                                                 loss_function=loss_funct)
        #train the model
        print('Training QoE model:', feature)
        models_dict[feature].fit(train_pool)

        pools_dict[feature] = Pool(data=x_test,
                                   label=y_test[feature],
                                   cat_features=categorical_features_indices)

        learn_mape_train_df[feature] = models_dict[feature].eval_metrics(
            train_pool, ['MAPE'])['MAPE']
        learn_mape_test_df[feature] = models_dict[feature].eval_metrics(
            pools_dict[feature], ['MAPE'])['MAPE']

        learn_rmse_train_df[feature] = models_dict[feature].eval_metrics(
            train_pool, ['RMSE'])['RMSE']
        learn_rmse_test_df[feature] = models_dict[feature].eval_metrics(
            pools_dict[feature], ['RMSE'])['RMSE']

    st.write('QoE model test set MAPE:')
    st.write(learn_mape_test_df.min())
    st.write(learn_mape_test_df.min().describe())

    return models_dict, pools_dict
예제 #22
0
def test_invalid_loss_regressor():
    with pytest.raises(CatboostError):
        pool = Pool(TRAIN_FILE, column_description=CD_FILE)
        model = CatBoostRegressor(loss_function="fee")
        model.fit(pool)
예제 #23
0
def Model_cv(MODEL, k, X_train, X_test, y, RE, makepred=True, CatPos=None):
	# Create the k folds
	kf=StratifiedKFold(n_splits=k, shuffle=True, random_state=RE)

	# first level train and test
	Level_1_train = pd.DataFrame(np.zeros((X_train.shape[0],1)), columns=['train_yhat'])
	if makepred==True:
		Level_1_test = pd.DataFrame()

	# Main loop for each fold. Initialize counter
	count=0
	for train_index, test_index in kf.split(X_train, Y_train):
		count+=1
		# Define train and test depending in which fold are we
		fold_train= X_train.loc[train_index.tolist(), :]
		fold_test=X_train.loc[test_index.tolist(), :]
		fold_ytrain=y[train_index.tolist()]
		fold_ytest=y[test_index.tolist()]

		# (k-1)-folds model adjusting
		if CatPos:
			# Prepare Pool
			pool_train=Pool(fold_train, fold_ytrain,cat_features=Pos)
			# (k-1)-folds model adjusting
			model_fit=MODEL.fit(X=pool_train)

		else:
			# (k-1)-folds model adjusting
			model_fit=MODEL.fit(fold_train, fold_ytrain)

		# Predict on the free fold to evaluate metric
		# and on train to have an overfitting-free prediction for the next level
		p_fold=MODEL.predict_proba(fold_test)[:,1]
		p_fold_train=MODEL.predict_proba(fold_train)[:,1]

		# Score in the free fold
		score=roc_auc_score(fold_ytest,p_fold)
		score_train=roc_auc_score(fold_ytrain,p_fold_train)
		print(k, '-cv, Fold ', count, '\t --test AUC: ', round(score,4), '\t--train AUC: ', round(score_train,4),sep='')
		# Save in Level_1_train the "free" predictions concatenated
		Level_1_train.loc[test_index.tolist(),'train_yhat'] = p_fold

		# Predict in test to make the k model mean
		# Define name of the prediction (p_"iteration number")
		if makepred==True:
			name = 'p_' + str(count)
			# Predictin to real test
			real_pred = MODEL.predict_proba(X_test)[:,1]
			# Name
			real_pred = pd.DataFrame({name:real_pred}, columns=[name])
			# Add to Level_1_test
			Level_1_test=pd.concat((Level_1_test,real_pred),axis=1)

	# Compute the metric of the total concatenated prediction (and free of overfitting) in train
	score_total=roc_auc_score(y,Level_1_train['train_yhat'])
	print('\n',k, '- cv, TOTAL AUC:', round((score_total)*100,4),'%')

	# mean of the k predictions in test
	if makepred==True:
		Level_1_test['model']=Level_1_test.mean(axis=1)

	# Return train and test sets with predictions and the performance
	if makepred==True:
		return Level_1_train, pd.DataFrame({'test_yhat':Level_1_test['model']}), score_total
	else:
		return score_total
예제 #24
0
def test_no_eval_set():
    with pytest.raises(CatboostError):
        pool = Pool(TRAIN_FILE, column_description=CD_FILE)
        model = CatBoostClassifier()
        model.fit(pool, use_best_model=True)
    details = []
    answers = []
    mean_f1 = 0
    n_splits = 5
    sk = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=2020)
    cnt = 0
    for train, test in sk.split(train_data, label):
        x_train = train_data.iloc[train]
        y_train = label.iloc[train]
        x_test = train_data.iloc[test]
        y_test = label.iloc[test]



        train_dataset = Pool(data=x_train,
                            label=y_train,
                            cat_features=cat_features)

        eval_dataset = Pool(data=x_test,
                            label=y_test,
                            cat_features=cat_features)


        model.fit(train_dataset,
                use_best_model=True,
                eval_set=eval_dataset)


        importance_df = pd.DataFrame()
        importance_df["feature"] = x_train.columns.tolist()      
        importance_df["importance"] = model.feature_importances_
예제 #26
0
def test_fit_no_label():
    with pytest.raises(CatboostError):
        pool = Pool(TRAIN_FILE, column_description=CD_FILE)
        model = CatBoostClassifier()
        model.fit(pool.get_features())
예제 #27
0
def calc_attributes(name, city):
    data = pd.read_csv('data/final.csv')
    #ids=[142]
    pokemon = data[data['city'] == city]
    pokemon['temp_new'] = pokemon['temperature'].apply(temperature_changer)
    weather_classes = [
        'Foggy', 'Clear', 'PartlyCloudy', 'MostlyCloudy', 'Overcast', 'Rain',
        'BreezyandOvercast', 'LightRain', 'Drizzle', 'BreezyandPartlyCloudy',
        'HeavyRain', 'BreezyandMostlyCloudy', 'Breezy', 'Windy',
        'WindyandFoggy', 'Humid', 'Dry', 'WindyandPartlyCloudy',
        'DangerouslyWindy', 'DryandMostlyCloudy', 'DryandPartlyCloudy',
        'DrizzleandBreezy', 'LightRainandBreezy', 'HumidandPartlyCloudy',
        'HumidandOvercast', 'RainandWindy'
    ]
    day_of_week = [
        "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday",
        "Sunday"
    ]
    time_of_day = ["Morning", "Afternoon", "Evening", "Night"]
    mapper = DataFrameMapper([
        ('close_to_water', LabelEncoder()),
        ('weather', MultiLabelBinarizer(classes=weather_classes)),
        ('temp_new', MultiLabelBinarizer(classes=['Cool', 'Mild', 'Hot'])),
        ('day', MultiLabelBinarizer(classes=day_of_week)),
        ('time', MultiLabelBinarizer(classes=time_of_day)),
        (['level_one'], [
            SimpleImputer(strategy='constant', fill_value='most_frequent'),
            LabelBinarizer()
        ]),
        ('population_density',
         MultiLabelBinarizer(classes=['Low', 'Medium', 'High'])),
    ],
                             df_out=True)

    def pokemon_target(x):
        #print(id)
        if name == x:
            return 1
        else:
            return 0

    pokemon['target'] = pokemon['name'].apply(pokemon_target)
    target = 'target'
    y = pokemon[target]
    X = pokemon.drop(target, axis=1)
    X = mapper.fit_transform(X)
    #X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=117, test_size=0.85)
    train_pool = Pool(X, y)
    #val_pool = Pool(X_val, y_val)
    model = cb.CatBoostClassifier(iterations=150,
                                  logging_level='Silent',
                                  custom_loss=['AUC'],
                                  depth=None,
                                  l2_leaf_reg=7)
    model.fit(X, y, plot=False, verbose=False)

    res = pd.DataFrame(zip(X.columns, model.feature_importances_),
                       columns=['Feature',
                                'Score']).sort_values(by='Score',
                                                      ascending=False)
    loc = []
    weather = []
    day = []
    density = []
    time = []
    for res in (list(res.Feature.values)):
        if (res.find("level_one") >= 0) and (len(loc) <= 1):
            loc.append(res.split('_')[-1])
        elif (res.find("weather") >= 0) and (len(weather) <= 1):
            weather.append(res.split('_')[-1])
        elif (res.find("time") >= 0) and (len(time) <= 1):
            time.append(res.split('_')[-1])
        elif (res.find("day") >= 0) and (len(day) <= 1):
            day.append(res.split('_')[-1])
        elif (res.find("density") >= 0) and (len(density) <= 1):
            density.append(res.split('_')[-1])

    res = pd.DataFrame(zip(loc, weather, day, density, time),
                       columns=[
                           'loc',
                           'weather',
                           'day',
                           'density',
                           'time',
                       ])

    return res
예제 #28
0
def test_predict_without_fit():
    with pytest.raises(CatboostError):
        pool = Pool(TRAIN_FILE, column_description=CD_FILE)
        model = CatBoostClassifier()
        model.predict(pool)
예제 #29
0
        # 'nthread':12
    }
    params['silent'] = 1
    watchlist = [(xgb_train, 'train'), (xgb_eval, 'eval')]
    xgb_model = xgb.train(params,
                          xgb_train,
                          5000,
                          watchlist,
                          early_stopping_rounds=40,
                          verbose_eval=40)
    train_model_pred['xgb_pred'].iloc[test_index] += xgb_model.predict(
        xgb_eval)
    test_model_pred['xgb_pred'] += xgb_model.predict(xgb_test)

    print('开始cb训练...')
    train_pool = Pool(train_feat[predictors].iloc[train_index],
                      train_feat['loan_sum'].iloc[train_index])
    eval_pool = Pool(train_feat[predictors].iloc[test_index],
                     train_feat['loan_sum'].iloc[test_index])
    test_pool = Pool(test_feat[predictors])
    cb_model = cb.CatBoostRegressor(iterations=400,
                                    depth=7,
                                    learning_rate=0.06,
                                    eval_metric='RMSE',
                                    od_type='Iter',
                                    od_wait=20,
                                    random_seed=42,
                                    thread_count=7,
                                    bagging_temperature=0.85,
                                    rsm=0.85,
                                    verbose=False)
    cb_model.fit(train_pool)
예제 #30
0
# DIVIDE TRAINDATA AND PREDDATA

DX = D[D['DateTime'] < pd.to_datetime('2015')].copy()
DY = D[D['DateTime'] >= pd.to_datetime('2015')].copy()
DY = DY[DY['DateTime'] < pd.to_datetime('2016')].copy()

# HOURLY CONCENTRATION ESTIMATION ##############################################

RDX = DX[DX['ConcentrationObs'].notnull()].copy()

m = CatBoostRegressor(learning_rate=LR_R,
                      iterations=NT_R,
                      logging_level='Silent')
m.fit(RDX[FEAT], y=RDX['ConcentrationObs'], cat_features=CATS)

DX['ConcentrationPred'] = m.predict(Pool(DX[FEAT], cat_features=CATS))
DY['ConcentrationPred'] = m.predict(Pool(DY[FEAT], cat_features=CATS))

FEAT += ['ConcentrationPred']

# HOURLY HIGH-CONCENTRATION PROBABILITY ESTIMATION #############################

m = CatBoostClassifier(learning_rate=LR_C,
                       iterations=NT_C,
                       logging_level='Silent')
m.fit(DX[FEAT], y=DX['HourTarget'], cat_features=CATS)

# DAILY HIGH-CONCENTRATION PROBABILITY ESTIMATION ##############################

fday = lambda x: 1 - np.prod(1 - x.nlargest(5))