def create_model(self, kfold_X_train, y_train, kfold_X_valid, y_test, test):


        best = CatBoostClassifier(loss_function='MultiClassOneVsAll', learning_rate=0.07940735491731761, depth=8)
        best.fit(kfold_X_train, y_train)

        # 对验证集predict
        pred = best.predict_proba(kfold_X_valid)
        results = best.predict_proba(test)

        return pred, results, best
def cleaning_comments(raw_comments, path='.') -> str:
    print('start cleaning of comments...')

    raw = pd.read_csv(raw_comments)
    cleaned_comments = os.path.join(path, 'cleaned_comments.csv')
    bad_comments = os.path.join(path, 'bad_comments.csv')
    model = CatBoostClassifier().load_model(os.path.join(path, 'trash_model'))
    vectorizer = joblib.load(os.path.join(path, 'trash_vectorizer'))

    hyp = model.predict_proba(vectorizer.transform(raw.text).toarray())
    with open(cleaned_comments, 'w') as cleaned, open(bad_comments, 'w') as bad:
        bad_file = 'likes,status,text\n'
        cleaned_file = 'likes,status,text\n'
        for i in range(len(hyp)):
            if hyp[i][0] < 0.6:
                bad_file += str(raw.likes[i]) + ',1,"' + raw.text[i] + '"\n'
            else:
                cleaned_file += str(raw.likes[i]) + ',0,"' + raw.text[i] + '"\n'
        cleaned.write(cleaned_file)
        bad.write(bad_file)

    os.remove(raw_comments)

    print('end cleaning of comments...')
    return cleaned_comments
Exemple #3
0
def test_ntree_limit():
    train_pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    test_pool = Pool(TEST_FILE, column_description=CD_FILE)
    model = CatBoostClassifier(iterations=100, random_seed=0)
    model.fit(train_pool)
    pred = model.predict_proba(test_pool, ntree_end=10)
    np.save(PREDS_PATH, np.array(pred))
    return local_canonical_file(PREDS_PATH)
Exemple #4
0
def test_multiclass():
    pool = Pool(CLOUDNESS_TRAIN_FILE, column_description=CLOUDNESS_CD_FILE)
    classifier = CatBoostClassifier(iterations=2, random_seed=0, loss_function='MultiClass', thread_count=8)
    classifier.fit(pool)
    classifier.save_model(OUTPUT_MODEL_PATH)
    new_classifier = CatBoostClassifier()
    new_classifier.load_model(OUTPUT_MODEL_PATH)
    pred = new_classifier.predict_proba(pool)
    np.save(PREDS_PATH, np.array(pred))
    return local_canonical_file(PREDS_PATH)
Exemple #5
0
def model_1(X,y,test):
	'''
	This is a catBoost model where we need not to encode categorical variables.
	It automatically takes care of them.
	'''
	categorical_features_indices = np.where(X.dtypes != np.float)[0]
	X_train, X_validation, y_train, y_validation = train_test_split(X, y, train_size=0.7, random_state=1234)
	#importing library and building model
	cboost=CatBoostClassifier(iterations=500,learning=0.01,depth=6,loss_function='MultiClass',eval_metric='Accuracy')
	cboost.fit(X_train, y_train,cat_features=categorical_features_indices,eval_set=(X_validation, y_validation),plot=True)
	#calculating the class wise prediction probability of cboost model
	pred_prob=cboost.predict_proba(test)
	return pred_prob
class BesCatBoost:
    """
    catboost_params = {
            'iterations': 500,
            'depth': 3,
            'learning_rate': 0.1,
            'eval_metric': 'AUC',
            'random_seed': 42,
            'logging_level': 'Verbose',
            'l2_leaf_reg': 15.0,
            'bagging_temperature': 0.75,
            'allow_writing_files': False,
            'metric_period': 50
        }
        """

    def __init__(self, params, metric='AUC', maximize=True, verbose=True, model=None):
        self.params = params
        self.metric = metric
        self.maximize = maximize
        self.verbose = verbose
        self.model = model

    def fit(self, X_train, y_train):

        bst = cv(
            Pool(X_train, y_train),
            self.params
        )

        best_rounds = int(bst['test-{}-mean'.format(self.metric)].idxmax() * 1.5) + 1
        print('Best Iteration: {}'.format(best_rounds))

        self.params['iterations'] = best_rounds
        self.model = CatBoostClassifier(**self.params)

        self.model.fit(
            X_train, y_train
        )

    def predict(self, X_test):
        pred_prob = self.model.predict_proba(X_test)[:, -1]
        return pred_prob

    def feature_importance(self):
        pass

    @staticmethod
    def find_best_params(kag):
        pass
Exemple #7
0
#print(log)
#pred2 = model2_val.predict(val_df_x)
#acc2 = accuracy_score(val_df_y, pred2)
#print('Accuracy: ', acc2)

model2 = CatBoostClassifier(depth=8,
                             iterations=1000,
                            learning_rate=0.02,
                            eval_metric='MultiClass',
                            loss_function='MultiClass',
                            bootstrap_type= 'Bernoulli',
                            leaf_estimation_method='Gradient',
                            random_state=123)

model2.fit(train_x, train_y, verbose=100)
pred2 = model2.predict_proba(test_x)

# Submission
sub2 = sample_df.copy()

sub2.iloc[:, 1:] = pred2.data
sub2.to_csv("submission2_T06.csv",index=False)

def generate(main, support, coeff):
    g = main.copy()
    for i in main.columns[1:]:
        res = []
        lm, ls = [], []
        lm = main[i].tolist()
        ls = support[i].tolist()
Exemple #8
0
def make_cat_oof_prediction(train,
                            y,
                            test,
                            features,
                            categorical_features=None,
                            model_params=None,
                            folds=10,
                            is_optuna=False):
    x_train = train[features]
    x_test = test[features]

    # 테스트 데이터 예측값을 저장할 변수
    test_preds = np.zeros(x_test.shape[0])

    # Out Of Fold Validation 예측 데이터를 저장할 변수
    y_oof = np.zeros(x_train.shape[0])

    # 폴드별 평균 Validation 스코어를 저장할 변수
    score = 0

    # 피처 중요도를 저장할 데이터 프레임 선언
    fi = pd.DataFrame()
    fi['feature'] = features

    # Stratified K Fold 선언
    skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=SEED)

    for fold, (tr_idx, val_idx) in enumerate(skf.split(x_train, y)):
        # train index, validation index로 train 데이터를 나눔
        x_tr, x_val = x_train.loc[tr_idx, features], x_train.loc[val_idx,
                                                                 features]
        y_tr, y_val = y[tr_idx], y[val_idx]

        print(
            f'fold: {fold+1}, x_tr.shape: {x_tr.shape}, x_val.shape: {x_val.shape}'
        )

        # CatBoost 모델 훈련
        clf = CatBoostClassifier(**model_params)
        clf.fit(
            x_tr,
            y_tr,
            eval_set=(x_val, y_val),  # Validation 성능을 측정할 수 있도록 설정
            cat_features=categorical_features,
            use_best_model=True,
            verbose=True)

        # Validation 데이터 예측
        val_preds = clf.predict_proba(x_val)[:, 1]

        # Validation index에 예측값 저장
        y_oof[val_idx] = val_preds

        # 폴드별 Validation 스코어 출력
        print(f"Fold {fold + 1} | AUC: {roc_auc_score(y_val, val_preds)}")
        print('-' * 80)

        # score 변수에 폴드별 평균 Validation 스코어 저장
        score += roc_auc_score(y_val, val_preds) / folds

        # 테스트 데이터 예측하고 평균해서 저장
        test_preds += clf.predict_proba(x_test)[:, 1] / folds

        # 폴드별 피처 중요도 저장
        fi[f'fold_{fold+1}'] = clf.feature_importances_

        del x_tr, x_val, y_tr, y_val
        gc.collect()

    print(f"\nMean AUC = {score}")  # 폴드별 평균 Validation 스코어 출력
    print(f"OOF AUC = {roc_auc_score(y, y_oof)}"
          )  # Out Of Fold Validation 스코어 출력

    #wandb
    if is_optuna == False:
        wandb.log({'[cat]perdictiona Mean AUC': score})
        wandb.log({'[cat]perdictiona OOF AUC': roc_auc_score(y, y_oof)})

    # 폴드별 피처 중요도 평균값 계산해서 저장
    fi_cols = [col for col in fi.columns if 'fold_' in col]
    fi['importance'] = fi[fi_cols].mean(axis=1)

    return y_oof, test_preds, fi
cb_model = CatBoostClassifier(iterations=1000,
                              learning_rate=0.1,
                              depth=7,
                              l2_leaf_reg=40,
                              bootstrap_type='Bernoulli',
                              subsample=0.7,
                              scale_pos_weight=5,
                              eval_metric='AUC',
                              metric_period=50,
                              od_type='Iter',
                              od_wait=45,
                              random_seed=17,
                              allow_writing_files=False)

cb_model.fit(X_train,
             y_train,
             eval_set=(X_valid, y_valid),
             cat_features=cat_features_inds,
             use_best_model=True,
             verbose=True)

fea_imp = pd.DataFrame({'imp': cb_model.feature_importances_, 'col': cols})
fea_imp = fea_imp.sort_values(['imp', 'col'], ascending=[True,
                                                         False]).iloc[-30:]
_ = fea_imp.plot(kind='barh', x='col', y='imp', figsize=(20, 10))
plt.savefig('catboost_feature_importance.png')

print('AUC:', roc_auc_score(y_valid, cb_model.predict_proba(X_valid)[:, 1]))
y_preds = cb_model.predict_proba(data_test)[:, 1]
subm['TARGET'] = y_preds
subm.to_csv('submission.csv', index=False)
    lr1_pred_p = lr1.predict_proba(X_test)
    y_pred = lr1.predict(X_test)

    ### CatBoost ###

    cat = CatBoostClassifier(iterations=2,
                             depth=2,
                             learning_rate=1,
                             loss_function='Logloss',
                             verbose=True)

    cat.fit(X_train, y_train)

    preds_class = cat.predict(X_test)
    preds_proba = cat.predict_proba(X_test)

    a_scores = {}
    f_scores = {}
    models = {'lr1': lr1, 'lr2': lr2, 'rf': rf, 'cat': cat}

    for model_str, model in models.items():
        a_scores[model_str] = model.score(X_test, y_test)
        if 'l' in model_str:
            f_scores[model_str] = model.coef_
        else:
            f_scores[model_str] = model.feature_importances_

    df_f_scores = pd.DataFrame([arr.flatten() for arr in f_scores.values()],
                               index=f_scores.keys(),
                               columns=X_cols)
Exemple #11
0
    l2_leaf_reg=3,
    depth=4,
    loss_function="Logloss",
    verbose=False,
    random_state=random_state,
)

# %%

xgb_model.fit(X_train, y_train)
rf_model.fit(X_train, y_train)
cat_model.fit(X_train, y_train)

xgb_pred = xgb_model.predict_proba(X_validation)
rf_pred = rf_model.predict_proba(X_validation)
cat_pred = cat_model.predict_proba(X_validation)

# %% ROC

fig, ax = plt.subplots()
skplt.metrics.plot_roc(
    y_validation,
    xgb_pred,
    plot_micro=False,
    plot_macro=False,
    classes_to_plot=[1],
    ax=ax,
    cmap="Blues",
)
skplt.metrics.plot_roc(
    y_validation,
Exemple #12
0
def main():
    # load data
    df_train = pd.read_csv('data/train_data.csv')
    df_valid = pd.read_csv('data/valid_data.csv')
    df_test = pd.read_csv('data/test_data.csv')

    feature_cols = [f for f in list(df_train) if "feature" in f]
    target_col = df_train.columns[-1]

    X_train = df_train[feature_cols].values
    y_train = df_train[target_col].values

    X_valid = df_valid[feature_cols].values
    y_valid = df_valid[target_col].values

    X_test = df_test[feature_cols].values

    tsne_3d_50p = np.load('data/tsne_3d_50p.npz')

    tsne_3d_50p_train = tsne_3d_50p['train']
    tsne_3d_50p_valid = tsne_3d_50p['valid']
    tsne_3d_50p_test = tsne_3d_50p['test']

    X_train_concat = np.concatenate((X_train, tsne_3d_50p_train), axis=1)
    X_valid_concat = np.concatenate((X_valid, tsne_3d_50p_valid), axis=1)
    X_test_concat = np.concatenate((X_test, tsne_3d_50p_test), axis=1)

    parameter_grid = [
        {
            'learning_rate': [0.005, 0.008, 0.01, 0.03, 0.05],
            'iterations': [500, 1000, 1500, 2000],
            'depth': [6, 7, 8, 9, 10]
        },
        {
            'learning_rate': [0.01, 0.02, 0.03],
            'border_count': range(32, 40),
            'l2_leaf_reg': range(3, 5)
        },
    ]

    X_search = np.concatenate([
        np.concatenate([X_train, tsne_3d_50p_train], axis=1),
        np.concatenate([X_valid, tsne_3d_50p_valid], axis=1),
    ],
                              axis=0)
    y_search = np.concatenate([y_train, y_valid], axis=0)

    classifier = CatBoostClassifier(n_jobs=1)

    search = GridSearchCV(classifier, parameter_grid, verbose=1)

    print('Tuning hyperparameters...')
    search.fit(X_search, y_search)

    print('Found best parameters:')
    print(search.best_score_)
    print(search.best_params_)

    classifier = CatBoostClassifier( \
        learning_rate=search.best_params_['learning_rate'], \
        iterations=search.best_params_['iterations'], \
        depth=search.best_params_['depth'])

    print('Fitting...')
    start_time = time.time()
    classifier.fit(X_train_concat, y_train)
    print('Fit: {}s'.format(time.time() - start_time))

    p_valid = classifier.predict_proba(X_valid_concat)
    loss = log_loss(y_valid, p_valid)
    print('Loss: {}'.format(loss))

    p_test = classifier.predict_proba(X_test_concat)
    df_pred = pd.DataFrame({'id': df_test['id'], 'probability': p_test[:, 1]})
    csv_path = 'predictions/predictions_{}_{}.csv'.format(
        int(time.time()), loss)
    df_pred.to_csv(csv_path, columns=('id', 'probability'), index=None)
    print('Saved: {}'.format(csv_path))
class CXDetector:  # "sklearn estimator"
    def __init__(self, sample_freq, low_freq, high_freq, window_size,
                 window_shift, shap_window_size, shap_window_shift,
                 read_signal_fn, read_clin_fn, proc_clin_fn):

        self.sample_freq = sample_freq
        self.low_freq = low_freq
        self.high_freq = high_freq

        self.window_size = window_size
        self.window_shift = window_shift
        self.shap_window_size = shap_window_size
        self.shap_window_shift = shap_window_shift

        self.read_signal_fn = read_signal_fn
        self.read_clin_fn = read_clin_fn
        self.proc_clin_fn = proc_clin_fn

        # Features to apply on the shorter normalized windows
        self.short_features = [
            GENDISFeatures,
        ]

        # Features using 1 channel to apply on the normal windows
        self.features = [
            # Spectral features
            BOSSFeatures,
            BasicFrequencyFeatures,

            # Temporal features
            BasicFeatures,
            RMSFeatures,
            #TSFRESHFeatures,
        ]

        # Features using multiple channels
        self.multi_channel_features = [
            # Correlations
            CorrFeatures()
        ]

    def prep_data(self, files, train=True):
        # Read in our data
        signals = []
        intervals = []
        for file in files:
            *_signals, _intervals = self.read_signal_fn(
                file, self.low_freq, self.high_freq, self.sample_freq)
            signals.append(_signals)
            intervals.append(_intervals)

        # Extract windows from the data
        window_data = WindowData()
        shap_window_data = WindowData()
        for file, signals, intervals in zip(files, signals, intervals):
            if train:
                for ann1, ann2 in zip(intervals[::2], intervals[1::2]):

                    if (ann1[1][-1] not in ['C', 'D']
                            or ann2[0] >= len(signals[0]) or ann1[0] < 0):
                        continue

                    label = int(ann1[1][-1] == 'C')
                    _windows, idx = extract_windows(signals, ann1[0], ann2[0],
                                                    self.window_size,
                                                    self.window_shift)

                    window_data.windows.extend(_windows)
                    window_data.indices.extend(idx)
                    window_data.files.extend([file] * len(_windows))
                    window_data.labels.extend([label] * len(_windows))

                    shap_windows, shap_idx = extract_windows(
                        signals, ann1[0], ann2[0], self.shap_window_size,
                        self.shap_window_shift)

                    shap_window_data.windows.extend(shap_windows)
                    shap_window_data.indices.extend(shap_idx)
                    shap_window_data.files.extend([file] * len(shap_windows))
                    shap_window_data.labels.extend([label] * len(shap_windows))
            else:
                _windows, idx = extract_windows(signals, 0, len(signals[0]),
                                                self.window_size,
                                                self.window_shift)

                window_data.windows.extend(_windows)
                window_data.indices.extend(idx)
                window_data.files.extend([file] * len(_windows))
                window_data.labels.extend([None] * len(_windows))

                shap_windows, shap_idx = extract_windows(
                    signals, 0, len(signals[0]), self.shap_window_size,
                    self.shap_window_shift)

                shap_window_data.windows.extend(shap_windows)
                shap_window_data.indices.extend(shap_idx)
                shap_window_data.files.extend([file] * len(shap_windows))
                shap_window_data.labels.extend([None] * len(shap_windows))

        window_data.windows = np.array(window_data.windows)
        window_data.files = np.array(window_data.files)
        window_data.indices = np.array(window_data.indices)
        shap_window_data.windows = np.array(shap_window_data.windows)
        shap_window_data.files = np.array(shap_window_data.files)
        shap_window_data.indices = np.array(shap_window_data.indices)

        return window_data, shap_window_data

    def get_corr_features(self, X):
        """Get all coordinates in the X-matrix with correlation value equals 1
        (columns with equal values), excluding elements on the diagonal.

        Parameters:
        -----------
        - train_df: pd.DataFrame
            the feature matrix where correlated features need to be removed

        Returns
        -------
        - correlated_feature_pairs: list of tuples
            coordinates (row, col) where correlated features can be found
        """
        row_idx, col_idx = np.where(np.abs(X.corr()) > 0.99)
        self_corr = set([(i, i) for i in range(X.shape[1])])
        correlated_feature_pairs = set(list(zip(row_idx, col_idx))) - self_corr
        return correlated_feature_pairs

    def get_uncorr_features(self, data):
        """Remove clusters of these correlated features, until only one feature 
        per cluster remains.

        Parameters:
        -----------
        - data: pd.DataFrame
            the feature matrix where correlated features need to be removed

        Returns
        -------
        - data_uncorr_cols: list of string
            the column names that are completely uncorrelated to eachother
        """
        X_train_corr = data.copy()
        correlated_features = self.get_corr_features(X_train_corr)

        corr_cols = set()
        for row_idx, col_idx in correlated_features:
            corr_cols.add(row_idx)
            corr_cols.add(col_idx)

        uncorr_cols = list(
            set(X_train_corr.columns) -
            set(X_train_corr.columns[list(corr_cols)]))

        col_mask = [False] * X_train_corr.shape[1]
        for col in corr_cols:
            col_mask[col] = True
        X_train_corr = X_train_corr.loc[:, col_mask]

        correlated_features = self.get_corr_features(X_train_corr)
        to_remove = set()
        for corr_row, corr_col in correlated_features:
            if corr_row in to_remove:
                continue

            for corr_row2, corr_col2 in correlated_features:
                if corr_row == corr_row2:
                    to_remove.add(corr_col2)
                elif corr_row == corr_col2:
                    to_remove.add(corr_row2)

        col_mask = [True] * X_train_corr.shape[1]
        for ix in to_remove:
            col_mask[ix] = False

        X_train_corr = X_train_corr.loc[:, col_mask]

        data_uncorr_cols = list(set(list(X_train_corr.columns) + uncorr_cols))

        return data_uncorr_cols

    def remove_features(self, data):
        """Remove all correlated features and columns with only a single value.

        Parameters:
        -----------
        - data: pd.DataFrame
            the feature matrix where correlated features need to be removed

        Returns
        -------
        - useless_cols: list of string
            list of column names that have no predictive value
        """
        single_cols = list(data.columns[data.nunique() == 1])

        uncorr_cols = self.get_uncorr_features(data)
        corr_cols = list(set(data.columns) - set(uncorr_cols))

        useless_cols = list(set(single_cols + corr_cols))

        print('Removing {} features'.format(len(useless_cols)))

        return useless_cols

    def fit(self, train_files):
        window_data, shap_window_data = self.prep_data(train_files)

        # Extract clinical variables
        clin_features = []
        for file in train_files:
            names, values = self.read_clin_fn(file)
            clin_features.append([file] + values)
        clin_df = pd.DataFrame(clin_features, columns=['file'] + names)
        clin_df = self.proc_clin_fn(clin_df)

        # Extract features for each channel separately
        features_per_channel = []
        self.feature_extractors_per_channel = {}
        for ch in range(window_data.windows.shape[1]):
            self.feature_extractors_per_channel[ch] = []
            for feature_extractor in self.features:
                self.feature_extractors_per_channel[ch].append(
                    feature_extractor())

            channel_features = []
            for f in self.feature_extractors_per_channel[ch]:
                features = f.fit_transform(window_data.windows[:, ch, :],
                                           window_data.labels)
                features = pd.DataFrame(
                    features,
                    columns=['{}_ch{}'.format(x, ch) for x in f.names_])
                channel_features.append(features)
            features_per_channel.append(pd.concat(channel_features, axis=1))

        short_features_per_channel = []
        self.short_feature_extractors_per_channel = {}
        for ch in range(window_data.windows.shape[1]):
            self.short_feature_extractors_per_channel[ch] = []
            for feature_extractor in self.short_features:
                self.short_feature_extractors_per_channel[ch].append(
                    feature_extractor())

            channel_features = []
            for f in self.short_feature_extractors_per_channel[ch]:
                f.fit(shap_window_data.windows[:, ch, :],
                      shap_window_data.labels)
                features = f.transform(window_data.windows[:, ch, :],
                                       window_data.labels)
                features = pd.DataFrame(
                    features,
                    columns=['{}_ch{}'.format(x, ch) for x in f.names_])
                channel_features.append(features)
            short_features_per_channel.append(
                pd.concat(channel_features, axis=1))

        features_multi_channel = []
        for f in self.multi_channel_features:
            features = f.fit_transform(window_data.windows, window_data.labels)
            features = pd.DataFrame(
                features, columns=['{}_ch{}'.format(x, ch) for x in f.names_])
            features_multi_channel.append(features)

        # Concatenate the features of different channels together
        train_features = pd.concat(features_per_channel +
                                   short_features_per_channel +
                                   features_multi_channel,
                                   axis=1)
        train_features['file'] = window_data.files
        train_features = train_features.merge(clin_df, on='file')

        # Create our X and y
        X_train = train_features
        y_train = np.array(window_data.labels)
        for col in ['ID', 'file']:
            if col in X_train.columns:
                X_train = X_train.drop(col, axis=1)

        X_train = X_train.astype(float)

        # useless_features = self.remove_features(X_train)
        # X_train = X_train.drop(useless_features, axis=1)

        # Now apply hypothesis testing on remaining features
        rel_table = calculate_relevance_table(X_train, pd.Series(y_train))
        self.rel_features = list(rel_table[rel_table['p_value'] <= 0.05].index)

        X_train = X_train[self.rel_features]

        # Create validation set for early stopping
        val_files = np.random.choice(train_files,
                                     size=int(0.1 * len(train_files)),
                                     replace=False)
        all_files = np.array(window_data.files)
        X_val = X_train.loc[np.isin(window_data.files, val_files), :]
        y_val = y_train[np.isin(window_data.files, val_files)]
        X_train = X_train.loc[~np.isin(window_data.files, val_files), :]
        y_train = y_train[~np.isin(window_data.files, val_files)]

        # Fit our gradient boosting classifier
        self.clf = CatBoostClassifier(
            iterations=10000,
            od_type='Iter',
            od_wait=50,
            objective='CrossEntropy',
            random_seed=2018,
            #eval_metric='AUC',
            use_best_model=True,
            task_type='CPU')

        self.clf.fit(X_train, y_train, eval_set=(X_val, y_val), verbose=100)

        return train_features

    def predict(self, test_files):
        # TODO: Take means of all predictions on same timepoint
        window_data, shap_window_data = self.prep_data(test_files, train=False)

        # Extract clinical variables
        clin_features = []
        for file in test_files:
            names, values = self.read_clin_fn(file)
            clin_features.append([file] + values)
        clin_df = pd.DataFrame(clin_features, columns=['file'] + names)
        clin_df = self.proc_clin_fn(clin_df)

        # Extract features for each channel separately
        features_per_channel = []
        for ch in range(window_data.windows.shape[1]):
            channel_features = []
            for f in self.feature_extractors_per_channel[ch]:
                features = f.transform(window_data.windows[:, ch, :])
                features = pd.DataFrame(
                    features,
                    columns=['{}_ch{}'.format(x, ch) for x in f.names_])
                channel_features.append(features)
            features_per_channel.append(pd.concat(channel_features, axis=1))

        short_features_per_channel = []
        for ch in range(window_data.windows.shape[1]):
            channel_features = []
            for f in self.short_feature_extractors_per_channel[ch]:
                features = f.transform(window_data.windows[:, ch, :])
                features = pd.DataFrame(
                    features,
                    columns=['{}_ch{}'.format(x, ch) for x in f.names_])
                channel_features.append(features)
            short_features_per_channel.append(
                pd.concat(channel_features, axis=1))

        features_multi_channel = []
        for f in self.multi_channel_features:
            features = f.transform(window_data.windows)
            features = pd.DataFrame(
                features, columns=['{}_ch{}'.format(x, ch) for x in f.names_])
            features_multi_channel.append(features)

        # Concatenate the features of different channels together
        test_features = pd.concat(features_per_channel +
                                  short_features_per_channel +
                                  features_multi_channel,
                                  axis=1)
        test_features['file'] = window_data.files
        test_features = test_features.merge(clin_df, on='file')

        all_preds = []
        for file in test_files:
            X_test = test_features[test_features['file'] == file]
            test_ix = window_data.indices[window_data.files == file].flatten()
            for col in ['ID', 'file']:
                if col in X_test.columns:
                    X_test = X_test.drop(col, axis=1)

            X_test = X_test[self.rel_features]
            preds = self.clf.predict_proba(X_test)[:, 1]  #.reshape(-1, 1)

            pred_df = pd.DataFrame(list(range(max(test_ix) +
                                              self.window_size)),
                                   columns=['index'])
            pred_df['file'] = file
            pred_df['pred'] = np.NaN
            pred_df = pred_df.set_index('index', drop=True)
            pred_df.loc[test_ix, 'pred'] = preds
            pred_df = pred_df.ffill().reset_index()
            all_preds.append(pred_df)

        return pd.concat(all_preds).reset_index(drop=True)
Exemple #14
0
# temp = pd.DataFrame()
# temp = score[:60]
# color = cm.jet(temp['fea']/temp['fea'].max())
# plt.figure(figsize=(25, 17))
# plt.barh(temp['fea_name'],temp['fea'],height =0.8,color=color,alpha=0.8)
# # plt.show()
# plt.savefig('huawei/feature_weight.jpg')



#线上提交的模型训练
clf1 = CatBoostClassifier(iterations=clf.best_iteration_, depth=6,learning_rate=0.1, loss_function='Logloss'
                        ,eval_metric='AUC',task_type='GPU',metric_period=50)
clf1.fit(
    train_df[feature], train_df['label'].astype('int32'),
    verbose=True
)
y_pre = clf1.predict_proba(test_df[feature])[:, 1]

import pickle
with open('/data/mengyuan/huawei/model/catbase.pkl', 'wb') as f:
    pickle.dump(clf1, f)
    print('save catebase model to /data/mengyuan/huawei/model/catbase.pkl !!')


res = pd.DataFrame()
res['id'] = test_df['id'].astype('int32')
res['probability'] = y_pre
res.to_csv('/data/mengyuan/huawei/ensemble/submission_catbase.csv',index = False)

Exemple #15
0
            depth=12,
        )
        cbt_model.fit(train_pool, eval_set=eval_pool, verbose=100)
#         with open('./models/fold%d_cbt_v1.mdl' % index, 'wb') as file:
#             pickle.dump(cbt_model, file)
    else:
        with open('./models/fold%d_cbt_v1.mdl' % index, 'rb') as file:
            cbt_model = pickle.load(file)

    imp['score%d' % (index + 1)] = cbt_model.feature_importances_

    score = cbt_model.best_score_['validation']['AUC']
    scores.append(score)
    print('fold %d round %d : score: %.6f | mean score %.6f' %
          (index + 1, cbt_model.best_iteration_, score, np.mean(scores)))
    preds += cbt_model.predict_proba(test_x)

    del cbt_model, train_pool, eval_pool
    del X_train, y_train, X_valid, y_valid
    import gc
    gc.collect()

#     mdls.append(cbt_model)

# In[ ]:

imp.sort_values(by='score1', ascending=False)

# In[ ]:

result = invite_info_evaluate[['question_id', 'author_id', 'invite_time']]
                t1 = datetime.datetime.now()
                model = CatBoostClassifier(iterations=100,
                                           rsm=rsm,
                                           learning_rate=lrn_rt,
                                           depth=dep,
                                           l2_leaf_reg=l2_reg,
                                           random_seed=2)

                model.fit(X_train,
                          y_train,
                          cat_indices,
                          use_best_model=True,
                          eval_set=(X_val, y_val),
                          logging_level='Silent')
                # Predicitng and calculating performance on test data
                predict_prob = model.predict_proba(X_test)[:, 1]

                pred_list = [
                    1 if i > 0.5 else 0 for i in predict_prob.tolist()
                ]

                y_list = y_test.tolist()

                counter = 0
                for i in range(len(pred_list)):
                    if pred_list[i] == y_list[i]:
                        counter = counter + 1

                accuracy = counter / len(pred_list)

                result_df_temp = pd.DataFrame(data=None,
Exemple #17
0
eval_dataset = Pool(data=X_test, label=y_test, cat_features=cat_features)

# Initialize CatBoostClassifier

model = CatBoostClassifier(iterations=100,
                           cat_features=cat_features,
                           depth=2,
                           loss_function='MultiClassOneVsAll')

parameters = {
    'depth': [6, 8, 10],
    'learning_rate': [0.01, 0.05, 0.1],
    'iterations': [30, 50]
}

model._tune_hyperparams(parameters, train_dataset)

model.fit(train_dataset, eval_set=eval_dataset)

preds_class = model.predict(eval_dataset)
preds_proba = model.predict_proba(eval_dataset)

preds_raw = model.predict(eval_dataset, prediction_type='RawFormulaVal')

model.predict_proba(test_x)

submission = sample.copy()
submission['Crop_Damage'] = model.predict(test_x)
submission.to_csv('cat3.csv', index=False)
skf = StratifiedKFold(n_splits=5, random_state=seeds[4], shuffle=True)
for train, test in skf.split(X, Y):
    #print(index)
    train_x, test_x, train_y, test_y = X[train], X[test], Y[train], Y[test]
    cbt_model = CatBoostClassifier(iterations=3000,
                                   learning_rate=0.01,
                                   max_depth=7,
                                   verbose=100,
                                   loss_function='MultiClass',
                                   early_stopping_rounds=500,
                                   task_type='CPU',
                                   eval_metric='Accuracy',
                                   max_ctr_complexity=4)
    cbt_model.fit(train_x, train_y, eval_set=(test_x, test_y))
    del train_x, test_x, train_y, test_y
    prediction_cat += cbt_model.predict_proba(result_test) / 5

#def print_best_score(gsearch,param_test):
# 输出best score
#    print("Best score: %0.3f" % gsearch.best_score_)
#    print("Best parameters set:")
#    # 输出最佳的分类器到底使用了怎样的参数
#    best_parameters = gsearch.best_estimator_.get_params()
#    for param_name in sorted(param_test.keys()):
#        print("\t%s: %r" % (param_name, best_parameters[param_name]))
#params = {'depth': [4, 7, 10],
#          'learning_rate' : [0.03, 0.1, 0.15,0.5],
#         'l2_leaf_reg': [1,4,9],
#         'iterations': [3000]}

#estimator =CatBoostClassifier(iterations=2000,verbose=400,early_stopping_rounds=400,
vals[:, 0] = np.linspace(1, 0, N)
vals[:, 1] = np.linspace(0, 1, N)
vals[:, 2] = np.linspace(1, 1, N)
newcmp = ListedColormap(vals)

# calculate coordinates grids for surface and frame plotting
n1, n2 = features.index('hp'), features.index('ch!')
vals1, vals2 = np.linspace(0, 3, 40), np.linspace(0, 1, 20)
N1, N2 = np.meshgrid(vals1, vals2)
LO = np.zeros(shape=N1.shape)

X_tmp = X.copy()
for i in range(N1.shape[0]):
    for j in range(N1.shape[1]):
        X_tmp[:, n1], X_tmp[:, n2] = N1[i, j], N2[i, j]
        pr = np.mean(cb_clf.predict_proba(X_tmp), axis=0)
        LO[i, j] = np.log(pr[1] / pr[0])

# PAGE 355. FIGURE 10.8. Partial dependence of the log-odds of spam vs. email
#            as a function of joint frequences of hp and the character !.
fig = plt.figure(figsize=(6, 3.75), dpi=150)
ax = fig.add_subplot(111, projection='3d')
ax.xaxis.set_pane_color((1.0, 1.0, 1.0, 0.0))
ax.yaxis.set_pane_color((1.0, 1.0, 1.0, 0.0))
ax.zaxis.set_pane_color((1.0, 1.0, 1.0, 0.0))
ax.set_xlabel('hp', fontsize=8)
ax.set_ylabel('ch!', fontsize=8)
ax.w_xaxis.line.set_color(GRAY7)
ax.w_yaxis.line.set_color(GRAY7)
ax.w_zaxis.line.set_color(GRAY7)
ax.view_init(22, 81)
Exemple #20
0
def main(cfg):

    train = pd.read_csv('../input/train.csv')
    test = pd.read_csv('../input/test.csv')
    pre_feat = [c for c in train.columns
                if c not in ['ID_code', 'target']]  #basic features
    target = train['target'].values

    params = {
        'num_rounds': 6000000,
        'verbose_eval': 5000,
        'early_stop': 4000,
    }
    print(params)

    if True:  # filter no use sample
        freq_cols = []
        for col in pre_feat:
            test[col + 'freq'] = test[col].map(
                test[col].value_counts(sort=False))
            freq_cols.append(col + 'freq')
        test['num_unique'] = (test[freq_cols] >= 2).sum(axis=1)
        real_idx = test['num_unique'] < 200
        real_test = test.loc[real_idx, pre_feat]
        # real_test = test.copy()

        assert len(real_test) == 100000
        all_data = train[pre_feat].append(real_test).reset_index(drop=True)
        test.drop(freq_cols, axis=1, inplace=True)

    train, test = get_features(train, test, all_data, pre_feat)
    train['allfreq'] = train[freq_cols].sum(axis=1)
    train['num_unique'] = (train[freq_cols] >= 2).sum(axis=1)

    test['allfreq'] = test[freq_cols].sum(axis=1)
    test['num_unique'] = (test[freq_cols] >= 2).sum(axis=1)

    new_stat = ['freq'] + feat_stat
    features = pre_feat + ['allfreq', 'num_unique'] + \
               [col + 'bin2' for col in two_count_peak_cols] + \
               [col + 'bin3' for col in three_count_peak_cols]
    for s in new_stat:
        features += [col + s for col in pre_feat]

    folds = StratifiedKFold(n_splits=cfg['n_splits'],
                            shuffle=False,
                            random_state=random_state).split(
                                train.values, target)
    oof = np.zeros(len(train))
    predictions = np.zeros(len(test))

    # feat_score = pd.read_csv('feat_importance.csv')['name'].values[:2000].tolist()
    # features = list(set(features) & set(feat_score))
    for fold_, (trn_idx, val_idx) in enumerate(folds):

        if fold_ not in cfg['folds']:
            continue

        val_x, val_y = train.iloc[val_idx], target[val_idx]
        tr_x, tr_y = train.iloc[trn_idx], target[trn_idx]
        # tr_x,val_x,te_x = cal_freq_TE(tr_x,val_x,test,all_data,pre_feat)

        tr_x, val_x, te_x = tr_x[features], val_x[features], test[features]

        tr_x, tr_y = augment(tr_x, tr_y, pre_feat, cfg['t1'], cfg['t2'])
        tr_x['allfreq'] = tr_x[freq_cols].sum(axis=1)
        tr_x['num_unique'] = (tr_x[freq_cols] >= 2).sum(axis=1)

        print("Fold idx:{}".format(fold_ + 1))

        d_train = Pool(tr_x, label=tr_y)
        d_valid = Pool(val_x, label=val_y)

        model = CatBoostClassifier(
            iterations=params['num_rounds'],
            learning_rate=0.003,
            od_type='Iter',
            od_wait=params['early_stop'],
            loss_function="Logloss",
            eval_metric='AUC',
            #         depth=3,
            bagging_temperature=0.7,
            random_seed=2019,
            task_type='GPU')
        model.fit(d_train,
                  eval_set=d_valid,
                  use_best_model=True,
                  verbose=params['verbose_eval'])

        oof[val_idx] = model.predict_proba(val_x)[:, 1]
        pred = model.predict_proba(te_x)[:, 1]
        predictions += pred / cfg['n_splits']
        threshold_search(target[val_idx], oof[val_idx])
        np.save('../submit/' + cfg['name'] + str(fold_), pred)
        np.save(
            '../oof/' + cfg['name'] +
            ''.join([str(fold) for fold in cfg['folds']]), oof)
    print("CV score: {:<8.5f}".format(roc_auc_score(target, oof)))
    # np.save('../input/oof',oof)
    sub = pd.DataFrame({"ID_code": test.ID_code.values})
    sub["target"] = predictions
    sub.to_csv(cfg['name'] + "submission.csv", index=False)
Exemple #21
0
# In[10]:

ctb = CatBoostClassifier(random_seed=17)

# **Обучаем Catboost без настройки параметров, передав только индексы категориальных признаков.**

# In[11]:

get_ipython().run_cell_magic(
    'time', '',
    'ctb.fit(X_train_part, y_train_part,\n        cat_features=categ_feat_idx)'
)

# In[12]:

ctb_valid_pred = ctb.predict_proba(X_valid)[:, 1]

# **Получаем почти 0.75 ROC AUC на отложенной выборке.**

# In[13]:

roc_auc_score(y_valid, ctb_valid_pred)

# **Обучаем на всей выборке, делаем прогноз на тестовой, в соревновании получается результат 0.73008.**

# In[14]:

get_ipython().run_cell_magic(
    'time', '',
    'ctb.fit(X_train, y_train,\n        cat_features=categ_feat_idx)')
Exemple #22
0
arrays[0] = numpy.column_stack((enhancer_vec, promoter_vec))
distance[0] = float(dis)

X_train = numpy.column_stack((arrays, distance))
print(X_train.shape[0], X_train.shape[1])

estimator = CatBoostClassifier(iterations=1000,
                               depth=10,
                               learning_rate=0.1,
                               logging_level=None,
                               scale_pos_weight=45)
estimator.load_model('{}{}/best_model{}'.format(model_filepath, cellline,
                                                kvalue))

y_pred = estimator.predict(X_train)
y_proba_pred = estimator.predict_proba(X_train)[:, 1]
if enchrome != prchrome:
    print(
        'The two elements are not in the same chrosome, please recheck your input!'
    )
else:
    print('For Promoter ' + enoldname + ', Enhancer ' + proldname +
          ' in cell line ' + cellline + ' :')
    if y_pred[0] == 0:
        print(
            'The two elements are predicted not to be interacted by EPBoost, the interaction prediction score is %.4f.'
            % y_proba_pred[0])
    else:
        print(
            'The two elements are predicted interacted by EPBoost, the interaction prediction score is %.4f.'
            % y_proba_pred[0])
def main():
    #############################################################################################################
    seed = 42  # For random numbers generation
    iterations = 10  # Max number of iterations at every run of gradient boosting (max number of trees built)
    hyper_iterations = 3  # Number of iterations required during each Bayesian optimization of hyper-parameters
    log_regs_hyper_iterations = 2  # Number of iterations for hyper-parameters optimization for logistic regression
    cv_folds = 4  # Number of folds used for k-folds cross-validation
    logs_dir = Path('catboost_logs'
                    )  # Relative to the directory where the program is running
    task_type = 'GPU'  # Can be 'CPU' or 'GPU'
    early_stopping_iters = 10000  # Effectively disabled, as there is an issue with displaying the charts see https://github.com/catboost/catboost/issues/1468
    #############################################################################################################

    start_time = time()

    # Make the logs directory, if it doesn't exist already, and ensure it is empty
    logs_dir.mkdir(exist_ok=True)
    for item in logs_dir.iterdir():
        if item.is_dir():
            shutil.rmtree(str(item))
    ''' Load the NHANES I epidemiology dataset. The dataset is already partitioned into a dev set and a test set.
    Here below, the dev set will be further partitioned into a training set and a validation set.'''
    X_dev, X_test, y_dev, y_test = load_data(10)

    # Convert categorical features from float to int, as that is what CatBoost expects
    X_dev = X_dev.astype({'Sex': int, 'Race': int})
    y_dev = y_dev.astype(int)
    X_test = X_test.astype({'Sex': int, 'Race': int})
    y_test = y_test.astype(int)

    # Count and present how many samples with missing data (variable values) in the dev and test set respectively
    dev_missing_count = count_samples_with_missing_data(X_dev)
    test_missing_count = count_samples_with_missing_data(X_test)
    print('\nDev. set missing data in', dev_missing_count, 'samples out of',
          len(X_dev))
    print('Test set missing data in', test_missing_count, 'samples out of',
          len(X_test))

    # Split the dev set into training and validation. The latter will be used for hyper-parameters tuning.
    X_train, X_val, y_train, y_val = train_test_split(X_dev,
                                                      y_dev,
                                                      test_size=0.2,
                                                      random_state=seed)

    # Make a dataset after dropping samples with missing data (note, there are no samples with missing data in test set)

    X_train_dropped = X_train.dropna(axis='rows')
    y_train_dropped = y_train.loc[X_train_dropped.index]
    X_val_dropped = X_val.dropna(axis='rows')
    y_val_dropped = y_val.loc[X_val_dropped.index]
    ''' Prepare two imputers that will be used to impute missing values in the dataset. One is a mean imputer
    and the other an iterative imputer '''

    mean_imputer = SimpleImputer(strategy='mean', verbose=0)
    mean_imputer.fit(X_train)

    iter_imputer = IterativeImputer(random_state=seed,
                                    sample_posterior=False,
                                    max_iter=10,
                                    min_value=0,
                                    verbose=0)
    iter_imputer.fit(X_train)

    # Run a logistic regression
    ''' Fill in hyper-parameters for the logistic regression with their values, or with a probability distribution from 
    where the value must be sampled. '''
    log_regr_params = {
        'penalty': 'elasticnet',
        'C': hp.uniform('C', .25, 4),
        'class_weight': None,
        'random_state': seed,
        'solver': 'saga',
        'max_iter': 10000,
        'multi_class': 'ovr',
        'n_jobs': -1,
        'l1_ratio': hp.uniform('l1_ratio', .0, 1)
    }

    run_exp_log_regr(X_train,
                     y_train,
                     X_val,
                     y_val,
                     param_space=log_regr_params,
                     max_evals=log_regs_hyper_iterations,
                     imputer=iter_imputer,
                     seed=seed)

    # Run gradient boosting (boosted trees) models

    cat_features = [3, 11]  # Categorical features are race and sex
    '''Note: passing a CatBoost Pool() instance in the param_space values here below doesn't work, because hyperopt would
    throw an exception during optimization.'''

    ############################################################################################################
    param_space = {
        'learning_rate': hp.loguniform('learning_rate', np.log(.001),
                                       np.log(.2)),
        'depth': hp.quniform('depth', 4, 12, 1),
        'l2_leaf_reg': hp.uniform('l2_leaf_reg', 1, 9),
        'bagging_temperature': hp.uniform('bagging_temperature', 0, 2),
        'seed': seed,
        'iterations': iterations,
        'task_type': task_type,
        # 'early_stopping_rounds': True,
        'od_type': 'Iter',
        'od_wait': early_stopping_iters
    }
    ############################################################################################################
    ''' First try with no imputation, but instead dropping all samples from the train/val set that have missing data '''
    print(
        '\nPerforming Bayesian search for hyper-parameters optimization, after dropping samples with missing data'
    )

    run_exp_bayes_hyperparams_opt(X_train_dropped,
                                  y_train_dropped,
                                  X_val_dropped,
                                  y_val_dropped,
                                  cat_features=cat_features,
                                  param_space=param_space,
                                  max_evals=hyper_iterations,
                                  imputer=None,
                                  train_dir=str(logs_dir /
                                                'catboost_logs_drop'),
                                  seed=seed)
    ''' Next solve the same model, but with missing data imputed by the mean imputer (no samples are dropped)'''

    print(
        '\nPerforming Bayesian search for hyper-parameters optimization, with missing data replaced with mean imputer'
    )

    run_exp_bayes_hyperparams_opt(X_train,
                                  y_train,
                                  X_val,
                                  y_val,
                                  cat_features=cat_features,
                                  param_space=param_space,
                                  max_evals=hyper_iterations,
                                  imputer=mean_imputer,
                                  train_dir=str(logs_dir /
                                                'catboost_logs_mean_imputer'),
                                  seed=seed)

    print(
        '\nPerforming Bayesian search for hyper-parameters optimization, with missing data replaced with iterative imputer'
    )
    ''' Now do it with missing data imputed by the iterative imputer (no samples are dropped)'''

    run_exp_bayes_hyperparams_opt(X_train,
                                  y_train,
                                  X_val,
                                  y_val,
                                  cat_features=cat_features,
                                  param_space=param_space,
                                  max_evals=hyper_iterations,
                                  imputer=iter_imputer,
                                  train_dir=str(logs_dir /
                                                'catboost_logs_iter_imputer'),
                                  seed=seed)
    ''' Solve the same model again, but this time neither drop samples with missing data nor use an imputer. Leave
    the missing data in the dataset the way they are, and let CatBoost deal with them. '''

    print(
        '\nPerforming Bayesian search for hyper-parameters optimization, without replacement of missing data'
    )

    selected_model = run_exp_bayes_hyperparams_opt(
        X_train,
        y_train,
        X_val,
        y_val,
        cat_features=cat_features,
        param_space=param_space,
        max_evals=hyper_iterations,
        imputer=None,
        train_dir=str(logs_dir / 'catboost_logs_keep_nan'),
        seed=seed)
    ''' Solve the model still leaving missing data in the dataset, but this time use a weighted loss function,
    to keep into account that the dataset is imbalanced (positive cases are under-represented) '''

    print(
        '\nPerforming Bayesian search for hyper-parameters optimization, without replacement and with weights'
    )
    ''' Compute the number of positive and negative samples in the training set, and the respective weights to be 
    used '''
    w, stats = compute_weights(y_train)
    print('Computed weights')
    print('For', stats['total_pos'], 'positive samples:', stats['pos_weight'])
    print('For', stats['total_neg'], 'negative samples:', stats['neg_weight'])

    run_exp_bayes_hyperparams_opt(X_train,
                                  y_train,
                                  X_val,
                                  y_val,
                                  cat_features=cat_features,
                                  param_space=param_space,
                                  max_evals=hyper_iterations,
                                  imputer=None,
                                  weights=w,
                                  train_dir=str(logs_dir /
                                                'catboost_logs_weights'),
                                  seed=seed)
    ''' A note on CatBoost grid-search (not used here). It would be done on the dev. set, as the grid-search takes care 
    of splitting it into training and validation. If `search_by_train_test_split` is set to True, every combination of 
    values of the hyper-parameters is evaluated with a basic training/val. split of the dataset; if set to False, then 
    every combination is evaluated with x-evaluation. Once method grid_search() has selected the best combination of 
    hyper-parameters, we could fit a model with it. The final model can be evaluated with x-evaluation by setting 
    parameter `calc_cv_statistics` to True (which is the default). '''
    """
    print('\nTuning hyper-parameters for NN')
    p = Process(target=run_exp_nn,
                args=(X_train, y_train, X_val, y_val, params, hyper_iterations,
                      iter_imputer,
                      str(logs_dir / 'tensorflow_logs_nn'),
                      seed))
    p.start()
    p.join()
    sleep(2.)
    """
    ''' Cross-validate the selected model, and test it on the test set '''

    model = selected_model
    imputer = None  # The selected model retains missing data, doesn't impute nor discard them

    print(f'\nCross-validating selected model.')
    params = model.get_params()
    params['loss_function'] = 'Logloss'
    params['eval_metric'] = 'AUC:hints=skip_train~false'
    params['train_dir'] = str(logs_dir / ('catboost_logs_cv_selected'))
    params['task_type'] = task_type
    # params['early_stopping_rounds'] = True
    params['od_type'] = 'Iter'
    params['od_wait'] = early_stopping_iters
    ''' Make a new imputer for cross-validation over the dev set. '''
    X_pool, _ = make_imputed_pool(X_dev,
                                  y_dev,
                                  imputer=imputer,
                                  cat_features=cat_features,
                                  weight=None)
    cv_results = cv(pool=X_pool,
                    params=params,
                    iterations=iterations,
                    fold_count=cv_folds,
                    partition_random_seed=seed,
                    stratified=True,
                    verbose=False)
    # Find the iteration with the best test AUC, the value of its AUC and other train and test stats.
    best_cv_iter = np.argmax(
        cv_results['test-AUC-mean']
    )  # All the stats retrieved will refer to this same iteration
    best_cv_val_AUC = cv_results['test-AUC-mean'][best_cv_iter]
    best_cv_val_Logloss = cv_results['test-Logloss-mean'][best_cv_iter]
    best_cv_train_AUC = cv_results['train-AUC-mean'][best_cv_iter]
    best_cv_train_Logloss = cv_results['train-Logloss-mean'][best_cv_iter]
    print('Parameters:')
    for key, value in sorted(params.items()):
        print(f'   {key}={value}')
    print('Best cross-validation achieved at iteration', best_cv_iter)
    print(
        f'Training: Logloss {best_cv_train_Logloss}   ROC AUC {best_cv_train_AUC}'
    )
    print(
        f'Validation: Logloss {best_cv_val_Logloss}   ROC AUC {best_cv_val_AUC}'
    )

    print('Re-fitting the model on the dev. set and testing it')
    params['iterations'] = best_cv_iter + 1
    params['train_dir'] = None
    cv_model = CatBoostClassifier(**params)
    training_res = cv_model.fit(X_pool, verbose=False)
    # print('Iteration:', training_res.best_iteration_)
    y_test_preds = cv_model.predict_proba(X_test)[:, 1]
    test_AUC = roc_auc_score(y_test, y_test_preds)
    test_Logloss = log_loss(y_test, y_test_preds)
    print(f"Test on test set: Log loss={test_Logloss}   ROC AUC={test_AUC}")

    print(
        f'Overall train, validation and test run time: {round(time() - start_time)}s'
    )

    print('\nFetaures importance based on prediction values change (%)')
    feature_importances = cv_model.get_feature_importance(X_pool)
    feature_names = X_dev.columns
    for score, name in sorted(zip(feature_importances, feature_names),
                              reverse=True):
        print('{}: {}'.format(name, score))

    print('\nFetaures importance based on loss (ROC AUC) values change')
    feature_importances_loss = cv_model.get_feature_importance(
        X_pool, type=EFstrType.LossFunctionChange)
    for score, name in sorted(zip(feature_importances_loss, feature_names),
                              reverse=True):
        print('{}: {}'.format(name, score))

    # Plot a ROC curve for the x-validated model over the test set
    y_test_preds = cv_model.predict_proba(X_test)[:, 1]
    fpr, tpr, thresholds = roc_curve(y_test, y_test_preds)
    fig, ax = plt.subplots()
    ax.set_title('ROC Curve')
    ax.set_xlabel('False positive rate')
    ax.set_ylabel('True Positive rate')
    ax.set_ylim((0, 1))
    ax.set_xlim((0, 1))
    ax.grid(True)
    plt.gca().set_aspect('equal', adjustable='box')
    ax.plot([0, 1], [0, 1], color='blue', ls='--', lw=.5)
    ax.plot(fpr, tpr, color='blue', label='ROC')
    # ax.legend(loc='lower center')
    plt.show()
                           eval_metric='AUC',
                           max_depth=6,
                           learning_rate=0.01,
                           od_wait=50,
                           l2_leaf_reg=10,
                           task_type="GPU",
                           cat_features=categorical_features_indices,
                           bagging_temperature=0.80,
                           random_strength=100,
                           use_best_model=True)
    m.fit(X_train,
          y_train,
          eval_set=[(X_test, y_test)],
          early_stopping_rounds=200,
          verbose=1000)
    oofcat[test_index] = m.predict_proba(X_test)[:, -1]
    p = m.predict_proba(test[usecols])[:, -1]
    y_pred_totcb += p / N_SPLITS

#%%
np.save('oof-onielg-catb', oofcat)
np.save('tst-onielg-catb', y_pred_totcb)

# y_pred_totcb = y_pred_totcb/5

# sample_sub['target'] = pd.DataFrame(predictionslgb).rank(pct=True)
# sample_sub.to_csv('submission.csv',index=False)

#sample_sub['target'] = y_pred_totcb
sample_sub['target'] = pd.DataFrame(y_pred_totcb).rank(
    pct=True) * 0.60 + pd.DataFrame(predictionslgb).rank(pct=True) * 0.40
Exemple #25
0
# Imputing by most frequent
for col in X.columns[X.isnull().sum()>0]:
    X[col] = X[col].transform(lambda x:x.fillna(x.value_counts().idxmax()))

# Reshaping	
X_train_ = X.iloc[:X_train.shape[0],:]
X_test_ = X.iloc[X_train.shape[0]:,:]

# Categorical features
cat_features_list = []
for i in range(X.shape[1]):
    if 'cat' in X.columns[i]:
        cat_features_list.append(i)

# Initialize CatBoostClassifier
model = CatBoostClassifier(verbose=True)
# Fit model
model.fit(X_train_, y_train, cat_features = cat_features_list, verbose=True)
# Get predicted classes
preds_class = model.predict(X_test)
# Get predicted probabilities for each class
preds_proba = model.predict_proba(X_test)
# Saving
a = np.reshape((preds_proba[:,1]), (preds_proba.shape[0],1))
df_pred = pd.read_csv("E:/Kaggle/Seguro/sample_submission.csv", header=0)
df_pred['target']=a
df_pred.to_csv("E:/Kaggle/Seguro/prediction.csv",index=False)

		
                         learning_rate=0.05,
                         depth=9,
                         boosting_type='Ordered',
                         bagging_temperature=0.4,
                         task_type='GPU',
                         silent=True)  # 0.8578
# clf = GaussianNB()
# clf = MLPClassifier(max_iter=300)
# clf = svm.SVC(kernel='linear')
t2 = time.time()
print(f"Model Created --- {(t2-t1) if (t2-t1)<60 else (t2-t1)/60}")

clf.fit(X_t,
        y_t,
        cat_features=cat_col,
        eval_set=(X_tt, y_tt),
        plot=True,
        early_stopping_rounds=30,
        verbose=100)
t3 = time.time()
print(f"Model Trained --- {(t3-t2)/60} Minutes")

sample = clf.predict_proba(test_x)

# print(sample[:, 1])

submit = pd.DataFrame(({"id": test.id, "Response": sample[:, 1]}))
submit.to_csv('submission.csv', index=False)
t4 = time.time()
print(f"Process Finished ---{(t4-t3)}sec")
Exemple #27
0
fig = plt.figure()
plt.hist(acc_set2, bins=50, color = 'g')
plt.axvline(x=final_accuracy,color = 'r')
plt.show()
fig.savefig(dest+'fig2.png', dpi=fig.dpi)
print("The accuracy from the model is ",scipy.stats.percentileofscore(acc_set2, final_accuracy, kind='rank')," percentile in the permutation test using shuffled values of test data labels")


# ### AUC and Confusion Matrix

# In[22]:


fig = plt.figure()
y_pred_proba = model.predict_proba(dfv.drop(columns = 'TimeCycle'))[::,1]
fpr, tpr, _ = metrics.roc_curve(dfv['TimeCycle'],y_pred_proba)
auc = metrics.roc_auc_score(dfv['TimeCycle'], y_pred_proba)
plt.plot(fpr,tpr,label="ROC, auc="+str(auc))
plt.legend(loc=4)
plt.show()
fig.savefig(dest+'fig3.png', dpi=fig.dpi)

cm = metrics.confusion_matrix(dfv['TimeCycle'],result)
# labels = ['No Default', 'Default']
fig = plt.figure(figsize=(8,6))
sns.heatmap(cm, annot = True, fmt='d', cmap="Blues", vmin = 0.2);
plt.title('Confusion Matrix')
plt.ylabel('True Class')
plt.xlabel('Predicted Class')
plt.show()
Exemple #28
0
        model = CatBoostClassifier(iterations=N_ESTIMATORS,
                                   learning_rate=LEARNING_RATE,
                                   depth=DEPTH,
                                   eval_metric=EVAL_METRIC,
                                   verbose=VERBOSE,
                                   random_state=RANDOM_STATE,
                                   thread_count=N_THREADS,
                                   task_type="GPU")

        model.fit(
            train_dataset,
            eval_set=valid_dataset,
            early_stopping_rounds=EARLY_STOPPING_ROUNDS,
        )
        y_pred_valid = model.predict_proba(valid_dataset)[:, 1]
        y_pred = model.predict_proba(test_dataset)[:, 1]

        fold_importance = pd.DataFrame()
        fold_importance["feature"] = model.feature_names_
        fold_importance["importance"] = model.get_feature_importance()
        fold_importance["fold"] = fold_n + 1
        feature_importance = pd.concat([feature_importance, fold_importance],
                                       axis=0)
        best_iteration = model.best_iteration_
    best_iterations.append(best_iteration)

    fold_score = roc_auc_score(y_valid, y_pred_valid)
    scores.append(fold_score)

    update_tracking(
Exemple #29
0
def train_model_classification(X, X_test, y, params, num_classes=2,
                               folds=None, model_type='lgb',
                               eval_metric='logloss', columns=None,
                               plot_feature_importance=False,
                               model=None, verbose=10000,
                               early_stopping_rounds=200,
                               splits=None, n_folds=3):
    """
    分类模型函数
    返回字典,包括: oof predictions, test predictions, scores and, if necessary, feature importances.
    :params: X - 训练数据, pd.DataFrame
    :params: X_test - 测试数据,pd.DataFrame
    :params: y - 目标
    :params: folds - folds to split data
    :params: model_type - 模型
    :params: eval_metric - 评价指标
    :params: columns - 特征列
    :params: plot_feature_importance - 是否展示特征重要性
    :params: model - sklearn model, works only for "sklearn" model type
    """
    start_time = time.time()
    global y_pred_valid, y_pred

    columns = X.columns if columns is None else columns
    X_test = X_test[columns]
    splits = folds.split(X, y) if splits is None else splits
    n_splits = folds.n_splits if splits is None else n_folds

    # to set up scoring parameters
    metrics_dict = {
        'logloss': {
            'lgb_metric_name': 'logloss',
            'xgb_metric_name': 'mlogloss',
            'catboost_metric_name': 'Logloss',
            'sklearn_scoring_function': metrics.log_loss
        },
        'lb_score_method': {
            'sklearn_scoring_f1': metrics.f1_score,  # 线上评价指标
            'sklearn_scoring_accuracy': metrics.accuracy_score,  # 线上评价指标
            'sklearn_scoring_auc': metrics.roc_auc_score
        },
    }
    result_dict = {}

    # out-of-fold predictions on train data
    oof = np.zeros(shape=(len(X), num_classes))
    # averaged predictions on train data
    prediction = np.zeros(shape=(len(X_test), num_classes))
    # list of scores on folds
    scores = []
    # feature importance
    feature_importance = pd.DataFrame()

    # split and train on folds
    for fold_n, (train_index, valid_index) in enumerate(splits):
        if verbose:
            print(f'Fold {fold_n + 1} started at {time.ctime()}')
        if type(X) == np.ndarray:
            X_train, X_valid = X[train_index], X[valid_index]
            y_train, y_valid = y[train_index], y[valid_index]
        else:
            X_train, X_valid = X[columns].iloc[train_index], X[columns].iloc[valid_index]
            y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]

        if model_type == 'lgb':
            model = lgb.LGBMClassifier(**params)
            model.fit(X_train, y_train,
                      eval_set=[(X_train, y_train), (X_valid, y_valid)],
                      eval_metric=metrics_dict[eval_metric]['lgb_metric_name'],
                      verbose=verbose,
                      early_stopping_rounds=early_stopping_rounds)

            y_pred_valid = model.predict_proba(X_valid)
            y_pred = model.predict_proba(X_test, num_iteration=model.best_iteration_)

        if model_type == 'xgb':
            model = xgb.XGBClassifier(**params)
            model.fit(X_train, y_train,
                      eval_set=[(X_train, y_train), (X_valid, y_valid)],
                      eval_metric=metrics_dict[eval_metric]['xgb_metric_name'],
                      verbose=bool(verbose),  # xgb verbose bool
                      early_stopping_rounds=early_stopping_rounds)
            y_pred_valid = model.predict_proba(X_valid)
            y_pred = model.predict_proba(X_test, ntree_limit=model.best_ntree_limit)
        if model_type == 'sklearn':
            model = model
            model.fit(X_train, y_train)
            y_pred_valid = model.predict_proba(X_valid)
            score = metrics_dict[eval_metric]['sklearn_scoring_function'](y_valid, y_pred_valid)
            print(f'Fold {fold_n}. {eval_metric}: {score:.4f}.')
            y_pred = model.predict_proba(X_test)

        if model_type == 'cat':
            model = CatBoostClassifier(iterations=20000, eval_metric=metrics_dict[eval_metric]['catboost_metric_name'],
                                       **params,
                                       loss_function=metrics_dict[eval_metric]['catboost_metric_name'])
            model.fit(X_train, y_train, eval_set=(X_valid, y_valid), cat_features=[], use_best_model=True,
                      verbose=False)

            y_pred_valid = model.predict_proba(X_valid)
            y_pred = model.predict_proba(X_test)

        oof[valid_index] = y_pred_valid
        # 评价指标
        scores.append(
            metrics_dict['lb_score_method']['sklearn_scoring_accuracy'](y_valid, np.argmax(y_pred_valid, axis=1)))
        print(scores)
        prediction += y_pred

        if model_type == 'lgb' and plot_feature_importance:
            # feature importance
            fold_importance = pd.DataFrame()
            fold_importance["feature"] = columns
            fold_importance["importance"] = model.feature_importances_
            fold_importance["fold"] = fold_n + 1
            feature_importance = pd.concat([feature_importance, fold_importance], axis=0)

        if model_type == 'xgb' and plot_feature_importance:
            # feature importance
            fold_importance = pd.DataFrame()
            fold_importance["feature"] = columns
            fold_importance["importance"] = model.feature_importances_
            fold_importance["fold"] = fold_n + 1
            feature_importance = pd.concat([feature_importance, fold_importance], axis=0)
    prediction /= n_splits
    print('CV mean score: {0:.4f}, std: {1:.4f}.'.format(np.mean(scores), np.std(scores)))

    result_dict['oof'] = oof
    result_dict['prediction'] = prediction
    result_dict['scores'] = scores

    if model_type == 'lgb' or model_type == 'xgb':
        if plot_feature_importance:
            feature_importance["importance"] /= n_splits
            cols = feature_importance[["feature", "importance"]].groupby("feature").mean().sort_values(
                by="importance", ascending=False)[:50].index

            best_features = feature_importance.loc[feature_importance.feature.isin(cols)]

            plt.figure(figsize=(16, 12))
            sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False))
            plt.title('LGB Features (avg over folds)')
            plt.show()
            result_dict['feature_importance'] = feature_importance
    end_time = time.time()

    print("train_model_classification cost time:{}".format(end_time - start_time))
    return result_dict
    def feature_importance(self, n_rows, n_cols, X_train, y_train, X_valid,
                           y_valid):
        '''Calculate feature importance from Logistic, Random Forest, CatBoost, XGB, LGBM'''
        # train classifiers
        lr = LogisticRegression(max_iter=100, random_state=42)
        lr.fit(X_train, y_train)
        lr_prob = lr.predict_proba(X_valid)
        rfc = RandomForestClassifier(n_jobs=2, random_state=42)
        rfc.fit(X_train, y_train)
        rfc_prob = rfc.predict_proba(X_valid)
        brfc = BalancedRandomForestClassifier(random_state=42)
        brfc.fit(X_train, y_train)
        brfc_prob = brfc.predict_proba(X_valid)
        cb = CatBoostClassifier(random_state=42, verbose=False)
        cb.fit(X_train, y_train)
        cb_prob = cb.predict_proba(X_valid)
        xgb = XGBClassifier(random_state=42)
        xgb.fit(X_train, y_train)
        xgb_prob = xgb.predict_proba(X_valid)
        lgbm = LGBMClassifier(random_state=42, n_jobs=-1)
        lgbm.fit(X_train, y_train)
        lgbm_prob = lgbm.predict_proba(X_valid)

        feat_importance_list = [
            lr.coef_[0], rfc.feature_importances_, brfc.feature_importances_,
            cb.feature_importances_, xgb.feature_importances_,
            lgbm.feature_importances_
        ]
        model_name = [
            'Logistic Regression', 'Random Forest Classifier',
            'Balanced Random Forest Classifier', 'CatBoost Classifier',
            'XGB Classifier', 'LGBM Classifier'
        ]

        # generate feature importance plots
        fig, ax = plt.subplots(nrows=n_rows, ncols=n_cols, figsize=(18, 20))
        sns.set(font_scale=1.5)
        for feature, name, n, ax in zip(feat_importance_list, model_name,
                                        list(range(n_rows * n_cols)),
                                        ax.flatten()):
            # get feature importance
            importance = feature

            # create dataframe
            df_imp = pd.DataFrame()

            # calculate importance of each variable
            df_imp['importance'] = pd.Series(importance,
                                             index=list(X_train.columns))

            # transform dataframe
            long_df = pd.melt(df_imp.T)

            # plot barplot
            plt.subplot(n_rows, n_cols, n + 1)
            sns.barplot(y=long_df.variable,
                        x=long_df.value,
                        order=long_df.sort_values(
                            'value', ascending=False)['variable'].to_list())
            plt.title(f'{name}')

        # adjusts subplot
        plt.tight_layout()

        # displays the plot
        plt.show()
Exemple #31
0
          eval_set=(X_eval, y_eval),
          cat_features=categorical_features_indices,
          plot=True)

# 输出各特征重要度
feature_names = X_train.columns
print(
    pd.DataFrame({
        'column': feature_names,
        'importance': model.get_feature_importance(),
    }).sort_values(by='importance', ascending=False))

print(model.get_best_iteration())

# 训练集和验证集的预测
y_train_prob = model.predict_proba(X_train,
                                   ntree_end=model.get_best_iteration())[:, 1]
y_eval_prob = model.predict_proba(X_eval,
                                  ntree_end=model.get_best_iteration())[:, 1]

# AUC指标
print('AUC')
print(roc_auc_score(y_train, y_train_prob))
print(roc_auc_score(y_eval, y_eval_prob))

# 本题采用的指标
print('本题采用的指标')
print(cal_metric(y_train, y_train_prob))
print(cal_metric(y_eval, y_eval_prob))

# 模型预测并写入csv(此处记得将前面的验证集df_eval改为0, 用全量的df_train,并设置迭代次数为best_iteration)
y_prob = model.predict_proba(X_test, ntree_end=model.get_best_iteration())[:,
    d_id_ce_hash.transform(d_test_account[d_id_cols]),
],
                           axis=1).fillna(0)

X_new = df_train_concat.reset_index(drop=True)
X_test_new = df_test_concat.copy().reset_index(drop=True)

lr = 0.09552 / 3.3

cat = CatBoostClassifier(random_state=17,
                         iterations=2321,
                         learning_rate=lr,
                         verbose=100,
                         custom_metric='AUC',
                         eval_metric='AUC',
                         use_best_model=False,
                         early_stopping_rounds=200)

cat.fit(X_new, y, verbose=False, plot=False)

y_test_pred = cat.predict_proba(X_test_new)[:, 1]

df_submission = pd.DataFrame({'radiant_win_prob': y_test_pred},
                             index=df_test_concat.index)

submission_filename = 'submission_{}.csv'.format(
    datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S_cat'))

df_submission.to_csv(submission_filename)
print('Submission saved to {}'.format(submission_filename))
Exemple #33
0
    cat_col = [
        i for i in data_test.select_dtypes(object).columns
        if i not in ['ncodpers', 'fecha_dato']
    ]
    for i in cat_col:
        data_train[i] = lbe.fit_transform(data_train[i].astype(str))
        data_val[i] = lbe.fit_transform(data_val[i].astype(str))
        data_test[i] = lbe.fit_transform(data_test[i].astype(str))

    exp_var = data_test.columns.tolist()[2:]
    x_train = data_train[exp_var]
    y_train = data_train[product]
    x_val = data_val[exp_var]
    y_val = data_val[product]
    x_test = data_test[exp_var]
    model = CatBoostClassifier(learning_rate=0.05,
                               n_estimators=1000,
                               random_state=2019)
    model.fit(x_train, y_train, eval_set=[(x_train, y_train), (x_val, y_val)])
    data_val[product] = model.predict_proba(x_val)[:, 1]
    new_val = data_val[['ncodpers', product]]
    new_val = new_val.sort_values(by='ncodpers')
    new_val.to_csv("catboost_validation_{}_{}.csv".format(product, train_date),
                   index=False)
    data_test[product] = model.predict_proba(x_test)[:, 1]
    new_test = data_test[['ncodpers', product]]
    new_test = new_test.sort_values(by='ncodpers')
    new_test.to_csv("catboost_submission_{}_{}.csv".format(
        product, train_date),
                    index=False)
Exemple #34
0
            model.fit(self.training_data.drop(["TARGET"], axis = 1), self.training_data['TARGET'],
                     cat_features = cat_features, 
                     eval_set = (self.validation_data.drop(['TARGET'], axis = 1), self.validation_data['TARGET']))

            preds = model.predict(self.validation_data.drop(['TARGET'], axis = 1))

            return model, preds, self.validation_data['TARGET']

        elif self.problem_type == 'classification':
            model = CatBoostClassifier(**params)

            model.fit(self.training_data.drop(["TARGET"], axis = 1), self.training_data['TARGET'],
                     cat_features = cat_features, 
                     eval_set = (self.validation_data.drop(['TARGET'], axis = 1), self.validation_data['TARGET']))

            preds = model.predict_proba(self.validation_data.drop(['TARGET'], axis = 1))[0]

            return model, preds, self.validation_data['TARGET']

        else:
            raise Exception("Problem Type not supported")
        

    def train_and_validate(self):

        if self.model = "xgboost":
            return self._xgboost()
        elif self.model = "lightgbm":
            return self._lightgbm()
        elif self.model = "catboost":
            return self._catboost()
Exemple #35
0
class modelCatBoost(object):
    def __init__(self, name="CBT", random_state=99, *args, **kwargs):

        self.name = name
        self.train_dir = "model_" + str(self.name) + "/"
        self.random_state = random_state

        self.manager_models = ParamsManager(param_file, key_read="Models")
        self.params = self.manager_models.get_params()["CatBoost"]
        self.params.update({
            'train_dir': self.train_dir,
            "random_state": self.random_state
        })

        self.model = CatBoostClassifier(**self.params)

    def dataset(self,
                X,
                y,
                categorical_columns_indices=None,
                test_size=0.2,
                *args,
                **kwargs):

        self.categorical_columns_indices = categorical_columns_indices
        self.X = X
        self.columns = list(X)

        self.y, self.cat_replace = self.replace_multiclass(y)
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            self.X,
            self.y,
            test_size=test_size,
            random_state=self.random_state)

        self.train_data = catboost.Pool(
            data=self.X_train.values,
            label=self.y_train.values,
            cat_features=self.categorical_columns_indices)
        self.eval_data = catboost.Pool(
            data=self.X_test.values,
            label=self.y_test.values,
            cat_features=self.categorical_columns_indices)
        self.all_train_data = catboost.Pool(
            data=self.X.values,
            label=self.y.values,
            cat_features=self.categorical_columns_indices)

    def replace_multiclass(self, targets):

        _unic = targets.unique().tolist()
        _remp = np.arange(0, len(_unic)).tolist()
        return targets.replace(_unic, _remp), _unic

    def fit(self,
            X,
            y,
            use_best_model=True,
            plot=True,
            save_snapshot=False,
            verbose=0,
            *args,
            **kwargs):

        self.dataset(X, y)
        _params = self.model.get_params()

        if verbose:
            _verbose = 0
        else:
            _verbose = _params["verbose"]

        return self.model.fit(self.train_data,
                              verbose=_verbose,
                              eval_set=self.eval_data,
                              use_best_model=use_best_model,
                              plot=plot,
                              save_snapshot=save_snapshot,
                              **kwargs)

        _preds = self.model.predict(self.dvalid)
        preds_test = np.where(_preds > 0.5, 1, 0)
        score_test = accuracy_score(self.y_test, preds_test)

        _preds = self.model.predict(self.dtrain)
        preds_train = np.where(_preds > 0.5, 1, 0)
        score_train = accuracy_score(self.y_train, preds_train)

        if not verbose == 0:
            print("Accurancy para el conjunto de entrenamiento ---> {:.2f}%".
                  format(score_train * 100))
            print("Accurancy para el conjunto de validacion ------> {:.2f}%".
                  format(score_test * 100))

    def fit_cv(self,
               X,
               y,
               fold_count=4,
               shuffle=True,
               stratified=True,
               plot=True,
               verbose=100):

        self.dataset(X, y)

        _params = self.model.get_params()
        _params.update({'verbose': verbose})

        _scores = catboost.cv(pool=self.all_train_data,
                              params=_params,
                              fold_count=fold_count,
                              seed=self.random_state,
                              shuffle=shuffle,
                              verbose=verbose,
                              plot=plot)
        if not verbose == 0:
            print('Best validation accuracy score: {:.2f}±{:.2f} on step {}'.
                  format(
                      np.max(_scores['test-Accuracy-mean']),
                      _scores['test-Accuracy-std'][np.argmax(
                          _scores['test-Accuracy-mean'])],
                      np.argmax(_scores['test-Accuracy-mean'])))

        return _scores

    def copy(self, *args, **kwargs):
        returned_classifier = CatBoostClassifier()
        returned_classifier.catboost_classifier = self.model.copy()
        returned_classifier.columns = self.columns
        return returned_classifier

    def update_model(self, **kwargs):
        for k, v in kwargs.items():
            setattr(self.model, k, v)

    def save_model(self, direct="./checkpoints", name="catboost_model"):

        if not os.path.isdir(direct):
            try:
                os.mkdir(direct)
                print("Directorio creado: " + direct)
            except OSError as e:
                raise NameError("Error al crear el directorio")
        current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
        filename = direct + "/" + name + "_" + current_time + ".dump"
        self.model.save_model(filename)
        print("Modelo guardado en la ruta: " + filename)

    def load_model(self, direct="./checkpoints", name="catboost_model"):

        if not os.path.isdir(direct):
            print("no existe el drectorio especificado")
        filename = direct + "/" + name + ".dump"
        self.model.load_model(filename)
        print("Modelo cargado de la ruta: " + filename)

    def predict(self, X, *args, **kwargs):
        _X_copy = X.loc[:, self.columns].copy()
        return self.model.predict(_X_copy.values, *args, **kwargs)

    def predict_proba(self, X, *args, **kwargs):
        _X_copy = X.loc[:, self.columns].copy()
        return self.model.predict_proba(_X_copy.values, *args, **kwargs)

    def add_cat_features(self, index_features):

        self.categorical_columns_indices = index_features
        print(self.categorical_columns_indices)

        self.train_data = catboost.Pool(
            data=self.X_train,
            label=self.y_train,
            cat_features=self.categorical_columns_indices)
        self.eval_data = catboost.Pool(
            data=self.X_test,
            label=self.y_test,
            cat_features=self.categorical_columns_indices)
        self.all_train_data = catboost.Pool(
            data=self.X,
            label=self.y,
            cat_features=self.categorical_columns_indices)

    def index_features(self, features):

        _index = []
        for i in features:
            _index.append(self.X.columns.get_loc(i))
        if _index == []:
            raise NameError("No coincide ninguna de las features introducidas")
        return _index

    def get_important_features(self, display=True):

        self.model.get_feature_importance(prettified=True)
        _feature_importance_df = self.model.get_feature_importance(
            prettified=True)

        if display:
            plt.figure(figsize=(12, 6))
            sns.barplot(x="Importances",
                        y="Feature Id",
                        data=_feature_importance_df)
            plt.title('CatBoost features importance:')

        return _feature_importance_df

    def Visualizer_Models(self, directs=None, visu_model=True):

        directorios = []
        if len(directs) < 0:
            if visu_model:
                directorios.append(self.train_dir)
            else:
                raise NameError("No se ha seleccionado ningun directorio")
        else:
            if visu_model:
                directorios.append(self.train_dir)
            for i in directs:
                directorios.append(i)
        print(directorios)
        widget = MetricVisualizer(directorios)
        widget.start()

    def hyperopt_objective(self, params):

        _model = CatBoostClassifier(
            l2_leaf_reg=int(params['l2_leaf_reg']),
            learning_rate=params['learning_rate'],
            bagging_temperature=params["bagging_temperature"],
            iterations=500,
            eval_metric='AUC',
            random_seed=99,
            verbose=False,
            loss_function='Logloss')
        _cv_data = catboost.cv(self.all_train_data, _model.get_params())
        best_accuracy = np.max(_cv_data['test-AUC-mean'])

        return 1 - best_accuracy

    def FineTune_hyperopt(self, X, y, mute=False):

        self.dataset(X, y)

        params_space = {
            'l2_leaf_reg':
            hyperopt.hp.qloguniform('l2_leaf_reg', 0, 2, 1),
            'learning_rate':
            hyperopt.hp.uniform('learning_rate', 1e-3, 5e-1),
            'bagging_temperature':
            hyperopt.hp.uniform("bagging_temperature", 0, 0.3)
        }
        trials = hyperopt.Trials()
        best = hyperopt.fmin(self.hyperopt_objective,
                             space=params_space,
                             algo=hyperopt.tpe.suggest,
                             max_evals=2,
                             trials=trials,
                             rstate=RandomState(self.random_state))
        if not mute:
            print("\nBest parameters:")
            print(best)
            print("\n")

        _parameters = self.params
        _parameters.update(best)

        _model = CatBoostClassifier(**_parameters)
        _cv_data = catboost.cv(self.all_train_data, _model.get_params())

        if not mute:
            print('\nPrecise validation accuracy score: {}'.format(
                np.max(_cv_data['test-Accuracy-mean'])))
        return best

    def FineTune_sklearn(self, X, y, mute=False, n_splits=10, n_iter=2):
        """
        https://www.kaggle.com/ksaaskil/pets-definitive-catboost-tuning
        """
        self.dataset(X, y)

        def build_search(modelo,
                         param_distributions,
                         cv=5,
                         n_iter=10,
                         verbose=1,
                         random_state=99):
            """
            Builder function for RandomizedSearch.
            """
            QWS = make_scorer(cohen_kappa_score, weights='quadratic')
            return RandomizedSearchCV(modelo,
                                      param_distributions=param_distributions,
                                      cv=cv,
                                      return_train_score=True,
                                      refit='cohen_kappa_quadratic',
                                      n_iter=n_iter,
                                      n_jobs=None,
                                      scoring={
                                          'accuracy':
                                          make_scorer(accuracy_score),
                                          'cohen_kappa_quadratic': QWS
                                      },
                                      verbose=verbose,
                                      random_state=random_state)

        def pretty_cv_results(cv_results,
                              sort_by='rank_test_cohen_kappa_quadratic',
                              sort_ascending=True,
                              n_rows=30):
            """
            Return pretty Pandas dataframe from the `cv_results_` attribute of finished parameter search,
            ranking by test performance and only keeping the columns of interest.
            """
            df = pd.DataFrame(cv_results)
            cols_of_interest = [
                key for key in df.keys() if key.startswith('param_')
                or key.startswith("mean_train") or key.startswith("std_train")
                or key.startswith("mean_test") or key.startswith("std_test")
                or key.startswith('mean_fit_time') or key.startswith('rank')
            ]
            return df.loc[:, cols_of_interest].sort_values(
                by=sort_by, ascending=sort_ascending).head(n_rows)

        def run_search(X_train, y_train, search, mute=False):
            search.fit(X_train, y_train)
            print('Best score is:', search.best_score_)
            return pretty_cv_results(search.cv_results_)

        param_distributions = {
            'iterations': [100, 200],
            'learning_rate': scipy.stats.uniform(0.01, 0.3),
            'max_depth': scipy.stats.randint(3, 10),
            'one_hot_max_size': [30],
            'l2_leaf_reg': scipy.stats.reciprocal(a=1e-2, b=1e1),
        }

        if mute:
            _verbose = 0
        else:
            _verbose = 1

        self.params.update({'use_best_model': False})
        _model = CatBoostClassifier(**self.params)

        catboost_search = build_search(_model,
                                       param_distributions=param_distributions,
                                       n_iter=n_iter,
                                       verbose=_verbose,
                                       cv=RepeatedStratifiedKFold(
                                           n_splits=n_splits,
                                           n_repeats=1,
                                           random_state=self.random_state))
        catboost_cv_results = run_search(self.X,
                                         self.y,
                                         search=catboost_search,
                                         mute=mute)
        best_estimator = catboost_search.best_estimator_
        if not mute:
            print(best_estimator.get_params())

        return catboost_cv_results, best_estimator

    def __getattr__(self, attr):
        """
        Pass all other method calls to self.model.
        """
        return getattr(self.model, attr)
Exemple #36
0
class CBModel(MetaModel):
    def __init__(self):
        super(CBModel, self).__init__()
        self.max_run = 2
        self.all_data_round = 1
        self.explore_params_round = 0

        self.not_gain_threhlod = 3

        self.patience = 3

        self.is_init = False

        self.name = 'cb'
        self.type = 'tree'

        self._model = None

        self.params = {
            'task_type': 'GPU',
            "loss_function": "MultiClass",
            "random_seed": CONSTANT.SEED,
            'verbose': False
        }

        self.hyperparams = {
            "learning_rate": 0.02,
            'iterations': 1200,
        }

        self.is_multi_label = None

        self.num_class = None

        self.models = {}

    def init_model(self, num_class, **kwargs):
        self.is_init = True
        self.num_class = num_class

    #@timeit
    def epoch_train(self,
                    dataloader,
                    run_num,
                    is_multi_label=None,
                    info=None,
                    time_remain=None):
        self.is_multi_label = is_multi_label
        X, y, train_idxs, cat = dataloader['X'], dataloader['y'], dataloader[
            'train_idxs'], dataloader['cat_cols']
        train_x, train_y = X.loc[train_idxs], y[train_idxs]

        if info['mode'] == 'bagging':
            self.hyperparams = info['cb'].copy()
            self.hyperparams['random_seed'] = np.random.randint(0, 2020)
            run_num = self.explore_params_round

        if run_num == self.explore_params_round:
            print('cb explore_params_round')
            train_x, train_y, val_x, val_y, = self.split_data(train_x, train_y)

            self.import_cols = info['imp_cols']

            if train_x.shape[1] > 300 and train_x.shape[0] > 20000:
                train_x = train_x[self.import_cols[:300]]
                val_x = val_x[self.import_cols[:300]]
                log('explore_params_round sample 300 cols')
                train_x.reset_index(drop=True, inplace=True)
                train_x = train_x.sample(n=20000)
                train_y = train_y[list(train_x.index)]
                log('explore_params_round sample 2w samples')

            elif train_x.shape[0] > 20000:
                train_x.reset_index(drop=True, inplace=True)
                train_x = train_x.sample(n=20000)
                train_y = train_y[list(train_x.index)]
                log('explore_params_round sample 2w samples')

            elif train_x.shape[1] > 300:
                train_x = train_x[self.import_cols[:300]]
                val_x = val_x[self.import_cols[:300]]
                log('explore_params_round sample 300 cols')

            self.bayes_opt(train_x, val_x, train_y, val_y, cat)
            self.early_stop_opt(train_x, val_x, train_y, val_y, cat)

            info['cb'] = self.hyperparams.copy()

        train_x, train_y = X.loc[train_idxs], y[train_idxs]
        if run_num == self.all_data_round:
            print('cb all data round')
            all_train_idxs = dataloader['all_train_idxs']
            train_x = X.loc[all_train_idxs]
            train_y = y[all_train_idxs]

        if self.is_multi_label:
            for cls in range(self.num_class):
                cls_y = train_y[:, cls]
                self.models[cls] = CatBoostClassifier(**{
                    **self.params,
                    **self.hyperparams
                })
                self.models[cls].fit(train_x, cls_y)
        else:
            self._model = CatBoostClassifier(**{
                **self.params,
                **self.hyperparams
            })
            self._model.fit(train_x, ohe2cat(train_y))

    #@timeit
    def epoch_valid(self, dataloader):
        X, y, val_idxs = dataloader['X'], dataloader['y'], dataloader[
            'val_idxs']
        val_x, val_y = X.loc[val_idxs], y[val_idxs]
        if not self.is_multi_label:
            preds = self._model.predict_proba(val_x)
        else:
            all_preds = []
            for cls in range(y.shape[1]):
                preds = self.models[cls].predict_proba(val_x)
                all_preds.append(preds[:, 1])
            preds = np.stack(all_preds, axis=1)
        valid_auc = roc_auc_score(val_y, preds)
        return valid_auc

    #@timeit
    def predict(self, dataloader):
        X, test_idxs = dataloader['X'], dataloader['test_idxs']
        test_x = X.loc[test_idxs]
        if not self.is_multi_label:
            return self._model.predict_proba(test_x)
        else:
            all_preds = []
            for cls in range(self.num_class):
                preds = self.models[cls].predict_proba(test_x)
                all_preds.append(preds[:, 1])
            return np.stack(all_preds, axis=1)

    #@timeit
    def bayes_opt(self, X_train, X_eval, y_train, y_eval, categories):
        if self.is_multi_label:
            y_train = y_train[:, 1]
            y_eval = y_eval[:, 1]
        else:
            y_train = ohe2cat(y_train)

        space = {
            "learning_rate":
            hp.loguniform("learning_rate", np.log(0.01), np.log(0.1)),
            "depth":
            hp.choice("depth", [4, 6, 8, 10, 12]),
            "l2_leaf_reg":
            hp.uniform('l2_leaf_reg', 0.1, 2),
        }

        def objective(hyperparams):
            hyperparams = self.hyperparams.copy()
            hyperparams['iterations'] = 300
            model = CatBoostClassifier(**{**self.params, **hyperparams})
            model.fit(X_train, y_train)
            pred = model.predict_proba(X_eval)

            if self.is_multi_label:
                score = roc_auc_score(y_eval, pred[:, 1])
            else:
                score = roc_auc_score(y_eval, pred)

            return {'loss': -score, 'status': STATUS_OK}

        trials = Trials()
        best = hyperopt.fmin(fn=objective,
                             space=space,
                             trials=trials,
                             algo=tpe.suggest,
                             max_evals=15,
                             verbose=1,
                             rstate=np.random.RandomState(1))

        self.hyperparams.update(space_eval(space, best))
        log("auc = {}, hyperparams: {}".format(
            -trials.best_trial['result']['loss'], self.hyperparams))

    def early_stop_opt(self, X_train, X_eval, y_train, y_eval, categories):
        if self.is_multi_label:
            y_train = y_train[:, 1]
            y_eval = y_eval[:, 1]
        else:
            y_train = ohe2cat(y_train)
            y_eval = ohe2cat(y_eval)

        model = CatBoostClassifier(**{**self.params, **self.hyperparams})
        model.fit(X_train,
                  y_train,
                  eval_set=[(X_eval, y_eval)],
                  use_best_model=True,
                  verbose=10,
                  early_stopping_rounds=20)

        self.params['iterations'] = model.best_iteration_
        log('best iterations: {}'.format(model.best_iteration_))

    def split_data(self, x, y):
        new_x = x.copy()
        new_x.reset_index(drop=True, inplace=True)
        sss = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
        self.splits = {}
        i = 0
        for train_idxs, val_idxs in sss.split(new_x, y):
            self.splits[i] = [train_idxs, val_idxs]
            i += 1
        new_train_x = new_x.loc[self.splits[0][0]]
        new_train_y = y[self.splits[0][0]]

        new_val_x = new_x.loc[self.splits[0][1]]
        new_val_y = y[self.splits[0][1]]

        return new_train_x, new_train_y, new_val_x, new_val_y