def create_model(self, kfold_X_train, y_train, kfold_X_valid, y_test, test): best = CatBoostClassifier(loss_function='MultiClassOneVsAll', learning_rate=0.07940735491731761, depth=8) best.fit(kfold_X_train, y_train) # 对验证集predict pred = best.predict_proba(kfold_X_valid) results = best.predict_proba(test) return pred, results, best
def cleaning_comments(raw_comments, path='.') -> str: print('start cleaning of comments...') raw = pd.read_csv(raw_comments) cleaned_comments = os.path.join(path, 'cleaned_comments.csv') bad_comments = os.path.join(path, 'bad_comments.csv') model = CatBoostClassifier().load_model(os.path.join(path, 'trash_model')) vectorizer = joblib.load(os.path.join(path, 'trash_vectorizer')) hyp = model.predict_proba(vectorizer.transform(raw.text).toarray()) with open(cleaned_comments, 'w') as cleaned, open(bad_comments, 'w') as bad: bad_file = 'likes,status,text\n' cleaned_file = 'likes,status,text\n' for i in range(len(hyp)): if hyp[i][0] < 0.6: bad_file += str(raw.likes[i]) + ',1,"' + raw.text[i] + '"\n' else: cleaned_file += str(raw.likes[i]) + ',0,"' + raw.text[i] + '"\n' cleaned.write(cleaned_file) bad.write(bad_file) os.remove(raw_comments) print('end cleaning of comments...') return cleaned_comments
def test_ntree_limit(): train_pool = Pool(TRAIN_FILE, column_description=CD_FILE) test_pool = Pool(TEST_FILE, column_description=CD_FILE) model = CatBoostClassifier(iterations=100, random_seed=0) model.fit(train_pool) pred = model.predict_proba(test_pool, ntree_end=10) np.save(PREDS_PATH, np.array(pred)) return local_canonical_file(PREDS_PATH)
def test_multiclass(): pool = Pool(CLOUDNESS_TRAIN_FILE, column_description=CLOUDNESS_CD_FILE) classifier = CatBoostClassifier(iterations=2, random_seed=0, loss_function='MultiClass', thread_count=8) classifier.fit(pool) classifier.save_model(OUTPUT_MODEL_PATH) new_classifier = CatBoostClassifier() new_classifier.load_model(OUTPUT_MODEL_PATH) pred = new_classifier.predict_proba(pool) np.save(PREDS_PATH, np.array(pred)) return local_canonical_file(PREDS_PATH)
def model_1(X,y,test): ''' This is a catBoost model where we need not to encode categorical variables. It automatically takes care of them. ''' categorical_features_indices = np.where(X.dtypes != np.float)[0] X_train, X_validation, y_train, y_validation = train_test_split(X, y, train_size=0.7, random_state=1234) #importing library and building model cboost=CatBoostClassifier(iterations=500,learning=0.01,depth=6,loss_function='MultiClass',eval_metric='Accuracy') cboost.fit(X_train, y_train,cat_features=categorical_features_indices,eval_set=(X_validation, y_validation),plot=True) #calculating the class wise prediction probability of cboost model pred_prob=cboost.predict_proba(test) return pred_prob
class BesCatBoost: """ catboost_params = { 'iterations': 500, 'depth': 3, 'learning_rate': 0.1, 'eval_metric': 'AUC', 'random_seed': 42, 'logging_level': 'Verbose', 'l2_leaf_reg': 15.0, 'bagging_temperature': 0.75, 'allow_writing_files': False, 'metric_period': 50 } """ def __init__(self, params, metric='AUC', maximize=True, verbose=True, model=None): self.params = params self.metric = metric self.maximize = maximize self.verbose = verbose self.model = model def fit(self, X_train, y_train): bst = cv( Pool(X_train, y_train), self.params ) best_rounds = int(bst['test-{}-mean'.format(self.metric)].idxmax() * 1.5) + 1 print('Best Iteration: {}'.format(best_rounds)) self.params['iterations'] = best_rounds self.model = CatBoostClassifier(**self.params) self.model.fit( X_train, y_train ) def predict(self, X_test): pred_prob = self.model.predict_proba(X_test)[:, -1] return pred_prob def feature_importance(self): pass @staticmethod def find_best_params(kag): pass
#print(log) #pred2 = model2_val.predict(val_df_x) #acc2 = accuracy_score(val_df_y, pred2) #print('Accuracy: ', acc2) model2 = CatBoostClassifier(depth=8, iterations=1000, learning_rate=0.02, eval_metric='MultiClass', loss_function='MultiClass', bootstrap_type= 'Bernoulli', leaf_estimation_method='Gradient', random_state=123) model2.fit(train_x, train_y, verbose=100) pred2 = model2.predict_proba(test_x) # Submission sub2 = sample_df.copy() sub2.iloc[:, 1:] = pred2.data sub2.to_csv("submission2_T06.csv",index=False) def generate(main, support, coeff): g = main.copy() for i in main.columns[1:]: res = [] lm, ls = [], [] lm = main[i].tolist() ls = support[i].tolist()
def make_cat_oof_prediction(train, y, test, features, categorical_features=None, model_params=None, folds=10, is_optuna=False): x_train = train[features] x_test = test[features] # 테스트 데이터 예측값을 저장할 변수 test_preds = np.zeros(x_test.shape[0]) # Out Of Fold Validation 예측 데이터를 저장할 변수 y_oof = np.zeros(x_train.shape[0]) # 폴드별 평균 Validation 스코어를 저장할 변수 score = 0 # 피처 중요도를 저장할 데이터 프레임 선언 fi = pd.DataFrame() fi['feature'] = features # Stratified K Fold 선언 skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=SEED) for fold, (tr_idx, val_idx) in enumerate(skf.split(x_train, y)): # train index, validation index로 train 데이터를 나눔 x_tr, x_val = x_train.loc[tr_idx, features], x_train.loc[val_idx, features] y_tr, y_val = y[tr_idx], y[val_idx] print( f'fold: {fold+1}, x_tr.shape: {x_tr.shape}, x_val.shape: {x_val.shape}' ) # CatBoost 모델 훈련 clf = CatBoostClassifier(**model_params) clf.fit( x_tr, y_tr, eval_set=(x_val, y_val), # Validation 성능을 측정할 수 있도록 설정 cat_features=categorical_features, use_best_model=True, verbose=True) # Validation 데이터 예측 val_preds = clf.predict_proba(x_val)[:, 1] # Validation index에 예측값 저장 y_oof[val_idx] = val_preds # 폴드별 Validation 스코어 출력 print(f"Fold {fold + 1} | AUC: {roc_auc_score(y_val, val_preds)}") print('-' * 80) # score 변수에 폴드별 평균 Validation 스코어 저장 score += roc_auc_score(y_val, val_preds) / folds # 테스트 데이터 예측하고 평균해서 저장 test_preds += clf.predict_proba(x_test)[:, 1] / folds # 폴드별 피처 중요도 저장 fi[f'fold_{fold+1}'] = clf.feature_importances_ del x_tr, x_val, y_tr, y_val gc.collect() print(f"\nMean AUC = {score}") # 폴드별 평균 Validation 스코어 출력 print(f"OOF AUC = {roc_auc_score(y, y_oof)}" ) # Out Of Fold Validation 스코어 출력 #wandb if is_optuna == False: wandb.log({'[cat]perdictiona Mean AUC': score}) wandb.log({'[cat]perdictiona OOF AUC': roc_auc_score(y, y_oof)}) # 폴드별 피처 중요도 평균값 계산해서 저장 fi_cols = [col for col in fi.columns if 'fold_' in col] fi['importance'] = fi[fi_cols].mean(axis=1) return y_oof, test_preds, fi
cb_model = CatBoostClassifier(iterations=1000, learning_rate=0.1, depth=7, l2_leaf_reg=40, bootstrap_type='Bernoulli', subsample=0.7, scale_pos_weight=5, eval_metric='AUC', metric_period=50, od_type='Iter', od_wait=45, random_seed=17, allow_writing_files=False) cb_model.fit(X_train, y_train, eval_set=(X_valid, y_valid), cat_features=cat_features_inds, use_best_model=True, verbose=True) fea_imp = pd.DataFrame({'imp': cb_model.feature_importances_, 'col': cols}) fea_imp = fea_imp.sort_values(['imp', 'col'], ascending=[True, False]).iloc[-30:] _ = fea_imp.plot(kind='barh', x='col', y='imp', figsize=(20, 10)) plt.savefig('catboost_feature_importance.png') print('AUC:', roc_auc_score(y_valid, cb_model.predict_proba(X_valid)[:, 1])) y_preds = cb_model.predict_proba(data_test)[:, 1] subm['TARGET'] = y_preds subm.to_csv('submission.csv', index=False)
lr1_pred_p = lr1.predict_proba(X_test) y_pred = lr1.predict(X_test) ### CatBoost ### cat = CatBoostClassifier(iterations=2, depth=2, learning_rate=1, loss_function='Logloss', verbose=True) cat.fit(X_train, y_train) preds_class = cat.predict(X_test) preds_proba = cat.predict_proba(X_test) a_scores = {} f_scores = {} models = {'lr1': lr1, 'lr2': lr2, 'rf': rf, 'cat': cat} for model_str, model in models.items(): a_scores[model_str] = model.score(X_test, y_test) if 'l' in model_str: f_scores[model_str] = model.coef_ else: f_scores[model_str] = model.feature_importances_ df_f_scores = pd.DataFrame([arr.flatten() for arr in f_scores.values()], index=f_scores.keys(), columns=X_cols)
l2_leaf_reg=3, depth=4, loss_function="Logloss", verbose=False, random_state=random_state, ) # %% xgb_model.fit(X_train, y_train) rf_model.fit(X_train, y_train) cat_model.fit(X_train, y_train) xgb_pred = xgb_model.predict_proba(X_validation) rf_pred = rf_model.predict_proba(X_validation) cat_pred = cat_model.predict_proba(X_validation) # %% ROC fig, ax = plt.subplots() skplt.metrics.plot_roc( y_validation, xgb_pred, plot_micro=False, plot_macro=False, classes_to_plot=[1], ax=ax, cmap="Blues", ) skplt.metrics.plot_roc( y_validation,
def main(): # load data df_train = pd.read_csv('data/train_data.csv') df_valid = pd.read_csv('data/valid_data.csv') df_test = pd.read_csv('data/test_data.csv') feature_cols = [f for f in list(df_train) if "feature" in f] target_col = df_train.columns[-1] X_train = df_train[feature_cols].values y_train = df_train[target_col].values X_valid = df_valid[feature_cols].values y_valid = df_valid[target_col].values X_test = df_test[feature_cols].values tsne_3d_50p = np.load('data/tsne_3d_50p.npz') tsne_3d_50p_train = tsne_3d_50p['train'] tsne_3d_50p_valid = tsne_3d_50p['valid'] tsne_3d_50p_test = tsne_3d_50p['test'] X_train_concat = np.concatenate((X_train, tsne_3d_50p_train), axis=1) X_valid_concat = np.concatenate((X_valid, tsne_3d_50p_valid), axis=1) X_test_concat = np.concatenate((X_test, tsne_3d_50p_test), axis=1) parameter_grid = [ { 'learning_rate': [0.005, 0.008, 0.01, 0.03, 0.05], 'iterations': [500, 1000, 1500, 2000], 'depth': [6, 7, 8, 9, 10] }, { 'learning_rate': [0.01, 0.02, 0.03], 'border_count': range(32, 40), 'l2_leaf_reg': range(3, 5) }, ] X_search = np.concatenate([ np.concatenate([X_train, tsne_3d_50p_train], axis=1), np.concatenate([X_valid, tsne_3d_50p_valid], axis=1), ], axis=0) y_search = np.concatenate([y_train, y_valid], axis=0) classifier = CatBoostClassifier(n_jobs=1) search = GridSearchCV(classifier, parameter_grid, verbose=1) print('Tuning hyperparameters...') search.fit(X_search, y_search) print('Found best parameters:') print(search.best_score_) print(search.best_params_) classifier = CatBoostClassifier( \ learning_rate=search.best_params_['learning_rate'], \ iterations=search.best_params_['iterations'], \ depth=search.best_params_['depth']) print('Fitting...') start_time = time.time() classifier.fit(X_train_concat, y_train) print('Fit: {}s'.format(time.time() - start_time)) p_valid = classifier.predict_proba(X_valid_concat) loss = log_loss(y_valid, p_valid) print('Loss: {}'.format(loss)) p_test = classifier.predict_proba(X_test_concat) df_pred = pd.DataFrame({'id': df_test['id'], 'probability': p_test[:, 1]}) csv_path = 'predictions/predictions_{}_{}.csv'.format( int(time.time()), loss) df_pred.to_csv(csv_path, columns=('id', 'probability'), index=None) print('Saved: {}'.format(csv_path))
class CXDetector: # "sklearn estimator" def __init__(self, sample_freq, low_freq, high_freq, window_size, window_shift, shap_window_size, shap_window_shift, read_signal_fn, read_clin_fn, proc_clin_fn): self.sample_freq = sample_freq self.low_freq = low_freq self.high_freq = high_freq self.window_size = window_size self.window_shift = window_shift self.shap_window_size = shap_window_size self.shap_window_shift = shap_window_shift self.read_signal_fn = read_signal_fn self.read_clin_fn = read_clin_fn self.proc_clin_fn = proc_clin_fn # Features to apply on the shorter normalized windows self.short_features = [ GENDISFeatures, ] # Features using 1 channel to apply on the normal windows self.features = [ # Spectral features BOSSFeatures, BasicFrequencyFeatures, # Temporal features BasicFeatures, RMSFeatures, #TSFRESHFeatures, ] # Features using multiple channels self.multi_channel_features = [ # Correlations CorrFeatures() ] def prep_data(self, files, train=True): # Read in our data signals = [] intervals = [] for file in files: *_signals, _intervals = self.read_signal_fn( file, self.low_freq, self.high_freq, self.sample_freq) signals.append(_signals) intervals.append(_intervals) # Extract windows from the data window_data = WindowData() shap_window_data = WindowData() for file, signals, intervals in zip(files, signals, intervals): if train: for ann1, ann2 in zip(intervals[::2], intervals[1::2]): if (ann1[1][-1] not in ['C', 'D'] or ann2[0] >= len(signals[0]) or ann1[0] < 0): continue label = int(ann1[1][-1] == 'C') _windows, idx = extract_windows(signals, ann1[0], ann2[0], self.window_size, self.window_shift) window_data.windows.extend(_windows) window_data.indices.extend(idx) window_data.files.extend([file] * len(_windows)) window_data.labels.extend([label] * len(_windows)) shap_windows, shap_idx = extract_windows( signals, ann1[0], ann2[0], self.shap_window_size, self.shap_window_shift) shap_window_data.windows.extend(shap_windows) shap_window_data.indices.extend(shap_idx) shap_window_data.files.extend([file] * len(shap_windows)) shap_window_data.labels.extend([label] * len(shap_windows)) else: _windows, idx = extract_windows(signals, 0, len(signals[0]), self.window_size, self.window_shift) window_data.windows.extend(_windows) window_data.indices.extend(idx) window_data.files.extend([file] * len(_windows)) window_data.labels.extend([None] * len(_windows)) shap_windows, shap_idx = extract_windows( signals, 0, len(signals[0]), self.shap_window_size, self.shap_window_shift) shap_window_data.windows.extend(shap_windows) shap_window_data.indices.extend(shap_idx) shap_window_data.files.extend([file] * len(shap_windows)) shap_window_data.labels.extend([None] * len(shap_windows)) window_data.windows = np.array(window_data.windows) window_data.files = np.array(window_data.files) window_data.indices = np.array(window_data.indices) shap_window_data.windows = np.array(shap_window_data.windows) shap_window_data.files = np.array(shap_window_data.files) shap_window_data.indices = np.array(shap_window_data.indices) return window_data, shap_window_data def get_corr_features(self, X): """Get all coordinates in the X-matrix with correlation value equals 1 (columns with equal values), excluding elements on the diagonal. Parameters: ----------- - train_df: pd.DataFrame the feature matrix where correlated features need to be removed Returns ------- - correlated_feature_pairs: list of tuples coordinates (row, col) where correlated features can be found """ row_idx, col_idx = np.where(np.abs(X.corr()) > 0.99) self_corr = set([(i, i) for i in range(X.shape[1])]) correlated_feature_pairs = set(list(zip(row_idx, col_idx))) - self_corr return correlated_feature_pairs def get_uncorr_features(self, data): """Remove clusters of these correlated features, until only one feature per cluster remains. Parameters: ----------- - data: pd.DataFrame the feature matrix where correlated features need to be removed Returns ------- - data_uncorr_cols: list of string the column names that are completely uncorrelated to eachother """ X_train_corr = data.copy() correlated_features = self.get_corr_features(X_train_corr) corr_cols = set() for row_idx, col_idx in correlated_features: corr_cols.add(row_idx) corr_cols.add(col_idx) uncorr_cols = list( set(X_train_corr.columns) - set(X_train_corr.columns[list(corr_cols)])) col_mask = [False] * X_train_corr.shape[1] for col in corr_cols: col_mask[col] = True X_train_corr = X_train_corr.loc[:, col_mask] correlated_features = self.get_corr_features(X_train_corr) to_remove = set() for corr_row, corr_col in correlated_features: if corr_row in to_remove: continue for corr_row2, corr_col2 in correlated_features: if corr_row == corr_row2: to_remove.add(corr_col2) elif corr_row == corr_col2: to_remove.add(corr_row2) col_mask = [True] * X_train_corr.shape[1] for ix in to_remove: col_mask[ix] = False X_train_corr = X_train_corr.loc[:, col_mask] data_uncorr_cols = list(set(list(X_train_corr.columns) + uncorr_cols)) return data_uncorr_cols def remove_features(self, data): """Remove all correlated features and columns with only a single value. Parameters: ----------- - data: pd.DataFrame the feature matrix where correlated features need to be removed Returns ------- - useless_cols: list of string list of column names that have no predictive value """ single_cols = list(data.columns[data.nunique() == 1]) uncorr_cols = self.get_uncorr_features(data) corr_cols = list(set(data.columns) - set(uncorr_cols)) useless_cols = list(set(single_cols + corr_cols)) print('Removing {} features'.format(len(useless_cols))) return useless_cols def fit(self, train_files): window_data, shap_window_data = self.prep_data(train_files) # Extract clinical variables clin_features = [] for file in train_files: names, values = self.read_clin_fn(file) clin_features.append([file] + values) clin_df = pd.DataFrame(clin_features, columns=['file'] + names) clin_df = self.proc_clin_fn(clin_df) # Extract features for each channel separately features_per_channel = [] self.feature_extractors_per_channel = {} for ch in range(window_data.windows.shape[1]): self.feature_extractors_per_channel[ch] = [] for feature_extractor in self.features: self.feature_extractors_per_channel[ch].append( feature_extractor()) channel_features = [] for f in self.feature_extractors_per_channel[ch]: features = f.fit_transform(window_data.windows[:, ch, :], window_data.labels) features = pd.DataFrame( features, columns=['{}_ch{}'.format(x, ch) for x in f.names_]) channel_features.append(features) features_per_channel.append(pd.concat(channel_features, axis=1)) short_features_per_channel = [] self.short_feature_extractors_per_channel = {} for ch in range(window_data.windows.shape[1]): self.short_feature_extractors_per_channel[ch] = [] for feature_extractor in self.short_features: self.short_feature_extractors_per_channel[ch].append( feature_extractor()) channel_features = [] for f in self.short_feature_extractors_per_channel[ch]: f.fit(shap_window_data.windows[:, ch, :], shap_window_data.labels) features = f.transform(window_data.windows[:, ch, :], window_data.labels) features = pd.DataFrame( features, columns=['{}_ch{}'.format(x, ch) for x in f.names_]) channel_features.append(features) short_features_per_channel.append( pd.concat(channel_features, axis=1)) features_multi_channel = [] for f in self.multi_channel_features: features = f.fit_transform(window_data.windows, window_data.labels) features = pd.DataFrame( features, columns=['{}_ch{}'.format(x, ch) for x in f.names_]) features_multi_channel.append(features) # Concatenate the features of different channels together train_features = pd.concat(features_per_channel + short_features_per_channel + features_multi_channel, axis=1) train_features['file'] = window_data.files train_features = train_features.merge(clin_df, on='file') # Create our X and y X_train = train_features y_train = np.array(window_data.labels) for col in ['ID', 'file']: if col in X_train.columns: X_train = X_train.drop(col, axis=1) X_train = X_train.astype(float) # useless_features = self.remove_features(X_train) # X_train = X_train.drop(useless_features, axis=1) # Now apply hypothesis testing on remaining features rel_table = calculate_relevance_table(X_train, pd.Series(y_train)) self.rel_features = list(rel_table[rel_table['p_value'] <= 0.05].index) X_train = X_train[self.rel_features] # Create validation set for early stopping val_files = np.random.choice(train_files, size=int(0.1 * len(train_files)), replace=False) all_files = np.array(window_data.files) X_val = X_train.loc[np.isin(window_data.files, val_files), :] y_val = y_train[np.isin(window_data.files, val_files)] X_train = X_train.loc[~np.isin(window_data.files, val_files), :] y_train = y_train[~np.isin(window_data.files, val_files)] # Fit our gradient boosting classifier self.clf = CatBoostClassifier( iterations=10000, od_type='Iter', od_wait=50, objective='CrossEntropy', random_seed=2018, #eval_metric='AUC', use_best_model=True, task_type='CPU') self.clf.fit(X_train, y_train, eval_set=(X_val, y_val), verbose=100) return train_features def predict(self, test_files): # TODO: Take means of all predictions on same timepoint window_data, shap_window_data = self.prep_data(test_files, train=False) # Extract clinical variables clin_features = [] for file in test_files: names, values = self.read_clin_fn(file) clin_features.append([file] + values) clin_df = pd.DataFrame(clin_features, columns=['file'] + names) clin_df = self.proc_clin_fn(clin_df) # Extract features for each channel separately features_per_channel = [] for ch in range(window_data.windows.shape[1]): channel_features = [] for f in self.feature_extractors_per_channel[ch]: features = f.transform(window_data.windows[:, ch, :]) features = pd.DataFrame( features, columns=['{}_ch{}'.format(x, ch) for x in f.names_]) channel_features.append(features) features_per_channel.append(pd.concat(channel_features, axis=1)) short_features_per_channel = [] for ch in range(window_data.windows.shape[1]): channel_features = [] for f in self.short_feature_extractors_per_channel[ch]: features = f.transform(window_data.windows[:, ch, :]) features = pd.DataFrame( features, columns=['{}_ch{}'.format(x, ch) for x in f.names_]) channel_features.append(features) short_features_per_channel.append( pd.concat(channel_features, axis=1)) features_multi_channel = [] for f in self.multi_channel_features: features = f.transform(window_data.windows) features = pd.DataFrame( features, columns=['{}_ch{}'.format(x, ch) for x in f.names_]) features_multi_channel.append(features) # Concatenate the features of different channels together test_features = pd.concat(features_per_channel + short_features_per_channel + features_multi_channel, axis=1) test_features['file'] = window_data.files test_features = test_features.merge(clin_df, on='file') all_preds = [] for file in test_files: X_test = test_features[test_features['file'] == file] test_ix = window_data.indices[window_data.files == file].flatten() for col in ['ID', 'file']: if col in X_test.columns: X_test = X_test.drop(col, axis=1) X_test = X_test[self.rel_features] preds = self.clf.predict_proba(X_test)[:, 1] #.reshape(-1, 1) pred_df = pd.DataFrame(list(range(max(test_ix) + self.window_size)), columns=['index']) pred_df['file'] = file pred_df['pred'] = np.NaN pred_df = pred_df.set_index('index', drop=True) pred_df.loc[test_ix, 'pred'] = preds pred_df = pred_df.ffill().reset_index() all_preds.append(pred_df) return pd.concat(all_preds).reset_index(drop=True)
# temp = pd.DataFrame() # temp = score[:60] # color = cm.jet(temp['fea']/temp['fea'].max()) # plt.figure(figsize=(25, 17)) # plt.barh(temp['fea_name'],temp['fea'],height =0.8,color=color,alpha=0.8) # # plt.show() # plt.savefig('huawei/feature_weight.jpg') #线上提交的模型训练 clf1 = CatBoostClassifier(iterations=clf.best_iteration_, depth=6,learning_rate=0.1, loss_function='Logloss' ,eval_metric='AUC',task_type='GPU',metric_period=50) clf1.fit( train_df[feature], train_df['label'].astype('int32'), verbose=True ) y_pre = clf1.predict_proba(test_df[feature])[:, 1] import pickle with open('/data/mengyuan/huawei/model/catbase.pkl', 'wb') as f: pickle.dump(clf1, f) print('save catebase model to /data/mengyuan/huawei/model/catbase.pkl !!') res = pd.DataFrame() res['id'] = test_df['id'].astype('int32') res['probability'] = y_pre res.to_csv('/data/mengyuan/huawei/ensemble/submission_catbase.csv',index = False)
depth=12, ) cbt_model.fit(train_pool, eval_set=eval_pool, verbose=100) # with open('./models/fold%d_cbt_v1.mdl' % index, 'wb') as file: # pickle.dump(cbt_model, file) else: with open('./models/fold%d_cbt_v1.mdl' % index, 'rb') as file: cbt_model = pickle.load(file) imp['score%d' % (index + 1)] = cbt_model.feature_importances_ score = cbt_model.best_score_['validation']['AUC'] scores.append(score) print('fold %d round %d : score: %.6f | mean score %.6f' % (index + 1, cbt_model.best_iteration_, score, np.mean(scores))) preds += cbt_model.predict_proba(test_x) del cbt_model, train_pool, eval_pool del X_train, y_train, X_valid, y_valid import gc gc.collect() # mdls.append(cbt_model) # In[ ]: imp.sort_values(by='score1', ascending=False) # In[ ]: result = invite_info_evaluate[['question_id', 'author_id', 'invite_time']]
t1 = datetime.datetime.now() model = CatBoostClassifier(iterations=100, rsm=rsm, learning_rate=lrn_rt, depth=dep, l2_leaf_reg=l2_reg, random_seed=2) model.fit(X_train, y_train, cat_indices, use_best_model=True, eval_set=(X_val, y_val), logging_level='Silent') # Predicitng and calculating performance on test data predict_prob = model.predict_proba(X_test)[:, 1] pred_list = [ 1 if i > 0.5 else 0 for i in predict_prob.tolist() ] y_list = y_test.tolist() counter = 0 for i in range(len(pred_list)): if pred_list[i] == y_list[i]: counter = counter + 1 accuracy = counter / len(pred_list) result_df_temp = pd.DataFrame(data=None,
eval_dataset = Pool(data=X_test, label=y_test, cat_features=cat_features) # Initialize CatBoostClassifier model = CatBoostClassifier(iterations=100, cat_features=cat_features, depth=2, loss_function='MultiClassOneVsAll') parameters = { 'depth': [6, 8, 10], 'learning_rate': [0.01, 0.05, 0.1], 'iterations': [30, 50] } model._tune_hyperparams(parameters, train_dataset) model.fit(train_dataset, eval_set=eval_dataset) preds_class = model.predict(eval_dataset) preds_proba = model.predict_proba(eval_dataset) preds_raw = model.predict(eval_dataset, prediction_type='RawFormulaVal') model.predict_proba(test_x) submission = sample.copy() submission['Crop_Damage'] = model.predict(test_x) submission.to_csv('cat3.csv', index=False)
skf = StratifiedKFold(n_splits=5, random_state=seeds[4], shuffle=True) for train, test in skf.split(X, Y): #print(index) train_x, test_x, train_y, test_y = X[train], X[test], Y[train], Y[test] cbt_model = CatBoostClassifier(iterations=3000, learning_rate=0.01, max_depth=7, verbose=100, loss_function='MultiClass', early_stopping_rounds=500, task_type='CPU', eval_metric='Accuracy', max_ctr_complexity=4) cbt_model.fit(train_x, train_y, eval_set=(test_x, test_y)) del train_x, test_x, train_y, test_y prediction_cat += cbt_model.predict_proba(result_test) / 5 #def print_best_score(gsearch,param_test): # 输出best score # print("Best score: %0.3f" % gsearch.best_score_) # print("Best parameters set:") # # 输出最佳的分类器到底使用了怎样的参数 # best_parameters = gsearch.best_estimator_.get_params() # for param_name in sorted(param_test.keys()): # print("\t%s: %r" % (param_name, best_parameters[param_name])) #params = {'depth': [4, 7, 10], # 'learning_rate' : [0.03, 0.1, 0.15,0.5], # 'l2_leaf_reg': [1,4,9], # 'iterations': [3000]} #estimator =CatBoostClassifier(iterations=2000,verbose=400,early_stopping_rounds=400,
vals[:, 0] = np.linspace(1, 0, N) vals[:, 1] = np.linspace(0, 1, N) vals[:, 2] = np.linspace(1, 1, N) newcmp = ListedColormap(vals) # calculate coordinates grids for surface and frame plotting n1, n2 = features.index('hp'), features.index('ch!') vals1, vals2 = np.linspace(0, 3, 40), np.linspace(0, 1, 20) N1, N2 = np.meshgrid(vals1, vals2) LO = np.zeros(shape=N1.shape) X_tmp = X.copy() for i in range(N1.shape[0]): for j in range(N1.shape[1]): X_tmp[:, n1], X_tmp[:, n2] = N1[i, j], N2[i, j] pr = np.mean(cb_clf.predict_proba(X_tmp), axis=0) LO[i, j] = np.log(pr[1] / pr[0]) # PAGE 355. FIGURE 10.8. Partial dependence of the log-odds of spam vs. email # as a function of joint frequences of hp and the character !. fig = plt.figure(figsize=(6, 3.75), dpi=150) ax = fig.add_subplot(111, projection='3d') ax.xaxis.set_pane_color((1.0, 1.0, 1.0, 0.0)) ax.yaxis.set_pane_color((1.0, 1.0, 1.0, 0.0)) ax.zaxis.set_pane_color((1.0, 1.0, 1.0, 0.0)) ax.set_xlabel('hp', fontsize=8) ax.set_ylabel('ch!', fontsize=8) ax.w_xaxis.line.set_color(GRAY7) ax.w_yaxis.line.set_color(GRAY7) ax.w_zaxis.line.set_color(GRAY7) ax.view_init(22, 81)
def main(cfg): train = pd.read_csv('../input/train.csv') test = pd.read_csv('../input/test.csv') pre_feat = [c for c in train.columns if c not in ['ID_code', 'target']] #basic features target = train['target'].values params = { 'num_rounds': 6000000, 'verbose_eval': 5000, 'early_stop': 4000, } print(params) if True: # filter no use sample freq_cols = [] for col in pre_feat: test[col + 'freq'] = test[col].map( test[col].value_counts(sort=False)) freq_cols.append(col + 'freq') test['num_unique'] = (test[freq_cols] >= 2).sum(axis=1) real_idx = test['num_unique'] < 200 real_test = test.loc[real_idx, pre_feat] # real_test = test.copy() assert len(real_test) == 100000 all_data = train[pre_feat].append(real_test).reset_index(drop=True) test.drop(freq_cols, axis=1, inplace=True) train, test = get_features(train, test, all_data, pre_feat) train['allfreq'] = train[freq_cols].sum(axis=1) train['num_unique'] = (train[freq_cols] >= 2).sum(axis=1) test['allfreq'] = test[freq_cols].sum(axis=1) test['num_unique'] = (test[freq_cols] >= 2).sum(axis=1) new_stat = ['freq'] + feat_stat features = pre_feat + ['allfreq', 'num_unique'] + \ [col + 'bin2' for col in two_count_peak_cols] + \ [col + 'bin3' for col in three_count_peak_cols] for s in new_stat: features += [col + s for col in pre_feat] folds = StratifiedKFold(n_splits=cfg['n_splits'], shuffle=False, random_state=random_state).split( train.values, target) oof = np.zeros(len(train)) predictions = np.zeros(len(test)) # feat_score = pd.read_csv('feat_importance.csv')['name'].values[:2000].tolist() # features = list(set(features) & set(feat_score)) for fold_, (trn_idx, val_idx) in enumerate(folds): if fold_ not in cfg['folds']: continue val_x, val_y = train.iloc[val_idx], target[val_idx] tr_x, tr_y = train.iloc[trn_idx], target[trn_idx] # tr_x,val_x,te_x = cal_freq_TE(tr_x,val_x,test,all_data,pre_feat) tr_x, val_x, te_x = tr_x[features], val_x[features], test[features] tr_x, tr_y = augment(tr_x, tr_y, pre_feat, cfg['t1'], cfg['t2']) tr_x['allfreq'] = tr_x[freq_cols].sum(axis=1) tr_x['num_unique'] = (tr_x[freq_cols] >= 2).sum(axis=1) print("Fold idx:{}".format(fold_ + 1)) d_train = Pool(tr_x, label=tr_y) d_valid = Pool(val_x, label=val_y) model = CatBoostClassifier( iterations=params['num_rounds'], learning_rate=0.003, od_type='Iter', od_wait=params['early_stop'], loss_function="Logloss", eval_metric='AUC', # depth=3, bagging_temperature=0.7, random_seed=2019, task_type='GPU') model.fit(d_train, eval_set=d_valid, use_best_model=True, verbose=params['verbose_eval']) oof[val_idx] = model.predict_proba(val_x)[:, 1] pred = model.predict_proba(te_x)[:, 1] predictions += pred / cfg['n_splits'] threshold_search(target[val_idx], oof[val_idx]) np.save('../submit/' + cfg['name'] + str(fold_), pred) np.save( '../oof/' + cfg['name'] + ''.join([str(fold) for fold in cfg['folds']]), oof) print("CV score: {:<8.5f}".format(roc_auc_score(target, oof))) # np.save('../input/oof',oof) sub = pd.DataFrame({"ID_code": test.ID_code.values}) sub["target"] = predictions sub.to_csv(cfg['name'] + "submission.csv", index=False)
# In[10]: ctb = CatBoostClassifier(random_seed=17) # **Обучаем Catboost без настройки параметров, передав только индексы категориальных признаков.** # In[11]: get_ipython().run_cell_magic( 'time', '', 'ctb.fit(X_train_part, y_train_part,\n cat_features=categ_feat_idx)' ) # In[12]: ctb_valid_pred = ctb.predict_proba(X_valid)[:, 1] # **Получаем почти 0.75 ROC AUC на отложенной выборке.** # In[13]: roc_auc_score(y_valid, ctb_valid_pred) # **Обучаем на всей выборке, делаем прогноз на тестовой, в соревновании получается результат 0.73008.** # In[14]: get_ipython().run_cell_magic( 'time', '', 'ctb.fit(X_train, y_train,\n cat_features=categ_feat_idx)')
arrays[0] = numpy.column_stack((enhancer_vec, promoter_vec)) distance[0] = float(dis) X_train = numpy.column_stack((arrays, distance)) print(X_train.shape[0], X_train.shape[1]) estimator = CatBoostClassifier(iterations=1000, depth=10, learning_rate=0.1, logging_level=None, scale_pos_weight=45) estimator.load_model('{}{}/best_model{}'.format(model_filepath, cellline, kvalue)) y_pred = estimator.predict(X_train) y_proba_pred = estimator.predict_proba(X_train)[:, 1] if enchrome != prchrome: print( 'The two elements are not in the same chrosome, please recheck your input!' ) else: print('For Promoter ' + enoldname + ', Enhancer ' + proldname + ' in cell line ' + cellline + ' :') if y_pred[0] == 0: print( 'The two elements are predicted not to be interacted by EPBoost, the interaction prediction score is %.4f.' % y_proba_pred[0]) else: print( 'The two elements are predicted interacted by EPBoost, the interaction prediction score is %.4f.' % y_proba_pred[0])
def main(): ############################################################################################################# seed = 42 # For random numbers generation iterations = 10 # Max number of iterations at every run of gradient boosting (max number of trees built) hyper_iterations = 3 # Number of iterations required during each Bayesian optimization of hyper-parameters log_regs_hyper_iterations = 2 # Number of iterations for hyper-parameters optimization for logistic regression cv_folds = 4 # Number of folds used for k-folds cross-validation logs_dir = Path('catboost_logs' ) # Relative to the directory where the program is running task_type = 'GPU' # Can be 'CPU' or 'GPU' early_stopping_iters = 10000 # Effectively disabled, as there is an issue with displaying the charts see https://github.com/catboost/catboost/issues/1468 ############################################################################################################# start_time = time() # Make the logs directory, if it doesn't exist already, and ensure it is empty logs_dir.mkdir(exist_ok=True) for item in logs_dir.iterdir(): if item.is_dir(): shutil.rmtree(str(item)) ''' Load the NHANES I epidemiology dataset. The dataset is already partitioned into a dev set and a test set. Here below, the dev set will be further partitioned into a training set and a validation set.''' X_dev, X_test, y_dev, y_test = load_data(10) # Convert categorical features from float to int, as that is what CatBoost expects X_dev = X_dev.astype({'Sex': int, 'Race': int}) y_dev = y_dev.astype(int) X_test = X_test.astype({'Sex': int, 'Race': int}) y_test = y_test.astype(int) # Count and present how many samples with missing data (variable values) in the dev and test set respectively dev_missing_count = count_samples_with_missing_data(X_dev) test_missing_count = count_samples_with_missing_data(X_test) print('\nDev. set missing data in', dev_missing_count, 'samples out of', len(X_dev)) print('Test set missing data in', test_missing_count, 'samples out of', len(X_test)) # Split the dev set into training and validation. The latter will be used for hyper-parameters tuning. X_train, X_val, y_train, y_val = train_test_split(X_dev, y_dev, test_size=0.2, random_state=seed) # Make a dataset after dropping samples with missing data (note, there are no samples with missing data in test set) X_train_dropped = X_train.dropna(axis='rows') y_train_dropped = y_train.loc[X_train_dropped.index] X_val_dropped = X_val.dropna(axis='rows') y_val_dropped = y_val.loc[X_val_dropped.index] ''' Prepare two imputers that will be used to impute missing values in the dataset. One is a mean imputer and the other an iterative imputer ''' mean_imputer = SimpleImputer(strategy='mean', verbose=0) mean_imputer.fit(X_train) iter_imputer = IterativeImputer(random_state=seed, sample_posterior=False, max_iter=10, min_value=0, verbose=0) iter_imputer.fit(X_train) # Run a logistic regression ''' Fill in hyper-parameters for the logistic regression with their values, or with a probability distribution from where the value must be sampled. ''' log_regr_params = { 'penalty': 'elasticnet', 'C': hp.uniform('C', .25, 4), 'class_weight': None, 'random_state': seed, 'solver': 'saga', 'max_iter': 10000, 'multi_class': 'ovr', 'n_jobs': -1, 'l1_ratio': hp.uniform('l1_ratio', .0, 1) } run_exp_log_regr(X_train, y_train, X_val, y_val, param_space=log_regr_params, max_evals=log_regs_hyper_iterations, imputer=iter_imputer, seed=seed) # Run gradient boosting (boosted trees) models cat_features = [3, 11] # Categorical features are race and sex '''Note: passing a CatBoost Pool() instance in the param_space values here below doesn't work, because hyperopt would throw an exception during optimization.''' ############################################################################################################ param_space = { 'learning_rate': hp.loguniform('learning_rate', np.log(.001), np.log(.2)), 'depth': hp.quniform('depth', 4, 12, 1), 'l2_leaf_reg': hp.uniform('l2_leaf_reg', 1, 9), 'bagging_temperature': hp.uniform('bagging_temperature', 0, 2), 'seed': seed, 'iterations': iterations, 'task_type': task_type, # 'early_stopping_rounds': True, 'od_type': 'Iter', 'od_wait': early_stopping_iters } ############################################################################################################ ''' First try with no imputation, but instead dropping all samples from the train/val set that have missing data ''' print( '\nPerforming Bayesian search for hyper-parameters optimization, after dropping samples with missing data' ) run_exp_bayes_hyperparams_opt(X_train_dropped, y_train_dropped, X_val_dropped, y_val_dropped, cat_features=cat_features, param_space=param_space, max_evals=hyper_iterations, imputer=None, train_dir=str(logs_dir / 'catboost_logs_drop'), seed=seed) ''' Next solve the same model, but with missing data imputed by the mean imputer (no samples are dropped)''' print( '\nPerforming Bayesian search for hyper-parameters optimization, with missing data replaced with mean imputer' ) run_exp_bayes_hyperparams_opt(X_train, y_train, X_val, y_val, cat_features=cat_features, param_space=param_space, max_evals=hyper_iterations, imputer=mean_imputer, train_dir=str(logs_dir / 'catboost_logs_mean_imputer'), seed=seed) print( '\nPerforming Bayesian search for hyper-parameters optimization, with missing data replaced with iterative imputer' ) ''' Now do it with missing data imputed by the iterative imputer (no samples are dropped)''' run_exp_bayes_hyperparams_opt(X_train, y_train, X_val, y_val, cat_features=cat_features, param_space=param_space, max_evals=hyper_iterations, imputer=iter_imputer, train_dir=str(logs_dir / 'catboost_logs_iter_imputer'), seed=seed) ''' Solve the same model again, but this time neither drop samples with missing data nor use an imputer. Leave the missing data in the dataset the way they are, and let CatBoost deal with them. ''' print( '\nPerforming Bayesian search for hyper-parameters optimization, without replacement of missing data' ) selected_model = run_exp_bayes_hyperparams_opt( X_train, y_train, X_val, y_val, cat_features=cat_features, param_space=param_space, max_evals=hyper_iterations, imputer=None, train_dir=str(logs_dir / 'catboost_logs_keep_nan'), seed=seed) ''' Solve the model still leaving missing data in the dataset, but this time use a weighted loss function, to keep into account that the dataset is imbalanced (positive cases are under-represented) ''' print( '\nPerforming Bayesian search for hyper-parameters optimization, without replacement and with weights' ) ''' Compute the number of positive and negative samples in the training set, and the respective weights to be used ''' w, stats = compute_weights(y_train) print('Computed weights') print('For', stats['total_pos'], 'positive samples:', stats['pos_weight']) print('For', stats['total_neg'], 'negative samples:', stats['neg_weight']) run_exp_bayes_hyperparams_opt(X_train, y_train, X_val, y_val, cat_features=cat_features, param_space=param_space, max_evals=hyper_iterations, imputer=None, weights=w, train_dir=str(logs_dir / 'catboost_logs_weights'), seed=seed) ''' A note on CatBoost grid-search (not used here). It would be done on the dev. set, as the grid-search takes care of splitting it into training and validation. If `search_by_train_test_split` is set to True, every combination of values of the hyper-parameters is evaluated with a basic training/val. split of the dataset; if set to False, then every combination is evaluated with x-evaluation. Once method grid_search() has selected the best combination of hyper-parameters, we could fit a model with it. The final model can be evaluated with x-evaluation by setting parameter `calc_cv_statistics` to True (which is the default). ''' """ print('\nTuning hyper-parameters for NN') p = Process(target=run_exp_nn, args=(X_train, y_train, X_val, y_val, params, hyper_iterations, iter_imputer, str(logs_dir / 'tensorflow_logs_nn'), seed)) p.start() p.join() sleep(2.) """ ''' Cross-validate the selected model, and test it on the test set ''' model = selected_model imputer = None # The selected model retains missing data, doesn't impute nor discard them print(f'\nCross-validating selected model.') params = model.get_params() params['loss_function'] = 'Logloss' params['eval_metric'] = 'AUC:hints=skip_train~false' params['train_dir'] = str(logs_dir / ('catboost_logs_cv_selected')) params['task_type'] = task_type # params['early_stopping_rounds'] = True params['od_type'] = 'Iter' params['od_wait'] = early_stopping_iters ''' Make a new imputer for cross-validation over the dev set. ''' X_pool, _ = make_imputed_pool(X_dev, y_dev, imputer=imputer, cat_features=cat_features, weight=None) cv_results = cv(pool=X_pool, params=params, iterations=iterations, fold_count=cv_folds, partition_random_seed=seed, stratified=True, verbose=False) # Find the iteration with the best test AUC, the value of its AUC and other train and test stats. best_cv_iter = np.argmax( cv_results['test-AUC-mean'] ) # All the stats retrieved will refer to this same iteration best_cv_val_AUC = cv_results['test-AUC-mean'][best_cv_iter] best_cv_val_Logloss = cv_results['test-Logloss-mean'][best_cv_iter] best_cv_train_AUC = cv_results['train-AUC-mean'][best_cv_iter] best_cv_train_Logloss = cv_results['train-Logloss-mean'][best_cv_iter] print('Parameters:') for key, value in sorted(params.items()): print(f' {key}={value}') print('Best cross-validation achieved at iteration', best_cv_iter) print( f'Training: Logloss {best_cv_train_Logloss} ROC AUC {best_cv_train_AUC}' ) print( f'Validation: Logloss {best_cv_val_Logloss} ROC AUC {best_cv_val_AUC}' ) print('Re-fitting the model on the dev. set and testing it') params['iterations'] = best_cv_iter + 1 params['train_dir'] = None cv_model = CatBoostClassifier(**params) training_res = cv_model.fit(X_pool, verbose=False) # print('Iteration:', training_res.best_iteration_) y_test_preds = cv_model.predict_proba(X_test)[:, 1] test_AUC = roc_auc_score(y_test, y_test_preds) test_Logloss = log_loss(y_test, y_test_preds) print(f"Test on test set: Log loss={test_Logloss} ROC AUC={test_AUC}") print( f'Overall train, validation and test run time: {round(time() - start_time)}s' ) print('\nFetaures importance based on prediction values change (%)') feature_importances = cv_model.get_feature_importance(X_pool) feature_names = X_dev.columns for score, name in sorted(zip(feature_importances, feature_names), reverse=True): print('{}: {}'.format(name, score)) print('\nFetaures importance based on loss (ROC AUC) values change') feature_importances_loss = cv_model.get_feature_importance( X_pool, type=EFstrType.LossFunctionChange) for score, name in sorted(zip(feature_importances_loss, feature_names), reverse=True): print('{}: {}'.format(name, score)) # Plot a ROC curve for the x-validated model over the test set y_test_preds = cv_model.predict_proba(X_test)[:, 1] fpr, tpr, thresholds = roc_curve(y_test, y_test_preds) fig, ax = plt.subplots() ax.set_title('ROC Curve') ax.set_xlabel('False positive rate') ax.set_ylabel('True Positive rate') ax.set_ylim((0, 1)) ax.set_xlim((0, 1)) ax.grid(True) plt.gca().set_aspect('equal', adjustable='box') ax.plot([0, 1], [0, 1], color='blue', ls='--', lw=.5) ax.plot(fpr, tpr, color='blue', label='ROC') # ax.legend(loc='lower center') plt.show()
eval_metric='AUC', max_depth=6, learning_rate=0.01, od_wait=50, l2_leaf_reg=10, task_type="GPU", cat_features=categorical_features_indices, bagging_temperature=0.80, random_strength=100, use_best_model=True) m.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=200, verbose=1000) oofcat[test_index] = m.predict_proba(X_test)[:, -1] p = m.predict_proba(test[usecols])[:, -1] y_pred_totcb += p / N_SPLITS #%% np.save('oof-onielg-catb', oofcat) np.save('tst-onielg-catb', y_pred_totcb) # y_pred_totcb = y_pred_totcb/5 # sample_sub['target'] = pd.DataFrame(predictionslgb).rank(pct=True) # sample_sub.to_csv('submission.csv',index=False) #sample_sub['target'] = y_pred_totcb sample_sub['target'] = pd.DataFrame(y_pred_totcb).rank( pct=True) * 0.60 + pd.DataFrame(predictionslgb).rank(pct=True) * 0.40
# Imputing by most frequent for col in X.columns[X.isnull().sum()>0]: X[col] = X[col].transform(lambda x:x.fillna(x.value_counts().idxmax())) # Reshaping X_train_ = X.iloc[:X_train.shape[0],:] X_test_ = X.iloc[X_train.shape[0]:,:] # Categorical features cat_features_list = [] for i in range(X.shape[1]): if 'cat' in X.columns[i]: cat_features_list.append(i) # Initialize CatBoostClassifier model = CatBoostClassifier(verbose=True) # Fit model model.fit(X_train_, y_train, cat_features = cat_features_list, verbose=True) # Get predicted classes preds_class = model.predict(X_test) # Get predicted probabilities for each class preds_proba = model.predict_proba(X_test) # Saving a = np.reshape((preds_proba[:,1]), (preds_proba.shape[0],1)) df_pred = pd.read_csv("E:/Kaggle/Seguro/sample_submission.csv", header=0) df_pred['target']=a df_pred.to_csv("E:/Kaggle/Seguro/prediction.csv",index=False)
learning_rate=0.05, depth=9, boosting_type='Ordered', bagging_temperature=0.4, task_type='GPU', silent=True) # 0.8578 # clf = GaussianNB() # clf = MLPClassifier(max_iter=300) # clf = svm.SVC(kernel='linear') t2 = time.time() print(f"Model Created --- {(t2-t1) if (t2-t1)<60 else (t2-t1)/60}") clf.fit(X_t, y_t, cat_features=cat_col, eval_set=(X_tt, y_tt), plot=True, early_stopping_rounds=30, verbose=100) t3 = time.time() print(f"Model Trained --- {(t3-t2)/60} Minutes") sample = clf.predict_proba(test_x) # print(sample[:, 1]) submit = pd.DataFrame(({"id": test.id, "Response": sample[:, 1]})) submit.to_csv('submission.csv', index=False) t4 = time.time() print(f"Process Finished ---{(t4-t3)}sec")
fig = plt.figure() plt.hist(acc_set2, bins=50, color = 'g') plt.axvline(x=final_accuracy,color = 'r') plt.show() fig.savefig(dest+'fig2.png', dpi=fig.dpi) print("The accuracy from the model is ",scipy.stats.percentileofscore(acc_set2, final_accuracy, kind='rank')," percentile in the permutation test using shuffled values of test data labels") # ### AUC and Confusion Matrix # In[22]: fig = plt.figure() y_pred_proba = model.predict_proba(dfv.drop(columns = 'TimeCycle'))[::,1] fpr, tpr, _ = metrics.roc_curve(dfv['TimeCycle'],y_pred_proba) auc = metrics.roc_auc_score(dfv['TimeCycle'], y_pred_proba) plt.plot(fpr,tpr,label="ROC, auc="+str(auc)) plt.legend(loc=4) plt.show() fig.savefig(dest+'fig3.png', dpi=fig.dpi) cm = metrics.confusion_matrix(dfv['TimeCycle'],result) # labels = ['No Default', 'Default'] fig = plt.figure(figsize=(8,6)) sns.heatmap(cm, annot = True, fmt='d', cmap="Blues", vmin = 0.2); plt.title('Confusion Matrix') plt.ylabel('True Class') plt.xlabel('Predicted Class') plt.show()
model = CatBoostClassifier(iterations=N_ESTIMATORS, learning_rate=LEARNING_RATE, depth=DEPTH, eval_metric=EVAL_METRIC, verbose=VERBOSE, random_state=RANDOM_STATE, thread_count=N_THREADS, task_type="GPU") model.fit( train_dataset, eval_set=valid_dataset, early_stopping_rounds=EARLY_STOPPING_ROUNDS, ) y_pred_valid = model.predict_proba(valid_dataset)[:, 1] y_pred = model.predict_proba(test_dataset)[:, 1] fold_importance = pd.DataFrame() fold_importance["feature"] = model.feature_names_ fold_importance["importance"] = model.get_feature_importance() fold_importance["fold"] = fold_n + 1 feature_importance = pd.concat([feature_importance, fold_importance], axis=0) best_iteration = model.best_iteration_ best_iterations.append(best_iteration) fold_score = roc_auc_score(y_valid, y_pred_valid) scores.append(fold_score) update_tracking(
def train_model_classification(X, X_test, y, params, num_classes=2, folds=None, model_type='lgb', eval_metric='logloss', columns=None, plot_feature_importance=False, model=None, verbose=10000, early_stopping_rounds=200, splits=None, n_folds=3): """ 分类模型函数 返回字典,包括: oof predictions, test predictions, scores and, if necessary, feature importances. :params: X - 训练数据, pd.DataFrame :params: X_test - 测试数据,pd.DataFrame :params: y - 目标 :params: folds - folds to split data :params: model_type - 模型 :params: eval_metric - 评价指标 :params: columns - 特征列 :params: plot_feature_importance - 是否展示特征重要性 :params: model - sklearn model, works only for "sklearn" model type """ start_time = time.time() global y_pred_valid, y_pred columns = X.columns if columns is None else columns X_test = X_test[columns] splits = folds.split(X, y) if splits is None else splits n_splits = folds.n_splits if splits is None else n_folds # to set up scoring parameters metrics_dict = { 'logloss': { 'lgb_metric_name': 'logloss', 'xgb_metric_name': 'mlogloss', 'catboost_metric_name': 'Logloss', 'sklearn_scoring_function': metrics.log_loss }, 'lb_score_method': { 'sklearn_scoring_f1': metrics.f1_score, # 线上评价指标 'sklearn_scoring_accuracy': metrics.accuracy_score, # 线上评价指标 'sklearn_scoring_auc': metrics.roc_auc_score }, } result_dict = {} # out-of-fold predictions on train data oof = np.zeros(shape=(len(X), num_classes)) # averaged predictions on train data prediction = np.zeros(shape=(len(X_test), num_classes)) # list of scores on folds scores = [] # feature importance feature_importance = pd.DataFrame() # split and train on folds for fold_n, (train_index, valid_index) in enumerate(splits): if verbose: print(f'Fold {fold_n + 1} started at {time.ctime()}') if type(X) == np.ndarray: X_train, X_valid = X[train_index], X[valid_index] y_train, y_valid = y[train_index], y[valid_index] else: X_train, X_valid = X[columns].iloc[train_index], X[columns].iloc[valid_index] y_train, y_valid = y.iloc[train_index], y.iloc[valid_index] if model_type == 'lgb': model = lgb.LGBMClassifier(**params) model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_valid, y_valid)], eval_metric=metrics_dict[eval_metric]['lgb_metric_name'], verbose=verbose, early_stopping_rounds=early_stopping_rounds) y_pred_valid = model.predict_proba(X_valid) y_pred = model.predict_proba(X_test, num_iteration=model.best_iteration_) if model_type == 'xgb': model = xgb.XGBClassifier(**params) model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_valid, y_valid)], eval_metric=metrics_dict[eval_metric]['xgb_metric_name'], verbose=bool(verbose), # xgb verbose bool early_stopping_rounds=early_stopping_rounds) y_pred_valid = model.predict_proba(X_valid) y_pred = model.predict_proba(X_test, ntree_limit=model.best_ntree_limit) if model_type == 'sklearn': model = model model.fit(X_train, y_train) y_pred_valid = model.predict_proba(X_valid) score = metrics_dict[eval_metric]['sklearn_scoring_function'](y_valid, y_pred_valid) print(f'Fold {fold_n}. {eval_metric}: {score:.4f}.') y_pred = model.predict_proba(X_test) if model_type == 'cat': model = CatBoostClassifier(iterations=20000, eval_metric=metrics_dict[eval_metric]['catboost_metric_name'], **params, loss_function=metrics_dict[eval_metric]['catboost_metric_name']) model.fit(X_train, y_train, eval_set=(X_valid, y_valid), cat_features=[], use_best_model=True, verbose=False) y_pred_valid = model.predict_proba(X_valid) y_pred = model.predict_proba(X_test) oof[valid_index] = y_pred_valid # 评价指标 scores.append( metrics_dict['lb_score_method']['sklearn_scoring_accuracy'](y_valid, np.argmax(y_pred_valid, axis=1))) print(scores) prediction += y_pred if model_type == 'lgb' and plot_feature_importance: # feature importance fold_importance = pd.DataFrame() fold_importance["feature"] = columns fold_importance["importance"] = model.feature_importances_ fold_importance["fold"] = fold_n + 1 feature_importance = pd.concat([feature_importance, fold_importance], axis=0) if model_type == 'xgb' and plot_feature_importance: # feature importance fold_importance = pd.DataFrame() fold_importance["feature"] = columns fold_importance["importance"] = model.feature_importances_ fold_importance["fold"] = fold_n + 1 feature_importance = pd.concat([feature_importance, fold_importance], axis=0) prediction /= n_splits print('CV mean score: {0:.4f}, std: {1:.4f}.'.format(np.mean(scores), np.std(scores))) result_dict['oof'] = oof result_dict['prediction'] = prediction result_dict['scores'] = scores if model_type == 'lgb' or model_type == 'xgb': if plot_feature_importance: feature_importance["importance"] /= n_splits cols = feature_importance[["feature", "importance"]].groupby("feature").mean().sort_values( by="importance", ascending=False)[:50].index best_features = feature_importance.loc[feature_importance.feature.isin(cols)] plt.figure(figsize=(16, 12)) sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False)) plt.title('LGB Features (avg over folds)') plt.show() result_dict['feature_importance'] = feature_importance end_time = time.time() print("train_model_classification cost time:{}".format(end_time - start_time)) return result_dict
def feature_importance(self, n_rows, n_cols, X_train, y_train, X_valid, y_valid): '''Calculate feature importance from Logistic, Random Forest, CatBoost, XGB, LGBM''' # train classifiers lr = LogisticRegression(max_iter=100, random_state=42) lr.fit(X_train, y_train) lr_prob = lr.predict_proba(X_valid) rfc = RandomForestClassifier(n_jobs=2, random_state=42) rfc.fit(X_train, y_train) rfc_prob = rfc.predict_proba(X_valid) brfc = BalancedRandomForestClassifier(random_state=42) brfc.fit(X_train, y_train) brfc_prob = brfc.predict_proba(X_valid) cb = CatBoostClassifier(random_state=42, verbose=False) cb.fit(X_train, y_train) cb_prob = cb.predict_proba(X_valid) xgb = XGBClassifier(random_state=42) xgb.fit(X_train, y_train) xgb_prob = xgb.predict_proba(X_valid) lgbm = LGBMClassifier(random_state=42, n_jobs=-1) lgbm.fit(X_train, y_train) lgbm_prob = lgbm.predict_proba(X_valid) feat_importance_list = [ lr.coef_[0], rfc.feature_importances_, brfc.feature_importances_, cb.feature_importances_, xgb.feature_importances_, lgbm.feature_importances_ ] model_name = [ 'Logistic Regression', 'Random Forest Classifier', 'Balanced Random Forest Classifier', 'CatBoost Classifier', 'XGB Classifier', 'LGBM Classifier' ] # generate feature importance plots fig, ax = plt.subplots(nrows=n_rows, ncols=n_cols, figsize=(18, 20)) sns.set(font_scale=1.5) for feature, name, n, ax in zip(feat_importance_list, model_name, list(range(n_rows * n_cols)), ax.flatten()): # get feature importance importance = feature # create dataframe df_imp = pd.DataFrame() # calculate importance of each variable df_imp['importance'] = pd.Series(importance, index=list(X_train.columns)) # transform dataframe long_df = pd.melt(df_imp.T) # plot barplot plt.subplot(n_rows, n_cols, n + 1) sns.barplot(y=long_df.variable, x=long_df.value, order=long_df.sort_values( 'value', ascending=False)['variable'].to_list()) plt.title(f'{name}') # adjusts subplot plt.tight_layout() # displays the plot plt.show()
eval_set=(X_eval, y_eval), cat_features=categorical_features_indices, plot=True) # 输出各特征重要度 feature_names = X_train.columns print( pd.DataFrame({ 'column': feature_names, 'importance': model.get_feature_importance(), }).sort_values(by='importance', ascending=False)) print(model.get_best_iteration()) # 训练集和验证集的预测 y_train_prob = model.predict_proba(X_train, ntree_end=model.get_best_iteration())[:, 1] y_eval_prob = model.predict_proba(X_eval, ntree_end=model.get_best_iteration())[:, 1] # AUC指标 print('AUC') print(roc_auc_score(y_train, y_train_prob)) print(roc_auc_score(y_eval, y_eval_prob)) # 本题采用的指标 print('本题采用的指标') print(cal_metric(y_train, y_train_prob)) print(cal_metric(y_eval, y_eval_prob)) # 模型预测并写入csv(此处记得将前面的验证集df_eval改为0, 用全量的df_train,并设置迭代次数为best_iteration) y_prob = model.predict_proba(X_test, ntree_end=model.get_best_iteration())[:,
d_id_ce_hash.transform(d_test_account[d_id_cols]), ], axis=1).fillna(0) X_new = df_train_concat.reset_index(drop=True) X_test_new = df_test_concat.copy().reset_index(drop=True) lr = 0.09552 / 3.3 cat = CatBoostClassifier(random_state=17, iterations=2321, learning_rate=lr, verbose=100, custom_metric='AUC', eval_metric='AUC', use_best_model=False, early_stopping_rounds=200) cat.fit(X_new, y, verbose=False, plot=False) y_test_pred = cat.predict_proba(X_test_new)[:, 1] df_submission = pd.DataFrame({'radiant_win_prob': y_test_pred}, index=df_test_concat.index) submission_filename = 'submission_{}.csv'.format( datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S_cat')) df_submission.to_csv(submission_filename) print('Submission saved to {}'.format(submission_filename))
cat_col = [ i for i in data_test.select_dtypes(object).columns if i not in ['ncodpers', 'fecha_dato'] ] for i in cat_col: data_train[i] = lbe.fit_transform(data_train[i].astype(str)) data_val[i] = lbe.fit_transform(data_val[i].astype(str)) data_test[i] = lbe.fit_transform(data_test[i].astype(str)) exp_var = data_test.columns.tolist()[2:] x_train = data_train[exp_var] y_train = data_train[product] x_val = data_val[exp_var] y_val = data_val[product] x_test = data_test[exp_var] model = CatBoostClassifier(learning_rate=0.05, n_estimators=1000, random_state=2019) model.fit(x_train, y_train, eval_set=[(x_train, y_train), (x_val, y_val)]) data_val[product] = model.predict_proba(x_val)[:, 1] new_val = data_val[['ncodpers', product]] new_val = new_val.sort_values(by='ncodpers') new_val.to_csv("catboost_validation_{}_{}.csv".format(product, train_date), index=False) data_test[product] = model.predict_proba(x_test)[:, 1] new_test = data_test[['ncodpers', product]] new_test = new_test.sort_values(by='ncodpers') new_test.to_csv("catboost_submission_{}_{}.csv".format( product, train_date), index=False)
model.fit(self.training_data.drop(["TARGET"], axis = 1), self.training_data['TARGET'], cat_features = cat_features, eval_set = (self.validation_data.drop(['TARGET'], axis = 1), self.validation_data['TARGET'])) preds = model.predict(self.validation_data.drop(['TARGET'], axis = 1)) return model, preds, self.validation_data['TARGET'] elif self.problem_type == 'classification': model = CatBoostClassifier(**params) model.fit(self.training_data.drop(["TARGET"], axis = 1), self.training_data['TARGET'], cat_features = cat_features, eval_set = (self.validation_data.drop(['TARGET'], axis = 1), self.validation_data['TARGET'])) preds = model.predict_proba(self.validation_data.drop(['TARGET'], axis = 1))[0] return model, preds, self.validation_data['TARGET'] else: raise Exception("Problem Type not supported") def train_and_validate(self): if self.model = "xgboost": return self._xgboost() elif self.model = "lightgbm": return self._lightgbm() elif self.model = "catboost": return self._catboost()
class modelCatBoost(object): def __init__(self, name="CBT", random_state=99, *args, **kwargs): self.name = name self.train_dir = "model_" + str(self.name) + "/" self.random_state = random_state self.manager_models = ParamsManager(param_file, key_read="Models") self.params = self.manager_models.get_params()["CatBoost"] self.params.update({ 'train_dir': self.train_dir, "random_state": self.random_state }) self.model = CatBoostClassifier(**self.params) def dataset(self, X, y, categorical_columns_indices=None, test_size=0.2, *args, **kwargs): self.categorical_columns_indices = categorical_columns_indices self.X = X self.columns = list(X) self.y, self.cat_replace = self.replace_multiclass(y) self.X_train, self.X_test, self.y_train, self.y_test = train_test_split( self.X, self.y, test_size=test_size, random_state=self.random_state) self.train_data = catboost.Pool( data=self.X_train.values, label=self.y_train.values, cat_features=self.categorical_columns_indices) self.eval_data = catboost.Pool( data=self.X_test.values, label=self.y_test.values, cat_features=self.categorical_columns_indices) self.all_train_data = catboost.Pool( data=self.X.values, label=self.y.values, cat_features=self.categorical_columns_indices) def replace_multiclass(self, targets): _unic = targets.unique().tolist() _remp = np.arange(0, len(_unic)).tolist() return targets.replace(_unic, _remp), _unic def fit(self, X, y, use_best_model=True, plot=True, save_snapshot=False, verbose=0, *args, **kwargs): self.dataset(X, y) _params = self.model.get_params() if verbose: _verbose = 0 else: _verbose = _params["verbose"] return self.model.fit(self.train_data, verbose=_verbose, eval_set=self.eval_data, use_best_model=use_best_model, plot=plot, save_snapshot=save_snapshot, **kwargs) _preds = self.model.predict(self.dvalid) preds_test = np.where(_preds > 0.5, 1, 0) score_test = accuracy_score(self.y_test, preds_test) _preds = self.model.predict(self.dtrain) preds_train = np.where(_preds > 0.5, 1, 0) score_train = accuracy_score(self.y_train, preds_train) if not verbose == 0: print("Accurancy para el conjunto de entrenamiento ---> {:.2f}%". format(score_train * 100)) print("Accurancy para el conjunto de validacion ------> {:.2f}%". format(score_test * 100)) def fit_cv(self, X, y, fold_count=4, shuffle=True, stratified=True, plot=True, verbose=100): self.dataset(X, y) _params = self.model.get_params() _params.update({'verbose': verbose}) _scores = catboost.cv(pool=self.all_train_data, params=_params, fold_count=fold_count, seed=self.random_state, shuffle=shuffle, verbose=verbose, plot=plot) if not verbose == 0: print('Best validation accuracy score: {:.2f}±{:.2f} on step {}'. format( np.max(_scores['test-Accuracy-mean']), _scores['test-Accuracy-std'][np.argmax( _scores['test-Accuracy-mean'])], np.argmax(_scores['test-Accuracy-mean']))) return _scores def copy(self, *args, **kwargs): returned_classifier = CatBoostClassifier() returned_classifier.catboost_classifier = self.model.copy() returned_classifier.columns = self.columns return returned_classifier def update_model(self, **kwargs): for k, v in kwargs.items(): setattr(self.model, k, v) def save_model(self, direct="./checkpoints", name="catboost_model"): if not os.path.isdir(direct): try: os.mkdir(direct) print("Directorio creado: " + direct) except OSError as e: raise NameError("Error al crear el directorio") current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") filename = direct + "/" + name + "_" + current_time + ".dump" self.model.save_model(filename) print("Modelo guardado en la ruta: " + filename) def load_model(self, direct="./checkpoints", name="catboost_model"): if not os.path.isdir(direct): print("no existe el drectorio especificado") filename = direct + "/" + name + ".dump" self.model.load_model(filename) print("Modelo cargado de la ruta: " + filename) def predict(self, X, *args, **kwargs): _X_copy = X.loc[:, self.columns].copy() return self.model.predict(_X_copy.values, *args, **kwargs) def predict_proba(self, X, *args, **kwargs): _X_copy = X.loc[:, self.columns].copy() return self.model.predict_proba(_X_copy.values, *args, **kwargs) def add_cat_features(self, index_features): self.categorical_columns_indices = index_features print(self.categorical_columns_indices) self.train_data = catboost.Pool( data=self.X_train, label=self.y_train, cat_features=self.categorical_columns_indices) self.eval_data = catboost.Pool( data=self.X_test, label=self.y_test, cat_features=self.categorical_columns_indices) self.all_train_data = catboost.Pool( data=self.X, label=self.y, cat_features=self.categorical_columns_indices) def index_features(self, features): _index = [] for i in features: _index.append(self.X.columns.get_loc(i)) if _index == []: raise NameError("No coincide ninguna de las features introducidas") return _index def get_important_features(self, display=True): self.model.get_feature_importance(prettified=True) _feature_importance_df = self.model.get_feature_importance( prettified=True) if display: plt.figure(figsize=(12, 6)) sns.barplot(x="Importances", y="Feature Id", data=_feature_importance_df) plt.title('CatBoost features importance:') return _feature_importance_df def Visualizer_Models(self, directs=None, visu_model=True): directorios = [] if len(directs) < 0: if visu_model: directorios.append(self.train_dir) else: raise NameError("No se ha seleccionado ningun directorio") else: if visu_model: directorios.append(self.train_dir) for i in directs: directorios.append(i) print(directorios) widget = MetricVisualizer(directorios) widget.start() def hyperopt_objective(self, params): _model = CatBoostClassifier( l2_leaf_reg=int(params['l2_leaf_reg']), learning_rate=params['learning_rate'], bagging_temperature=params["bagging_temperature"], iterations=500, eval_metric='AUC', random_seed=99, verbose=False, loss_function='Logloss') _cv_data = catboost.cv(self.all_train_data, _model.get_params()) best_accuracy = np.max(_cv_data['test-AUC-mean']) return 1 - best_accuracy def FineTune_hyperopt(self, X, y, mute=False): self.dataset(X, y) params_space = { 'l2_leaf_reg': hyperopt.hp.qloguniform('l2_leaf_reg', 0, 2, 1), 'learning_rate': hyperopt.hp.uniform('learning_rate', 1e-3, 5e-1), 'bagging_temperature': hyperopt.hp.uniform("bagging_temperature", 0, 0.3) } trials = hyperopt.Trials() best = hyperopt.fmin(self.hyperopt_objective, space=params_space, algo=hyperopt.tpe.suggest, max_evals=2, trials=trials, rstate=RandomState(self.random_state)) if not mute: print("\nBest parameters:") print(best) print("\n") _parameters = self.params _parameters.update(best) _model = CatBoostClassifier(**_parameters) _cv_data = catboost.cv(self.all_train_data, _model.get_params()) if not mute: print('\nPrecise validation accuracy score: {}'.format( np.max(_cv_data['test-Accuracy-mean']))) return best def FineTune_sklearn(self, X, y, mute=False, n_splits=10, n_iter=2): """ https://www.kaggle.com/ksaaskil/pets-definitive-catboost-tuning """ self.dataset(X, y) def build_search(modelo, param_distributions, cv=5, n_iter=10, verbose=1, random_state=99): """ Builder function for RandomizedSearch. """ QWS = make_scorer(cohen_kappa_score, weights='quadratic') return RandomizedSearchCV(modelo, param_distributions=param_distributions, cv=cv, return_train_score=True, refit='cohen_kappa_quadratic', n_iter=n_iter, n_jobs=None, scoring={ 'accuracy': make_scorer(accuracy_score), 'cohen_kappa_quadratic': QWS }, verbose=verbose, random_state=random_state) def pretty_cv_results(cv_results, sort_by='rank_test_cohen_kappa_quadratic', sort_ascending=True, n_rows=30): """ Return pretty Pandas dataframe from the `cv_results_` attribute of finished parameter search, ranking by test performance and only keeping the columns of interest. """ df = pd.DataFrame(cv_results) cols_of_interest = [ key for key in df.keys() if key.startswith('param_') or key.startswith("mean_train") or key.startswith("std_train") or key.startswith("mean_test") or key.startswith("std_test") or key.startswith('mean_fit_time') or key.startswith('rank') ] return df.loc[:, cols_of_interest].sort_values( by=sort_by, ascending=sort_ascending).head(n_rows) def run_search(X_train, y_train, search, mute=False): search.fit(X_train, y_train) print('Best score is:', search.best_score_) return pretty_cv_results(search.cv_results_) param_distributions = { 'iterations': [100, 200], 'learning_rate': scipy.stats.uniform(0.01, 0.3), 'max_depth': scipy.stats.randint(3, 10), 'one_hot_max_size': [30], 'l2_leaf_reg': scipy.stats.reciprocal(a=1e-2, b=1e1), } if mute: _verbose = 0 else: _verbose = 1 self.params.update({'use_best_model': False}) _model = CatBoostClassifier(**self.params) catboost_search = build_search(_model, param_distributions=param_distributions, n_iter=n_iter, verbose=_verbose, cv=RepeatedStratifiedKFold( n_splits=n_splits, n_repeats=1, random_state=self.random_state)) catboost_cv_results = run_search(self.X, self.y, search=catboost_search, mute=mute) best_estimator = catboost_search.best_estimator_ if not mute: print(best_estimator.get_params()) return catboost_cv_results, best_estimator def __getattr__(self, attr): """ Pass all other method calls to self.model. """ return getattr(self.model, attr)
class CBModel(MetaModel): def __init__(self): super(CBModel, self).__init__() self.max_run = 2 self.all_data_round = 1 self.explore_params_round = 0 self.not_gain_threhlod = 3 self.patience = 3 self.is_init = False self.name = 'cb' self.type = 'tree' self._model = None self.params = { 'task_type': 'GPU', "loss_function": "MultiClass", "random_seed": CONSTANT.SEED, 'verbose': False } self.hyperparams = { "learning_rate": 0.02, 'iterations': 1200, } self.is_multi_label = None self.num_class = None self.models = {} def init_model(self, num_class, **kwargs): self.is_init = True self.num_class = num_class #@timeit def epoch_train(self, dataloader, run_num, is_multi_label=None, info=None, time_remain=None): self.is_multi_label = is_multi_label X, y, train_idxs, cat = dataloader['X'], dataloader['y'], dataloader[ 'train_idxs'], dataloader['cat_cols'] train_x, train_y = X.loc[train_idxs], y[train_idxs] if info['mode'] == 'bagging': self.hyperparams = info['cb'].copy() self.hyperparams['random_seed'] = np.random.randint(0, 2020) run_num = self.explore_params_round if run_num == self.explore_params_round: print('cb explore_params_round') train_x, train_y, val_x, val_y, = self.split_data(train_x, train_y) self.import_cols = info['imp_cols'] if train_x.shape[1] > 300 and train_x.shape[0] > 20000: train_x = train_x[self.import_cols[:300]] val_x = val_x[self.import_cols[:300]] log('explore_params_round sample 300 cols') train_x.reset_index(drop=True, inplace=True) train_x = train_x.sample(n=20000) train_y = train_y[list(train_x.index)] log('explore_params_round sample 2w samples') elif train_x.shape[0] > 20000: train_x.reset_index(drop=True, inplace=True) train_x = train_x.sample(n=20000) train_y = train_y[list(train_x.index)] log('explore_params_round sample 2w samples') elif train_x.shape[1] > 300: train_x = train_x[self.import_cols[:300]] val_x = val_x[self.import_cols[:300]] log('explore_params_round sample 300 cols') self.bayes_opt(train_x, val_x, train_y, val_y, cat) self.early_stop_opt(train_x, val_x, train_y, val_y, cat) info['cb'] = self.hyperparams.copy() train_x, train_y = X.loc[train_idxs], y[train_idxs] if run_num == self.all_data_round: print('cb all data round') all_train_idxs = dataloader['all_train_idxs'] train_x = X.loc[all_train_idxs] train_y = y[all_train_idxs] if self.is_multi_label: for cls in range(self.num_class): cls_y = train_y[:, cls] self.models[cls] = CatBoostClassifier(**{ **self.params, **self.hyperparams }) self.models[cls].fit(train_x, cls_y) else: self._model = CatBoostClassifier(**{ **self.params, **self.hyperparams }) self._model.fit(train_x, ohe2cat(train_y)) #@timeit def epoch_valid(self, dataloader): X, y, val_idxs = dataloader['X'], dataloader['y'], dataloader[ 'val_idxs'] val_x, val_y = X.loc[val_idxs], y[val_idxs] if not self.is_multi_label: preds = self._model.predict_proba(val_x) else: all_preds = [] for cls in range(y.shape[1]): preds = self.models[cls].predict_proba(val_x) all_preds.append(preds[:, 1]) preds = np.stack(all_preds, axis=1) valid_auc = roc_auc_score(val_y, preds) return valid_auc #@timeit def predict(self, dataloader): X, test_idxs = dataloader['X'], dataloader['test_idxs'] test_x = X.loc[test_idxs] if not self.is_multi_label: return self._model.predict_proba(test_x) else: all_preds = [] for cls in range(self.num_class): preds = self.models[cls].predict_proba(test_x) all_preds.append(preds[:, 1]) return np.stack(all_preds, axis=1) #@timeit def bayes_opt(self, X_train, X_eval, y_train, y_eval, categories): if self.is_multi_label: y_train = y_train[:, 1] y_eval = y_eval[:, 1] else: y_train = ohe2cat(y_train) space = { "learning_rate": hp.loguniform("learning_rate", np.log(0.01), np.log(0.1)), "depth": hp.choice("depth", [4, 6, 8, 10, 12]), "l2_leaf_reg": hp.uniform('l2_leaf_reg', 0.1, 2), } def objective(hyperparams): hyperparams = self.hyperparams.copy() hyperparams['iterations'] = 300 model = CatBoostClassifier(**{**self.params, **hyperparams}) model.fit(X_train, y_train) pred = model.predict_proba(X_eval) if self.is_multi_label: score = roc_auc_score(y_eval, pred[:, 1]) else: score = roc_auc_score(y_eval, pred) return {'loss': -score, 'status': STATUS_OK} trials = Trials() best = hyperopt.fmin(fn=objective, space=space, trials=trials, algo=tpe.suggest, max_evals=15, verbose=1, rstate=np.random.RandomState(1)) self.hyperparams.update(space_eval(space, best)) log("auc = {}, hyperparams: {}".format( -trials.best_trial['result']['loss'], self.hyperparams)) def early_stop_opt(self, X_train, X_eval, y_train, y_eval, categories): if self.is_multi_label: y_train = y_train[:, 1] y_eval = y_eval[:, 1] else: y_train = ohe2cat(y_train) y_eval = ohe2cat(y_eval) model = CatBoostClassifier(**{**self.params, **self.hyperparams}) model.fit(X_train, y_train, eval_set=[(X_eval, y_eval)], use_best_model=True, verbose=10, early_stopping_rounds=20) self.params['iterations'] = model.best_iteration_ log('best iterations: {}'.format(model.best_iteration_)) def split_data(self, x, y): new_x = x.copy() new_x.reset_index(drop=True, inplace=True) sss = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=0) self.splits = {} i = 0 for train_idxs, val_idxs in sss.split(new_x, y): self.splits[i] = [train_idxs, val_idxs] i += 1 new_train_x = new_x.loc[self.splits[0][0]] new_train_y = y[self.splits[0][0]] new_val_x = new_x.loc[self.splits[0][1]] new_val_y = y[self.splits[0][1]] return new_train_x, new_train_y, new_val_x, new_val_y