def main(): X, y = utils.load_10x(path('SC02'), 'SC02') best_model = catboost.CatBoostClassifier( l2_leaf_reg=3, learning_rate=0.445, depth=10, iterations=200, random_seed=42, logging_level='Silent', loss_function='MultiClass', eval_metric='TotalF1', thread_count=20, ) model_path = os.path.join(CUR_DIR, 'sc02-best-model.cbm') if os.path.exists(model_path): best_model.load_model(model_path) else: best_model.fit(X, y, [X.shape[1] - 1]) best_model.save_model(model_path) sc03, _ = utils.load_10x(path('SC03'), 'SC03') sc03_preds = predict(best_model, X.columns, sc03) sc03_preds.to_csv(os.path.join(CUR_DIR, 'sc03-preds.csv')) importances = pd.DataFrame(best_model._feature_importance, X.columns) importances[importances[0] > 0].sort_values(0, ascending=False).to_csv( os.path.join(CUR_DIR, 'sc02-model-features.csv'))
def process(): X, _ = utils.load_10x(os.path.join(ROOT, 'SC01'), 'SC01v2') X.drop(columns=['Batch'], inplace=True) predictions = pd.read_csv(os.path.join(CUR_DIR, 'sc01-best-preds.csv'), index_col=0) predictions.index = predictions.index.str.replace('-1', '') y = predictions.loc[X.index, :].max(axis=1) params_space = { 'l2_leaf_reg': [1, 3, 5, 7, 9], 'learning_rate': np.linspace(1e-3, 8e-1, num=10), 'depth': [6, 8, 10], } score, best_params = find_best_params(X, y, params_space) print('Best params score: {}'.format(score)) print('Best params: {}'.format(repr(best_params))) model = catboost.CatBoostRegressor( iterations=500, learning_rate=best_params['learning_rate'], depth=best_params['depth'], l2_leaf_reg=best_params['l2_leaf_reg'], random_seed=42, logging_level='Silent', thread_count=20, loss_function='MAE', ) model.fit(X, y) model.save_model(os.path.join(CUR_DIR, 'sc01-pred-score.cbm')) importances = pd.DataFrame(model._feature_importance, X.columns) importances[importances[0] > 0].sort_values(0, ascending=False).to_csv( os.path.join(CUR_DIR, 'sc01-pred-score-features.csv'))
def main(): X, _ = utils.load_10x(path('SC02'), 'SC02v2') best_model = catboost.CatBoostClassifier( l2_leaf_reg=7, learning_rate=0.622, depth=10, iterations=200, random_seed=42, logging_level='Silent', loss_function='MultiClass', eval_metric='TotalF1', thread_count=20, ) model_path = os.path.join(CUR_DIR, 'sc02v2-best-model.cbm') best_model.load_model(model_path) sc01, _ = utils.load_10x(path('SC01'), 'SC01v2') sc01_preds = predict(best_model, X.columns, sc01) sc01_preds.to_csv(os.path.join(CUR_DIR, 'sc01-preds.csv'))
def main(): runs = [] for i in range(1, 6): runs.append( pd.read_csv(os.path.join(CUR_DIR, './search-{}-of-5.csv'.format(i)), index_col=0)) runs = pd.concat(runs, ignore_index=True) best_row = runs.score.idxmax(axis=0) best_params = eval(runs.params[best_row]) X, y = utils.load_10x(path('SC03'), 'SC03') best_model = catboost.CatBoostClassifier( l2_leaf_reg=best_params['l2_leaf_reg'], learning_rate=best_params['learning_rate'], depth=best_params['depth'], iterations=200, random_seed=42, logging_level='Silent', loss_function='MultiClass', eval_metric='TotalF1', thread_count=20, ) model_path = os.path.join(CUR_DIR, 'sc03-model.cbm') if os.path.exists(model_path): best_model.load_model(model_path) else: best_model.fit(X, y, [X.shape[1] - 1]) best_model.save_model(model_path) importances = pd.DataFrame(best_model._feature_importance, X.columns) importances[importances[0] > 0].sort_values(0, ascending=False).to_csv( os.path.join(CUR_DIR, 'sc03-features.csv')) sc01, _ = utils.load_10x(path('SC01'), 'SC01v2') sc01_preds = predict(best_model, X.columns, sc01) sc01_preds.to_csv(os.path.join(CUR_DIR, 'sc01-preds.csv')) sc02, _ = utils.load_10x(path('SC02'), 'SC02v2') sc02_preds = predict(best_model, X.columns, sc02) sc02_preds.to_csv(os.path.join(CUR_DIR, 'sc02-preds.csv'))
def main(splits, current_split): X, y = utils.load_10x(path('SC02'), 'SC02v2') X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.1, stratify=y, random_state=42, ) params_space = { 'l2_leaf_reg': [1, 3, 5, 7, 9], 'learning_rate': np.linspace(1e-3, 8e-1, num=10), 'depth': [6, 8, 10], } score, best_params = catboost_GridSearchCV(X_train, y_train, params_space, cv=5, splits=splits, current_split=current_split) print(score) print(best_params)
def main(splits, current_split): X, y = utils.load_10x(path('SC03'), 'SC03') params_space = { 'l2_leaf_reg': [1, 3, 5, 7, 9], 'learning_rate': np.linspace(1e-3, 8e-1, num=10), 'depth': [6, 8, 10], } record = [] score, best_params = catboost_GridSearchCV(X, y, params_space, record, cv=5, splits=splits, current_split=current_split) pd.DataFrame(record, columns=['params', 'score', 'time']).to_csv( os.path.join(CUR_DIR, 'search-{}-of-{}.csv'.format(current_split, splits)))
def get_query(): global _query if '_query' not in globals(): _query = utils.load_10x(os.path.join(ROOT, 'SC03'), 'SC03') return _query
def get_reference(): global _reference if '_reference' not in globals(): _reference = utils.load_10x(os.path.join(ROOT, 'SC02'), 'SC02v2') return _reference