def balance_classes(X, X_y, X_strat, target, train_cols, method): y = X_strat.astype(str) x = X.copy() # x[target] = X_y if method == "over": from imblearn.over_sampling import RandomOverSampler x, y = RandomOverSampler().fit_sample(x, y) elif method == "under": from imblearn.under_sampling import RandomUnderSampler x, y = RandomUnderSampler().fit_sample(x, y) elif method == "smote": from imblearn.over_sampling import SMOTE features = list( set( x.select_dtypes(include=[ "uint8", "int16", "int32", "int64", "float16", "float32", "float64" ]).columns) - set(X_strat.name)) no_action = list(set(x.columns) - set(features)) x_sm = x[features].copy() x_no_action = x[no_action].copy() x, y = SMOTE().fit_sample(x_sm, y) x[no_action] = x_no_action[no_action] x[X_strat.name] = y X = x.copy() X_y = x[target] gc.collect() return X, X_y
def train(self): fit_params = { 'early_stopping_rounds': 50, 'eval_metric': 'auc', 'eval_names': ['valid'], 'verbose': 100, 'categorical_feature': 'auto' } param_test = { 'num_leaves': sp_randint(6, 50), 'min_child_samples': sp_randint(100, 500), 'min_child_weight': [1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4], 'subsample': sp_uniform(loc=0.2, scale=0.8), 'colsample_bytree': sp_uniform(loc=0.4, scale=0.6), 'reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100], 'reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50, 100] } # split data X_train, X_test, y_train, y_test = train_test_split( self.df_train_features, self.df_train_label, test_size=self.test_size) # random oversampling ros = RandomOverSampler(random_state=0) X_train, y_train = RandomOverSampler().fit_resample(X_train, y_train) clf = lgb.LGBMClassifier(max_depth=-1, random_state=314, silent=True, metric='None', n_jobs=4, n_estimators=900) # set randomsearch optimization params random_search = RandomizedSearchCV(estimator=clf, param_distributions=param_test, scoring='roc_auc', cv=5, refit=True, random_state=314, verbose=False) fit_params = {**fit_params, **{'eval_set': [(X_test, y_test)]}} # fit random search random_search.fit(X_train.copy(), y_train.copy(), **fit_params) # build final classifier clf_final = lgb.LGBMClassifier( **random_search.best_estimator_.get_params()) clf_final.fit( X_train, y_train, **fit_params, callbacks=[ lgb.reset_parameter( learning_rate=self.learning_rate_010_decay_power_0995) ]) # save model joblib.dump(clf_final, os.path.join(MODELS_DIR, "{}.pkl".format("ML"))) predicted = clf_final.predict(X_test) return { "accuracy": accuracy_score(y_test, predicted), "roc": roc_auc_score(y_test, predicted), "cm": confusion_matrix(y_test, predicted, normalize="true"), "feature_importance": pd.DataFrame(sorted( zip(clf_final.feature_importances_, X_test.columns)), columns=['Value', 'Feature']) }
def train(self): # split data X_train, X_test, y_train, y_test = train_test_split( self.df_train_features, self.df_train_label, test_size=self.test_size ) # random oversampling ros = RandomOverSampler(random_state=0) X_train, y_train = RandomOverSampler().fit_resample(X_train, y_train) clf = lgb.LGBMClassifier( max_depth=-1, random_state=314, silent=True, metric='None', n_jobs=4, n_estimators=10000) # set randomsearch optimization params random_search = RandomizedSearchCV( estimator=clf, param_distributions=config.param_test, scoring='roc_auc', cv=5, refit=True, random_state=314, verbose=False) fit_params = { ** config.fit_params, **{'eval_set': [(X_test, y_test)]}} # fit random search random_search.fit( X_train.copy(), y_train.copy(), **fit_params) # build final classifier clf_final = lgb.LGBMClassifier(**random_search.best_estimator_.get_params()) clf_final.fit( X_train, y_train, **fit_params, callbacks=[ lgb.reset_parameter( learning_rate=self.learning_rate_010_decay_power_0995 )]) # save model joblib.dump(clf_final, os.path.join( config.MODELS_DIR, "{}.pkl".format("ML"))) predicted = clf_final.predict(X_test) return { "accuracy": accuracy_score(y_test, predicted), "roc": roc_auc_score(y_test, predicted), "cm": confusion_matrix(y_test, predicted, normalize="true"), "feature_importance": pd.DataFrame( sorted( zip( clf_final.feature_importances_, X_test.columns)), columns=['Value','Feature']) }