X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, test_size=0.25) xgb = XGBClassifier() # xgb_new_model.fit(X_train, y_train) # print('Accuracy ', accuracy_score(y_test, xgb_new_model.predict(X_test))) cv = StratifiedKFold(n_splits=5, shuffle=True) param = { 'n_estimators': [x for x in range(10, 201, 20)], 'max_depth': [x for x in range(1, 5)] } grid = GridSearchCV(xgb, param_grid=param, cv=cv, scoring='accuracy', verbose=1) grid.fit(X_train, y_train) model = grid.best_estimator_ accuracy = accuracy_score(y_test, grid.predict(X_test)) print(accuracy) save_model(model_name='xgb_new', model=model, accuracy=accuracy, features=features)
path_to_models = list(map(lambda x: 'full/' + x, path_to_models)) models = [pickle.load(open(name, 'rb')) for name in path_to_models] df_base = df_base.loc[:, features] df_base.reset_index(inplace=True, drop=True) df_non = df_non.loc[:, features] df_label_0 = df_label_0.loc[:, features] df_label_0.reset_index(inplace=True, drop=True) df = pd.concat([df_base, df_label_0, df_non], ignore_index=True) X, y = df.drop('y', axis=1).values, df['y'].values X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, test_size=0.25) cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=4) S_train, S_test = stacking(models, X_train, y_train, X_test, regression=False, mode='oof_pred_bag', needs_proba=False, save_dir=None, metric=accuracy_score, n_folds=5, stratified=True, shuffle=True, verbose=2) model = LogisticRegression() model.fit(S_train, y_train) print(accuracy_score(y_test, model.predict(S_test))) meta_feat = ['rf_predict', 'xgb_predict', 'xgb_new_predict', 'xgb_new_model_predict'] save_model('Meta_model', model, accuracy_score(y_test, model.predict(S_test)), meta_feat)
df = pd.concat([df_base, df_label_0, df_non], ignore_index=True) X, y = df.drop('y', axis=1).values, df['y'].values X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, test_size=0.25) cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=4) forest = RandomForestClassifier(n_estimators=1000) params = { 'max_depth': [x for x in range(3, 12)], 'min_samples_split': [x for x in range(2, 6)], 'min_samples_leaf': [x for x in range(1, 4)] } search = RandomizedSearchCV(estimator=forest, param_distributions=params, n_iter=50, scoring='accuracy', cv=cv) print("TRAINING") search.fit(X_train, y_train) model = search.best_estimator_ print("BEST ESTIMATOR {}".format(model)) accuracy = accuracy_score(y_test, model.predict(X_test)) print(accuracy) save_model(model_name='rf', model=model, accuracy=accuracy, features=features)
df_base = df_base.loc[:, features] df_base.reset_index(inplace=True, drop=True) df_non = df_non.loc[:, features] df_label_0 = df_label_0.loc[:, features] df_label_0.reset_index(inplace=True, drop=True) df = pd.concat([df_base, df_label_0, df_non, df4, df5, df6, df7, df8, df9], ignore_index=True) X, y = df.drop('y', axis=1).values, df['y'].values X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, test_size=0.25) cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=4) forest = RandomForestClassifier(n_estimators=1000, n_jobs=-1) params = { 'max_depth': [x for x in range(3, 12)], 'min_samples_split': [x for x in range(2, 6)], 'min_samples_leaf': [x for x in range(1, 4)] } search = RandomizedSearchCV(estimator=forest, param_distributions=params, n_iter=50, scoring='recall', cv=cv) print("TRAINING") search.fit(X_train, y_train) model = search.best_estimator_ print("BEST ESTIMATOR {}".format(model)) accuracy = recall_score(y_test, model.predict(X_test), average=None) print(accuracy) save_model(model_name='rf_fake_without_cw', model=model, accuracy=None, features=features)
df_non = df_non.loc[:, features] df_label_0 = df_label_0.loc[:, features] df = pd.concat([df_base, df_label_0, df_non, df4, df5, df6, df7, df8, df9], ignore_index=True) X, y = df.drop('y', axis=1).values, df['y'].values weights = np.ones(X.shape[0]) weights[y == 1] = 1.5 X_train, X_test, y_train, y_test, weights_train, weights_test = train_test_split( X, y, weights, shuffle=True, test_size=0.25) xgb = XGBClassifier() # xgb_new_model.fit(X_train, y_train) # print('Accuracy ', accuracy_score(y_test, xgb_new_model.predict(X_test))) cv = StratifiedKFold(n_splits=5, shuffle=True) param = { 'n_estimators': [x for x in range(10, 51, 5)], 'max_depth': [x for x in range(1, 5)] } grid = GridSearchCV(xgb, param_grid=param, cv=cv, scoring='recall', verbose=1) grid.fit(X_train, y_train, sample_weight=weights_train) model = grid.best_estimator_ accuracy = recall_score(y_test, grid.predict(X_test), average=None) print(accuracy) save_model('xgb_recall', model, None, features)