#Split to Training and Testing from sklearn import cross_validation seed = 7 test_size = 0.3 X = credit_data.loc[:, credit_data.columns != 'default payment next month'] y = credit_data[['default payment next month']] X_train, X_test, y_train, y_test = cross_validation.train_test_split( X, y, test_size=test_size, random_state=seed) colnames = credit_data.columns y = np.array(credit_data[colnames[-1]]) # XGBoost # fit model on training data model = XGBClassifier() model.fit(X_train, y_train) # make predictions for test data y_pred = model.predict(X_test) predictions = [round(value) for value in y_pred] # evaluate predictions accuracy = accuracy_score(y_test, predictions) print("Accuracy: %.2f%%" % (accuracy * 100.0)) #tune parameter #use crtl+1 to select all #from pandas.core.categorical import Categorical #from scipy.sparse import csr_matrix #import numpy as np
############################################################ # filename = r'C:\Users\Admin\Downloads\bestRFmodelV1.sav' # loadedModel = pickle.load(open(filename,'rb')) # loadedModel ############################################################ ## BASELINE TESTS ## ############################################################################################################################################################# Xtrain, Xtest, ytrain, ytest = preProcessingPipeline( set(allFeatures)-set(['Weight','Height']) ) #Compute baseline performance (accuracy on test set) for each model type: acc_rf = compute_performance_Array( RandomForestClassifier(random_state=1) .fit(Xtrain,ytrain).predict(Xtest), ytest) #RandomForestClassifier acc_lg = compute_performance_Array( LogisticRegression(random_state=1) .fit(Xtrain,ytrain).predict(Xtest), ytest) #LogisticRegressionClassifier acc_nn = compute_performance_Array( KNeighborsClassifier() .fit(Xtrain,ytrain).predict(Xtest), ytest) #KNeighborsClassifier acc_gb = compute_performance_Array( XGBClassifier(random_state=1) .fit(Xtrain,ytrain).predict(Xtest), ytest) #XGBClassifier acc_nb = compute_performance_Array( GaussianNB() .fit(Xtrain,ytrain).predict(Xtest), ytest) #XGBClassifier print("Random Forest Accuracy:" , acc_rf) print("Logistic Regression Accuracy:" , acc_lg) print("k-Nearest Neighbours Accuracy:" , acc_nn) print("XGBoost Accuracy:" , acc_gb) print("Naive Bayes Accuracy:" , acc_nb) ############################################################################################################################################################# ## FEATURE TUNING ## ############################################################################################################################################################# feature_results_rf = recursiveFeatureSearch( RandomForestClassifier(random_state=1), list(set(allFeatures) - set(['Weight','Height'])) ) feature_results_gb = recursiveFeatureSearch( XGBClassifier(random_state=1), list(set(allFeatures) - set(['Weight','Height'])) )
return float("{0:.2f}".format( (((np.array(x)[:-1] * np.array(x)[1:]) < 0).sum()) / len(x))) grouped = train[features].groupby('id') X_train = grouped.agg(['max', 'min', 'mean', 'mad', q1, q3, IQR, RMS, ZCR]) X_test = test[features].groupby('id').agg( ['max', 'min', 'mean', 'mad', q1, q3, IQR, RMS, ZCR]) y_train = train_label['label'] from xgboost import XGBClassifier xgb_wrapper = XGBClassifier(n_estimators=400, learning_rate=0.3, max_depth=3, min_child_weight=5, gamma=0.3, subsample=0.9, colsample_bytree=0.4) xgb_wrapper.fit(X_train, y_train) w_preds = xgb_wrapper.predict(X_test) w_pred_proba = xgb_wrapper.predict_proba(X_test)[:, 1] y_pred = xgb_wrapper.predict_proba(X_test) submission.iloc[:, 1:] = y_pred submission submission.to_csv('xgboost_q1q3_iqrrmszcr.csv', index=False)
def train(): print("Starting writing classifier training...") if USE_POS_TAG: df = pd.read_csv( path.join(path.dirname(__file__), 'data/scrapeResultPOS.csv')) else: df = pd.read_csv( path.join(path.dirname(__file__), 'data/scrapeResultCleaned.csv')) # missing_rows = [] # for i in range(len(df)): # if df.loc[i, 'text'] != df.loc[i, 'text']: # missing_rows.append(i) # df = df.drop(missing_rows).reset_index().drop(['index', 'id'], axis=1) # df = df.drop_duplicates(subset='text', keep='first') # df = df.drop_duplicates(subset='link', keep='first') count_fake = 0 count_real = 0 for index, row in df.iterrows(): if row['label'] == 1: if count_fake > 5000: df.drop([df.index[index]]) continue count_fake += 1 else: count_real += 1 print("Number of fake articles is ", count_fake) print("Number of real articles is ", count_real) # Set `y` y = df.label # Drop the `label` column df.drop("label", axis=1) # Make training and test sets X_train, X_test, Y_train, Y_test = train_test_split(df['text'], y, test_size=0.2, random_state=53) # Initialize the `tfidf_vectorizer` if USE_POS_TAG: vectorizer = TfidfVectorizer(ngram_range=(1, 3), stop_words='english', min_df=2, norm='l2', strip_accents='unicode', lowercase=True) else: vectorizer = TfidfVectorizer(sublinear_tf=True, ngram_range=(1, 2), stop_words='english', max_df=0.8, min_df=0.01, max_features=5000, strip_accents='unicode') # Fit and transform the training data X_train = vectorizer.fit_transform(X_train) # Transform the test set X_test = vectorizer.transform(X_test) clf = XGBClassifier() clf.fit(X_train, Y_train) Y_predicted = clf.predict(X_test) print("Classification Report Writing") print(metrics.classification_report(Y_test, Y_predicted)) if (USE_POS_TAG): modelFile = path.join(path.dirname(__file__), "model-POS.xgb") else: modelFile = path.join(path.dirname(__file__), "model.xgb") outfile = open(modelFile, 'wb') pickle.dump(clf, outfile) outfile.close() if (USE_POS_TAG): vectorizerFile = path.join(path.dirname(__file__), "vectorizer-POS.tfidf") else: vectorizerFile = path.join(path.dirname(__file__), "vectorizer.tfidf") outfile = open(vectorizerFile, 'wb') pickle.dump(vectorizer, outfile) outfile.close() return clf, vectorizer
mm = MinMaxScaler() X_train = mm.fit_transform(X_train) X_test = mm.transform(X_test) print('y_train class distribution') print(y_train.value_counts(normalize=True)) print('y_test class distribution') print(y_test.value_counts(normalize=True)) scorers = {'precision_score': make_scorer(precision_score), 'f1_score': make_scorer(f1_score), 'recall_score': make_scorer(recall_score), 'accuracy_score': make_scorer(accuracy_score)} lgbm = LGBMClassifier() knn = KNeighborsClassifier() catb = CatBoostClassifier() xgb = XGBClassifier() et = ExtraTreesClassifier() #Extratrees Parameters n_estimators = np.arange(50,350) max_depth = np.arange(5,350) max_features = ['sqrt', 'log2'] param_grid = dict(n_estimators = n_estimators, max_features = max_features, max_depth = max_depth) et = ExtraTreesClassifier() randomized = RandomizedSearchCV(et, param_grid, scoring = 'f1', cv = 2, n_iter = 10) randomized.fit(X_train,y_train) randomized.best_estimator_ #knn Parameters #k_range = np.arange(1,100) #weights = ["uniform","distance"]
# # print(model) # preds = model.predict(test[:,0:(n2-1)]) # accuracy=accuracy_score(test[:,(n2-1)], preds) xtrain = train[:, 0:(n2 - 1)] ytrain = train[:, (n2 - 1)] xtest = test[:, 0:(n2 - 1)] ytest = test[:, (n2 - 1)] model = XGBClassifier(learning_rate=0.1, n_estimators=1000, max_depth=10, min_child_weight=3, gamma=0, subsample=0.8, colsample_bytree=0.8, objective='binary:logistic', nthread=4, scale_pos_weight=1, seed=27) model.fit(data[:, 0:(n2 - 1)], data[:, (n2 - 1)]) preds = model.predict(xtest) accuracy = accuracy_score(ytest, preds) df = pd.read_csv('test.csv', delimiter=',') Embarked_map = {'S': 0, 'C': 1, 'Q': 2} sex_map = {'male': 1, 'female': 0}
# Visualize the count sns.countplot(df['status']) #Get the Data type df.dtypes #Create the feature data set X = df.drop(['name'], 1) X = np.array(X.drop(['status'], 1)) y = np.array(df['status']) #Split the data into 80% training and 20% testing data sets x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2) #Transform the feature data to be values between 0 and 1 sc = MinMaxScaler(feature_range=(0, 1)) x_train = sc.fit_transform(x_train) x_test = sc.transform(x_test) # Create the xbgclassifier model = XGBClassifier().fit(x_train, y_train) #get the models predictions predictions = model.predict(x_test) predictions y_test #get the models accuracy, precision, recall and the f1- score print(classification_report(y_test, predictions))
print(' ') print(' ') from sklearn.model_selection import cross_val_score #train model with cv of 10 cv_scores = cross_val_score(model, X, y, cv=10) #Display the results print('List of Cross-Validation Scores:', cv_scores) print('Mean of Cross-Validation Scores:{}'.format(np.mean(cv_scores))) """# Model 1: XGBoost Classification (Parallel Tree Gradient Boosting) """ Model = "XGBClassifier()" # Adds to title in viz model = XGBClassifier() # Create the Model train_test_ml_model(X_train, y_train, X_test, Model) cross_val(X, y, Model) """# Model 2: K-Nearest Neighbors Classification (KNN)""" # Attempt 1: Out of Box #n_neighbors=5 out of the box Model = "KNeighborsClassifier" model = KNeighborsClassifier() train_test_ml_model(X_train,y_train,X_test,Model) cross_val(X, y, Model)
#print (td[col]) X_train = td[[x for x in td.columns if 'class' not in x]] #print (X_train) Y_train = td['class'] #print (Y_train) y = preprocessing.LabelEncoder() for col in td1.columns: td1[col] = y.fit_transform(td1[col]) X_test = td1[[x for x in td.columns if 'class' not in x]] #print (X_test) Y_test = td1['class'] #print (Y_test) #import xgboost as xgb from xgboost import XGBClassifier xgb_data = XGBClassifier().fit(X_train, Y_train) #print (xgb_data) xgb_predictions = xgb_data.predict(X_test) #print (xgb_predictions) # model accuracy for X_test acc_train = xgb_data.score(X_train, Y_train) #print (acc_train) accuracy = xgb_data.score(X_test, Y_test) print(accuracy) # creating a confusion matrix cm = confusion_matrix(Y_test, xgb_predictions) #print (cm) print(classification_report(Y_test, xgb_predictions))
prepare_data(train_values_df, test_values_df, train_labels_df) # pipeline to place median for NaNs and normalize data # prepared_X_train_values = feature_pipeline(train_values_df, num_attrib, cat_attrib) # prepared_X_test_values = feature_pipeline(test_values_df, num_attrib, cat_attrib) prepared_X_train_values, prepared_test_values = \ target_encode_multiclass(train_values_df, train_labels_df, test_values_df) # generating stratified training and validation data sets from sparse matrices prepared_X_strat_train, y_strat_train_df, prepared_X_strat_val, y_strat_val_df = \ stratified_shuffle_data_split(prepared_X_train_values, train_labels_df) # classifiers employed for training classifier_dict = { 'xgb_clf': XGBClassifier(n_estimators=500, learning_rate=0.3, colsample_bytree=0.3, subsample=0.3, early_stopping_rounds=50, verbosity=0), 'sgd_clf': SGDClassifier(loss='modified_huber', n_jobs=-1, early_stopping=True), # 'rf_clf': RandomForestClassifier(n_estimators=500, n_jobs=-1), 'cat_clf': CatBoostClassifier(iterations=2e3, allow_writing_files=False, learning_rate=0.3, loss_function='MultiClass', custom_metric=['Accuracy', 'AUC', 'TotalF1'], verbose=100), 'ada_clf': AdaBoostClassifier(n_estimators=100, learning_rate=0.3), } # creates list of named classifier tuples for training clf_list = clf_func(classifier_dict) # runs actual training on classifiers and outputs results to screen run_clf(prepared_X_strat_train, prepared_X_strat_val, y_strat_train_df, y_strat_val_df, clf_list, model_dir)
def get_top_n_features(titanic_train_data_X, titanic_train_data_Y, top_n_features): #randomforest rf_est = RandomForestClassifier(random_state=0) rf_param_grid = { 'n_estimators': [500], 'min_samples_split': [2, 3], 'max_depth': [20] } rf_grid = model_selection.GridSearchCV(rf_est, rf_param_grid, n_jobs=25, cv=10, verbose=1) rf_grid.fit(titanic_train_data_X, titanic_train_data_Y) print('Top N Features Best RF Params:' + str(rf_grid.best_params_)) print('Top N Features Best RF Score:' + str(rf_grid.best_score_)) print('Top N Features RF Train Score:' + str(rf_grid.score(titanic_train_data_X, titanic_train_data_Y))) feature_imp_sorted_rf = pd.DataFrame({ 'feature': list(titanic_train_data_X), 'importance': rf_grid.best_estimator_.feature_importances_ }).sort_values('importance', ascending=False) features_top_n_rf = feature_imp_sorted_rf.head(top_n_features)['feature'] print('Sample 10 Feeatures from RF Classifier') print(str(features_top_n_rf[:10])) #AdaBoost ada_est = AdaBoostClassifier(random_state=0) ada_param_grid = {'n_estimators': [500], 'learning_rate': [0.01, 0.1]} ada_grid = model_selection.GridSearchCV(ada_est, ada_param_grid, n_jobs=25, cv=10, verbose=1) ada_grid.fit(titanic_train_data_X, titanic_train_data_Y) print('Top N Features Best Ada Params:' + str(ada_grid.best_params_)) print('Top N Features Best Ada Score:' + str(ada_grid.best_score_)) print('Top N Features Ada Train Score:' + str(ada_grid.score(titanic_train_data_X, titanic_train_data_Y))) feature_imp_sorted_ada = pd.DataFrame({ 'feature': list(titanic_train_data_X), 'importance': ada_grid.best_estimator_.feature_importances_ }).sort_values('importance', ascending=False) features_top_n_ada = feature_imp_sorted_ada.head(top_n_features)['feature'] print('Sample 10 Features from Ada Classifier:') print(str(features_top_n_ada[:10])) #ExtraTree et_est = ExtraTreesClassifier(random_state=0) et_param_grid = { 'n_estimators': [500], 'min_samples_split': [3, 4], 'max_depth': [20] } et_grid = model_selection.GridSearchCV(et_est, et_param_grid, n_jobs=25, cv=10, verbose=1) et_grid.fit(titanic_train_data_X, titanic_train_data_Y) print('Top N Features Best ET Params:' + str(et_grid.best_params_)) print('Top N Features Best DT Score:' + str(et_grid.best_score_)) print('Top N Features ET Train Score:' + str(et_grid.score(titanic_train_data_X, titanic_train_data_Y))) feature_imp_sorted_et = pd.DataFrame({ 'feature': list(titanic_train_data_X), 'importance': et_grid.best_estimator_.feature_importances_ }).sort_values('importance', ascending=False) features_top_n_et = feature_imp_sorted_et.head(top_n_features)['feature'] print('Sample 10 Features from ET Classifier:') print(str(features_top_n_et[:10])) # GradientBoosting gb_est = GradientBoostingClassifier(random_state=0) gb_param_grid = { 'n_estimators': [500], 'learning_rate': [0.01, 0.1], 'max_depth': [20] } gb_grid = model_selection.GridSearchCV(gb_est, gb_param_grid, n_jobs=25, cv=10, verbose=1) gb_grid.fit(titanic_train_data_X, titanic_train_data_Y) print('Top N Features Best GB Params:' + str(gb_grid.best_params_)) print('Top N Features Best GB Score:' + str(gb_grid.best_score_)) print('Top N Features GB Train Score:' + str(gb_grid.score(titanic_train_data_X, titanic_train_data_Y))) feature_imp_sorted_gb = pd.DataFrame({ 'feature': list(titanic_train_data_X), 'importance': gb_grid.best_estimator_.feature_importances_ }).sort_values('importance', ascending=False) features_top_n_gb = feature_imp_sorted_gb.head(top_n_features)['feature'] print('Sample 10 Feature from GB Classifier:') print(str(features_top_n_gb[:10])) # DecisionTree dt_est = DecisionTreeClassifier(random_state=0) dt_param_grid = {'min_samples_split': [2, 4], 'max_depth': [20]} dt_grid = model_selection.GridSearchCV(dt_est, dt_param_grid, n_jobs=25, cv=10, verbose=1) dt_grid.fit(titanic_train_data_X, titanic_train_data_Y) print('Top N Features Bset DT Params:' + str(dt_grid.best_params_)) print('Top N Features Best DT Score:' + str(dt_grid.best_score_)) print('Top N Features DT Train Score:' + str(dt_grid.score(titanic_train_data_X, titanic_train_data_Y))) feature_imp_sorted_dt = pd.DataFrame({ 'feature': list(titanic_train_data_X), 'importance': dt_grid.best_estimator_.feature_importances_ }).sort_values('importance', ascending=False) features_top_n_dt = feature_imp_sorted_dt.head(top_n_features)['feature'] print('Sample 10 Features from DT Classifier:') print(str(features_top_n_dt[:10])) # XGBClassifier XGB_est = XGBClassifier(random_state=0) XGB_param_grid = {'n_estimators': [60], 'max_depth': [9]} XGB_grid = model_selection.GridSearchCV(XGB_est, XGB_param_grid, n_jobs=25, cv=10, verbose=1) XGB_grid.fit(titanic_train_data_X, titanic_train_data_Y) print('Top N Features Bset XGB Params:' + str(XGB_grid.best_params_)) print('Top N Features Best XGB Score:' + str(XGB_grid.best_score_)) print('Top N Features XGB Train Score:' + str(XGB_grid.score(titanic_train_data_X, titanic_train_data_Y))) feature_imp_sorted_XGB = pd.DataFrame({ 'feature': list(titanic_train_data_X), 'importance': XGB_grid.best_estimator_.feature_importances_ }).sort_values('importance', ascending=False) features_top_n_XGB = feature_imp_sorted_XGB.head(top_n_features)['feature'] print('Sample 10 Features from XGB Classifier:') print(str(features_top_n_XGB[:10])) #merge the 6 models特征融合 features_top_n = pd.concat([ features_top_n_rf, features_top_n_ada, features_top_n_et, features_top_n_gb, features_top_n_dt, features_top_n_XGB ], ignore_index=True).drop_duplicates() features_importance = pd.concat([ feature_imp_sorted_rf, feature_imp_sorted_ada, feature_imp_sorted_et, feature_imp_sorted_gb, feature_imp_sorted_dt, features_top_n_XGB ], ignore_index=True) # features_top_n = pd.concat([features_top_n_dt],ignore_index=True).drop_duplicates() # features_importance = pd.concat([feature_imp_sorted_dt], ignore_index=True) # features_top_n = FeatureUnion([('randomforest',RandomForestClassifier()), ('AdaBoost',AdaBoostClassifier()),('ExtraTree',ExtraTreesClassifier()), # ('GradientBoosting',GradientBoostingClassifier()),('DecisionTree',DecisionTreeClassifier()),('XGBClassifier',XGBClassifier())]) # features_importance = FeatureUnion([('randomforest',RandomForestClassifier()), ('AdaBoost',AdaBoostClassifier()),('ExtraTree',ExtraTreesClassifier()), # ('GradientBoosting',GradientBoostingClassifier()),('DecisionTree',DecisionTreeClassifier()),('XGBClassifier',XGBClassifier())]) return features_top_n, features_importance
x = frame[features] y = frame[[Dependent]] # short X_train = frame[features].iloc[:108, :] y_train = frame[Dependent].iloc[:108] X_test = frame[features].iloc[108:, :] y_test = frame[Dependent].iloc[108:] sc = StandardScaler() sc.fit(X_train[feature]) X_train_std = sc.transform(X_train[feature]) X_test_std = sc.transform(X_test[feature]) ml = XGBClassifier(n_estimators=100, min_child_weight=1, max_depth=6, gamma=0) ml.fit(X_train_std, y_train) y_pred = ml.predict(X_test_std) accuracy = accuracy_score(y_test, y_pred) recall = recall_score(y_test, y_pred) precision = precision_score(y_test, y_pred) print('accuracy : %.3f' % accuracy) print('precision : %.3f' % precision) print('recall : %.3f' % recall) X_test['LM3DN'] = y_test X_test['LM3DN_pred'] = y_pred X_test.to_csv("XGB_Kospi_LM3DN_result.csv", encoding='cp949')
x_test = dict_vec.transform(x_test.to_dict(orient='record')) # x_train['Sex']=(x_train['Sex']=='male').astype('int') # x_test['Sex']=(x_test['Sex']=='male').astype('int') # all_Embarked=x_train['Embarked'].unique().tolist() # x_train['Embarked']=x_train['Embarked'].apply(lambda x:all_Embarked.index(x)) # x_test['Embarked']=x_test['Embarked'].apply(lambda x:all_Embarked.index(x)) ''' 3、模型训练(随机森林、xgboost),交叉验证,学习曲线绘制, 生成csv文件 ''' from sklearn.ensemble import RandomForestClassifier from xgboost import XGBClassifier rfc = RandomForestClassifier() xgbc = XGBClassifier() from sklearn.model_selection import cross_val_score, ShuffleSplit cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=78) rfc_scores = cross_val_score(rfc, x_train, y_train, cv=cv) xgbc_scores = cross_val_score(xgbc, x_train, y_train, cv=cv) #绘制学习曲线查看拟合情况 from learning_curve import plot_learning_curve import matplotlib.pyplot as plt plt.figure(figsize=(18, 6)) plt.subplot(121) plot_learning_curve(xgbc, 'xgbc', x_train, y_train, cv=cv) plt.subplot(122)
# ABT abt = pd.concat([df[target], df_numerical_imputed, df_categorical_encoded], axis=1) # Model Training # X_train, X_test, y_train, y_test = model_selection.train_test_split( abt[abt.columns.difference(target)], abt[target], test_size=0.33, random_state=27513) # Build Sklearn Random Forest xgb = XGBClassifier(max_depth=4, subsample=0.9, objective='binary:logistic', n_estimators=100, learning_rate=0.1) eval_set = [(X_train, y_train), (X_test, y_test)] xgb.fit(X_train, y_train.values.ravel(), early_stopping_rounds=10, eval_metric=["error", "logloss"], eval_set=eval_set, verbose=True) output = open('./experiment_xgboost/xgboost.pickle', 'wb') joblib.dump(xgb, output) output.close()
plt.savefig('../img/gradient_boost_confusionmatrix.png') return ax df = pd.read_csv('../data/clean_train.csv') X = df.drop(['churn', 'Unnamed: 0'], axis=1) y = df['churn'] X_train, X_test, y_train, y_test = tts(X, y, test_size=0.33) model = XGBClassifier(booster='gbtree', learning_rate=0.7, max_depth=3, n_estimators=30, nthread=-1, gamma=0.7, max_delta_step=3, min_child_weight=5, subsample=1) model.fit(X_train, y_train) pred = model.predict(X_test) #print(grid_search(X_train,y_train,model)) graph() #print(cmatrix(y_test,pred)) plot_confusion_matrix(y_test, pred) print('Precision:', prec(y_test, pred)) print('Recall:', rec(y_test, pred)) print('Accuracy:', acc(y_test, pred))
# Define Variables ############################################################################### print("Defining variables...") # DATASETS TO BE GENERATED DATASET_NAMES = ["5k_95_5_6d","5k_85_15_6d","5k_70_30_6d","10k_95_5_7d","10k_85_15_7d","10k_70_30_7d", "15k_95_5_9d","15k_85_15_9d","15k_70_30_9d"] #DATASET PARAMETERS n = [5000,5000,5000,10000,10000,10000,15000,15000,15000] #n = [300,300,300,400,400,400,500,500,500] d = [6,6,6,7,7,7,9,9,9] w = [[.95,.05],[.85,.15],[.70,.30],[.95,.05],[.85,.15],[.70,.30],[.95,.05],[.85,.15],[.70,.30]] # MODELS TO BE FITTED MODELS = [SVC(),RandomForestClassifier(),XGBClassifier()] TRAINED_MODELS = [[SVC(),SVC(),SVC(),SVC(),SVC(),SVC(),SVC(),SVC(),SVC()],[RandomForestClassifier(), RandomForestClassifier(),RandomForestClassifier(),RandomForestClassifier(),RandomForestClassifier(), RandomForestClassifier(),RandomForestClassifier(),RandomForestClassifier(), RandomForestClassifier()],[XGBClassifier(),XGBClassifier(),XGBClassifier(),XGBClassifier(), XGBClassifier(),XGBClassifier(),XGBClassifier(),XGBClassifier(),XGBClassifier()]] # MODEL NAMES MODEL_NAME = ['SVM','RF','GB_DT'] # MODEL PARAMETERS M_PARAMS = [{'kernel':['linear'],'gamma': [0.1, 0.01, 0.001,1,1.5,5,10]}, {'max_depth': [3, 5, 6, 7],'min_samples_split': [3, 5, 6, 7],'n_estimators':[10,50,100]}, {'max_depth': [5,6,7,8], 'gamma': [0.1, 0.01, 0.001],'learning_rate': [0.05,0.1, 0.2, 0.3]}] print("Variables successfully defined") ############################################################################### # Generate and save the various datasets
dataset['Embarked'] = label.fit_transform(dataset['Embarked']) dataset['Title'] = dataset['Name'].str.split( ", ", expand=True)[1].str.split(".", expand=True)[0] dataset['Title'] = dataset['Title'].map({ 'Mr': 0, 'Mrs': 1, 'Miss': 2, 'Master': 3 }) dataset['Title'] = dataset['Title'].fillna(4) dataset['Sex'] = label.fit_transform(dataset['Sex']) features = train_set[[ 'Pclass', 'Sex', 'SibSp', 'Parch', 'Fare', 'Embarked', 'Title' ]] survival = train_set[['Survived']] test_X = test_set[[ 'Pclass', 'Sex', 'SibSp', 'Parch', 'Fare', 'Embarked', 'Title' ]] #Training Model XGB_Model = XGBClassifier() XGB_Model.fit(features, survival.values.ravel()) #Testing Model and Submitting XGBy_pred = XGB_Model.predict(test_X) #Predicting based on testing data submit = [test_set['PassengerId'], XGBy_pred] submit = DataFrame(submit, index=['PassengerId', 'Survived']).T submit = submit.set_index('PassengerId') submit.to_csv('predictions.csv')
sc = StandardScaler() sc.fit(food_X) food_X = sc.transform(food_X) XGB_cv = [] KNN_cv = [] SVM_cv = [] RFC_cv = [] LR_cv = [] #X_train, X_test, y_train, y_test = train_test_split(food_X, food_y, test_size=0.3,shuffle=False) #XGB XGB_model = XGBClassifier(min_child_weight=0.1,max_depth=7) #KNN KNN_model = KNeighborsClassifier() #SVM SVM_model = SVC(kernel = 'linear',probability = True) #隨機森 RFC_model = RandomForestClassifier(n_estimators=100,n_jobs=5) #羅吉斯回歸 LR_model = LogisticRegression() scores_x = cross_val_score(XGB_model,food_X,food_y,cv=5,scoring='neg_root_mean_squared_error') scores_k = cross_val_score(KNN_model,food_X,food_y,cv=5,scoring='neg_root_mean_squared_error')
y_pred = classifier.predict(x_test) y_pred = scPrice.inverse_transform(y_pred) y_pred = np.squeeze(y_pred) output = pd.DataFrame({ 'Id' : ids, 'SalePrice': y_pred }) output.to_csv('house_prediction_NN.csv', index = False) ############################ Xgboost ###################################### from xgboost import XGBClassifier classifier = XGBClassifier() classifier.fit(x_train, y_train.ravel()) y_pred = classifier.predict(x_test) y_pred = scPrice.inverse_transform(y_pred) y_pred = np.squeeze(y_pred) output = pd.DataFrame({ 'Id' : ids, 'SalePrice': y_pred }) output.to_csv('house_prediction_xgboost.csv', index = False)
def Retrain_Model_10_Iterates_SVMSMOTE(target, title, max_depth=3, n_esti=160, lr=0.1, withexperience=False, color='YlGnBu'): matrics = [] seed(2145) groups = df_model_draft['HospID'] if withexperience is False: X = df_model_draft.drop( ['SiteID', 'surgid', 'Complics', 'STSRCHOSPD', 'STSRCOM'], axis=1) y = df_model_draft[target] else: X = df_model_draft.drop([ 'SiteID', 'surgid', 'Complics', 'STSRCHOSPD', 'STSRCOM', 'HospID_Reop_CABG', 'HospID_total_CABG', 'surgyear', 'HospID_total_cardiac_surgery', 'surgid_total_cardiac_surgery', 'surgid_total_CABG', 'surgid_Reop_CABG' ], axis=1) y = df_model_draft[target] print(groups.shape) print(groups.unique()) gss = GroupShuffleSplit(n_splits=10, train_size=.8, random_state=42) gss.get_n_splits() i = 1 for train_idx, test_idx in gss.split(X, y, groups): print("TRAIN:", train_idx, "TEST:", test_idx) if (i == 1): X = X.drop(['HospID'], axis=1) print(X.columns.tolist()) X_train = X.loc[train_idx] y_train = y.loc[train_idx] X_test = X.loc[test_idx] y_test = y.loc[test_idx] print("\nTRAIN DATAFRAME\n", X_train.shape) print("\nTEST DATAFRAME\n", X_test.shape) # summarize class distribution sm = SVMSMOTE() # SVMSMOTE(random_state=21) # fit and apply the transform X_over, y_over = sm.fit_resample(X_train, y_train) # summarize class distribution print("after under sampling") counter = Counter(y_over) print(counter) estimate = counter[0] / counter[1] print('Estimate: %.3f' % estimate) model = XGBClassifier(objective='binary:logistic', eval_metric='logloss', max_depth=max_depth, learning_rate=lr, n_estimators=n_esti) model.fit(X_over, y_over) y_pred = model.predict(X_test) cm = confusion_matrix(y_test, y_pred) mats = Make_Confusion_Matrix(cm, categories=categories, cmap=color, title=title, group_names=labels, y_pred=y_pred, y_test=y_test) auc = roc_auc_score(y_test, model.predict_proba(X_test.values)[:, 1]) mats['AUROC'] = auc matrics.append(mats) i = i + 1 return matrics
'scale_pos_weight': 1, 'max_delta_step': 5, 'n_jobs': 1, 'random_state': 0, 'max_depth': 5, 'min_child_weight': 3, 'n_estimators': 300, 'subsample': 1.0, #0.9, 'colsample_bytree': 0.5, 'reg_lambda': 10, 'reg_alpha': 0.1, 'learning_rate': 0.01, 'gamma': 0.1 } xgb = XGBClassifier(**basicparameter) # xgb=XGBClassifier() # apply the default parameters xgb.fit(Xtrain, Ytrain) # score the model print('============================= XGBoost =============================') score(xgb, Xtrain, Ytrain, Xtest, Ytest) print('============================== SHAP ===============================') explainer = shap.TreeExplainer(xgb) # define the explainer shap_values = explainer.shap_values(X) # use all data for analysis def gen_data(inputs, X): """ creates a data Frame with inputs and X for statistics with shap """ df1 = pd.DataFrame()
import numpy as np import pandas as pd from sklearn.model_selection import train_test_split from xgboost import XGBClassifier # NOTE: Make sure that the class is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1).values training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'].values, random_state=None) # Average CV score on the training set was:0.9080067906471255 exported_pipeline = XGBClassifier(learning_rate=0.1, max_depth=8, min_child_weight=12, n_estimators=100, nthread=1, subsample=0.9000000000000001) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
(logistic_Y_test_predic.reshape(-1, 1), svc_Y_test_predic.reshape(-1, 1), knn_Y_test_predic.reshape(-1, 1), gauss_bayes_Y_test_predic.reshape( -1, 1), perceptron_Y_test_predic.reshape( -1, 1), sgd_Y_test_predic.reshape( -1, 1), decision_tree_Y_test_predic.reshape( -1, 1), random_forest_Y_test_predic.reshape(-1, 1)), axis=1) # 建模9 xgboost from xgboost import XGBClassifier gbm = XGBClassifier( #learning_rate = 0.02, n_estimators=2000, max_depth=4, min_child_weight=2, #gamma=1, gamma=0.9, subsample=0.8, colsample_bytree=0.8, objective='binary:logistic', nthread=-1, scale_pos_weight=1).fit(X_train, y_train) predictions = gbm.predict(X_test) print("gbm score: " + str(gbm.score(X_train, y_train))) # Generate Submission File StackingSubmission = pd.DataFrame({ 'PassengerId': test_PassengerId, 'Survived': predictions }) StackingSubmission.to_csv("./data/StackingSubmission.csv", index=False)
c = np.vstack((ID, predictions2)).transpose() columns = ['id_num', 'is_pass'] index = range(len(predictions2)) obj1 = pd.DataFrame(c, index, columns) # In[162]: obj1 # ### XGboost 算法 # In[188]: from xgboost import XGBClassifier clf = XGBClassifier() x_train = train_base[predictors] ##训练数据 y_train = train_base["is_pass"] ##训练方向 clf.fit(x_train, y_train) ##模拟 test_predict = clf.predict(test_base[predictors]) # In[189]: ID = test_base['id_num'] ID = np.array(ID) c = np.vstack((ID, test_predict)).transpose() columns = ['id_num', 'is_pass'] index = range(11684) obj2 = pd.DataFrame(c, index, columns)
def build_model(X, y, cross=5, models=['xgb']): """ Need support for more models, along with cross validation and feature importances which can be easily taken out something like build_model(X,y,cross = 5,model) if model == 'xgb': ... if model == 'logistic' ... """ best_score = 0 seed = 7 test_size = 0.30 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=seed) for model1 in models: if model1 == 'xgb': print("XGBoost Classifier: \n") model = XGBClassifier() model.fit(X_train, y_train) joblib.dump(model, 'xgb.pkl') pred = model.predict(X_test) pred = pred.astype(int) y_test = y_test.astype(int) print("Balanced Accuracy is ", balanced_accuracy_score(y_test, pred) * 100) results = cross_val_score(model, X_train, y_train, cv=cross, scoring='balanced_accuracy') print("Cross Validation Balanced Accuracy: %.2f%% (%.2f%%)" % (results.mean() * 100, results.std() * 100)) acc = results.mean() * 100 post_proc(X, model) if model1 == 'Logistic': print("\n Logistic Classifier: \n") model = LogisticRegression(solver='liblinear') model.fit(X_train, y_train) joblib.dump(model, 'logi.pkl') pred = model.predict(X_test) prob = model.predict_proba(X_test) y_test = y_test.astype(int) print("Balanced Accuracy is ", balanced_accuracy_score(y_test, pred) * 100) results = cross_val_score(model, X_train, y_train, cv=cross, scoring='balanced_accuracy') print("Cross Validation Balanced Accuracy: %.2f%% (%.2f%%)" % (results.mean() * 100, results.std() * 100)) acc = results.mean() * 100 cm = confusion_matrix(y_test, pred) fig, ax = plt.subplots(figsize=(8, 8)) ax.imshow(cm) ax.grid(False) ax.xaxis.set(ticks=(0, 1), ticklabels=('Predicted 0s', 'Predicted 1s')) ax.yaxis.set(ticks=(0, 1), ticklabels=('Actual 0s', 'Actual 1s')) ax.set_ylim(1.5, -0.5) for i in range(2): for j in range(2): ax.text(j, i, cm[i, j], ha='center', va='center', color='red') plt.show() Logi = pickle.dumps(model) if model1 == 'auto': print("\n Auto: \n") tpot = TPOTClassifier(verbosity=2, max_time_mins=2, scoring='balanced_accuracy') tpot.fit(X_train, y_train) print(tpot.score(X_test, y_test)) if model1 == 'SVM': print("\n SVM: \n") model = svm.NuSVC(gamma='auto') model.fit(X_train, y_train) joblib.dump(model, 'svm.pkl') pred = model.predict(X_test) pred = pred.astype(int) y_test = y_test.astype(int) print("Balanced Accuracy is ", balanced_accuracy_score(y_test, pred) * 100) results = cross_val_score(model, X_train, y_train, cv=cross, scoring='balanced_accuracy') print("Cross Validation Balanced Accuracy: %.2f%% (%.2f%%)" % (results.mean() * 100, results.std() * 100)) acc = results.mean() * 100 if model1 == 'RandomForest': print("\n Random Forest: \n") model = RandomForestClassifier() model.fit(X_train, y_train) joblib.dump(model, 'rf.pkl') pred = model.predict(X_test) pred = pred.astype(int) y_test = y_test.astype(int) print("Balanced Accuracy is ", balanced_accuracy_score(y_test, pred) * 100) results = cross_val_score(model, X_train, y_train, cv=cross, scoring='balanced_accuracy') print("Cross Validation Balanced Accuracy: %.2f%% (%.2f%%)" % (results.mean() * 100, results.std() * 100)) acc = results.mean() * 100 if acc > best_score: best_score = acc model2 = model joblib.dump(model2, 'best.pkl')
class OOF(object): """Out of flod prediction # TODO 支持回归 lightGBM一个一个地建立节点; XGboost一层一层地建立节点 https://blog.csdn.net/friyal/article/details/82758532 Catboost总是使用完全二叉树。它的节点是镜像的(对称树)。Catboost称对称树有利于避免overfit,增加可靠性,并且能大大加速预测等等。 计算某个category出现的频率,加上超参数,生成新的numerical features # https://blog.csdn.net/linxid/article/details/80723811 """ _params = { 'metric': 'auc', 'learning_rate': 0.01, 'n_estimators': 30000, 'subsample': 0.8, 'colsample_bytree': 0.8, 'class_weight': 'balanced', ## 'scale_pos_weight': 1, ## 'random_state': 2019, 'verbosity': -1 } lgb = LGBMClassifier(n_jobs=16, **_params) # TODO: 常用模型另存为其他模块 xgb = XGBClassifier() cat = CatBoostClassifier(n_estimators=20000, learning_rate=0.05, loss_function='Logloss', eval_metric='AUC', random_state=2019) def __init__(self, estimator=None, folds=None, early_stopping_rounds=300, verbose=100): self.estimator = self.lgb if estimator is None else estimator # 指定lgb: metric xgb: eval_metric self.folds = folds if folds else StratifiedKFold( 5, True, 2019) # 支持 RepeatedStratifiedKFold self.model_type = self.estimator.__repr__() self.early_stopping_rounds = early_stopping_rounds self.verbose = verbose # self.estimator_agrs = self.getfullargspec(self.estimator.fit).args if hasattr(self.estimator, 'fit') else None def fit(self, X, y, X_test, feval=None, cat_feats=None, exclude_columns=None, epochs=16, batch_size=128, oof2csv=False, plot=False): """ # TODO: Rank 融合 :param X: 保证索引唯一 :param y: :param X_test: :param feval: roc_auc_score(y_true, y_score) :param cat_feats: 类别特征索引 :param exclude_columns: 仅针对 nn :param epochs: :param batch_size: :return: """ # 判断输入数据转数据框 if isinstance(y, pd.Series): y.reset_index(drop=True, inplace=True) if not isinstance(X, pd.DataFrame): X = pd.DataFrame(X) X_test = pd.DataFrame(X) else: X.reset_index(drop=True, inplace=True) X_test.reset_index(drop=True, inplace=True) # oof评估函数 feval = feval if feval else roc_auc_score # 移除不需要的特征 if exclude_columns: feats = X.columns.difference(exclude_columns) X, X_test = X[feats], X_test[feats] # Score if hasattr(feval, '__repr__'): score_name = feval.__repr__().split()[1] else: score_name = None # cv num if hasattr(self.folds, 'n_splits'): num_cv = self.folds.n_splits else: num_cv = self.folds.cvargs['n_splits'] * self.folds.n_repeats # Cross validation model # Create arrays and dataframes to store results oof_preds = np.zeros(X.shape[0]) sub_preds = np.zeros((X_test.shape[0], num_cv)) self.feature_importance_df = pd.DataFrame() for n_fold, (train_idx, valid_idx) in enumerate(self.folds.split(X, y), 1): print("\n\033[94mFold %s started at %s\033[0m" % (n_fold, time.ctime())) X_train, y_train = X.iloc[train_idx], y[train_idx] X_valid, y_valid = X.iloc[valid_idx], y[valid_idx] if not hasattr(self.estimator, 'fit'): print("该算法无fit方法") break else: if 'LGBMClassifier' in self.model_type: eval_set = [(X_train, y_train), (X_valid, y_valid)] self.estimator.fit( X_train, y_train, eval_set=eval_set, categorical_feature=cat_feats if cat_feats else 'auto', eval_metric='auc', early_stopping_rounds=self.early_stopping_rounds, verbose=self.verbose) elif 'LGBMRegressor' in self.model_type: # reg_objs = ['regression_l1', 'regression_l2', 'huber', 'fair', 'poisson', 'quantile', 'mape', 'gamma', 'tweedie'] eval_set = [(X_train, y_train), (X_valid, y_valid)] self.estimator.fit( X_train, y_train, eval_set=eval_set, categorical_feature=cat_feats if cat_feats else 'auto', # eval_metric='l2', early_stopping_rounds=self.early_stopping_rounds, verbose=self.verbose) elif 'XGBClassifier' in self.model_type: eval_set = [(X_train, y_train), (X_valid, y_valid)] self.estimator.fit( X_train, y_train, eval_set=eval_set, eval_metric='auc', early_stopping_rounds=self.early_stopping_rounds, verbose=self.verbose) elif 'XGBRegressor' in self.model_type: eval_set = [(X_train, y_train), (X_valid, y_valid)] self.estimator.fit( X_train, y_train, eval_set=eval_set, # eval_metric='rmse', early_stopping_rounds=self.early_stopping_rounds, verbose=self.verbose) elif 'CatBoostClassifier' in self.model_type: eval_set = [(X_train, y_train), (X_valid, y_valid)] self.estimator.fit( X_train, y_train, eval_set=eval_set, cat_features=cat_feats, use_best_model=True, plot=True, early_stopping_rounds=self.early_stopping_rounds, verbose=self.verbose) elif 'CatBoostRegressor' in self.model_type: eval_set = [(X_train, y_train), (X_valid, y_valid)] self.estimator.fit( X_train, y_train, eval_set=eval_set, cat_features=cat_feats, use_best_model=True, plot=True, early_stopping_rounds=self.early_stopping_rounds, verbose=self.verbose) elif 'RGFClassifier' in self.model_type: pass elif 'RGFRegressor' in self.model_type: pass # https://www.cnblogs.com/flyu6/p/7691106.html elif 'KerasClassifier' in self.model_type: eval_set = [(X_train, y_train), (X_valid, y_valid)] self.estimator.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=eval_set) elif 'KerasRegressor' in self.model_type: eval_set = [(X_train, y_train), (X_valid, y_valid)] self.estimator.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=eval_set) elif self.model_type == 'GLM': # TODO: 其他模型的支持 self.estimator = GLM(y_train, X_train, family=families.Binomial()) self.estimator = self.estimator.fit().predict(X) else: # sklearn 原生模型 print('Sklearn Fitting ...') self.estimator.fit(X_train, y_train) # 计算并保存 preds # TODO: 多分类需要修改 if hasattr(self.estimator, 'predict_proba'): oof_preds[valid_idx] = self.estimator.predict_proba( X_valid)[:, 1] sub_preds[:, n_fold - 1] = self.estimator.predict_proba(X_test)[:, 1] else: oof_preds[valid_idx] = self.estimator.predict(X_valid) sub_preds[:, n_fold - 1] = self.estimator.predict(X_test) if plot and hasattr(self.estimator, 'feature_importances_'): fold_importance_df = pd.DataFrame() fold_importance_df["feature"] = X.columns fold_importance_df[ "importance"] = self.estimator.feature_importances_ fold_importance_df["fold"] = n_fold self.feature_importance_df = fold_importance_df.append( self.feature_importance_df) # 输出需要的结果 self.oof_preds = oof_preds self.sub_preds = sub_preds.mean(1) self.sub_preds_rank = pd.DataFrame(sub_preds).rank().mean( 1) / sub_preds.shape[0] # auc work try: self.score = feval(y, self.oof_preds) except Exception as e: self.score = 0 print('Error feval:', e) print("\n\033[94mCV Score %s: %s ended at %s\033[0m" % (score_name, self.score, time.ctime())) # 保存的普通平均的得分 if oof2csv: pd.Series(np.append(self.oof_preds, self.sub_preds), name='oof') \ .to_csv('OOF %s %.4f.csv' % (time.ctime(), self.score), index=False) # 是否输出特征重要性 if plot: self.feature_importance_df.sort_values(['fold', 'importance'], 0, False, inplace=True) self.plot_importances(self.feature_importance_df, len(X.columns)) def plot_importances(self, df, topk=64): """Display/plot feature importance""" assert "feature" in df.columns and "importance" in df.columns, '无["feature", "importance"]' data = (df[["feature", "importance" ]].groupby("feature").mean().reset_index().sort_values( "importance", 0, False))[:topk] self.feature_importance_df_agg = data plt.figure(figsize=(12, topk // 4)) sns.barplot(x="importance", y="feature", data=data.assign(feature='col_' + data.feature.astype(str))) plt.title('Features (avg over folds)') plt.tight_layout() plt.savefig('importances.png')
# ===================================================================================================================== # define GridSearchCV parameters cv_params = {'max_depth': [3, 5, 7], 'min_child_weight': [1, 3, 5]} ind_params = { 'learning_rate': 0.1, 'n_estimators': 1000, 'seed': 0, 'subsample': 0.8, 'colsample_bytree': 0.8, 'objective': 'binary:logistic' } # define model model = GridSearchCV( XGBClassifier(**ind_params), cv_params, scoring='accuracy', # 准确度评价标准 cv=5, # cross_validation,交叉验证 n_jobs=-1) # 并行数,int:个数,-1:跟CPU核数一致, 1:默认值 # ===================================================================================================================== def XGB_TRAIN_EVA(): # prepare train data train_data = pd.read_csv(TRAIN_DATA_PATH) train_data.pop('index') Y_train = train_data.pop('income') X_train = train_data
from sklearn.model_selection import cross_validate scoring = {'accuracy': 'accuracy', 'log_loss': 'neg_log_loss', 'auc': 'roc_auc'} results = cross_validate(knnclassifier, X_train, y_train, cv=10, scoring=list(scoring.values()), return_train_score=False) print('K-fold cross-validation results:') for sc in range(len(scoring)): print(knnclassifier.__class__.__name__+" average %s: %.3f (+/-%.3f)" % (list(scoring.keys())[sc], -results['test_%s' % list(scoring.values())[sc]].mean() if list(scoring.values())[sc]=='neg_log_loss' else results['test_%s' % list(scoring.values())[sc]].mean(), results['test_%s' % list(scoring.values())[sc]].std())) # Fitting XGBoost to the Training set from xgboost import XGBClassifier xgclassifier = XGBClassifier() xgclassifier.fit(X_train, y_train) # Predicting the Test set results xg_pred = xgclassifier.predict(X_test) # Making the Confusion Matrix from sklearn.metrics import confusion_matrix cm = confusion_matrix(y_test, xg_pred) print(cm) print("Model Accuracy for XGBoost:",metrics.accuracy_score(y_test, xg_pred)) print(classification_report(y_test,xg_pred)) """Feature importance and weight determination"""
scoring='f1', return_train_score=True) gs_rf.fit(X_train, y_train) ''' Best Estimator RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini', max_depth=None, max_features=10, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=5, n_jobs=None, oob_score=False, random_state=None, verbose=0, warm_start=False) best_f1 = .39 ''' xgb = XGBClassifier() xgb.fit(X_train, y_train) xgb = XGBClassifier() cv_xg = cross_validate(xgb, X_train, y_train, scoring=['accuracy', 'f1']) ''' 'test_accuracy': array([0.6727133 , 0.67297048, 0.67420049]), 'test_f1': array([0.15816528, 0.20663931, 0.2000755 ])} ''' svc = SVC() cv_svc = cross_validate(svc, X_train, y_train, scoring=['accuracy', 'f1']) ''' 'test_accuracy': array([0.667794 , 0.66774293, 0.66774293]), 'test_f1': array([0., 0., 0.])} '''
# Instantiate the classifiers n_estimators = config["threads"] n_jobs = int(n_estimators / 2 + 1) svm_classifier = svm.SVC(kernel='poly', degree=3, gamma='scale', verbose=True, max_iter=1000, cache_size=5000, random_state=now, probability=True) xgb_classifier = XGBClassifier(n_estimators=100, verbosity=2, nthread=config["threads"], max_depth=4, subsample=0.5) rf_classifier = RandomForestClassifier(n_estimators=100, verbose=2, n_jobs=config["threads"], random_state=now) mlp_classifier = MLPClassifier(hidden_layer_sizes=(50, 25), max_iter=1000, n_iter_no_change=50, activation='relu', solver='adam', random_state=now, verbose=True) lda_classifier = LDA(solver='svd')