parser.add_argument('-n','--n_rounds', help='Number of Boost iterations', type=int, default=2000) parser.add_argument('-e','--eta', help='Learning rate', type=float, default=0.01) parser.add_argument('-r','--r_seed', help='Set random seed', type=int, default=3) parser.add_argument('-b','--minbin', help='Minimum categorical bin size', type=int, default=1) parser.add_argument('-ct','--cat_trans', help='Category transformation method', type=str, default='std') parser.add_argument('-cv','--cv', action='store_true') parser.add_argument('-codetest','--codetest', action='store_true') parser.add_argument('-getcached', '--getcached', action='store_true') parser.add_argument('-extra', '--extra', action='store_true') m_params = vars(parser.parse_args()) # Load data X, y, X_sub, ids = data.load(m_params) print("BNP Parabas: AdaBoost...\n") clf = AdaBoostClassifier(n_estimators=30, learning_rate=0.001, algorithm='SAMME', random_state=1) if m_params['cv']: # do cross validation scoring kf = KFold(X.shape[0], n_folds=4, shuffle=True, random_state=1) scr = np.zeros([len(kf)]) oob_pred = np.zeros(X.shape[0]) for i, (tr_ix, val_ix) in enumerate(kf): clf.fit(X[tr_ix], y[tr_ix]) pred = clf.predict_proba(X[val_ix]) oob_pred[val_ix] = np.array(pred[:,1]) scr[i] = log_loss(y[val_ix], np.array(pred[:,1])) print('Train score is:', scr[i]) print(log_loss(y, oob_pred)) print oob_pred[1:10]
X_spatial_norm = norm_scaller.fit_transform(X_spatial, Y) # combine features X_norm = np.concatenate([X_norm, X_spatial_norm], -1) # save features to csv np.savetxt("features.csv", X_norm, delimiter=",") np.savetxt("labels.csv", Y, delimiter=",") # find the optimal value for the Adaboost and Decision Tree parameters = {'n_estimators': [15, 25, 50, 75], "learning_rate": [0.5, 0.75], "base_estimator__max_depth":[1, 3, 5, 7], "base_estimator__max_features":[.5], "base_estimator__max_leaf_nodes":[3, 5, 7]} # base estimator for AdaBoost base_estimator = DecisionTreeClassifier(criterion="entropy", class_weight="balanced", random_state=0) base_model = AdaBoostClassifier(base_estimator=base_estimator, random_state=0) clf = GridSearchCV(base_model, parameters) clf.fit(X_norm, Y) best_params = clf.best_params_ acc = [] roc_auc_values = [] mcc = [] # perform k-fold cross validation kf = StratifiedKFold(n_splits=3, shuffle=True, random_state=0) fold = 0 for train_index, test_index in kf.split(X_norm, Y): X_train = X_norm[train_index] X_test = X_norm[test_index]
y_logistic_regression=logistic_regression.predict(standard_scale.fit_transform(x_test)) print(accuracy_score(y_test, y_logistic_regression)) print(precision_score(y_test, y_logistic_regression)) print(recall_score(y_test, y_logistic_regression)) print(f1_score(y_test, y_logistic_regression)) # # Adaboost # In[40]: adaboost_classifier=AdaBoostClassifier( RandomForestClassifier(), n_estimators=200, algorithm="SAMME.R", learning_rate=0.5 ) adaboost_classifier.fit(x_train_sm, y_train_sm) # In[41]: # adaboost metrics y_train_pred_adaboost=cross_val_predict(adaboost_classifier, x_train_sm, y_train_sm, cv=5) # In[42]:
if (data[i, -1] == 0): data[i, -1] = -1 train_data = data[0:5000, :] test_data = data[5000:, :] train_x = train_data[:, 0:-1] train_y = train_data[:, -1] test_x = test_data[:, 0:-1] test_y = test_data[:, -1] # train_index_list=[] # test_index_list=[] # kf=KFold(n_splits=5,shuffle=False) # for train_index,test_index in kf.split(train_x): # train_index_list.append(train_index) # test_index_list.append(test_index) # # print(test_index_list[3]) # # train_data=train_x[test_index_list[3],:] w = np.ones(train_y.shape) weight = w / train_y.shape[0] estimator = AdaBoostClassifier() estimator.fit(train_x, train_y, sample_weight=weight) predict_y = estimator.predict(test_x) score = estimator.score(test_x, test_y) print(score)
encoded_test_data[categorical_variables] = encoded_test_data[ categorical_variables].apply(lambda x: d[x.name].transform(x)) independent_variables = [ x for x in train_data.columns if x not in ['victim_id', 'datetime', 'criticality'] ] from sklearn.model_selection import train_test_split from sklearn.metrics import roc_auc_score def scorer(estimator, X, y): y1 = np.array(estimator.predict(X)) score = roc_auc_score(y, y1) return score from sklearn.ensemble import AdaBoostClassifier adam = AdaBoostClassifier(learning_rate=2, n_estimators=48, random_state=0) adam.fit(encoded_train_data[independent_variables], encoded_train_data['criticality']) test_predictions = adam.predict(encoded_test_data[independent_variables]) victim_id = test_data['victim_id'] submission = pd.DataFrame({ 'victim_id': victim_id, 'criticality': test_predictions }) submission.to_csv('dataquest_submission4.csv', index=False)
pipe1 = Pipeline([('pca', PCA()), ('classifier', GaussianNB())]) param = {'pca__n_components': [4, 5, 6]} gsv = GridSearchCV(pipe1, param_grid=param, n_jobs=2, scoring='f1', cv=2) gsv.fit(features_train, labels_train) clf = gsv.best_estimator_ print("GausianNB with PCA fitting time: %rs" % round(time() - t0, 3)) pred = clf.predict(features_test) t0 = time() test_classifier(clf, my_dataset, financial_features, folds=1000) print("GausianNB evaluation time: %rs" % round(time() - t0, 3)) ''' Adaboost tuned for comparision with final algorithm ''' from sklearn.tree import DecisionTreeClassifier abc = AdaBoostClassifier(random_state=40) data = featureFormat(my_dataset, financial_features, sort_keys=True) labels, features = targetFeatureSplit(data) dt = [] for i in range(6): dt.append(DecisionTreeClassifier(max_depth=(i + 1))) ab_params = {'base_estimator': dt, 'n_estimators': [60, 45, 101, 10]} t0 = time() abt = GridSearchCV( abc, ab_params, scoring='f1', ) abt = abt.fit(features_train, labels_train) print("AdaBoost fitting time: %rs" % round(time() - t0, 3)) abc = abt.best_estimator_
n_classes = 3 n_estimators = 30 cmap = plt.cm.RdYlBu plot_step = 0.02 # fine step width for decision surface contours plot_step_coarser = 0.5 # step widths for coarse classifier guesses RANDOM_SEED = 13 # fix the seed on each iteration # Load data iris = load_iris() plot_idx = 1 models = [DecisionTreeClassifier(max_depth=None), RandomForestClassifier(n_estimators=n_estimators), ExtraTreesClassifier(n_estimators=n_estimators), AdaBoostClassifier(DecisionTreeClassifier(max_depth=3), n_estimators=n_estimators)] for pair in ([0, 1], [0, 2], [2, 3]): for model in models: # We only take the two corresponding features X = iris.data[:, pair] y = iris.target # Shuffle idx = np.arange(X.shape[0]) np.random.seed(RANDOM_SEED) np.random.shuffle(idx) X = X[idx] y = y[idx] # Standardize
num_folds = 10 seed = 7 scoring = 'accuracy' validation_size = 0.20 seed = 7 X_train, X_validation, Y_train, Y_validation = train_test_split( X, Y, test_size=validation_size, random_state=seed) # Spot Check Algorithms models = [] models.append(('LR', LogisticRegression())) models.append(('LDA', LinearDiscriminantAnalysis())) models.append(('KNN', KNeighborsClassifier())) models.append(('CART', DecisionTreeClassifier())) models.append(('ADA', AdaBoostClassifier())) models.append(('GBC', GradientBoostingClassifier())) models.append(('RFC', RandomForestClassifier())) models.append(('ETC', ExtraTreesClassifier())) models.append(('SVM', SVC())) results = [] names = [] for name, model in models: kfold = KFold(n_splits=num_folds, random_state=seed) cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring) results.append(cv_results) names.append(name)
rf_train_acc, rf_test_acc)) print("Precision score: ", precision_score(Y_test, predictions)) print("Recall score: ", recall_score(Y_test, predictions)) print("F1 score : ", rf_f1_score) confusion_matrix(Y_test, predictions) """Model#3: Ada Boost Classifier""" from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import AdaBoostClassifier tree = DecisionTreeClassifier(random_state=11, max_features="auto", class_weight="balanced", max_depth=None) model_ada = AdaBoostClassifier(base_estimator=tree) model_ada = model_ada.fit(sequences_matrix, Y_train) predictions = model_ada.predict(test_sequences_matrix) ada_train_acc = accuracy_score(Y_train, model_ada.predict(sequences_matrix)) ada_test_acc = accuracy_score(Y_test, predictions) ada_f1_score = f1_score(predictions, Y_test) print("Accuracy score: \n a) Train : {}\n b) Test : {}".format( ada_train_acc, ada_test_acc)) print("Precision score: ", precision_score(Y_test, predictions)) print("Recall score: ", recall_score(Y_test, predictions)) print("F1 score : ", ada_f1_score) confusion_matrix(Y_test, predictions) """Model#4: Recurrent Neural Networks"""
def learn(fname): data = pd.read_csv(fname, encoding='utf-8') # shuffle data data = data.sample(frac=1).reset_index(drop=True) for idx, row in data.iterrows(): if row["content"] is np.nan: data.drop(idx, inplace=True) elif isinstance(row["content"], str): data.set_value(idx, "content", clean_content(row["content"])) X = data["label"].as_matrix() del data["label"] Y = data.as_matrix() X_train, X_test, y_train, y_test = train_test_split(Y, X, test_size=0.20, random_state=42) count_vect = CountVectorizer() X_train_counts = count_vect.fit_transform(X_train[:, 4]) print(X_train_counts.shape) X_test_counts = count_vect.transform(X_test[:, 4]) print(X_test_counts.shape) tfidf_transformer = TfidfTransformer() X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts) X_test_tfidf = tfidf_transformer.transform(X_test_counts) print(X_train_tfidf.shape) print(X_test_tfidf.shape) # add punctuation features mnb_clf = MultinomialNB().fit(X_train_tfidf, y_train) ada_clf = AdaBoostClassifier(n_estimators=100).fit(X_train_tfidf, y_train) text_clf = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42) svm_clf = text_clf.fit(X_train_tfidf, y_train) joblib.dump(tfidf_transformer, 'tfidf_transformer.pkl') joblib.dump(count_vect, 'count_vect.pkl') joblib.dump(mnb_clf, 'mnb_clf.pkl') joblib.dump(svm_clf, 'svm_clf.pkl') joblib.dump(ada_clf, 'ada_clf.pkl') predicted = mnb_clf.predict(X_test_tfidf) ada_predictions = ada_clf.predict(X_test_tfidf) svm_predictions = svm_clf.predict(X_test_tfidf) ada_score = np.mean(ada_predictions == y_test) mnb_score = np.mean(predicted == y_test) svm_score = np.mean(svm_predictions == y_test) print("MNB: ", mnb_score) print("ADA: ", ada_score) print("SVM: ", svm_score) sketchy_score = (mnb_score + ada_score + svm_score) / 3.0 print("Sketchy score: ", sketchy_score)
Mem_Ext = memory_usage_psutil() print("Extra Trees Memory usage: ", Mem_Ext) Cpu_Ext = psutil.cpu_percent() print("Extra Trees Cpu Percent: ", Cpu_Ext) # In[16]: #Adaboost from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier start_time = time.clock() AdaB_model = AdaBoostClassifier( RandomForestClassifier(n_estimators=100, n_jobs=-1, criterion='gini', class_weight='balanced')) AdaB_model = AdaB_model.fit(Train_SVD, Y_train) pred_Adab = AdaB_model.predict(Test_SVD) Acc_Adab = accuracy_score(Y_test, pred_Adab) print("Adaboost accuracy =", Acc_Adab) F1_Adab = f1_score(Y_test, pred_Adab, average='micro') print("Adaboost F-1 score(micro) = ", F1_Adab) F1W_Adab = f1_score(Y_test, pred_Adab, average='weighted') print("Adaboost F-1 score(weighted) = ", F1W_Adab) Time_Adab = time.clock() - start_time
X = Z try: try: os.remove('.'.join(args.file) + '.' + args.model + '.predict.xml') except OSError: pass except OSError: pass dt = DecisionTreeClassifier(min_samples_split=20, random_state=99) svr = SVR(kernel='poly', C=1e3, degree=2) reg = linear_model.Lasso(alpha=0.1) rf = RandomForestClassifier(max_depth=2, random_state=0) ab = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2), n_estimators=600, learning_rate=1.5, algorithm="SAMME") gnb = MultinomialNB() #GaussianNB() lrg = LogisticRegression() model = dt if args.model == 'lrg': model = lrg elif args.model == 'svr': model = svr elif args.model == 'rf': model = rf elif args.model == 'av': model == ab elif args.model == 'gnb': model = gnb if args.semi == 'svr' and len(args.file) == 2: # one labled and one unlabled
from sklearn.model_selection import KFold cv = 5 kf = KFold(n_splits=cv, shuffle=True) for train_index, test_index in kf.split(feature, target): X_train, X_test = feature[train_index], feature[test_index] y_train, y_test = target[train_index], target[test_index] from sklearn.ensemble import AdaBoostClassifier clf4 = AdaBoostClassifier() clf4.fit(X_train, y_train) y_train_pred4 = clf4.predict(X_train) y_pred4 = clf4.predict(X_test) prec4, rec4, f14, acc_train4, acc_test4 = getScore(y_test, y_pred4, y_train_pred4) prec_sum4 = prec_sum4 + prec4 rec_sum4 = rec_sum4 + rec4 f14_sum = f14_sum + f14 sum_acc_train4 = sum_acc_train4 + acc_train4 sum_acc_test4 = sum_acc_test4 + acc_test4
"Standard Scaler", "Normal Scaler", "MinMaxScaler", "MaxAbsScaler", "Kernel Centerer" ] for preprocess, name in zip(preprocessors, preprocessors_type): print "-------------------------------------\n" print "For Preprocessor : ", preprocess print "--------------------------------------\n" data = preprocess.fit_transform(forestFrame.values) train_data, test_data, train_labels, test_labels = cross_validation.train_test_split( data, target_labels.values, test_size=0.3) rf = RandomForestClassifier(n_estimators=101) ada = AdaBoostClassifier(n_estimators=101) bagging = BaggingClassifier(n_estimators=101) gradBoost = GradientBoostingClassifier(n_estimators=101) classifiers = [rf, ada, bagging, gradBoost] classifier_names = [ "Random Forests", "Adaboost", "Bagging", "Gradient Boost" ] for classifier, classifier_name in zip(classifiers, classifier_names): classifier.fit(train_data, train_labels) predicted_labels = classifier.predict(test_data) print "----------------------------------\n" print "Accuracy for ", classifier_name, " : ", metrics.accuracy_score( test_labels, predicted_labels)
y_test=y_test[:,0].reshape(-1) print("BAGGİNG") from sklearn.ensemble import BaggingClassifier bg = BaggingClassifier(clf,max_samples=0.5, max_features=1.0, n_estimators=20) bg.fit(X_train, y_train) y_pred =bg.predict(X_test) polat=accuracy_score(y_test, y_pred) print("Accuracy:",polat) #ADABOOST #integer X_train=X_train.astype ("int") X_test=X_test.astype("int") y_train=y_train.astype("int") y_test=y_test.astype("int") print("ADABOOST") clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=3), n_estimators=10, learning_rate=0.01) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) memati=accuracy_score(y_test, y_pred) print("Accuracy:",memati) '''# Create Decision Tree classifer object clf = DecisionTreeClassifier(criterion="entropy", max_depth=3) # Train Decision Tree Classifer clf = clf.fit(X_train,y_train) #Predict the response for test dataset y_pred = clf.predict(X_test) print("Accuracy:",metrics.accuracy_score(y_test, y_pred)) from sklearn.tree import export_graphviz from sklearn.externals.six import StringIO
# initial visualization plt.xlim(0.0, 1.0) plt.ylim(0.0, 1.0) plt.scatter(bumpy_fast, grade_fast, color="b", label="fast") plt.scatter(grade_slow, bumpy_slow, color="r", label="slow") plt.legend() plt.xlabel("bumpiness") plt.ylabel("grade") plt.show() # your code here! name your classifier object clf if you want the # visualization code (prettyPicture) to show you the decision boundary clf = AdaBoostClassifier(n_estimators=50) clf.fit(features_train, labels_train) pred = clf.predict(features_test) acc = accuracy_score(pred, labels_test) print(clf.score(features_test, labels_test)) print(f'Accuracy: {acc}') try: prettyPicture(clf, features_test, labels_test) except NameError: pass
# hf.lasso_selection(features, labels, features_list) ### Task 4: Try a varity of classifiers ### Please name your classifier clf for easy export below. ### Note that if you want to do PCA or other multi-stage operations, ### you'll need to use Pipelines. For more info: ### http://scikit-learn.org/stable/modules/pipeline.html random_seed = 1303 features_train, features_test, labels_train, labels_test = \ train_test_split(features, labels, test_size=0.3, random_state=random_seed) cv = StratifiedShuffleSplit(labels_train, n_iter=20, test_size=0.5, random_state=random_seed) ada = AdaBoostClassifier(random_state=random_seed) selector = RFE(ada, step=1) pipe_ada = Pipeline(steps=[('RFE', selector), ('ada', ada)]) params_ada_gs = { "RFE__n_features_to_select": np.arange(11, 15, 2), "ada__learning_rate": np.arange(0.3, 0.7, 0.2), "ada__n_estimators": [50, 100] } # pipe_ada= Pipeline(steps=[('RFE', selector), ('ada', ada)]) # params_ada_gs = {"RFE__n_features_to_select": [15], # "ada__learning_rate" : [0.5], # "ada__n_estimators" : [50] # } gs = GridSearchCV(pipe_ada, params_ada_gs, scoring='f1', cv=cv)
#%% from sklearn.naive_bayes import MultinomialNB import pandas as pd import numpy as np #%% data = pd.read_csv('spambase.data').as_matrix() np.random.shuffle(data) X = data[:, :48] y = data[:, -1] #%% Xtrain = X[:-100, ] ytrain = y[:-100, ] Xtest = X[-100:, ] ytest = y[-100:, ] #%% model = MultinomialNB() model.fit(Xtrain, ytrain) print("\nAccuracy for NB: ", model.score(Xtest, ytest)) #%% from sklearn.ensemble import AdaBoostClassifier #%% model = AdaBoostClassifier() model.fit(Xtrain, ytrain) print("Accuracy for Adaboost is: ", model.score(Xtest, ytest))
kfold = model_selection.KFold(n_splits=10, random_state=7) cart = DecisionTreeClassifier() num_trees = 100 model = BaggingClassifier(base_estimator=cart, n_estimators=num_trees, random_state=7) results = model_selection.cross_val_score(model, X, Y, cv=kfold) print(results.mean()) # AdaBoost Classification from sklearn.ensemble import AdaBoostClassifier seed = 7 num_trees = 70 kfold = model_selection.KFold(n_splits=10, random_state=seed) model = AdaBoostClassifier(n_estimators=num_trees, random_state=seed) results = model_selection.cross_val_score(model, X, Y, cv=kfold) print(results.mean()) # Voting Ensemble for Classification from sklearn.linear_model import LogisticRegression from sklearn.tree import DecisionTreeClassifier from sklearn.svm import SVC from sklearn.ensemble import VotingClassifier kfold = model_selection.KFold(n_splits=10, random_state=seed) # create the sub models estimators = [] model1 = LogisticRegression(solver='lbfgs', max_iter=10000)
names = [ "Nearest Neighbors", "Linear SVM", "RBF SVM", "Gaussian Process", "Decision Tree", "Random Forest", "Neural Net", "AdaBoost", "Naive Bayes", "QDA" ] classifiers = [ KNeighborsClassifier(3), SVC(kernel="linear", C=0.025), SVC(gamma=2, C=1), GaussianProcessClassifier(1.0 * RBF(1.0)), DecisionTreeClassifier(max_depth=5), RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1), MLPClassifier(alpha=1), AdaBoostClassifier(), GaussianNB(), QuadraticDiscriminantAnalysis() ] X, y = make_classification(n_features=2, n_redundant=0, n_informative=2, random_state=1, n_clusters_per_class=1) rng = np.random.RandomState(2) X += 2 * rng.uniform(size=X.shape) linearly_separable = (X, y) datasets = [ make_moons(noise=0.3, random_state=0),
from sklearn.feature_selection import SelectKBest, chi2 from imblearn.pipeline import Pipeline from customer_review_API_lib import run_test_predictions DIRECTORY = 'C:/Users/bergj/Documents/Geroge Mason/Courses/2019-Spring/GMU- CS 584/FinalProject/data/' run_test_predictions( toys_file='{}{}'.format(DIRECTORY, 'amazon_reviews_us_Apparel_v1_00.tsv'), apparel_file='{}{}'.format(DIRECTORY, 'amazon_reviews_us_Toys_v1_00.tsv'), min_words=12, n_reviews=200000, pipeline=Pipeline([('tfidf', TfidfVectorizer(norm='l2', max_df=0.6, min_df=75, ngram_range=(1, 1), stop_words='english')), ('feature-extract', SelectKBest(chi2, k=20)), ('clf', AdaBoostClassifier(base_estimator=LogisticRegression( C=0.1, class_weight='balanced', solver='liblinear'), n_estimators=10, learning_rate=5))]), gridsearch_args=dict(param_grid={ 'clf__n_estimators': [2, 5, 10, 15], 'clf__learning_rate': [0.1, 1, 5] }, scoring='f1'))
clf_mlp.fit(X_train, y_train) end_time = time.time() - start_time print end_time print "Evaluation time" start_time = time.time() predictions = clf_mlp.predict(X_test) end_time = time.time() - start_time print end_time print(classification_report(y_test, predictions)) joblib.dump( clf_mlp, 'datasetB_results/' + 'MLP_logistic_sgd_' + method + '-' + size + '.joblib.pkl') from sklearn.ensemble import AdaBoostClassifier print "AdaBoostClassifier" clf_ada = AdaBoostClassifier() print "Training classifier" start_time = time.time() clf_ada.fit(X_train, y_train) end_time = time.time() - start_time print end_time print "Evaluation time" start_time = time.time() predictions = clf_ada.predict(X_test) end_time = time.time() - start_time print end_time print(classification_report(y_test, predictions)) joblib.dump( clf_ada, 'datasetB_results/' + 'AdaBoostClassifier_' + method + '-' + size + '.joblib.pkl')
}) # appending our result table result_tabulation = result_tabulation.append(Bagging_Meta_estimator, ignore_index=True) # view the result table result_tabulation # In[183]: # Adaboost from sklearn.ensemble import AdaBoostClassifier # build the model adaboost = AdaBoostClassifier(random_state=10) # fit the model adaboost.fit(X_train, y_train) # In[184]: # predict the values y_pred_adaboost = adaboost.predict(X_test) # In[185]: adaboost_metrics = pd.Series({ 'Model': "AdaBoost", 'AUC Score': metrics.roc_auc_score(y_test, y_pred_adaboost),
start_time = time.time() print("开始训练模型..") model.fit(x_train, t_train) # 训练模型 end_time = time.time() print("训练结束! 耗时:", end_time-start_time, "s") joblib.dump(model, save_file) # 保存模型 if __name__ == "__main__": # adaboost n_est = 20 print("弱分类器数为: ", n_est) model = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2, min_samples_split=20, min_samples_leaf=5), algorithm="SAMME", n_estimators=n_est, learning_rate=0.5) dataset_dir = os.path.abspath(os.path.join(os.getcwd(), "..")) + "/model" save_dir = dataset_dir + '/model.pkl' if(os.path.isfile(save_dir)): # 读取模型 model = joblib.load(save_dir) # 加载模型 else: train_mnist(model, save_dir) # 训练模型 (x_train, t_train), (x_test, t_test) = load_mnist() # 加载训练集、测试集 print("测试集识别精确度:", model.score(x_test, t_test))
RandomForestClassifier(criterion='entropy', min_samples_split=5, random_state=24)), ('Random Forrest 2', RandomForestClassifier(criterion='entropy', max_depth=20, random_state=24)), ('Random Forrest 3', RandomForestClassifier(criterion='entropy', min_samples_split=20, random_state=24)), ('Random Forrest 4', RandomForestClassifier(criterion='entropy', min_samples_split=50, random_state=24)), ('ADdaBOost', AdaBoostClassifier(n_estimators=100, random_state=24)), ('PERceptron', CalibratedClassifierCV(Perceptron(max_iter=50, tol=-np.infty, random_state=24), cv=10, method='isotonic')), ('PERceptron 2', CalibratedClassifierCV(Perceptron(max_iter=100, tol=-np.infty, random_state=24), cv=10, method='isotonic')), ('KNeighbors Classifier', KNeighborsClassifier(n_neighbors=5)), ('KNeighbors Classifier 2', KNeighborsClassifier(n_neighbors=2)), ('Multi-Layer Perceptron', MLPClassifier(random_state=24))]
from sklearn.datasets import load_iris from sklearn.ensemble import AdaBoostClassifier # 导包 from sklearn.model_selection import cross_val_score from __init__ import write_log log_file = "test_cnn_9_1.log" # 载入数据,sklearn中自带的iris数据集 iris = load_iris() write_log(str(iris), file=log_file) """ AdaBoostClassifier参数解释 base_estimator:弱分类器,默认是CART分类树:DecisionTressClassifier algorithm:在scikit-learn实现了两种AdaBoost分类算法,即SAMME和SAMME.R, SAMME就是原理篇介绍到的AdaBoost算法,指Discrete AdaBoost SAMME.R指Real AdaBoost,返回值不再是离散的类型,而是一个表示概率的实数值,算法流程见后文 两者的主要区别是弱分类器权重的度量,SAMME使用了分类效果作为弱分类器权重,SAMME.R使用了预测概率作为弱分类器权重。 SAMME.R的迭代一般比SAMME快,默认算法是SAMME.R。因此,base_estimator必须使用支持概率预测的分类器。 loss:这个只在回归中用到,不解释了 n_estimator:最大迭代次数,默认50。在实际调参过程中,常常将n_estimator和学习率learning_rate一起考虑 learning_rate:每个弱分类器的权重缩减系数v。f_k(x)=f_{k-1}*a_k*G_k(x)。较小的v意味着更多的迭代次数,默认是1,也就是v不发挥作用。 另外的弱分类器的调参,弱分类器不同则参数不同,这里不详细叙述 """ # 构建模型 clf = AdaBoostClassifier(n_estimators=100) # 弱分类器个数设为100 scores = cross_val_score(clf, iris.data, iris.target) print(scores.mean())
y = df[DEPENDENT_VARIABLE] X = df.drop(DEPENDENT_VARIABLE,axis=1) from sklearn.tree import DecisionTreeClassifier basetree = DecisionTreeClassifier( criterion="entropy") from sklearn.feature_selection import RFE rfe = RFE(basetree) rfe.fit(X,y) rfe.ranking_ rankdf = pd.DataFrame({"rank" : rfe.ranking_ , "feature":X.columns}) rankdf[rankdf["rank"] == 1] from sklearn.metrics import roc_auc_score from sklearn.model_selection import RandomizedSearchCV , cross_val_score from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import train_test_split from sklearn.ensemble import GradientBoostingClassifier , AdaBoostClassifier X_train , X_test ,y_train, y_test = train_test_split(X , y , stratify = y) basetree = DecisionTreeClassifier( criterion="gini" , min_samples_split=0.4) clf = AdaBoostClassifier(n_estimators=50 , learning_rate=0.5) cross_val_score(clf , X,y,scoring="roc_auc")
#将数据规定在[-1,1]之间 scaler = MinMaxScaler(feature_range=(-1, 1)) # 数据转换 rescaledX = scaler.fit_transform(traindata) # 设定数据的打印格式 set_printoptions(precision=3) print(rescaledX) # 调参 num_folds = 5 seed = 7 kfold = KFold(n_splits=num_folds, random_state=seed) from sklearn.model_selection import GridSearchCV scoring = 'accuracy' param_grid = {'n_estimators': [10, 30, 50, 70, 90, 100]} model = AdaBoostClassifier(LogisticRegression(C=1000, multi_class='multinomial', solver='lbfgs'), algorithm='SAMME.R') #LogisticRegression(),algorithm='SAMME.R' kfold = KFold(n_splits=num_folds, random_state=seed) grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scoring, cv=kfold) grid_result = grid.fit(X=rescaledX, y=trainlabel) print('最优:%s 使用%s' % (grid_result.best_score_, grid_result.best_params_)) cv_results = zip(grid_result.cv_results_['mean_test_score'], grid_result.cv_results_['std_test_score'], grid_result.cv_results_['params']) for mean, std, param in cv_results: print('%f (%f) with %r' % (mean, std, param))
msg = "Taxa de acerto do vencedor entre os dois algoritmos no mundo real: {0}".format( taxa_de_acerto) print(msg) resultados = {} from sklearn.naive_bayes import MultinomialNB modeloMultinomial = MultinomialNB() resultadoMultinomial = fit_and_predict("MultinomialNB", modeloMultinomial, treino_dados, treino_marcacoes) resultados[resultadoMultinomial] = modeloMultinomial from sklearn.ensemble import AdaBoostClassifier modeloAdaBoost = AdaBoostClassifier( random_state=0) # elimina randomizacao (sempre mesmo resultado) resultadoAdaBoost = fit_and_predict("AdaBoostClassifier", modeloAdaBoost, treino_dados, treino_marcacoes) resultados[resultadoAdaBoost] = modeloAdaBoost # Algoritmo Um Contra Resto usando LinearSVC from sklearn.multiclass import OneVsRestClassifier from sklearn.svm import LinearSVC modeloOneVsRest = OneVsRestClassifier( LinearSVC(random_state=0)) # elimina randomizacao (sempre mesmo resultado) resultadoOneVsRest = fit_and_predict("OneVsRest", modeloOneVsRest, treino_dados, treino_marcacoes) resultados[resultadoOneVsRest] = modeloOneVsRest # Algoritmo Um Contra Um (todas as categorias são testadas entre si) from sklearn.multiclass import OneVsOneClassifier
if ( len(inpgrade1.columns) == 2 ): grade = load_data("ml_scripts/data/" + course + "/MasterTrainingData1.csv") X = grade[['Homework 1', 'Quiz 1 ']].values scaler = StandardScaler().fit(X) X = scaler.transform(X) y = grade[["Grade"]].values.ravel() y1 = [] for label in y: if label == "Good": y1.append(0) if label == "OK": y1.append(1) if label == "High-risk": y1.append(2) # model = MLPClassifier(random_state=0, hidden_layer_sizes=(7, 20), alpha=0.0001, solver='lbfgs',max_iter=200, learning_rate = 'adaptive') model = AdaBoostClassifier(random_state=0, n_estimators=1000) model.fit(X, y1) chosenModels[0] = model elif ( len(inpgrade1.columns) == 5 ): grade = load_data("ml_scripts/data/" + course + "/MasterTrainingData2.csv") X = grade[['Quiz 1 ', 'Quiz 2 ', 'Quiz 3', 'Homework 1', 'Homework 2']].values y = grade[["Grade"]].values.ravel() y1 = [] for label in y: if label == "Good": y1.append(0) if label == "OK": y1.append(1) if label == "High-risk": y1.append(2)