from sklearn.metrics import roc_auc_score from sklearn.metrics import classification_report # Load iris dataset # Load dataset df = pd.read_csv('../datasets/diabetes.csv') X = df.drop('diabetes', axis=1).values y = df['diabetes'].values X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=21, stratify=y) accuracy = dict() roc_auc = dict() tree = DecisionTreeClassifier(criterion='entropy', max_depth=None, random_state=1) bag = BaggingClassifier(base_estimator=tree, n_estimators=500, max_samples=1.0, max_features=1.0, bootstrap=True, bootstrap_features=False, n_jobs=1, random_state=1) tree = tree.fit(X_train, y_train) y_test_pred = tree.predict(X_test) y_pred_prob = tree.predict_proba(X_test)[:,1]
import pandas as pd from sklearn.tree import DecisionTreeClassifier import joblib # read the data from a csv file music_data = pd.read_csv('weather.csv') # CLEAN THE TRAINING AND PREDICTING DATA x = music_data.drop(columns=['output']) y = music_data['output'] # create model model = DecisionTreeClassifier() model.fit(x,y) # make predictions trained_model = joblib.dump(model,'trained_model.joblib')
m, n = df1.shape X = df1.iloc[:, 0:n - 1] Y = df1["price"] x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=0) test = pd.concat([x_test, y_test], axis=1) # test.to_csv("./t.csv", sep=",", index=0) # from sklearn.preprocessing import StandardScaler # # ss = StandardScaler() # x_train = ss.fit_transform(x_train) # x_test = ss.transform(x_test) regressor = DecisionTreeClassifier(random_state=0) parameters = {'max_depth': range(10, 50)} scoring_fnc = make_scorer(accuracy_score) kfold = KFold(n_splits=10) grid = GridSearchCV(regressor, parameters, scoring_fnc, cv=kfold) grid = grid.fit(x_train, y_train.ravel()) reg = grid.best_estimator_ print('train score: %f' % grid.best_score_) print('best parameters:') for key in parameters.keys(): print('%s: %d' % (key, reg.get_params()[key])) print('test score: %f' % reg.score(x_test, y_test)) from sklearn.externals import joblib joblib.dump(grid, "./" + i[:-4] + ".m")
X[:,7] = labelencoder_X.fit_transform(X[:,7]) X[:,8] = labelencoder_X.fit_transform(X[:,8]) X[:,9] = labelencoder_X.fit_transform(X[:,9]) X[:,10] = labelencoder_X.fit_transform(X[:,10]) X[:,11] = labelencoder_X.fit_transform(X[:,11]) X[:,12] = labelencoder_X.fit_transform(X[:,12]) X[:,13] = labelencoder_X.fit_transform(X[:,13]) X[:,14] = labelencoder_X.fit_transform(X[:,14]) X[:,15] = labelencoder_X.fit_transform(X[:,15]) X[:,16] = labelencoder_X.fit_transform(X[:,16]) X[:,17] = labelencoder_X.fit_transform(X[:,17]) X[:,18] = labelencoder_X.fit_transform(X[:,18]) X[:,19] = labelencoder_X.fit_transform(X[:,19]) X[:,20] = labelencoder_X.fit_transform(X[:,20]) X[:,21] = labelencoder_X.fit_transform(X[:,21]) features = ['cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor', 'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color', 'stalk-shape', 'stalk-root', 'stalk-surface-above-ring', 'stalk-surface-below-ring', 'stalk-color-above-ring', 'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number', 'ring-type', 'spore-print-color', 'population', 'habitat'] X = df[features] y = df['class'] dtree = DecisionTreeClassifier() dtree = dtree.fit(X, y) data = tree.export_graphviz(dtree, out_file=None, feature_names=features) graph = pydotplus.graph_from_dot_data(data) graph.write_png('mydecisiontree.png') img = pltimg.imread('mydecisiontree.png') imgplot = plt.imshow(img) plt.show()
# Import DecisionTreeClassifier from sklearn.tree import DecisionTreeClassifier # Import BaggingClassifier from sklearn.ensemble import BaggingClassifier # Instantiate dt dt = DecisionTreeClassifier(random_state=1) # Instantiate bc bc = BaggingClassifier(base_estimator=dt, n_estimators=50, random_state=1)
def run_demo(): #################################################################################################################### # Tip 1: Use make_column_transformer to apply different preprocessing to different columns # # NOTE: I'm not sure this works # #################################################################################################################### # Load data (loading Titanic dataset) data = pd.read_csv( 'https://web.stanford.edu/class/archive/cs/cs109/cs109.1166/stuff/titanic.csv' ) # Make Transformer preprocessing = make_column_transformer( (OneHotEncoder(), ['Pclass', 'Sex']), (SimpleImputer(), ['Age']), remainder='passthrough') # Fit-Transform data with transformer data = preprocessing.fit_transform(data) #################################################################################################################### # Tip 2: Use make_column_selector to change data types to different columns # # NOTE: I'm not sure this works # #################################################################################################################### # Load data (loading Titanic dataset) data = pd.read_csv( 'https://web.stanford.edu/class/archive/cs/cs109/cs109.1166/stuff/titanic.csv' ) # Make Transformer preprocessing = make_column_transformer( (OneHotEncoder(), make_column_selector(dtype_include='object')), (SimpleImputer(), make_column_selector(dtype_include='int')), remainder='drop') # Fit-Transform data with transformer data = preprocessing.fit_transform(data) #################################################################################################################### # Tip 3: Use Pipeline. Pipeline chains together multiple preprocessing steps. The output of each step is used as # # input to the next step, it makes it easy to apply the same preprocessing to Train and Test. # #################################################################################################################### # Load data (loading Titanic dataset) data = pd.read_csv( 'https://web.stanford.edu/class/archive/cs/cs109/cs109.1166/stuff/titanic.csv' ) # Set X and y X = data.drop('Survived', axis=1) y = data[['Survived']] # Split Train and Test X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) # Set variables ohe = OneHotEncoder(handle_unknown='ignore', sparse=True) imputer = SimpleImputer(add_indicator=True, verbose=1) scaler = StandardScaler() clf = DecisionTreeClassifier() # Make Transformer preprocessing = make_column_transformer((make_pipeline(imputer, scaler), [ 'Age', 'Siblings/Spouses Aboard', 'Parents/Children Aboard', 'Fare' ]), (ohe, ['Pclass', 'Sex', 'Name']), remainder='passthrough') # Make pipeline pipe = make_pipeline(preprocessing, clf) # Fit model pipe.fit(X_train, y_train.values.ravel()) print("Best score : %f" % pipe.score(X_test, y_test.values.ravel())) #################################################################################################################### # Tip 4: You can grid search an entire pipeline and fine optimal tuning parameters # #################################################################################################################### # Load data (loading Titanic dataset) data = pd.read_csv( 'https://web.stanford.edu/class/archive/cs/cs109/cs109.1166/stuff/titanic.csv' ) # Set X and y X = data.drop('Survived', axis=1) y = data[['Survived']] # Set variables clf = LogisticRegression() ohe = OneHotEncoder() scaler = StandardScaler() imputer = SimpleImputer() # Make Transformer preprocessing = make_column_transformer((make_pipeline(imputer, scaler), [ 'Age', 'Siblings/Spouses Aboard', 'Parents/Children Aboard', 'Fare' ]), (ohe, ['Sex']), remainder='drop') # Make pipeline pipe = make_pipeline(preprocessing, clf) # Set params for Grid Search params = {} params['logisticregression__C'] = [0.1, 0.2, 0.3] params['logisticregression__max_iter'] = [200, 500] # Run grid search grid = GridSearchCV(pipe, params, cv=5, scoring='accuracy') grid.fit(X, y.values.ravel()) print(grid.best_score_) print(grid.best)
def fit(self, x_data, y_data): """使用决策树进行训练""" self.estimator = DecisionTreeClassifier() self.estimator.fit(x_data, y_data) print("训练成功")
#Linear kernal has no need for gamma kpcas.append(('Linear K', 'lin_k', KernelPCA(n_components=2, kernel='linear'))) kpcas.append(('RBF K', 'rbf_k',KernelPCA(n_components=2, kernel='rbf', gamma=gamma))) #kpcas.append(('Polynomial K', 'ply_k', KernelPCA(n_components=2, kernel='poly', gamma=gamma))) #kpcas.append(('Sigmoid K', 'sig_k', KernelPCA(n_components=2, kernel='sigmoid', gamma=gamma))) #kpcas.append(('Cosine K', 'cos_k',KernelPCA(n_components=2, kernel='cosine', gamma=gamma))) #Initiate models with default parameters models = [] models.append(('Linear SVM', 'lin_svc', SVC(kernel='linear', probability=True))) models.append(('RBF Kernel SVM','rbf_svc', SVC(kernel='rbf', gamma=gamma, probability=True))) models.append(('K-Nearest Neighbour', 'knn', KNeighborsClassifier())) models.append(('Logistic Regression', 'log_reg', LogisticRegression())) models.append(('Decision Tree', 'dec_tree', DecisionTreeClassifier())) models.append(('Gaussian Naive Bayes', 'gnb', GaussianNB())) models.append(('Random Forest', 'rf', RandomForestClassifier())) models.append(('Gradient Boosting', 'gb', GradientBoostingClassifier())) #models.append(('PLS', PLSRegression())) # Scale=False as data already scaled. folds = 10 cv = StratifiedKFold(n_splits=folds, random_state=10) # Declare KPCA kernels deployed kpca_kernels = [] for kernel, abbreviation, kpca in kpcas:
yhat = naive_prediction(testX, value) # evaluate score = accuracy_score(testy, yhat) # summarize print('Naive=%d score=%.3f' % (value, score)) # Test options and evaluation metric seed = 7 scoring = 'accuracy' # Spot Check Algorithms models = [] models.append(('LR', LogisticRegression())) models.append(('LDA', LinearDiscriminantAnalysis())) models.append(('KNN', KNeighborsClassifier())) models.append(('CART', DecisionTreeClassifier())) models.append(('NB', GaussianNB())) models.append(('SVM', SVC())) # evaluate each model in turn results = [] names = [] for name, model in models: kfold = model_selection.KFold(n_splits=10, random_state=seed) cv_results = model_selection.cross_val_score(model, trainX, trainy, cv=kfold, scoring=scoring) results.append(cv_results) names.append(name)
X=df.drop(['Outcome'], axis=1) y=df['Outcome'] # In[4]: from sklearn.model_selection import train_test_split X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0) print(X_train.shape,X_test.shape,y_train.shape,y_test.shape) # In[5]: dec_cls=DecisionTreeClassifier(max_depth=5) # In[6]: dec_cls.fit(X_train,y_train) # In[7]: y_pred=dec_cls.predict(X_test) # In[8]:
# Success print ("{} trained on {} samples.".format(learner.__class__.__name__, sample_size)) # Return the results return results # Import the three supervised learning models from sklearn from sklearn.svm import SVC from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import AdaBoostClassifier # TODO: Initialize the three models clf_A = SVC() clf_B = DecisionTreeClassifier(min_samples_split=20) clf_C = AdaBoostClassifier() # Calculate the number of samples for 1%, 10%, and 100% of the training data # HINT: samples_100 is the entire training set i.e. len(y_train) # HINT: samples_10 is 10% of samples_100 # HINT: samples_1 is 1% of samples_100 samples_100 = len(y_train) samples_10 = len(y_train) // 10 samples_1 = len(y_train) // 100 # Collect results on the learners results = {} results = train_predict(clf_A, samples_1, X_train, y_train, X_test, y_test)
#summary of the model predicion print(classification_report(y_test,y_pred)) print('Confusion Matrix:\n',confusion_matrix(y_test,y_pred)) #accuracy score of the model from sklearn.metrics import accuracy_score print('accuracy score :',accuracy_score(y_pred,y_test)) """### **Decision Tree Classifier**""" #Decision Tree Classifier #importing the library from sklearn.tree import DecisionTreeClassifier #creating local variable classifier classifier = DecisionTreeClassifier() #Training the model classifier.fit(X_train,y_train) #predicting the value of Y y_pred = classifier.predict(X_test) #importing metrics for evaluation from sklearn.metrics import confusion_matrix from sklearn.metrics import classification_report #summary of the model predicion print(classification_report(y_test,y_pred)) print('Confusion Matrix:\n',confusion_matrix(y_test,y_pred)) #accuracy score of the model
def DecisionTreeModel(feature_set): c = SklearnClassifier(DecisionTreeClassifier()) accuracies = cross_validation(c, feature_set) print_metrics("Decision Tree", accuracies)
import numpy as np from sklearn.ensemble import AdaBoostClassifier from sklearn.tree import DecisionTreeClassifier from chapter07_Adaboost import adaColic # see: http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html if __name__ == '__main__': Xtrain, ytrain = adaColic.loadDataSet('horseColicTraining2.txt') Xtest, ytest = adaColic.loadDataSet('horseColicTest2.txt') clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2), algorithm='SAMME', n_estimators=10) clf.fit(Xtrain, ytrain) predictions = clf.predict(Xtrain) errArr = np.mat(np.ones((len(Xtrain), 1))) print('training set error rate: %.3f%%' % (float(errArr[predictions != ytrain].sum()) / len(Xtrain) * 100.0)) predictions = clf.predict(Xtest) errArr = np.mat(np.ones((len(Xtest), 1))) print('test set error rate: %.3f%%' % (float(errArr[predictions != ytest].sum()) / len(Xtest) * 100.0))
df = df.sample(frac=1).reset_index(drop=True) df['lbl'] = 1.0 df.loc[df['type']=='R', 'lbl'] = 0.0 df.drop('type', axis=1, inplace=True) df.astype(np.float32, inplace=True) feature_names = ['c' + str(i) for i in range(60)] label_name =['lbl'] # section 2: prep train and test data test_x = df[:70][feature_names].get_values() test_y = df[:70][label_name].get_values().ravel() train_x = df[70:][feature_names].get_values() train_y = df[70:][label_name].get_values().ravel() # section 3: take a look at performance of sklearn decision tree and randomforest clf = DecisionTreeClassifier() clf.fit(train_x, train_y) print("Sklearn Decision Tree Classifier", clf.score(test_x, test_y)) rfclf = RandomForestClassifier(n_jobs=2) rfclf.fit(train_x, train_y) print("Sklearn Random Forest Classifier", rfclf.score(test_x, test_y)) # section 4: my first practice of random forest m = 10 votes = [1/m] * m num_train = len(train_x) num_feat = len(train_x[0])
from sklearn.model_selection import train_test_split from sklearn.ensemble import AdaBoostClassifier raw_data_df = pd.read_csv("train_data.csv") raw_class_df = pd.read_csv("train_class.csv") start_time = time.time() data_train, data_verif, class_train, class_verif = train_test_split(raw_data_df, raw_class_df, test_size = 0.3, random_state = 2, stratify = raw_class_df) #data_verif, class_train, class_verif clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), n_estimators=100) clf.fit(data_train, ravel(class_train)) prediction = clf.predict(data_verif) pred = clf.predict_proba(data_verif) tn, fp, fn, tp = confusion_matrix(class_verif, prediction).ravel() print("tn: ", tn, "fp: ", fp, "fn: ", fn, "tp: ", tp) print("Confusion Matrix: \n" + str(confusion_matrix(class_verif, prediction))) print ("Accuracy : " + str(accuracy_score(class_verif, prediction)*100)) print("Report : \n" + str(classification_report(class_verif, prediction))) # keep probabilities for the positive outcome only pred = pred[:, 1]
print('\n4) Векторизация tf-idf') do_smth_with_model(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('classifier', MultinomialNB())]) # Байес, векторизация tf-idf, fit_prior=False print('\nExtra Векторизация tf-idf, fit_prior=False') do_smth_with_model(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('classifier', MultinomialNB(fit_prior=False))]) # Дерево принятий решений, tf-idf print('\nDecission Tree') pipeline, label_predicted = do_smth_with_model(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('classifier', DecisionTreeClassifier())]) draw_learning_curve(pipeline) draw_roc_curve(label_predicted) print('Learning curve показывает, что при увеличении обучающих данных, cross-validation score может незначительно ' 'улучшиться, training score при этом останется статичен') print('Судя по roc-curve, классификатор показывает высокие результаты, но у наивного байесы было лучше ' '(надо смотреть на наклон синей прямой). AUC-value хуже, чем у Байеса, лучше, чем у случайного леса ') # Случайный лес, tf-idf print('\nRandomForestClassifier') pipeline, label_predicted = do_smth_with_model(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('classifier', RandomForestClassifier())]) draw_learning_curve(pipeline)
bl1.update (ml6) ''' _trainfeatures, _trainlabels, _testfeatures, _testlabels = split(bf1, bl1) #(features, labels) = adapt (bf1, bl1) (trainfeatures, trainlabels) = adapt (_trainfeatures, _trainlabels) (testfeatures, testlabels) = adapt (_testfeatures, _testlabels) #models = (RandomForestClassifier(n_estimators = 128, random_state=0), )#GaussianProcessClassifier(), ExtraTreesClassifier(n_estimators=120), AdaBoostClassifier(n_estimators=120), GradientBoostingClassifier(n_estimators=120), BaggingClassifier (n_estimators=120), SVC(kernel='rbf'), SVC(kernel='linear'), DecisionTreeClassifier(random_state=None), KNeighborsClassifier(n_neighbors=5), GaussianNB(), MultinomialNB(), BernoulliNB()) #models = (ExtraTreesClassifier(n_estimators=128, random_state=0), AdaBoostClassifier(n_estimators=120), GradientBoostingClassifier(n_estimators=120), BaggingClassifier (n_estimators=120), )#SVC(kernel='rbf'), SVC(kernel='linear'), DecisionTreeClassifier(random_state=None), KNeighborsClassifier(n_neighbors=5), GaussianNB(), MultinomialNB(), BernoulliNB()) #models = (SVC(kernel='rbf'), SVC(kernel='linear'), DecisionTreeClassifier(random_state=None), KNeighborsClassifier(n_neighbors=5), GaussianNB(), MultinomialNB(), BernoulliNB()) #models = (RandomForestClassifier(n_estimators = 128, random_state=0), )#SVC(kernel='rbf'), SVC(kernel='linear'), DecisionTreeClassifier(random_state=None), KNeighborsClassifier(n_neighbors=5), GaussianNB(), MultinomialNB(), BernoulliNB()) models = (RandomForestClassifier(n_estimators = 128, random_state=0), SVC(kernel='rbf'), SVC(kernel='linear'), DecisionTreeClassifier(random_state=None), KNeighborsClassifier(n_neighbors=5), GaussianNB(), MultinomialNB(), BernoulliNB()) #models = (RandomForestClassifier(n_estimators = 120, random_state=0), )#ExtraTreesClassifier(n_estimators=120), GradientBoostingClassifier(n_estimators=120), BaggingClassifier (n_estimators=120), SVC(kernel='linear'), DecisionTreeClassifier(random_state=None), KNeighborsClassifier(n_neighbors=5), MultinomialNB()) #models = (RandomForestClassifier(n_estimators = 120, random_state=0), )#ExtraTreesClassifier(n_estimators=120), )#GradientBoostingClassifier(n_estimators=120), BaggingClassifier (n_estimators=120), SVC(kernel='linear'), DecisionTreeClassifier(random_state=None), KNeighborsClassifier(n_neighbors=5), MultinomialNB()) #fsets = (FSET_FULL,FSET_NOICC, FSET_MIN, FSET_YYY_G, FSET_FULL_TOP, FSET_YYY_TOP, FSET_FULL_TOP_G, FSET_YYY_TOP_G) #fsets = (FSET_FULL, FSET_G, FSET_ICC, FSET_SEC, FSET_Y, FSET_YY, FSET_YYY): fsets = (FSET_FULL, FSET_G, FSET_ICC, FSET_SEC, FSET_Y, FSET_YYY) #fsets = (FSET_FULL, FSET_Y, FSET_YYY) #fsets = (FSET_FULL, FSET_G, FSET_ICC, FSET_SEC, FSET_YYY, FSET_FULL_TOP, FSET_YYY_TOP, FSET_FULL_TOP_G, FSET_YYY_TOP_G) #fsets = (FSET_FULL, FSET_G, FSET_ICC, FSET_SEC, FSET_YYY, FSET_FULL_TOP_G, FSET_YYY_TOP_G) #fsets = (FSET_FULL, FSET_G, FSET_SEC, FSET_YYY, FSET_FULL_TOP_G, FSET_YYY_TOP_G)
#Run cross validation from sklearn.model_selection import cross_val_score from sklearn.model_selection import RepeatedKFold cv = RepeatedKFold(n_splits=10, n_repeats=10) gnbScores = cross_val_score(gnb, X, y, scoring='accuracy', cv=cv, n_jobs=-1) print("Gaussian Naive Bayes Accuracy: %0.2f (+/- %0.2f)" % (gnbScores.mean(), gnbScores.std() * 2)) # In[24]: #Decision Tree Classifier from sklearn.tree import DecisionTreeClassifier dtc = DecisionTreeClassifier(criterion='entropy', max_depth=11, random_state=150) dtc = dtc.fit(X, y) #Run cross validation from sklearn.model_selection import cross_val_score from sklearn.model_selection import RepeatedKFold cv = RepeatedKFold(n_splits=10, n_repeats=10) dtcScores = cross_val_score(dtc, X, y, scoring='accuracy', cv=cv, n_jobs=-1) print("Decision Tree Classifier Accuracy: %0.2f (+/- %0.2f)" % (dtcScores.mean(), dtcScores.std() * 2)) # In[32]: #KNN Classifier from sklearn.neighbors import KNeighborsClassifier
def update(self): ''' Decision Tree ML :return: ''' # ff_happiness["Happiness.Score"] self.list_corr_features = pd.DataFrame([]) if self.feature0.isChecked(): if len(self.list_corr_features) == 0: self.list_corr_features = ff_happiness[features_list[0]] else: self.list_corr_features = pd.concat( [self.list_corr_features, ff_happiness[features_list[0]]], axis=1) if self.feature1.isChecked(): if len(self.list_corr_features) == 0: self.list_corr_features = ff_happiness[features_list[1]] else: self.list_corr_features = pd.concat( [self.list_corr_features, ff_happiness[features_list[1]]], axis=1) if self.feature2.isChecked(): if len(self.list_corr_features) == 0: self.list_corr_features = ff_happiness[features_list[2]] else: self.list_corr_features = pd.concat( [self.list_corr_features, ff_happiness[features_list[2]]], axis=1) if self.feature3.isChecked(): if len(self.list_corr_features) == 0: self.list_corr_features = ff_happiness[features_list[3]] else: self.list_corr_features = pd.concat( [self.list_corr_features, ff_happiness[features_list[3]]], axis=1) if self.feature4.isChecked(): if len(self.list_corr_features) == 0: self.list_corr_features = ff_happiness[features_list[4]] else: self.list_corr_features = pd.concat( [self.list_corr_features, ff_happiness[features_list[4]]], axis=1) if self.feature5.isChecked(): if len(self.list_corr_features) == 0: self.list_corr_features = ff_happiness[features_list[5]] else: self.list_corr_features = pd.concat( [self.list_corr_features, ff_happiness[features_list[5]]], axis=1) if self.feature6.isChecked(): if len(self.list_corr_features) == 0: self.list_corr_features = ff_happiness[features_list[6]] else: self.list_corr_features = pd.concat( [self.list_corr_features, ff_happiness[features_list[6]]], axis=1) if self.feature7.isChecked(): if len(self.list_corr_features) == 0: self.list_corr_features = ff_happiness[features_list[7]] else: self.list_corr_features = pd.concat( [self.list_corr_features, ff_happiness[features_list[7]]], axis=1) vtest_per = float(self.txtPercentTest.text()) vmax_depth = float(self.txtMaxDepth.text()) self.ax1.clear() self.ax2.clear() self.ax3.clear() self.txtResults.clear() self.txtResults.setUndoRedoEnabled(False) vtest_per = vtest_per / 100 X_dt = self.list_corr_features y_dt = ff_happiness["Happiness.Scale"] class_le = LabelEncoder() # fit and transform the class y_dt = class_le.fit_transform(y_dt) # split the dataset into train and test X_train, X_test, y_train, y_test = train_test_split( X_dt, y_dt, test_size=vtest_per, random_state=100) # perform training with entropy. # Decision tree with entropy self.clf_entropy = DecisionTreeClassifier(criterion="entropy", random_state=100, max_depth=vmax_depth, min_samples_leaf=5) # Performing training self.clf_entropy.fit(X_train, y_train) # predicton on test using entropy y_pred_entropy = self.clf_entropy.predict(X_test) # confusion matrix for entropy model conf_matrix = confusion_matrix(y_test, y_pred_entropy) # clasification report self.ff_class_rep = classification_report(y_test, y_pred_entropy) self.txtResults.appendPlainText(self.ff_class_rep) # accuracy score self.ff_accuracy_score = accuracy_score(y_test, y_pred_entropy) * 100 self.txtAccuracy.setText(str(self.ff_accuracy_score)) self.ax1.set_xlabel('Predicted label') self.ax1.set_ylabel('True label') class_names1 = ['', 'Happy', 'Med.Happy', 'Low.Happy', 'Not.Happy'] self.ax1.matshow(conf_matrix, cmap=plt.cm.get_cmap('Blues', 14)) self.ax1.set_yticklabels(class_names1) self.ax1.set_xticklabels(class_names1, rotation=90) for i in range(len(class_names)): for j in range(len(class_names)): y_pred_score = self.clf_entropy.predict_proba(X_test) self.ax1.text(j, i, str(conf_matrix[i][j])) self.fig.tight_layout() self.fig.canvas.draw_idle() ##################### # End Graph 1 ##################### ########################## # Graph 1 -- ROC ########################## y_test_bin = label_binarize(y_test, classes=[0, 1, 2, 3]) n_classes = y_test_bin.shape[1] fpr = dict() tpr = dict() roc_auc = dict() for i in range(n_classes): fpr[i], tpr[i], _ = roc_curve(y_test_bin[:, i], y_pred_score[:, i]) roc_auc[i] = auc(fpr[i], tpr[i]) # Compute micro-average ROC curve and ROC area fpr["micro"], tpr["micro"], _ = roc_curve(y_test_bin.ravel(), y_pred_score.ravel()) roc_auc["micro"] = auc(fpr["micro"], tpr["micro"]) lw = 2 self.ax2.plot(fpr[2], tpr[2], color='darkorange', lw=lw, label='ROC curve (area = %0.2f)' % roc_auc[2]) self.ax2.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--') self.ax2.set_xlim([0.0, 1.0]) self.ax2.set_ylim([0.0, 1.05]) self.ax2.set_xlabel('False Positive Rate') self.ax2.set_ylabel('True Positive Rate') self.ax2.set_title('ROC Curve Decision Tree') self.ax2.legend(loc="lower right") self.fig2.tight_layout() self.fig2.canvas.draw_idle() #-------------------------------- ### Graph 3 Roc Curve by class #--------------------------------- str_classes = ['HP', 'MEH', 'LOH', 'NH'] colors = cycle(['magenta', 'darkorange', 'green', 'blue']) for i, color in zip(range(n_classes), colors): self.ax3.plot(fpr[i], tpr[i], color=color, lw=lw, label='{0} (area = {1:0.2f})' ''.format(str_classes[i], roc_auc[i])) self.ax3.plot([0, 1], [0, 1], 'k--', lw=lw) self.ax3.set_xlim([0.0, 1.0]) self.ax3.set_ylim([0.0, 1.05]) self.ax3.set_xlabel('False Positive Rate') self.ax3.set_ylabel('True Positive Rate') self.ax3.set_title('ROC Curve by Class') self.ax3.legend(loc="lower right") # show the plot self.fig3.tight_layout() self.fig3.canvas.draw_idle()
import pandas as pd from matplotlib import pyplot as plt from sklearn.metrics import accuracy_score # Loading dataset iris=datasets.load_iris() #dataframe df=pd.DataFrame(iris.data, columns=iris.feature_names) print(df.head(5)) y=iris.target #print(y) #decision tree algorithm from sklearn.tree import DecisionTreeClassifier dtree=DecisionTreeClassifier() dtree.fit(df,y) print('Decision Tree Classifer Created') #PLOT from sklearn.tree import plot_tree model_all_params = DecisionTreeClassifier().fit(iris.data, iris.target) plt.figure(figsize = (20,10)) #set size plot_tree(model_all_params, filled=True ) plt.show() #accuracy y_pred = dtree.predict(df) print('\nAccuracy: {0:.4f}'.format(accuracy_score(y, y_pred)))
# 删除Sex和Embarked data.drop(["Sex", "Embarked"], inplace=True, axis=1) # 将编码后的数据合并到原数据 newdata = pd.concat([data, data_Sex_df, data_Embarked_df], axis=1) print(newdata) # 划分特征与标签 X = newdata.iloc[:, newdata.columns != "Survived"] y = newdata.iloc[:,newdata.columns == "Survived"] # 划分训练集与测试集 Xtrain, Xtest, Ytrain, Ytest = train_test_split(X,y) # 实例化模型 clf = DecisionTreeClassifier(random_state=666) # 交叉验证得到最佳折数 cv_score = [] for i in range(2,10): score = cross_val_score(clf,X,y,cv=i).mean() cv_score.append(score) best_cv = cv_score.index(max(cv_score)) + 2 # 网格搜索寻找最佳超参数 parameters = {"splitter":('best','random') ,"max_depth":[*range(1,5)] ,"min_samples_leaf":[*range(1,10)] }
# Neural Network Classifier # train_x, test_x, train_y, test_y = train_test_split(transformed_samples, y, test_size=0.5, random_state=42) NN_clf = MLPClassifier(solver='lbfgs', max_iter=400,alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1) NN_model = NN_clf.fit(train_x, train_y) predicted = NN_model.predict(test_x) print('The accuracy score for NN classifier is : ') print (accuracy_score(test_y, predicted)) filename = 'LDA_NN_model.sav' pickle.dump(NN_model, open(filename, 'wb')) # Decision tree Classifier # Tree_clf = DecisionTreeClassifier(random_state = 0) Tree_model = Tree_clf.fit(train_x, train_y) predicted_Tree = Tree_model.predict(test_x) print('The accuracy score for Tree classifier is : ') print (accuracy_score(test_y, predicted_Tree)) filename1 = 'LDA_Tree_model.sav' pickle.dump(Tree_model, open(filename1, 'wb')) # Plotting the scatter plot of the new feature space # class_mapping = {0 : 'normal',1 : 'DOS',2 : 'U2R',3 : 'R2L',4 : 'PROBE'} for lab,marker,color in zip( range(0,5),('^', 's', 'o','*','D'),('blue', 'red', 'green','black','yellow')): plt.scatter(x=transformed_samples[:,0].real[y == lab], y=transformed_samples[:,1].real[y == lab],
targets=targets.astype('int64') #Correlations DataFrame(reduced_predictors).join(targets).corr() #Split as training and testing pred_train, pred_test, tar_train, tar_test = train_test_split(DataFrame(reduced_predictors), targets, test_size=.3) pred_train.shape pred_test.shape tar_train.shape tar_test.shape #Build model on training data #Using support vector machines from sklearn import ensemble from sklearn.tree import DecisionTreeClassifier #classifier=ensemble.BaggingClassifier(DecisionTreeClassifier()) classifier=ensemble.AdaBoostClassifier(DecisionTreeClassifier()) classifier=classifier.fit(pred_train,tar_train) predictions=classifier.predict(pred_test) sklearn.metrics.confusion_matrix(tar_test,predictions) sklearn.metrics.accuracy_score(tar_test, predictions) sklearn.metrics.classification_report(tar_test, predictions)
def testGreadSearchCV(self): if self.ui.comboBoxKlasyfikatory.currentIndex() == 0: X = self.tabelaBazowa[self.tabelaBazowa.columns[0:48]].copy() y = self.tabelaBazowa[self.tabelaBazowa.columns[48]].copy() for index, row in y.iteritems(): if row in 'DBScan euclidean': y[index] = 11 elif row in 'DBScan cityblock': y[index] = 12 elif row in 'DBScan cosine': y[index] = 13 elif row in 'KMeans euclidean': y[index] = 21 elif row in 'KMeans cityblock': y[index] = 22 elif row in 'KMeans cosine': y[index] = 23 elif row in 'Agglomerative euclidean': y[index] = 31 elif row in 'Agglomerative cityblock': y[index] = 32 elif row in 'Agglomerative cosine': y[index] = 33 y = y.astype(int) mlp = DecisionTreeClassifier() my_cv = LeaveOneOut() parametr_space = {} if self.czySilhouette: if self.ui.checkBoxCriterion.isChecked(): parametr_space.update({ 'criterion': [self.ui.lineEditCriterion_2.text().split(',')][0] }) else: parametr_space.update({ 'criterion': [ self.konfiguracja.loc['Silhouette_klasyfikacja', 'criterion'] ] }) if self.ui.checkBoxMaxDepth.isChecked(): parametr_space.update({ 'max_depth': np.arange(self.ui.spinBoxMaxDepthOd.value(), self.ui.spinBoxMaxDepthDo.value(), self.ui.spinBoxMaxDepthCo.value()) }) else: parametr_space.update({ 'max_depth': [ self.konfiguracja.loc['Silhouette_klasyfikacja', 'max_depth'] ] }) if self.ui.checkBoxMaxDepth.isChecked(): parametr_space.update({ 'random_state': [self.ui.spinBoxRandomState_2.value()] }) else: parametr_space.update({ 'random_state': [ self.konfiguracja.loc['Silhouette_klasyfikacja', 'random_state'] ] }) if self.ui.checkBoxMaxDepth.isChecked(): parametr_space.update({ 'min_samples_leaf': np.arange(self.ui.spinBoxMinSamplesLeafOd.value(), self.ui.spinBoxMinSamplesLeafDo.value(), self.ui.spinBoxMinSaplesLeafCo.value()) }) else: parametr_space.update({ 'min_samples_leaf': [ self.konfiguracja.loc['Silhouette_klasyfikacja', 'min_samples_leaf'] ] }) if self.ui.checkBoxMaxDepth.isChecked(): parametr_space.update({ 'min_samples_split': np.arange(self.ui.spinBoxMinSamplesSplitOd.value(), self.ui.spinBoxMinSaplesSplitDo.value(), self.ui.spinBoxMinSamplesSplitCo.value()) }) else: parametr_space.update({ 'min_samples_split': [ self.konfiguracja.loc['Silhouette_klasyfikacja', 'min_samples_split'] ] }) if self.ui.checkBoxMaxDepth.isChecked(): parametr_space.update({ 'splitter': [self.ui.lineEditSpliter_2.text().split(',')][0] }) else: parametr_space.update({ 'splitter': [ self.konfiguracja.loc['Silhouette_klasyfikacja', 'splitter'] ] }) elif self.czyDaviesBouldin: if self.ui.checkBoxCriterion.isChecked(): parametr_space.update({ 'criterion': [self.ui.lineEditCriterion_2.text().split(',')][0] }) else: parametr_space.update({ 'criterion': [ self.konfiguracja.loc['DaviesBouldin_klasyfikacja', 'criterion'] ] }) if self.ui.checkBoxMaxDepth.isChecked(): parametr_space.update({ 'max_depth': np.arange(self.ui.spinBoxMaxDepthOd.value(), self.ui.spinBoxMaxDepthDo.value(), self.ui.spinBoxMaxDepthCo.value()) }) else: parametr_space.update({ 'max_depth': [ self.konfiguracja.loc['DaviesBouldin_klasyfikacja', 'max_depth'] ] }) if self.ui.checkBoxMaxDepth.isChecked(): parametr_space.update({ 'random_state': [self.ui.spinBoxRandomState_2.value()] }) else: parametr_space.update({ 'random_state': [ self.konfiguracja.loc['DaviesBouldin_klasyfikacja', 'random_state'] ] }) if self.ui.checkBoxMaxDepth.isChecked(): parametr_space.update({ 'min_samples_leaf': np.arange(self.ui.spinBoxMinSamplesLeafOd.value(), self.ui.spinBoxMinSamplesLeafDo.value(), self.ui.spinBoxMinSaplesLeafCo.value()) }) else: parametr_space.update({ 'min_samples_leaf': [ self.konfiguracja.loc['DaviesBouldin_klasyfikacja', 'min_samples_leaf'] ] }) if self.ui.checkBoxMaxDepth.isChecked(): parametr_space.update({ 'min_samples_split': np.arange(self.ui.spinBoxMinSamplesSplitOd.value(), self.ui.spinBoxMinSaplesSplitDo.value(), self.ui.spinBoxMinSamplesSplitCo.value()) }) else: parametr_space.update({ 'min_samples_split': [ self.konfiguracja.loc['DaviesBouldin_klasyfikacja', 'min_samples_split'] ] }) if self.ui.checkBoxMaxDepth.isChecked(): parametr_space.update({ 'splitter': [self.ui.lineEditSpliter_2.text().split(',')][0] }) else: parametr_space.update({ 'splitter': [ self.konfiguracja.loc['DaviesBouldin_klasyfikacja', 'splitter'] ] }) elif self.czyCalinskiHarabasz: if self.ui.checkBoxCriterion.isChecked(): parametr_space.update({ 'criterion': [self.ui.lineEditCriterion_2.text().split(',')][0] }) else: parametr_space.update({ 'criterion': [ self.konfiguracja.loc[ 'CalinskiHarabasz_klasyfikacja', 'criterion'] ] }) if self.ui.checkBoxMaxDepth.isChecked(): parametr_space.update({ 'max_depth': np.arange(self.ui.spinBoxMaxDepthOd.value(), self.ui.spinBoxMaxDepthDo.value(), self.ui.spinBoxMaxDepthCo.value()) }) else: parametr_space.update({ 'max_depth': [ self.konfiguracja.loc[ 'CalinskiHarabasz_klasyfikacja', 'max_depth'] ] }) if self.ui.checkBoxMaxDepth.isChecked(): parametr_space.update({ 'random_state': [self.ui.spinBoxRandomState_2.value()] }) else: parametr_space.update({ 'random_state': [ self.konfiguracja.loc[ 'CalinskiHarabasz_klasyfikacja', 'random_state'] ] }) if self.ui.checkBoxMaxDepth.isChecked(): parametr_space.update({ 'min_samples_leaf': np.arange(self.ui.spinBoxMinSamplesLeafOd.value(), self.ui.spinBoxMinSamplesLeafDo.value(), self.ui.spinBoxMinSaplesLeafCo.value()) }) else: parametr_space.update({ 'min_samples_leaf': [ self.konfiguracja.loc[ 'CalinskiHarabasz_klasyfikacja', 'min_samples_leaf'] ] }) if self.ui.checkBoxMaxDepth.isChecked(): parametr_space.update({ 'min_samples_split': np.arange(self.ui.spinBoxMinSamplesSplitOd.value(), self.ui.spinBoxMinSaplesSplitDo.value(), self.ui.spinBoxMinSamplesSplitCo.value()) }) else: parametr_space.update({ 'min_samples_split': [ self.konfiguracja.loc[ 'CalinskiHarabasz_klasyfikacja', 'min_samples_split'] ] }) if self.ui.checkBoxMaxDepth.isChecked(): parametr_space.update({ 'splitter': [self.ui.lineEditSpliter_2.text().split(',')][0] }) else: parametr_space.update({ 'splitter': [ self.konfiguracja.loc[ 'CalinskiHarabasz_klasyfikacja', 'splitter'] ] }) clf = GridSearchCV(mlp, parametr_space, n_jobs=-1, cv=my_cv, verbose=3) clf.fit( X, y, ) print('Najlepsze parametry:\n', clf.best_params_) elif self.ui.comboBoxKlasyfikatory.currentIndex() == 1: X = self.tabelaBazowa[self.tabelaBazowa.columns[0:49]].copy() y = self.tabelaBazowa[self.tabelaBazowa.columns[49]].copy() temp = X[X.columns[-1]].copy() for index, row in temp.iteritems(): if row in 'DBScan euclidean': temp[index] = 11 elif row in 'DBScan cityblock': temp[index] = 12 elif row in 'DBScan cosine': temp[index] = 13 elif row in 'KMeans euclidean': temp[index] = 21 elif row in 'KMeans cityblock': temp[index] = 22 elif row in 'KMeans cosine': temp[index] = 23 elif row in 'Agglomerative euclidean': temp[index] = 31 elif row in 'Agglomerative cityblock': temp[index] = 32 elif row in 'Agglomerative cosine': temp[index] = 33 X['Metoda_metryka'] = temp.copy() y = y.astype(int) mlp = DecisionTreeRegressor() my_cv = LeaveOneOut() parametr_space = {} if self.czySilhouette: if self.ui.checkBoxCriterion.isChecked(): parametr_space.update({ 'criterion': [self.ui.lineEditCriterion_2.text().split(',')][0] }) else: parametr_space.update({ 'criterion': [ self.konfiguracja.loc['Silhouette_regresja', 'criterion'] ] }) if self.ui.checkBoxMaxDepth.isChecked(): parametr_space.update({ 'max_depth': np.arange(self.ui.spinBoxMaxDepthOd.value(), self.ui.spinBoxMaxDepthDo.value(), self.ui.spinBoxMaxDepthCo.value()) }) else: parametr_space.update({ 'max_depth': [ self.konfiguracja.loc['Silhouette_regresja', 'max_depth'] ] }) if self.ui.checkBoxMaxDepth.isChecked(): parametr_space.update({ 'random_state': [self.ui.spinBoxRandomState_2.value()] }) else: parametr_space.update({ 'random_state': [ self.konfiguracja.loc['Silhouette_regresja', 'random_state'] ] }) if self.ui.checkBoxMaxDepth.isChecked(): parametr_space.update({ 'min_samples_leaf': np.arange(self.ui.spinBoxMinSamplesLeafOd.value(), self.ui.spinBoxMinSamplesLeafDo.value(), self.ui.spinBoxMinSaplesLeafCo.value()) }) else: parametr_space.update({ 'min_samples_leaf': [ self.konfiguracja.loc['Silhouette_regresja', 'min_samples_leaf'] ] }) if self.ui.checkBoxMaxDepth.isChecked(): parametr_space.update({ 'min_samples_split': np.arange(self.ui.spinBoxMinSamplesSplitOd.value(), self.ui.spinBoxMinSaplesSplitDo.value(), self.ui.spinBoxMinSamplesSplitCo.value()) }) else: parametr_space.update({ 'min_samples_split': [ self.konfiguracja.loc['Silhouette_regresja', 'min_samples_split'] ] }) if self.ui.checkBoxMaxDepth.isChecked(): parametr_space.update({ 'splitter': [self.ui.lineEditSpliter_2.text().split(',')][0] }) else: parametr_space.update({ 'splitter': [ self.konfiguracja.loc['Silhouette_regresja', 'splitter'] ] }) elif self.czyDaviesBouldin: if self.ui.checkBoxCriterion.isChecked(): parametr_space.update({ 'criterion': [self.ui.lineEditCriterion_2.text().split(',')][0] }) else: parametr_space.update({ 'criterion': [ self.konfiguracja.loc['DaviesBouldin_regresja', 'criterion'] ] }) if self.ui.checkBoxMaxDepth.isChecked(): parametr_space.update({ 'max_depth': np.arange(self.ui.spinBoxMaxDepthOd.value(), self.ui.spinBoxMaxDepthDo.value(), self.ui.spinBoxMaxDepthCo.value()) }) else: parametr_space.update({ 'max_depth': [ self.konfiguracja.loc['DaviesBouldin_regresja', 'max_depth'] ] }) if self.ui.checkBoxMaxDepth.isChecked(): parametr_space.update({ 'random_state': [self.ui.spinBoxRandomState_2.value()] }) else: parametr_space.update({ 'random_state': [ self.konfiguracja.loc['DaviesBouldin_regresja', 'random_state'] ] }) if self.ui.checkBoxMaxDepth.isChecked(): parametr_space.update({ 'min_samples_leaf': np.arange(self.ui.spinBoxMinSamplesLeafOd.value(), self.ui.spinBoxMinSamplesLeafDo.value(), self.ui.spinBoxMinSaplesLeafCo.value()) }) else: parametr_space.update({ 'min_samples_leaf': [ self.konfiguracja.loc['DaviesBouldin_regresja', 'min_samples_leaf'] ] }) if self.ui.checkBoxMaxDepth.isChecked(): parametr_space.update({ 'min_samples_split': np.arange(self.ui.spinBoxMinSamplesSplitOd.value(), self.ui.spinBoxMinSaplesSplitDo.value(), self.ui.spinBoxMinSamplesSplitCo.value()) }) else: parametr_space.update({ 'min_samples_split': [ self.konfiguracja.loc['DaviesBouldin_regresja', 'min_samples_split'] ] }) if self.ui.checkBoxMaxDepth.isChecked(): parametr_space.update({ 'splitter': [self.ui.lineEditSpliter_2.text().split(',')][0] }) else: parametr_space.update({ 'splitter': [ self.konfiguracja.loc['DaviesBouldin_regresja', 'splitter'] ] }) elif self.czyCalinskiHarabasz: if self.ui.checkBoxCriterion.isChecked(): parametr_space.update({ 'criterion': [self.ui.lineEditCriterion_2.text().split(',')][0] }) else: parametr_space.update({ 'criterion': [ self.konfiguracja.loc['CalinskiHarabasz_regresja', 'criterion'] ] }) if self.ui.checkBoxMaxDepth.isChecked(): parametr_space.update({ 'max_depth': np.arange(self.ui.spinBoxMaxDepthOd.value(), self.ui.spinBoxMaxDepthDo.value(), self.ui.spinBoxMaxDepthCo.value()) }) else: parametr_space.update({ 'max_depth': [ self.konfiguracja.loc['CalinskiHarabasz_regresja', 'max_depth'] ] }) if self.ui.checkBoxMaxDepth.isChecked(): parametr_space.update({ 'random_state': [self.ui.spinBoxRandomState_2.value()] }) else: parametr_space.update({ 'random_state': [ self.konfiguracja.loc['CalinskiHarabasz_regresja', 'random_state'] ] }) if self.ui.checkBoxMaxDepth.isChecked(): parametr_space.update({ 'min_samples_leaf': np.arange(self.ui.spinBoxMinSamplesLeafOd.value(), self.ui.spinBoxMinSamplesLeafDo.value(), self.ui.spinBoxMinSaplesLeafCo.value()) }) else: parametr_space.update({ 'min_samples_leaf': [ self.konfiguracja.loc['CalinskiHarabasz_regresja', 'min_samples_leaf'] ] }) if self.ui.checkBoxMaxDepth.isChecked(): parametr_space.update({ 'min_samples_split': np.arange(self.ui.spinBoxMinSamplesSplitOd.value(), self.ui.spinBoxMinSaplesSplitDo.value(), self.ui.spinBoxMinSamplesSplitCo.value()) }) else: parametr_space.update({ 'min_samples_split': [ self.konfiguracja.loc['CalinskiHarabasz_regresja', 'min_samples_split'] ] }) if self.ui.checkBoxMaxDepth.isChecked(): parametr_space.update({ 'splitter': [self.ui.lineEditSpliter_2.text().split(',')][0] }) else: parametr_space.update({ 'splitter': [ self.konfiguracja.loc['CalinskiHarabasz_regresja', 'splitter'] ] }) clf = GridSearchCV(mlp, parametr_space, n_jobs=-1, cv=my_cv, verbose=3, scoring='max_error') clf.fit(X, y) print('Najlepsze parametry:\n', clf.best_params_)
# Splitting the dataset into the Training set and Test set from sklearn.model_selection import train_test_split x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=0) # Feature Scaling - lr otomatik yapmıyor manuel yapmamız gerekiyor from sklearn.preprocessing import StandardScaler sc_x = StandardScaler() x_train = sc_x.fit_transform(x_train) x_test = sc_x.transform(x_test) #fitting classifier to training test from sklearn.tree import DecisionTreeClassifier classifier = DecisionTreeClassifier(criterion='entropy', random_state=0) classifier.fit(x_train, y_train) #predict test set results y_pred = classifier.predict(x_test) #making the confusion matrix = biz modelin predictive powerını görmüş olduk from sklearn.metrics import confusion_matrix #sol üst sağ alt doğru tahminler sağ üst sol alt yanlış tahminler cm = confusion_matrix(y_test, y_pred) #visualising test= gerçek sonuçları ve tahmin bölgelerini görmemizi sağlar from matplotlib.colors import ListedColormap X_set, y_set = x_test, y_test X1, X2 = np.meshgrid( np.arange(start=X_set[:, 0].min() - 1, stop=X_set[:, 0].max() + 1,
After that, it's not our code anymore--it's yours! """ import pickle import sys sys.path.append("../tools/") from feature_format import featureFormat, targetFeatureSplit data_dict = pickle.load(open("../final_project/final_project_dataset.pkl", "r")) ### first element is our labels, any added elements are predictor ### features. Keep this the same for the mini-project, but you'll ### have a different feature list when you do the final project. features_list = ["poi", "salary"] data = featureFormat(data_dict, features_list) labels, features = targetFeatureSplit(data) ### it's all yours from here forward! from sklearn.tree import DecisionTreeClassifier from sklearn.metrics import accuracy_score from sklearn.cross_validation import train_test_split features_train, features_test, labels_train, labels_test = \ train_test_split(features, labels, test_size=0.3, random_state=42) clf = DecisionTreeClassifier() clf.fit(features_train, labels_train) pred = clf.predict(features_test) print 'Accuracy:', accuracy_score(pred, labels_test)
def DT(X, y, train_size, data_name): X = StandardScaler().fit_transform(X) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=train_size) # https://scikit-learn.org/stable/auto_examples/tree/plot_cost_complexity_pruning.html#sphx-glr-auto-examples-tree-plot-cost-complexity-pruning-py # Fit classification model dt = DecisionTreeClassifier() path = dt.cost_complexity_pruning_path(X_train, y_train) ccp_alphas, impurities = path.ccp_alphas, path.impurities fig, ax = plt.subplots() ax.plot(ccp_alphas[:-1], impurities[:-1], marker='o', drawstyle="steps-post") ax.set_xlabel("effective alpha") ax.set_ylabel("total impurity of leaves") ax.set_title("Total Impurity vs effective alpha for training set") clfs = [] for ccp_alpha in ccp_alphas: clf = DecisionTreeClassifier(random_state=0, ccp_alpha=ccp_alpha) clf.fit(X_train, y_train) clfs.append(clf) print("Number of nodes in the last tree is: {} with ccp_alpha: {}".format( clfs[-1].tree_.node_count, ccp_alphas[-1])) # %% # For the remainder of this example, we remove the last element in # ``clfs`` and ``ccp_alphas``, because it is the trivial tree with only one # node. Here we show that the number of nodes and tree depth decreases as alpha # increases. clfs = clfs[:-1] ccp_alphas = ccp_alphas[:-1] node_counts = [clf.tree_.node_count for clf in clfs] depth = [clf.tree_.max_depth for clf in clfs] fig, ax = plt.subplots(2, 1) ax[0].plot(ccp_alphas, node_counts, marker='o', drawstyle="steps-post") ax[0].set_xlabel("alpha") ax[0].set_ylabel("number of nodes") ax[0].set_title("Number of nodes vs alpha") ax[1].plot(ccp_alphas, depth, marker='o', drawstyle="steps-post") ax[1].set_xlabel("alpha") ax[1].set_ylabel("depth of tree") ax[1].set_title("Depth vs alpha") fig.tight_layout() # %% # Accuracy vs alpha for training and testing sets # ---------------------------------------------------- # When ``ccp_alpha`` is set to zero and keeping the other default parameters # of :class:`DecisionTreeClassifier`, the tree overfits, leading to # a 100% training accuracy and 88% testing accuracy. As alpha increases, more # of the tree is pruned, thus creating a decision tree that generalizes better. # In this example, setting ``ccp_alpha=0.015`` maximizes the testing accuracy. train_scores = [clf.score(X_train, y_train) for clf in clfs] test_scores = [clf.score(X_test, y_test) for clf in clfs] fig, ax = plt.subplots() ax.set_xlabel("alpha") ax.set_ylabel("accuracy") ax.set_title("Accuracy vs alpha for training and testing sets") ax.plot(ccp_alphas, train_scores, marker='o', label="train", drawstyle="steps-post") ax.plot(ccp_alphas, test_scores, marker='o', label="test", drawstyle="steps-post") ax.legend() plt.show() # %% best_alpha = 0.040790348647614105 # %% # Create CV training and test scores for various training set sizes train_sizes, train_scores, test_scores = learning_curve( DecisionTreeClassifier(ccp_alpha=best_alpha), X, y, # Number of folds in cross-validation cv=5, # Evaluation metric scoring='accuracy', # Use all computer cores n_jobs=-1, # 50 different sizes of the training set train_sizes=np.linspace(0.01, 1.0, 50)) print(train_scores) # Create means and standard deviations of training set scores train_mean = np.mean(train_scores, axis=1) train_std = np.std(train_scores, axis=1) # Create means and standard deviations of test set scores test_mean = np.mean(test_scores, axis=1) test_std = np.std(test_scores, axis=1) # Draw lines plt.plot(train_sizes, train_mean, '--', color="#111111", label="Training score") plt.plot(train_sizes, test_mean, color="#111111", label="Cross-validation score") # Draw bands plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, color="#DDDDDD") plt.fill_between(train_sizes, test_mean - test_std, test_mean + test_std, color="#DDDDDD") # Create plot plt.title("DT Learning Curve - {}".format(data_name)) plt.xlabel("Training Set Size"), plt.ylabel("Accuracy Score"), plt.legend( loc="best") plt.tight_layout() plt.show()
from sklearn.datasets import load_iris from sklearn.model_selection import train_test_split import matplotlib.pyplot as plt iris = load_iris() X = iris['data'][:, [2, 3]] y = iris['target'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1, stratify=y) # We are not doing featrue scaling but it might be helpful tree_model = DecisionTreeClassifier(criterion='gini', max_depth=4, random_state=1) tree_model.fit(X_train, y_train) X_comb = np.vstack((X_train, X_test)) y_comb = np.hstack((y_train, y_test)) plot_decision_region(X_comb, y_comb, tree_model, test_idx=range(105, 150)) plt.xlabel('Petal Length') plt.ylabel('Petal Width') plt.tight_layout() plt.show() from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier knn = KNeighborsClassifier(n_neighbors=20) knn.fit(X_train_sc,y_train_sc) pred_knn = knn.predict(X_test) print(confusion_matrix(y_test, pred_knn)) print(classification_report(y_test, pred_knn)) print(accuracy_score(y_test, pred_knn)) knn.fit(X_train_all, y_train_all) pred_all_knn = knn.predict(X_test_all) sub_knn = pd.DataFrame() sub_knn['PassengerId'] = df_test['PassengerId'] sub_knn['Survived'] = pred_all_knn #sub_knn.to_csv('knn.csv',index=False) from sklearn.tree import DecisionTreeClassifier dtree = DecisionTreeClassifier() dtree.fit(X_train,y_train) pred_dtree = dtree.predict(X_test) print(classification_report(y_test,pred_dtree)) print(accuracy_score(y_test, pred_dtree)) dtree_2 = DecisionTreeClassifier(max_features=7 , max_depth=6, min_samples_split=8) dtree_2.fit(X_train,y_train) pred_dtree_2 = dtree_2.predict(X_test) print(classification_report(y_test, pred_dtree_2)) print(accuracy_score(y_test, pred_dtree_2)) dtree_2.fit(X_train_all, y_train_all) pred_all_dtree2 = dtree_2.predict(X_test_all) from sklearn.ensemble import RandomForestClassifier rfc = RandomForestClassifier(max_depth=6, max_features=7)