def learning_curve(self, X1, Y1, dataset_name, kernel): title = "Learning Curve for {} Dataset(supportVectorMachine)".format( dataset_name) cv = StratifiedKFold(n_splits=10, random_state=42) X_train, X_test, y_train, y_test = train_test_split(X1, Y1, test_size=0.3) # Linear Kernel if kernel == "linear": C_param1 = self.GridSearchCV1(X_train, y_train, dataset_name) estimator1 = Pipeline([('Scale', StandardScaler()), ('clf', LinearSVC(C=C_param1))]) plot_learning_curve(estimator1, title, X1, Y1, ylim=None, cv=cv) return Pipeline([('Scale', StandardScaler()), ('clf', LinearSVC(C=C_param1))]) # RBF Kernel elif kernel == "rbf": C_param2 = self.GridSearchCV2(X_train, y_train, dataset_name) estimator1 = Pipeline([('Scale', StandardScaler()), ('clf', SVC(kernel='rbf', C=C_param2))]) plot_learning_curve(estimator1, title, X1, Y1, ylim=None, cv=cv) return Pipeline([('Scale', StandardScaler()), ('clf', SVC(kernel='rbf', C=C_param2))])
def learning_curve(self,X1,Y1,param,dataset_name): title = "Learning Curve for {} Dataset (Neural Network)".format(dataset_name) X_train, X_test, y_train, y_test = train_test_split(X1, Y1, test_size=0.3) estimator = self.gridSearchCV(X_train, y_train, param,dataset_name) plot_learning_curve(estimator, title, X1, Y1, ylim=None, cv=5) plt.show() return estimator
def scoreModel(classifiers, X, y, testX, testy, scoring, outputDir, params, scoreType='baseline', dsname=''): fitClassifiers = {} scores = [] names = [] for classifier in classifiers: clf, _ = A1.getClfParams(classifier) if params is not None: # Remove classifier prefix from params p = {k.replace('classifier__', ''): v for k, v in params[classifier].items()} clf.set_params(**p) print('{}: Generating {} learning curve' .format(classifier, scoreType)) print('{}: hyperparameters: '.format(classifier), clf.get_params()) util.plot_learning_curve(classifier, clf, X, y, scoring, savedir=outputDir, scoreType=scoreType) # SVM and ANN need a training epoch graph if classifier == 'kernelSVM' or classifier == 'ann': util.plotValidationCurve(clf, X, y, scoring=scoring, paramName='max_iter', paramRange=range(100, 2000, 100), savedir=outputDir, clfName='{}-{}'.format(classifier, scoreType), cv=3) # To score the model, fit with given parameters and predict print('{}: Retraining with best parameters on entire training set' .format(classifier)) pipeline = Pipeline(steps=[('scaler', StandardScaler()), ('classifier', clf)]) start_time = timeit.default_timer() pipeline.fit(X, y) total_time = timeit.default_timer() - start_time print('Training ANN took {} seconds'.format(total_time)) ypred = pipeline.predict(testX) fitClassifiers[classifier] = pipeline scores.append(f1_score(testy, ypred)) names.append(classifier) # Generate confusion matrix print('{}: Scoring predictions against test set' .format(classifier)) util.confusionMatrix(classifier, testy, ypred, savedir=outputDir, scoreType=scoreType) plt.close('all') util.plotBarScores(scores, names, '', outputDir, phaseName=scoreType) plt.close('all') return fitClassifiers
def draw_learning_curve_2(): title = "Learning Curve for Optical Digit Recognition Dataset (Decision Tree)" cv = ShuffleSplit(n_splits=10, test_size=0.3, random_state=0) X_train, X_test, y_train, y_test = train_test_split(X2, Y2, test_size=0.3) max_depth, min_samples_leaf = getParametersFromGridSearchCV(64, X_train, y_train) estimator = DecisionTreeClassifier(max_depth=max_depth, random_state=100, min_samples_leaf=min_samples_leaf) plot_learning_curve(estimator, title, X2, Y2, ylim=None, cv=cv) plt.show()
def learning_curve(self,X1,Y1,dataset_name,d): title = "Learning Curve for {} Dataset (Boosting)".format(dataset_name) cv = StratifiedKFold(n_splits=10, random_state=42) X_train, X_test, y_train, y_test = train_test_split(X1, Y1, test_size=0.3) n_estimators = self.GridSearchCV(X_train, y_train,dataset_name,d) estimator = AdaBoostClassifier( base_estimator = DecisionTreeClassifier(max_depth=d, random_state=42),n_estimators=n_estimators) plot_learning_curve(estimator, title, X1, Y1, ylim=None, cv=cv) return AdaBoostClassifier( base_estimator = DecisionTreeClassifier(max_depth=d, random_state=42),n_estimators=n_estimators) plt.show()
def run_ada(X, y, X_train, X_test, y_train, y_test, title): # param_range = [1, 2, 4, 8, 16, 32, 64, 128, 256, 512] # param_range = [1,5,10,15,20,25,30,35,40,45,50] # param_name = 'n_estimators' param_range = [.0001, .001, .01, .1, 1, 10] param_name = 'learning_rate' # data = load_digits() # X = data.data # y = data.target dt = tree.DecisionTreeClassifier(max_depth =10) # class sklearn.ensemble.AdaBoostClassifier(base_estimator=None, n_estimators=50, learning_rate=1.0, algorithm=’SAMME.R’, random_state=None)[source]¶ ada = AdaBoostClassifier(dt) util.plot_learning_curve(ada, title + " ADA LC", X, y, cv=5, n_jobs=-1) train_scores, test_scores = validation_curve(ada, X, y, param_name, param_range, cv=5, scoring="accuracy", n_jobs =-1 ) # Calculate mean and standard deviation for training set scores train_mean = np.mean(train_scores, axis=1) train_std = np.std(train_scores, axis=1) # Calculate mean and standard deviation for test set scores test_mean = np.mean(test_scores, axis=1) test_std = np.std(test_scores, axis=1) plt.figure() # Plot mean accuracy scores for training and test sets # plt.plot(param_range, train_mean, 'o-', label="Training score", color="g") # plt.plot(param_range, test_mean, 'o-', label="Cross-validation score", color="r") lw = 2 plt.semilogx(param_range, train_mean, label="Training score", color="darkorange", lw=lw) # plt.plot(param_range, train_mean, 'o-', label="Training score", color="g") plt.semilogx(param_range, test_mean, label="Cross-validation score", color="navy", lw=lw) # plt.plot(param_range, test_mean, 'o-', label="Cross-validation score", color="r") # Create plot plt.title("Ada Boost "+title+ " Validation") plt.xlabel(param_name) plt.ylabel("Accuracy Score") plt.tight_layout() plt.legend(loc="best") plt.savefig(title+'ADAvalidation.png') plt.figure()
def learning_curve(self, X1, Y1, dataset_name): title = "Learning Curve for {} Dataset(KNN)".format(dataset_name) cv = StratifiedKFold(n_splits=10, random_state=42) X_train, X_test, y_train, y_test = train_test_split(X1, Y1, test_size=0.3) n_neighbors = self.GridSearchCV(X_train, y_train, dataset_name) estimator = KNeighborsClassifier(n_neighbors=n_neighbors) plot_learning_curve(estimator, title, X1, Y1, ylim=None, cv=cv) plt.show() return KNeighborsClassifier(n_neighbors=n_neighbors)
def Boosting(X_train, X_test, y_train, y_test, data_name, lc_y_min=0.4, lc_y_max=1.01): # Train Model and Predict # dt1 = DecisionTreeClassifier(criterion="gini", max_depth=1) # dt2 = DecisionTreeClassifier(criterion="gini", max_depth=2) dt3 = DecisionTreeClassifier(criterion="gini", max_depth=3) # dt4 = DecisionTreeClassifier(criterion="gini", max_depth=4) # dt5 = DecisionTreeClassifier(criterion="gini", max_depth=5) param_grid = { "base_estimator": [dt3], "learning_rate": np.linspace(0.5, 10.0, 20), "n_estimators": range(1, 200, 20) } clf = AdaBoostClassifier() # run grid search grid_search = GridSearchCV(clf, param_grid=param_grid, cv=5, verbose=1, n_jobs=-1) grid_search.fit(X_train, y_train) print(grid_search.best_params_) print(grid_search.best_score_) best_params = grid_search.best_params_ print(best_params) save_cv(grid_search.cv_results_, 'Boosting', data_name) cv = ShuffleSplit(n_splits=100, test_size=0.2, random_state=0) X = np.concatenate((X_train, X_test)) y = np.concatenate((y_train, y_test)) title = 'Learning Curves (Boosting Classifier) - {}'.format(data_name) estimator = AdaBoostClassifier(**best_params) print('plotting learning curve for {}'.format(estimator)) plot_learning_curve(estimator, title, X, y, ylim=(lc_y_min, lc_y_max), cv=cv, n_jobs=4) plt.savefig('Figs/Boosting-learningcurve-{}'.format(data_name))
def learning_curve(self, X1, Y1, dataset_name): title = "Learning Curve for {} Dataset (Decision Tree)".format( dataset_name) cv = StratifiedKFold(n_splits=10, random_state=42) X_train, X_test, y_train, y_test = train_test_split(X1, Y1, test_size=0.3) max_depth = self.GridSearchCV(X_train, y_train, dataset_name) estimator = DecisionTreeClassifier(max_depth=max_depth, random_state=42) plot_learning_curve(estimator, title, X1, Y1, ylim=None, cv=cv) plt.show() return DecisionTreeClassifier(max_depth=max_depth, random_state=42)
def kNN(X_train, X_test, y_train, y_test, data_name): # Train Model and Predict Ks = 25 mean_acc = np.zeros((Ks-1)) std_acc = np.zeros((Ks-1)) performance = {} performance['mean_fit_time'] = np.zeros((Ks-1)) performance['mean_score_time'] = np.zeros((Ks-1)) performance['mean_test_score'] = np.zeros((Ks-1)) for n in range(1, Ks): # Train Model and Predict train_start = time.time() neigh = KNeighborsClassifier(n_neighbors = n).fit(X_train, y_train) train_end = time.time() yhat = neigh.predict(X_test) test_end = time.time() mean_acc[n-1] = metrics.accuracy_score(y_test, yhat) std_acc[n-1] = np.std(yhat == y_test)/np.sqrt(yhat.shape[0]) performance['mean_fit_time'][n-1] = train_end - train_start performance['mean_score_time'][n-1] = test_end - train_end performance['mean_test_score'] = metrics.accuracy_score(y_test, yhat) plt.title('Parameter Plot - Values for K - {}'.format(data_name)) plt.plot(range(1,Ks),mean_acc,'g') plt.fill_between(range(1,Ks),mean_acc - 1 * std_acc,mean_acc + 1 * std_acc, alpha=0.10) plt.legend(('Accuracy ', '+/- 3xstd')) plt.ylabel('Accuracy') plt.xlabel('Number of Neighbors (K)') plt.tight_layout() plt.savefig('Figs/KNN-param-plot-{}'.format(data_name)) plt.clf() save_cv(performance, 'KNN', data_name) print( "The best accuracy was with", mean_acc.max(), "with k=", mean_acc.argmax()+1) print( "The best with K<10 was", mean_acc[0:9].max(), "with k=", mean_acc[0:9].argmax()+1) cv = ShuffleSplit(n_splits=100, test_size=0.2, random_state=0) X = np.concatenate((X_train, X_test)) y = np.concatenate((y_train, y_test)) title = 'Learning Curves (kNN Classifier) - {}'.format(data_name) estimator = KNeighborsClassifier(n_neighbors=mean_acc.argmax()+1) print('plotting learning curve for {}'.format(estimator)) plot_learning_curve(estimator, title, X, y, ylim=(0.4, 1.01), cv=cv, n_jobs=4) plt.savefig('Figs/KNN-learningcurve-{}'.format(data_name)) plt.clf()
def run_knn(X, y, X_train, X_test, y_train, y_test, title, k): knn_learning = KNeighborsClassifier(n_neighbors=k) util.plot_learning_curve(knn_learning, title + " KNN LC", X, y, cv=10, n_jobs=-1) #search for an optimal value of K for KNN #credit https://www.youtube.com/watch?v=6dbrR-WymjI param_range = range(1,31) k_scores = [] # for k in k_range: # knn = KNeighborsClassifier(n_neighbors=k) # scores = cross_val_score(knn, X, y, cv=10, scoring='accuracy') # # print(k, scores.mean()) # k_scores.append(scores.mean()) # print(k_scores) knn = KNeighborsClassifier() param_name = "n_neighbors" train_scores, test_scores = validation_curve(knn, X, y, param_name, param_range, cv=5, scoring="accuracy", n_jobs =-1 ) # Calculate mean and standard deviation for training set scores train_mean = np.mean(train_scores, axis=1) train_std = np.std(train_scores, axis=1) # Calculate mean and standard deviation for test set scores test_mean = np.mean(test_scores, axis=1) test_std = np.std(test_scores, axis=1) plt.figure() # Plot mean accuracy scores for training and test sets plt.plot(param_range, train_mean, 'o-', label="Training score", color="g") plt.plot(param_range, test_mean, 'o-', label="Cross-validation score", color="r") # Plot accurancy bands for training and test sets # plt.fill_between(param_range, train_mean - train_std, train_mean + train_std,alpha=.1, color="r") # plt.fill_between(param_range, test_mean - test_std, test_mean + test_std,alpha=.1, color="g") # Create plot plt.title("KNN Validation Curve "+title) plt.xlabel("Number of K Neighbors") plt.ylabel("Accuracy Score") plt.tight_layout() plt.legend(loc="best") plt.savefig(title+'KNNvalidation.png') plt.figure()
def SVM(X_train, X_test, y_train, y_test, data_name): # Train Model and Predict # param_grid = {"kernel" : ["sigmoid", "poly", "rbf"], # "C" : [0.1, 0.5, 1.0, 1.5] # } param_distributions = { "kernel": ["sigmoid", "poly", "rbf"], "gamma": np.linspace(0.001, 1.0, 1000) } # clf = svm.SVC(gamma='scale') clf = svm.SVC() # run grid search on dataset # grid_search = GridSearchCV(clf, param_grid=param_grid, cv=5, verbose=1, n_jobs=-1) grid_search = RandomizedSearchCV(clf, param_distributions=param_distributions, cv=2, n_iter=20, verbose=1, n_jobs=-1) grid_search.fit(X_train, y_train) print(grid_search.best_params_) print(grid_search.best_score_) best_params = grid_search.best_params_ print(best_params) save_cv(grid_search.cv_results_, 'SVM', data_name) cv = ShuffleSplit(n_splits=100, test_size=0.2, random_state=0) X = np.concatenate((X_train, X_test)) y = np.concatenate((y_train, y_test)) title = 'Learning Curves (SVM Classifier) - {}'.format(data_name) # estimator = svm.SVC(gamma='scale', **best_params) estimator = svm.SVC(**best_params) print('plotting learning curve for {}'.format(estimator)) plot_learning_curve(estimator, title, X, y, ylim=(0.4, 1.01), cv=2, n_jobs=-1) plt.savefig('Figs/SVM-learningcurve-{}'.format(data_name))
def ANN(X_train, X_test, y_train, y_test, data_name, lc_y_min=0.4, lc_y_max=1.01): # Train Model and Predict unique_vals = len(np.unique(y_test)) clf = MLPClassifier(solver='sgd') param_grid = { "hidden_layer_sizes": [(10, )], "alpha": np.linspace(0.0001, 0.5, 50), "momentum": np.linspace(0.1, 1.0, 10) } grid_search = GridSearchCV(clf, param_grid=param_grid, cv=5, verbose=1, n_jobs=-1) grid_search.fit(X_train, y_train) print(grid_search.best_params_) print(grid_search.best_score_) best_params = grid_search.best_params_ best_params = {**best_params} save_cv(grid_search.cv_results_, 'ANN', data_name) cv = ShuffleSplit(n_splits=100, test_size=0.2, random_state=0) X = np.concatenate((X_train, X_test)) y = np.concatenate((y_train, y_test)) title = 'Learning Curves (ANN Classifier) - {}'.format(data_name) estimator = MLPClassifier(solver='sgd', **best_params) print('plotting learning curve for {}'.format(estimator)) plot_learning_curve(estimator, title, X, y, ylim=(lc_y_min, lc_y_max), cv=cv, n_jobs=4) plt.savefig('Figs/ANN-learningcurve-{}'.format(data_name))
# %% pred_Y = base_model.predict(test_X) util.print_accuracy_measures(test_Y, pred_Y, label="svm_big_base_clement") # %% util.visualize_confusion_matrix(base_model, test_X, test_Y, "svm_big_base_clement_confusion_matrix") # %% base_cv_results = cross_validate(base_model, train_X, train_Y, cv=KFold(5)) util.plot_cv_score(base_cv_results, title="svm_big_base_clement_cv_score_bar") # %% util.plot_learning_curve(base_model, "svm_big_base_clement_learning_curve", train_X, train_Y, cv=KFold(5), n_jobs=4) # %% util.plot_word_cloud(base_model, "svm_big_base_clement_word_cloud") # %% [markdown] # <h2> Adding TFIDF </h2> # %% tfidf_model = svm.TFIDFSVMModel(ngram=(1, 2)) tfidf_model.fit(train_X, train_Y) # %% pred_Y = tfidf_model.predict(test_X)
def train_svm(filename, X_train, X_test, y_train, y_test, solver='rbf', full_param=False, debug=False, numFolds=10, njobs=-1, scalar=1, make_graphs=False, pSVM={}): np.random.seed(1) algo = 'SVM' start = time.time() if len(pSVM) == 0: if full_param: param_grid = [{ 'kernel': [solver], # 0.0001 - Finished for Linear # 'max_iter': [-1, 10000, 100000], # 'shrinking' : [True, False], # Seems to just make things faster/slower on larger iterations, I think cutting down 2x is better # 'probability' : [True, False], 'random_state': [1] }] if solver == 'rbf': param_grid[0]['C'] = [ 0.001 ] #, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0, 10000, 100000] param_grid[0]['gamma'] = [ 0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0, 10000, 100000 ] elif solver == 'sigmoid': param_grid[0]['gamma'] = [ 0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0, 10000 ] param_grid[0]['coef0'] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10] param_grid[0]['C'] = [ 0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0, 10000, 100000 ] elif solver == 'poly': param_grid[0]['gamma'] = [ 0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0, 10000, 100000 ] param_grid[0]['degree'] = [1, 2, 3, 4, 5, 6, 7, 8] param_grid[0]['coef0'] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10] param_grid[0]['C'] = [ 0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0, 10000, 100000 ] elif solver == 'linear': param_grid[0]['C'] = [1.0] else: param_grid = [{ 'kernel': [solver], 'C': [0.01, 0.1, 1., 10., 100], 'cache_size': [2000], 'random_state': [1] }] if solver == 'poly' or solver == 'linear': param_grid = [{ 'kernel': [solver], 'C': [0.001, 0.01, 0.1, 1., 10.], 'cache_size': [2000], 'random_state': [1] }] svm_classifier = svm.SVC(probability=True) grid_search = GridSearchCV(svm_classifier, param_grid, cv=numFolds, scoring='roc_auc_ovr_weighted', return_train_score=True, n_jobs=njobs, verbose=debug) grid_search.fit(X_train, y_train) cvres = grid_search.cv_results_ best_params = grid_search.best_params_ util.save_gridsearch_to_csv(cvres, algo, filename[:-4], scalar, solver) svm_classifier = svm.SVC() svm_classifier.set_params(**best_params) else: svm_classifier = svm.SVC() svm_classifier.set_params(**pSVM) start = time.time() svm_classifier.fit(X_train, y_train) print('SVM Fit Time: ', time.time() - start) start = time.time() y_prob = svm_classifier.predict_proba(X_train) train_score = roc_auc_score(y_train, y_prob, multi_class="ovr", average="weighted") print('SVM Train Score Time: ', time.time() - start) start = time.time() y_prob = svm_classifier.predict_proba(X_test) test_score = roc_auc_score(y_test, y_prob, multi_class="ovr", average="weighted") print('SVM Test Score Time: ', time.time() - start) test_class = svm.SVC() test_class.set_params(**pSVM) if make_graphs: util.plot_learning_curve(svm_classifier, algo, filename[:-4], X_train, y_train, ylim=(0.0, 1.05), cv=10, n_jobs=njobs, debug=debug) util.compute_vc(algo, 'kernel', ['rbf', 'sigmoid', 'poly', 'linear'], X_train, y_train, X_test, y_test, svm_classifier, filename[:-4], test_class, pSVM, log=False, njobs=njobs, debug=debug, smalllegend=True) util.svm_rbf_C_Gamma_viz(X_train, y_train, pSVM, njobs, filename[:-4], train_score) # computer Model Complexity/Validation curves util.compute_vc(algo, 'kernel', ['rbf', 'sigmoid', 'poly', 'linear'], X_train, y_train, X_test, y_test, svm_classifier, filename[:-4], test_class, pSVM, log=False, njobs=njobs) util.compute_vc(algo, 'C', [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000], X_train, y_train, X_test, y_test, svm_classifier, filename[:-4], test_class, pSVM, log=True, njobs=njobs, debug=debug) if solver == 'rbf': util.compute_vc(algo, 'gamma', [0.0001, 0.001, 0.01, 0.1, 1.0, 5.0, 10.0], X_train, y_train, X_test, y_test, svm_classifier, filename[:-4], test_class, pSVM, log=True, njobs=njobs, debug=debug) elif solver == 'sigmoid': util.compute_vc( algo, 'gamma', [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0], X_train, y_train, X_test, y_test, svm_classifier, filename[:-4], test_class, pSVM, log=True, njobs=njobs, debug=debug) util.compute_vc(algo, 'coef0', [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], X_train, y_train, X_test, y_test, svm_classifier, filename[:-4], test_class, pSVM, log=False, njobs=njobs, debug=debug) elif solver == 'poly': util.compute_vc( algo, 'gamma', [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0], X_train, y_train, X_test, y_test, svm_classifier, filename[:-4], test_class, pSVM, log=True, njobs=njobs, debug=debug) util.compute_vc(algo, 'coef0', [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], X_train, y_train, X_test, y_test, svm_classifier, filename[:-4], test_class, pSVM, log=False, njobs=njobs, debug=debug) util.compute_vc(algo, 'degree', [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], X_train, y_train, X_test, y_test, svm_classifier, filename[:-4], test_class, pSVM, log=False, njobs=njobs, debug=debug) return time.time() - start, round(train_score, 4), round(test_score, 4)
label="naive_bayes_uni_base_clement") # %% util.visualize_confusion_matrix( base_model, test_X, test_Y, "naive_bayes_uni_base_clement_confusion_matrix") # %% base_cv_results = cross_validate(base_model, train_X, train_Y, cv=KFold(5)) util.plot_cv_score(base_cv_results, title="naive_bayes_uni_base_clement_cv_score_bar") # %% util.plot_learning_curve(base_model, "naive_bayes_uni_base_clement_learning_curve", train_X, train_Y, cv=KFold(5), n_jobs=4) # %% util.plot_word_cloud(base_model, "naive_bayes_uni_base_clement_word_cloud") # %% [markdown] # <h2> Adding TFIDF </h2> # %% tfidf_model = nb.TFIDFNaiveBayesModel() tfidf_model.fit(train_X, train_Y) # %% pred_Y = tfidf_model.predict(test_X)
train_accuracies = [] test_losses = [] test_accuracies = [] # if log_interval < 0, stop log when training train_loader, test_loader = load_data(batch_size=batch_size) if batch_size > (len(test_loader.dataset) *test_data_part): print('Error!batch_size: {} > single_test_batch_size: {}' .format(batch_size, len(test_loader.dataset) *test_data_part)) print('Exit!!!') os._exit(0) elif batch_size > (len(train_loader.dataset) *train_data_part): print('Error!batch_size: {} > single_train_batch_size: {}' .format(batch_size, len(train_loader.dataset) *train_data_part)) print('Exit!!!') os._exit(0) if model_name == 'CNN_add_2_28_28': model = CNN_add_2_28_28() elif model_name == 'CNN_add_56_28': model = CNN_add_56_28() # weight seems useless criterion = nn.CrossEntropyLoss(weight=weight) optimizer = optim.Adam(model.parameters(), lr= learning_rate) init() plot_learning_curve(train_losses,train_accuracies,test_losses,test_accuracies) over_time = time.perf_counter() print('Time Cost: {:.1f}'.format(over_time - start_time))
# here (1, 84, 84) == (2, 84, 84) == (3, 84, 84) == (4, 84, 84) while not done: action = agent.choose_action(observation) observation_, reward, done, info = env.step(action) # step() method is overloaded in class RepeatActionAndMaxFrame(gym.Wrapper) # here observation_, reward, done, info is reached after repeating the action 4 times # and (2, 84, 84) == (3, 84, 84) == (4, 84, 84) != (1, 84, 84) for the first loop # i.e queue follows a FIFO method, observation_ is stored in (1, 84, 84) score += reward if not load_checkpoint: agent.store_transition(observation, action, reward, observation_, int(done)) # acts like experience replay agent.learn() else: env.render() observation = observation_ n_steps += 1 scores.append(score) steps_array.append(n_steps) avg_score = np.mean(scores[-100:]) print('episode ',i, ' score: ', score, 'average score %.1f best score %.1f epsilon %.2f' % (avg_score, best_score, agent.epsilon), 'steps ', n_steps) if avg_score > best_score: if not load_checkpoint: agent.save_models() best_score = avg_score eps_history.append(agent.epsilon) plot_learning_curve(steps_array, scores, eps_history, figure_file)
env = gym.make("CartPole-v1") n_games = 10000 scores = [] eps_history = [] agent = Agent(lr=0.0001, input_dims=env.observation_space.shape, n_actions=env.action_space.n) for i in range(n_games): observation = env.reset() done = False score = 0 while not done: action = agent.choose_action(observation) next_observation, reward, done, info = env.step(action) score += reward agent.learn(observation, action, reward, next_observation) observation = next_observation scores.append(score) eps_history.append(agent.epsilon) if i % 100 == 0: avg_score = np.mean(scores[-100:]) print( 'episode ', i, 'score %.1f avg score %.1f epsilon %.2f' % (score, avg_score, agent.epsilon)) filename = 'cartpole_native_dqn.png' x = [i + 1 for i in range(n_games)] plot_learning_curve(x, scores, eps_history, filename)
digits_data = scale(data) n_samples, digits_n_features = data.shape n_digits = len(np.unique(d_labels)) run_cluster(digits_data, d_labels, n_samples, digits_n_features, n_digits, "Digits", True) b_pca = PCA(n_components=7) b_pca2_results = b_pca.fit_transform(breast_data) run_cluster(b_pca2_results, b_labels, n_samples, breast_n_features, n_digits, "Breast Cancer PCA 7") clf = MLPClassifier(solver='adam', max_iter=1000, hidden_layer_sizes=(100, 5)) timings['bc']['PCA'] = 0 start = clock() util.plot_learning_curve(clf, "B Cancer PCA ANN LC", b_pca2_results, b_labels, cv=5, n_jobs=-1) timings['bc']['PCA'] += clock() - start b_ica = FastICA(n_components=7) temp = b_ica.fit_transform(breast_data) run_cluster(temp, b_labels, n_samples, breast_n_features, n_digits, "Breast Cancer ICA 7") clf = MLPClassifier(solver='adam', max_iter=1000, hidden_layer_sizes=(100, 5)) timings['bc']['ICA'] = 0 start = clock() util.plot_learning_curve(clf, "B Cancer ICA ANN LC", temp, b_labels,
"D:\\project\\peixun\\ai_course_project_px\\1_intro\\4_anli_project_titanic\\Kaggle_Titanic_Chinese\\Kaggle_Titanic-master\\train.csv" ) # (2) 特征工程 - 处理缺失值 data_train, rfr = set_missing_ages(data_train) data_train = set_Cabin_type(data_train) # (3) 特特工程 - 类目型的特征离散/因子化 df = one_hot_encoding(data_train) # select specific coloumn train_df = df.filter( regex= 'Survived|Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*' ) #print(train_df.describe()) train_np = train_df.as_matrix() # y即Survival结果 y = train_np[:, 0] # X即特征属性值 X = train_np[:, 1:] # (5) 模型构建与训练 clf = RandomForestClassifier(criterion='gini', max_depth=5, n_estimators=5, verbose=2) #clf.fit(X, y) #print(clf.predict(y)) # (6) 绘制learning curve plot_learning_curve(clf, u"学习曲线", X, y)
("scaler", StandardScaler()), ("lin_reg", LinearRegression()), ]) # now the augmented dataset with the polynomial expansion (**2) can be fitted lin_reg = polynomial_regression.fit(X, y) y_hat = lin_reg.predict(X) # Interpolate values n_data_points = 500 x_new = np.linspace(X.min(), X.max(), n_data_points) f = interp1d(X.ravel(), y_hat, kind="quadratic", axis=0) y_smooth = f(x_new) # Plot values versus predicted values plt.plot(x_new, y_smooth, linestyle="-", color="#AA00AA") plt.scatter(X, y, c="#00AAAA") plt.scatter(X, y_hat, c="#FFFF55") plt.axis([-3, 3, 0, 10]) plt.show() plot_learning_curve( polynomial_regression, X, y, train_sizes=np.linspace(0.1, 1, 50), cv=5, n_jobs=multiprocessing.cpu_count() - 2, scoring="neg_mean_squared_error", )
_, train_loss = session.run([train_op, total_loss], {input_tensor: batch_xs}) # Append loss to the list loss_list_train.append(train_loss) # Save the model After Completion path_prefix =saver.save(session,os.path.join(save_directory,"homework_2")) return loss_list_train ########################################################### if not (model_name == 'autoencoder'): acc_val_list,ce_val_list,acc_test,ce_test, acc_train_list, ce_train_list, conf_matrix_test, classification_report_test,actual_labels_test,pred_labels_test,best_epoch = main_fun(batch_size, epochs, kernel_size, use_early_stopping, patience_no) # Create and plot the learning curve using training and validation sets plot_learning_curve(acc_train_list,acc_val_list,ce_train_list,ce_val_list) # Save the correct test class labels as pickle file for later analysis pickle_out = open("actual_labels_test.pickle","wb") pickle.dump(actual_labels_test, pickle_out) pickle_out.close() # Save predicted test class labels as pickle file for later analysis pickle_out = open("pred_labels_test.pickle","wb") pickle.dump(pred_labels_test, pickle_out) pickle_out.close() elif model_name=='autoencoder': loss_list_train = main_fun(batch_size, epochs, kernel_size, use_early_stopping, patience_no) # Calculate average loss avg_train_loss = sum(loss_list_train) / len(loss_list_train)
def DT(X_train, X_test, y_train, y_test, data_name, lc_y_min=0.4, lc_y_max=1.01): # Train Model and Predict param_grid = { "criterion": ["gini", "entropy"], "max_depth": range(1, 50), } clf = DecisionTreeClassifier() # run grid search grid_search = GridSearchCV(clf, param_grid=param_grid, cv=5, verbose=1, n_jobs=-1) grid_search.fit(X_train, y_train) print(grid_search.best_params_) print(grid_search.best_score_) best_params = grid_search.best_params_ print(best_params) save_cv(grid_search.cv_results_, 'DT', data_name) cv = ShuffleSplit(n_splits=100, test_size=0.2, random_state=0) X = np.concatenate((X_train, X_test)) y = np.concatenate((y_train, y_test)) title = 'Learning Curves (DT Classifier) - {}'.format(data_name) estimator = DecisionTreeClassifier(**best_params) print('plotting learning curve for {}'.format(estimator)) plot_learning_curve(estimator, title, X, y, ylim=(lc_y_min, lc_y_max), cv=cv, n_jobs=4) plt.savefig('Figs/DT-learningcurve-{}'.format(data_name)) plt.clf() # Plot param tuning n = 26 test_mean_acc1 = np.zeros((n - 1)) test_std_acc1 = np.zeros((n - 1)) train_mean_acc1 = np.zeros((n - 1)) train_std_acc1 = np.zeros((n - 1)) for n in range(1, n): # Train Model and Predict print('Max depth: ', n) tree = DecisionTreeClassifier(criterion="gini", max_depth=n) tree.fit(X_train, y_train) y_hat = tree.predict(X_test) y_hat_train = tree.predict(X_train) test_mean_acc1[n - 1] = metrics.accuracy_score(y_test, y_hat) test_std_acc1[n - 1] = np.std(y_hat == y_test) / np.sqrt(y_hat.shape[0]) train_mean_acc1[n - 1] = metrics.accuracy_score(y_train, y_hat_train) train_std_acc1[n - 1] = np.std(y_hat_train == y_train) / np.sqrt( y_hat_train.shape[0]) plt.plot(range(1, n + 1), test_mean_acc1, 'r') plt.fill_between(range(1, n + 1), test_mean_acc1 - 1 * test_std_acc1, test_mean_acc1 + 1 * test_std_acc1, alpha=0.10) plt.plot(range(1, n + 1), train_mean_acc1, 'm') plt.fill_between(range(1, n + 1), train_mean_acc1 - 1 * train_std_acc1, train_mean_acc1 + 1 * train_std_acc1, alpha=0.10) plt.legend(('Test Accuracy - {}'.format(data_name), 'Training Accuracy - {}'.format(data_name))) plt.ylabel('Accuracy') plt.xlabel('Decision Tree Depth') plt.tight_layout() plt.savefig('Figs/DT-depth-{}'.format(data_name)) plt.clf()
# %% [markdown] # <h2> Adding TFIDF </h2> # %% tfidf_model = lr.TFIDFLogRegModel() tfidf_model.fit(train_X, train_Y) # %% pred_Y = tfidf_model.predict(test_X) util.print_accuracy_measures(test_Y, pred_Y, label="log_reg_uni_tfidf_clement") # %% util.visualize_confusion_matrix(tfidf_model, test_X, test_Y, "log_reg_uni_tfidf_clement_confusion_matrix") # %% tfidf_cv_results = cross_validate(tfidf_model, train_X, train_Y, cv=KFold(5)) util.plot_cv_score(tfidf_cv_results, title="log_reg_uni_tfidf_clement_cv_score_bar") # %% util.plot_learning_curve(tfidf_model, "log_reg_uni_tfidf_clement_learning_curve", train_X, train_Y, cv=KFold(5), n_jobs=4) # %% util.plot_word_cloud(tfidf_model, "log_reg_uni_tfidf_clement_word_cloud")
# %% pred_Y = base_model.predict(test_X) util.print_accuracy_measures(test_Y, pred_Y, label="log_reg_big_base_comp") # %% util.visualize_confusion_matrix(base_model, test_X, test_Y, "log_reg_big_base_comp_confusion_matrix") # %% base_cv_results = cross_validate(base_model, train_X, train_Y, cv=KFold(5)) util.plot_cv_score(base_cv_results, title="log_reg_big_base_comp_cv_score_bar") # %% util.plot_learning_curve(base_model, "log_reg_big_base_comp_learning_curve", train_X, train_Y, cv=KFold(5), n_jobs=4) # %% util.plot_word_cloud(base_model, "log_reg_big_base_comp_word_cloud") # %% [markdown] # <h2> Adding TFIDF </h2> # %% tfidf_model = lr.TFIDFLogRegModel(ngram=(1, 2)) tfidf_model.fit(train_X, train_Y) # %% pred_Y = tfidf_model.predict(test_X)
def run_cluster(data, labels, n_samples, n_features, n_digits, title, run_extra=False): train_score = defaultdict(list) train_score['k-means'] = [] train_score['gmm'] = [] for i in iterations: n_digits = i kmeans = KMeans(init='k-means++', n_clusters=n_digits, n_init=1) kmeans.fit_transform(data) cluster_feature = kmeans.labels_ train_score['k-means'].append( metrics.v_measure_score(labels, kmeans.labels_)) gaus = GaussianMixture(n_components=n_digits) gaus.fit(data) cluster_feature2 = gaus.predict(data) train_score['gmm'].append(gaus.bic(data)) # print('pre', data.shape) # print('pre1', cluster_feature.shape) if run_extra: clf = MLPClassifier(solver='adam', max_iter=1000, hidden_layer_sizes=(100, 5)) timings[title]['default'] = 0 start = clock() util.plot_learning_curve(clf, title + " Basic ANN LC", data, labels, cv=5, n_jobs=-1) timings[title]['default'] += clock() - start data1 = np.column_stack((data, cluster_feature)) # print('post1', data.shape) # print('EM', cluster_feature2.shape) data2 = np.column_stack((data, cluster_feature2)) if run_extra: clf = MLPClassifier(solver='adam', max_iter=1000, hidden_layer_sizes=(100, 5)) timings[title]['KmeansXtra'] = 0 start = clock() util.plot_learning_curve(clf, title + " KM Cluster Feature ANN LC", data1, labels, cv=5, n_jobs=-1) timings[title]['KmeansXtra'] += clock() - start clf = MLPClassifier(solver='adam', max_iter=1000, hidden_layer_sizes=(100, 5)) timings[title]['EMxtra'] = 0 start = clock() util.plot_learning_curve(clf, title + " EM Cluster Feature ANN LC", data2, labels, cv=5, n_jobs=-1) timings[title]['EMxtra'] += clock() - start plt.figure() ticks = range(len(iterations)) plt.plot(ticks, train_score['k-means'], 'o-', label="Train Score", color="g") plt.xticks(ticks, iterations) plt.title(title + "K Means V-Measure Score") plt.xlabel("N Clusters") plt.ylabel("V Measure") plt.tight_layout() plt.legend(loc="best") plt.savefig(title + 'KmeansVmeasure.png') plt.figure() ticks = range(len(iterations)) plt.plot(ticks, train_score['gmm'], 'o-', label="Train Score", color="g") plt.xticks(ticks, iterations) plt.title(title + "EM BIC Score") plt.xlabel("N Clusters") plt.ylabel("BIC") plt.tight_layout() plt.legend(loc="best") plt.savefig(title + 'EMbic.png')
# %% pred_Y = base_model.predict(test_X) util.print_accuracy_measures(test_Y, pred_Y, label="svm_uni_base_comp") # %% util.visualize_confusion_matrix(base_model, test_X, test_Y, "svm_uni_base_comp_confusion_matrix") # %% base_cv_results = cross_validate(base_model, train_X, train_Y, cv=KFold(5)) util.plot_cv_score(base_cv_results, title="svm_uni_base_comp_cv_score_bar") # %% util.plot_learning_curve(base_model, "svm_uni_base_comp_learning_curve", train_X, train_Y, cv=KFold(5), n_jobs=4) # %% util.plot_word_cloud(base_model, "svm_uni_base_comp_word_cloud") # %% [markdown] # <h2> Adding TFIDF </h2> # %% tfidf_model = svm.TFIDFSVMModel() tfidf_model.fit(train_X, train_Y) # %% pred_Y = tfidf_model.predict(test_X)
# X即特征属性值 X = train_np[:, 1:] from sklearn.preprocessing import PolynomialFeatures poly = PolynomialFeatures(2) print(X.shape) X = poly.fit_transform(X) print(X.shape) # # (5) 模型构建与训练 clf = RandomForestClassifier(criterion='gini', max_depth=1, n_estimators=1, verbose=0) # (6) 绘制learning curve plot_learning_curve(clf, u"学习曲线", X, y, cv=10) # (5) 模型构建与训练 - 训练集精度提升明显 clf = RandomForestClassifier(criterion='gini', max_depth=30, n_estimators=1, verbose=0) # (6) 绘制learning curve plot_learning_curve(clf, u"学习曲线-", X, y, cv=10) # (5) 模型构建与训练 - 可以适当减缓过拟合 clf = RandomForestClassifier(criterion='gini', max_depth=6, n_estimators=30, verbose=0)
scores = [] winpct = [] eps_history = [] n_episodes = 10000 agent = Agent(env.observation_space.shape, env.action_space.n) n_plays = 5000 for i in tqdm(range(n_plays)): observation = env.reset() done = None score = 0 while not done: action = agent.choose_action(observation) next_observation, reward, done, info = env.step(action) score += reward agent.learn(observation, action, reward, next_observation) observation = next_observation scores.append(score) eps_history.append(agent.eps) if i % 100 == 0: avg_score = np.mean(scores[-100:]) print( 'episode', i, 'score %.1f avg score %.1f epsilon %.2f' % (score, avg_score, agent.eps)) filename = 'cartpole_naive_dqn.png' plot_learning_curve(range(n_plays), scores, eps_history, filename)