def k_nearest_neighbors(df, xcols, k=5): """ Run k-nearest neighbors algo """ y = df['target'] X = df[list(xcols)] # Standardize and split the training and test data X_std = standardize(X) ts = 0.3 X_train, X_test, y_train, y_test = train_test_split(X_std, y, test_size=ts, random_state=0) knn = KNeighborsClassifier(n_neighbors=k, p=2, metric='minkowski') knn.fit(X_train, y_train) print('Training accuracy:', knn.score(X_train, y_train)) print('Test accuracy:', knn.score(X_test, y_test)) plot_decision_regions(X_std, y.values, classifier=knn) plt.title('Randaom Forest (Decision Tree Ensemble)') plt.xlabel(list(X.columns)[0]) plt.ylabel(list(X.columns)[1]) plt.legend(loc='upper left') plt.tight_layout() plt.savefig(IMG_ROOT + 'snp/kmeans/kkn.png', dpi=300) plt.close() return knn
def decision_tree(df, xcols, md=3): y = df['target'] X = df[list(xcols)] # Standardize and split the training nad test data X_std = standardize(X) ts = 0.3 X_train, X_test, y_train, y_test = \ train_test_split(X_std, y, test_size=ts, random_state=0) tree = DecisionTreeClassifier(criterion='entropy', max_depth=md, random_state=0) tree.fit(X_train, y_train) print('Training accuracy:', tree.score(X_train, y_train)) print('Test accuracy:', tree.score(X_test, y_test)) plot_decision_regions(X_std, y.values, classifier=tree) plt.title('Decision Tree') plt.xlabel(list(X.columns)[0]) plt.ylabel(list(X.columns)[1]) plt.legend(loc='upper left') plt.tight_layout() plt.savefig(IMG_PATH + 'dec_tree' + '.png', dpi=300) plt.close() export_graphviz(tree, out_file='tree.dot', feature_names=list(xcols))
def support_vector_machines(df, xcols, C=100): y = df['target'] X = df[list(xcols)] # Standardize and split the training nad test data X_std = standardize(X) t_s = 0.3 X_train, X_test, y_train, y_test = train_test_split(X_std, y, test_size=t_s, random_state=0) svm = SVC(kernel='linear', C=C, random_state=0) svm.fit(X_train, y_train) print('Training accuracy:', svm.score(X_train, y_train)) print('Test accuracy:', svm.score(X_test, y_test)) # plot_decision_regions(X.values, y.values, classifier=svm, test_break_idx=int(len(y)*(1-ts))) plot_decision_regions(X_std, y.values, classifier=svm) plt.title('Support Vector Machines') plt.xlabel(list(X.columns)[0]) plt.ylabel(list(X.columns)[1]) plt.legend(loc='upper left') plt.tight_layout() plt.savefig(IMG_PATH + 'svm_C' + str(C) + '.png', dpi=300) plt.close() return svm
def adalineGD(df, xcols, eta=0.1, n_iter=10): t0 = time.time() # Need this replace to comply with the -1 and 1 of the perceptron binary classifier y = df['target'].replace(0, -1) X = df[list(xcols)] # Standardize and split the training nad test data X_std = standardize(X) ts = 0.3 X_train, X_test, y_train, y_test = train_test_split(X_std, y, test_size=ts, random_state=0) ada = AdalineGD(n_iter=15, eta=0.001) ada.fit(X_std, y) plot_decision_regions(X_std, y.values, classifier=ada) plt.title('Adaline - Gradient Descent') plt.xlabel(list(X.columns)[0]) plt.ylabel(list(X.columns)[1]) plt.legend(loc='upper left') plt.tight_layout() plt.savefig(IMG_ROOT + 'dow/adaline_2.png', dpi=300) plt.close() plt.plot(range(1, len(ada.cost_) + 1), ada.cost_, marker='o') plt.xlabel('Epochs') plt.ylabel('Sum-squared-error') plt.tight_layout() plt.savefig(IMG_ROOT + 'dow/adaline_3.png', dpi=300) plt.close()
def lda_scikit(df, xcols): y = df['target'] X = df[list(xcols)] # Standardize and split the training nad test data X_std = standardize(X) ts = 0.3 X_train, X_test, y_train, y_test = \ train_test_split(X_std, y, test_size=ts, random_state=0) lda = LDA(n_components=2) X_train_lda = lda.fit_transform(X_train, y_train) lr = LogisticRegression() lr = lr.fit(X_train_lda, y_train) plot_decision_regions(X_train_lda, y_train.values, classifier=lr) plt.xlabel('LD 1') plt.ylabel('LD 2') plt.legend(loc='lower left') plt.tight_layout() plt.savefig(IMG_PATH + 'lda_scikit.png', dpi=300) plt.close() X_test_lda = lda.transform(X_test) plot_decision_regions(X_test_lda, y_test.values, classifier=lr) plt.xlabel('LD 1') plt.ylabel('LD 2') plt.legend(loc='lower left') plt.tight_layout() plt.savefig(IMG_PATH + 'lda_scikit_test.png', dpi=300)
def run_perceptron(train_df, xcols, eta=0.1, n_iter=10): ''' Takes the pruned dataframe and runs it through the perceptron class Parameters ========== df : dataframe dataframe with the inputs and target eta : float learning rate between 0 and 1 n_iter : int passes over the training dataset Return ====== NONE ''' time0 = time.time() # Need this replace to comply with the -1 and 1 of the perceptron binary classifier y_df = train_df['target'].replace(0, -1) x_df = train_df[list(xcols)] # Standardize and split the training nad test data x_std = standardize(x_df) t_s = 0.3 x_train, x_test, y_train, y_test = train_test_split(x_std, y_df, test_size=t_s, random_state=0) plt.figure(figsize=(7, 4)) plt.legend() ppn = Perceptron(eta, n_iter) ppn.fit(x_train, y_train.values) print('Training accuracy:', ppn.score(x_train, y_train)) print('Test accuracy:', ppn.score(x_test, y_test)) pdb.set_trace() plot_decision_regions(x_train, y_train.values, classifier=ppn) # plot_decision_regions(x_df.values, y_df.values, classifier=ppn) plt.xlabel(x_df.columns[0]) plt.ylabel(x_df.columns[1]) plt.savefig(IMG_ROOT + "perceptron_{}.png" "".format(dt.datetime.now().strftime("%Y%m%d"))) plt.close() plt.plot(range(1, len(ppn.errors_) + 1), ppn.errors_, marker='o') plt.xlabel('Iterations') plt.ylabel('Number of misclassifications') plt.savefig(IMG_ROOT + "perceptron_misses_{}.png" "".format(dt.datetime.now().strftime("%Y%m%d"))) plt.close() time1 = time.time() print("Done training data and creating charts, took {0} seconds" "".format(time1 - time0))
def plot_knn(self, col1='col1', col2='col2', name="knn_custom_"): """ plot the decision boundaries of this knn instance """ plot_decision_regions(self.train[:, :-1], self.train[:, -1], classifier=self) pdb.set_trace() plt.xlabel(col1) plt.ylabel(col2) plt.savefig(IMG_PATH + name + "{}.png".format(dt.datetime.now().strftime("%Y%m%d"))) plt.close()
def pml_knn_test(): """ Test our knn vs sklearn """ # Get Data iris = datasets.load_iris() x_vals = iris.data[:, [2, 3]] y_vals = iris.target x_train, x_test, y_train, y_test = train_test_split(x_vals, y_vals, test_size=0.3, random_state=0) x_train_std = standardize(x_train) x_test_std = standardize(x_test) # x_combined = np.vstack((x_train, x_test)) x_combined_std = np.vstack((x_train_std, x_test_std)) y_combined = np.hstack((y_train, y_test)) iris_data = np.concatenate((x_train_std, np.array([y_train]).T), axis=1) # Sklearn KNN knn = KNeighborsClassifier(n_neighbors=5, p=2, metric='minkowski') knn.fit(x_train_std, y_train) # x_combined = np.vstack((x_train, x_test)) y_combined = np.hstack((y_train, y_test)) plot_decision_regions(x_combined_std, y_combined, classifier=knn, test_break_idx=range(105, 150)) plt.xlabel('petal length [standardized]') plt.ylabel('petal width [standardized]') plt.legend(loc='upper left') plt.tight_layout() plt.savefig(IMG_ROOT + "PML/" + 'knn_sklearn.png', dpi=300) plt.close() # Custom KNN cust_knn = KNN(iris_data, k_nbrs=5, dont_div=True) plot_decision_regions(x_combined_std, y_combined, classifier=cust_knn, test_break_idx=range(105, 150)) plt.xlabel('petal length [cm]') plt.ylabel('petal width [cm]') plt.legend(loc='upper left') plt.tight_layout() plt.savefig(IMG_ROOT + "PML/" + 'knn_cust.png', dpi=300) plt.close()
def pml_build_tree_test(): """ Test our decision tree vs sklearn """ # Get Data iris = datasets.load_iris() x_vals = iris.data[:, [2, 3]] y_vals = iris.target x_train, x_test, y_train, y_test = train_test_split(x_vals, y_vals, test_size=0.3, random_state=0) iris_data = np.concatenate((x_train, np.array([y_train]).T), axis=1) # Sklearn Tree tree = DecisionTreeClassifier(criterion='entropy', max_depth=3, random_state=0) tree.fit(x_train, y_train) x_combined = np.vstack((x_train, x_test)) y_combined = np.hstack((y_train, y_test)) plot_decision_regions(x_combined, y_combined, classifier=tree, test_break_idx=range(105, 150)) export_graphviz(tree, out_file='tree.dot', feature_names=['petal length', 'petal width']) # Custom Tree iris_tree = DecisionTree(data=iris_data) iris_tree.print_tree() x_comb = np.vstack((x_train, x_test)) y_comb = np.hstack((y_train, y_test)) plot_decision_regions(x_comb, y_comb, classifier=iris_tree, test_break_idx=range(105, 150)) plt.xlabel('petal length [cm]') plt.ylabel('petal width [cm]') plt.legend(loc='upper left') plt.tight_layout() plt.savefig(IMG_ROOT + "PML/" + 'decision_tree_decision.png', dpi=300) plt.close() draw_tree(iris_tree)
def run_perceptron_multi(df, xcols, eta=0.1, n_iter=15): time0 = time.time() y = df['target'] X = df[list(xcols)] # Split up the training and test data and standardize inputs # Standardize and split the training nad test data X_std = standardize(X) ts = 0.3 X_train, X_test, y_train, y_test = train_test_split(X_std, y, test_size=ts, random_state=0) # pdb.set_trace() # strong_buy = df[df['target'] == 3][list(X.columns)].values # buy = df[df['target'] == 2][list(X.columns)].values # sell = df[df['target'] == 1][list(X.columns)].values # strong_sell = df[df['target'] == 0][list(X.columns)].values # plt.figure(figsize=(7,4)) # plt.scatter(buy[:, 0], buy[:, 1], color='blue', marker='x', label='Buy') # plt.scatter(sell[:, 0], sell[:, 1], color='red', marker='s', label='Sell') # plt.scatter(strong_buy[:, 0], strong_buy[:, 1], color='blue', marker='*', # label='Strong Buy') # plt.scatter(strong_sell[:, 0], strong_sell[:, 1], color='red', marker='^', # label='Strong Sell') # plt.xlabel(list(X.columns)[0]) # plt.ylabel(list(X.columns)[1]) # plt.legend() ppn = perceptron_skl(n_iter=40, eta0=0.1, random_state=0) ppn.fit(X_train, y_train) y_pred = ppn.predict(X_test) print('Accuracy: %.2f' % accuracy_score(y_test, y_pred)) plot_decision_regions(X_train, y_train.values, classifier=ppn) plt.savefig(IMG_ROOT + "dow/perceptron_multi.png") plt.close() time1 = time.time() print("Done training data and creating charts, took {0} seconds" "".format(time1 - time0))
def logisticRegression(df, xcols, C=100, penalty='l2'): # Need xcols to be a tuple for the timeme method to work VERY HACKY y = df['target'] X = df[list(xcols)] # Standardize and split the training nad test data X_std = standardize(X) ts = 0.3 X_train, X_test, y_train, y_test = train_test_split(X_std, y, test_size=ts, random_state=0) # Normalization of the data --> max = 1, min=0, etc # mms = MinMaxScaler() # X_train_norm = mms.fit_transform(X_train) # X_test_norm = mms.transform(X_test) # C: regularization parameter, (C = 1/lambda) # smaller C = more regulatiazion, smaller wieghts, higher C = less regularization, lareger weights # penalty: type of regulatizaion function used for weight shrinkage / decay to prevent overfitting lr = LogisticRegression(C=C, random_state=0, penalty=penalty) lr.fit(X_train, y_train) # Shows the percentage of falling into each class print("Class breakdowns: " + str(lr.predict_proba(X_test[0:1]))) print('Training accuracy:', lr.score(X_train, y_train)) print('Test accuracy:', lr.score(X_test, y_test)) print("y-intercept:" + str(lr.intercept_)) print("coeffs:" + str(lr.coef_)) try: plot_decision_regions(X_train, y_train.values, classifier=lr) plt.title('Logistic Regression') plt.xlabel(list(X.columns)[0]) plt.ylabel(list(X.columns)[1]) plt.legend(loc='upper left') plt.tight_layout() plt.savefig(IMG_ROOT + 'dow/log_reg_1.png', dpi=300) plt.close() except Exception as e: print("May have more than 2 variables") return lr
def random_forest(df, xcols, estimators=5): """ Run random forest algorithm """ y = df['target'] X = df[list(xcols)] # Standardize and split the training nad test data X_std = standardize(X) t_s = 0.3 X_train, X_test, y_train, y_test = train_test_split(X_std, y, test_size=t_s, random_state=0) forest = RandomForestClassifier(criterion='entropy', n_estimators=estimators, random_state=1, n_jobs=3) forest.fit(X_train, y_train) # Shows the percentage of falling into each class print("Class breakdowns: " + str(forest.predict_proba(X_test[0:1]))) print('Training accuracy:', forest.score(X_train, y_train)) print('Test accuracy:', forest.score(X_test, y_test)) print("Feature Importances :" + str(forest.feature_importances_)) pdb.set_trace() plot_decision_regions(X_std, y.values, classifier=forest) plt.title('Randaom Forest (Decision Tree Ensemble)') plt.xlabel(list(X.columns)[0]) plt.ylabel(list(X.columns)[1]) plt.legend(loc='upper left') plt.tight_layout() plt.savefig(IMG_ROOT + 'snp/kmeans/random_forest.png', dpi=300) plt.close()
def nonlinear_svm(df, xcols, C=100, gamma=0.10): y = df['target'] X = df[list(xcols)] # Standardize and split the training nad test data X_std = standardize(X) ts = 0.3 X_train, X_test, y_train, y_test = \ train_test_split(X_std, y, test_size=ts, random_state=0) svm = SVC(kernel='rbf', random_state=0, gamma=gamma, C=C) svm.fit(X_train, y_train) print('Training accuracy:', svm.score(X_train, y_train)) print('Test accuracy:', svm.score(X_test, y_test)) plot_decision_regions(X_std, y.values, classifier=svm) plt.title('Support Vector Machines - Non Linear') plt.xlabel(list(X.columns)[0]) plt.ylabel(list(X.columns)[1]) plt.legend(loc='upper left') plt.tight_layout() plt.savefig(IMG_PATH + 'svm_nonlinear_C' + str(C) + '.png', dpi=300) plt.close()
AX[1].set_xlabel('Epochs') AX[1].set_ylabel('Sum-squared-error') AX[1].set_title('Adaline - Learning rate 0.0001') plt.tight_layout() plt.savefig(IMG_ROOT + "PML/" + 'adaline_comp.png', dpi=300) plt.close() # standardize features X_STD = np.copy(X_VALS) X_STD[:, 0] = (X_VALS[:, 0] - X_VALS[:, 0].mean()) / X_VALS[:, 0].std() X_STD[:, 1] = (X_VALS[:, 1] - X_VALS[:, 1].mean()) / X_VALS[:, 1].std() # Implement AdalineGD on Standardized data ADA = AdalineGD(n_iter=15, eta=0.01) ADA.fit(X_STD, Y_VALS) plot_decision_regions(X_STD, Y_VALS, classifier=ADA) plt.title('Adaline - Gradient Descent') plt.xlabel('sepal length [standardized]') plt.ylabel('petal length [standardized]') plt.legend(loc='upper left') plt.tight_layout() plt.savefig(IMG_ROOT + "PML/" + 'adaline_2.png', dpi=300) plt.close() plt.plot(range(1, len(ADA.cost_) + 1), ADA.cost_, marker='o') plt.xlabel('Epochs') plt.ylabel('Sum-squared-error') plt.tight_layout() plt.savefig(IMG_ROOT + "PML/" + 'adaline_3.png', dpi=300) plt.close()
Y_VALS = np.where(Y_VALS == 0, -1, 1) X_VALS = IRIS.data[0:100, [0, 2]] # Plot the x vals of the data set plt.scatter(X_VALS[:50, 0], X_VALS[:50, 1], color='red', marker='o', label='setosa') plt.scatter(X_VALS[50:100, 0], X_VALS[50:100, 1], color='blue', marker='x', label='versicolor') PPN = Perceptron(eta=0.1, n_iter=10) PPN.fit(X_VALS, Y_VALS) plt.xlabel('sepal length [cm]') plt.ylabel('petal length [cm]') plt.legend(loc='upper left') plot_decision_regions(X_VALS, Y_VALS, classifier=PPN) pdb.set_trace() plt.savefig(IMG_ROOT + "PML/" + "iris_ch2.png", dpi=300) plt.close() plt.plot(range(1, len(PPN.errors_) + 1), PPN.errors_, marker='o') plt.xlabel('Epochs') plt.ylabel('Number of misclassifications') plt.savefig(IMG_ROOT + "PML/" + "iris2_ch2.png", dpi=300) plt.close()