コード例 #1
0
def k_nearest_neighbors(df, xcols, k=5):
    """
    Run k-nearest neighbors algo
    """
    y = df['target']
    X = df[list(xcols)]

    # Standardize and split the training and test data
    X_std = standardize(X)
    ts = 0.3
    X_train, X_test, y_train, y_test = train_test_split(X_std,
                                                        y,
                                                        test_size=ts,
                                                        random_state=0)

    knn = KNeighborsClassifier(n_neighbors=k, p=2, metric='minkowski')
    knn.fit(X_train, y_train)

    print('Training accuracy:', knn.score(X_train, y_train))
    print('Test accuracy:', knn.score(X_test, y_test))

    plot_decision_regions(X_std, y.values, classifier=knn)
    plt.title('Randaom Forest (Decision Tree Ensemble)')
    plt.xlabel(list(X.columns)[0])
    plt.ylabel(list(X.columns)[1])
    plt.legend(loc='upper left')
    plt.tight_layout()
    plt.savefig(IMG_ROOT + 'snp/kmeans/kkn.png', dpi=300)
    plt.close()

    return knn
コード例 #2
0
def decision_tree(df, xcols, md=3):
    y = df['target']
    X = df[list(xcols)]

    # Standardize and split the training nad test data
    X_std = standardize(X)
    ts = 0.3
    X_train, X_test, y_train, y_test = \
          train_test_split(X_std, y, test_size=ts, random_state=0)

    tree = DecisionTreeClassifier(criterion='entropy',
                                  max_depth=md,
                                  random_state=0)
    tree.fit(X_train, y_train)

    print('Training accuracy:', tree.score(X_train, y_train))
    print('Test accuracy:', tree.score(X_test, y_test))

    plot_decision_regions(X_std, y.values, classifier=tree)
    plt.title('Decision Tree')
    plt.xlabel(list(X.columns)[0])
    plt.ylabel(list(X.columns)[1])
    plt.legend(loc='upper left')
    plt.tight_layout()
    plt.savefig(IMG_PATH + 'dec_tree' + '.png', dpi=300)
    plt.close()

    export_graphviz(tree, out_file='tree.dot', feature_names=list(xcols))
コード例 #3
0
def support_vector_machines(df, xcols, C=100):
    y = df['target']
    X = df[list(xcols)]

    # Standardize and split the training nad test data
    X_std = standardize(X)
    t_s = 0.3
    X_train, X_test, y_train, y_test = train_test_split(X_std,
                                                        y,
                                                        test_size=t_s,
                                                        random_state=0)

    svm = SVC(kernel='linear', C=C, random_state=0)
    svm.fit(X_train, y_train)

    print('Training accuracy:', svm.score(X_train, y_train))
    print('Test accuracy:', svm.score(X_test, y_test))

    # plot_decision_regions(X.values, y.values, classifier=svm, test_break_idx=int(len(y)*(1-ts)))
    plot_decision_regions(X_std, y.values, classifier=svm)
    plt.title('Support Vector Machines')
    plt.xlabel(list(X.columns)[0])
    plt.ylabel(list(X.columns)[1])
    plt.legend(loc='upper left')
    plt.tight_layout()
    plt.savefig(IMG_PATH + 'svm_C' + str(C) + '.png', dpi=300)
    plt.close()
    return svm
コード例 #4
0
def adalineGD(df, xcols, eta=0.1, n_iter=10):
    t0 = time.time()
    # Need this replace to comply with the -1 and 1 of the perceptron binary classifier
    y = df['target'].replace(0, -1)
    X = df[list(xcols)]

    # Standardize and split the training nad test data
    X_std = standardize(X)
    ts = 0.3
    X_train, X_test, y_train, y_test = train_test_split(X_std,
                                                        y,
                                                        test_size=ts,
                                                        random_state=0)

    ada = AdalineGD(n_iter=15, eta=0.001)
    ada.fit(X_std, y)

    plot_decision_regions(X_std, y.values, classifier=ada)
    plt.title('Adaline - Gradient Descent')
    plt.xlabel(list(X.columns)[0])
    plt.ylabel(list(X.columns)[1])
    plt.legend(loc='upper left')
    plt.tight_layout()
    plt.savefig(IMG_ROOT + 'dow/adaline_2.png', dpi=300)
    plt.close()

    plt.plot(range(1, len(ada.cost_) + 1), ada.cost_, marker='o')
    plt.xlabel('Epochs')
    plt.ylabel('Sum-squared-error')
    plt.tight_layout()
    plt.savefig(IMG_ROOT + 'dow/adaline_3.png', dpi=300)
    plt.close()
コード例 #5
0
def lda_scikit(df, xcols):
    y = df['target']
    X = df[list(xcols)]
    
    # Standardize and split the training nad test data
    X_std = standardize(X)
    ts = 0.3
    X_train, X_test, y_train, y_test = \
          train_test_split(X_std, y, test_size=ts, random_state=0)
    
    lda = LDA(n_components=2)
    X_train_lda = lda.fit_transform(X_train, y_train)
    lr = LogisticRegression()
    lr = lr.fit(X_train_lda, y_train)
    
    plot_decision_regions(X_train_lda, y_train.values, classifier=lr)
    plt.xlabel('LD 1')
    plt.ylabel('LD 2')
    plt.legend(loc='lower left')
    plt.tight_layout()
    plt.savefig(IMG_PATH + 'lda_scikit.png', dpi=300)
    plt.close()
    
    X_test_lda = lda.transform(X_test)
    
    plot_decision_regions(X_test_lda, y_test.values, classifier=lr)
    plt.xlabel('LD 1')
    plt.ylabel('LD 2')
    plt.legend(loc='lower left')
    plt.tight_layout()
    plt.savefig(IMG_PATH + 'lda_scikit_test.png', dpi=300)
コード例 #6
0
def run_perceptron(train_df, xcols, eta=0.1, n_iter=10):
    ''' Takes the pruned dataframe and runs it through the perceptron class

        Parameters
        ==========
        df : dataframe
            dataframe with the inputs and target
        eta : float
            learning rate between 0 and 1
        n_iter : int
            passes over the training dataset

        Return
        ======
        NONE
    '''
    time0 = time.time()
    # Need this replace to comply with the -1 and 1 of the perceptron binary classifier
    y_df = train_df['target'].replace(0, -1)
    x_df = train_df[list(xcols)]

    # Standardize and split the training nad test data
    x_std = standardize(x_df)
    t_s = 0.3
    x_train, x_test, y_train, y_test = train_test_split(x_std,
                                                        y_df,
                                                        test_size=t_s,
                                                        random_state=0)

    plt.figure(figsize=(7, 4))
    plt.legend()
    ppn = Perceptron(eta, n_iter)
    ppn.fit(x_train, y_train.values)

    print('Training accuracy:', ppn.score(x_train, y_train))
    print('Test accuracy:', ppn.score(x_test, y_test))

    pdb.set_trace()
    plot_decision_regions(x_train, y_train.values, classifier=ppn)
    # plot_decision_regions(x_df.values, y_df.values, classifier=ppn)
    plt.xlabel(x_df.columns[0])
    plt.ylabel(x_df.columns[1])
    plt.savefig(IMG_ROOT + "perceptron_{}.png"
                "".format(dt.datetime.now().strftime("%Y%m%d")))
    plt.close()

    plt.plot(range(1, len(ppn.errors_) + 1), ppn.errors_, marker='o')
    plt.xlabel('Iterations')
    plt.ylabel('Number of misclassifications')
    plt.savefig(IMG_ROOT + "perceptron_misses_{}.png"
                "".format(dt.datetime.now().strftime("%Y%m%d")))
    plt.close()
    time1 = time.time()
    print("Done training data and creating charts, took {0} seconds"
          "".format(time1 - time0))
コード例 #7
0
 def plot_knn(self, col1='col1', col2='col2', name="knn_custom_"):
     """
     plot the decision boundaries of this knn instance
     """
     plot_decision_regions(self.train[:, :-1],
                           self.train[:, -1],
                           classifier=self)
     pdb.set_trace()
     plt.xlabel(col1)
     plt.ylabel(col2)
     plt.savefig(IMG_PATH + name +
                 "{}.png".format(dt.datetime.now().strftime("%Y%m%d")))
     plt.close()
コード例 #8
0
def pml_knn_test():
    """
    Test our knn vs sklearn
    """
    # Get Data
    iris = datasets.load_iris()
    x_vals = iris.data[:, [2, 3]]
    y_vals = iris.target
    x_train, x_test, y_train, y_test = train_test_split(x_vals,
                                                        y_vals,
                                                        test_size=0.3,
                                                        random_state=0)
    x_train_std = standardize(x_train)
    x_test_std = standardize(x_test)
    # x_combined = np.vstack((x_train, x_test))
    x_combined_std = np.vstack((x_train_std, x_test_std))
    y_combined = np.hstack((y_train, y_test))
    iris_data = np.concatenate((x_train_std, np.array([y_train]).T), axis=1)

    # Sklearn KNN
    knn = KNeighborsClassifier(n_neighbors=5, p=2, metric='minkowski')
    knn.fit(x_train_std, y_train)
    # x_combined = np.vstack((x_train, x_test))
    y_combined = np.hstack((y_train, y_test))
    plot_decision_regions(x_combined_std,
                          y_combined,
                          classifier=knn,
                          test_break_idx=range(105, 150))
    plt.xlabel('petal length [standardized]')
    plt.ylabel('petal width [standardized]')
    plt.legend(loc='upper left')
    plt.tight_layout()
    plt.savefig(IMG_ROOT + "PML/" + 'knn_sklearn.png', dpi=300)
    plt.close()

    # Custom KNN
    cust_knn = KNN(iris_data, k_nbrs=5, dont_div=True)
    plot_decision_regions(x_combined_std,
                          y_combined,
                          classifier=cust_knn,
                          test_break_idx=range(105, 150))
    plt.xlabel('petal length [cm]')
    plt.ylabel('petal width [cm]')
    plt.legend(loc='upper left')
    plt.tight_layout()
    plt.savefig(IMG_ROOT + "PML/" + 'knn_cust.png', dpi=300)
    plt.close()
コード例 #9
0
def pml_build_tree_test():
    """
    Test our decision tree vs sklearn
    """
    # Get Data
    iris = datasets.load_iris()
    x_vals = iris.data[:, [2, 3]]
    y_vals = iris.target
    x_train, x_test, y_train, y_test = train_test_split(x_vals,
                                                        y_vals,
                                                        test_size=0.3,
                                                        random_state=0)
    iris_data = np.concatenate((x_train, np.array([y_train]).T), axis=1)

    # Sklearn Tree
    tree = DecisionTreeClassifier(criterion='entropy',
                                  max_depth=3,
                                  random_state=0)
    tree.fit(x_train, y_train)
    x_combined = np.vstack((x_train, x_test))
    y_combined = np.hstack((y_train, y_test))
    plot_decision_regions(x_combined,
                          y_combined,
                          classifier=tree,
                          test_break_idx=range(105, 150))
    export_graphviz(tree,
                    out_file='tree.dot',
                    feature_names=['petal length', 'petal width'])

    # Custom Tree
    iris_tree = DecisionTree(data=iris_data)
    iris_tree.print_tree()
    x_comb = np.vstack((x_train, x_test))
    y_comb = np.hstack((y_train, y_test))
    plot_decision_regions(x_comb,
                          y_comb,
                          classifier=iris_tree,
                          test_break_idx=range(105, 150))
    plt.xlabel('petal length [cm]')
    plt.ylabel('petal width [cm]')
    plt.legend(loc='upper left')
    plt.tight_layout()
    plt.savefig(IMG_ROOT + "PML/" + 'decision_tree_decision.png', dpi=300)
    plt.close()
    draw_tree(iris_tree)
コード例 #10
0
def run_perceptron_multi(df, xcols, eta=0.1, n_iter=15):
    time0 = time.time()
    y = df['target']
    X = df[list(xcols)]

    # Split up the training and test data and standardize inputs
    # Standardize and split the training nad test data
    X_std = standardize(X)
    ts = 0.3
    X_train, X_test, y_train, y_test = train_test_split(X_std,
                                                        y,
                                                        test_size=ts,
                                                        random_state=0)

    # pdb.set_trace()
    # strong_buy = df[df['target'] == 3][list(X.columns)].values
    # buy = df[df['target'] == 2][list(X.columns)].values
    # sell = df[df['target'] == 1][list(X.columns)].values
    # strong_sell = df[df['target'] == 0][list(X.columns)].values

    # plt.figure(figsize=(7,4))
    # plt.scatter(buy[:, 0], buy[:, 1], color='blue', marker='x', label='Buy')
    # plt.scatter(sell[:, 0], sell[:, 1], color='red', marker='s', label='Sell')
    # plt.scatter(strong_buy[:, 0], strong_buy[:, 1], color='blue', marker='*',
    #             label='Strong Buy')
    # plt.scatter(strong_sell[:, 0], strong_sell[:, 1], color='red', marker='^',
    #             label='Strong Sell')
    # plt.xlabel(list(X.columns)[0])
    # plt.ylabel(list(X.columns)[1])
    # plt.legend()

    ppn = perceptron_skl(n_iter=40, eta0=0.1, random_state=0)
    ppn.fit(X_train, y_train)
    y_pred = ppn.predict(X_test)

    print('Accuracy: %.2f' % accuracy_score(y_test, y_pred))
    plot_decision_regions(X_train, y_train.values, classifier=ppn)
    plt.savefig(IMG_ROOT + "dow/perceptron_multi.png")
    plt.close()

    time1 = time.time()
    print("Done training data and creating charts, took {0} seconds"
          "".format(time1 - time0))
コード例 #11
0
def logisticRegression(df, xcols, C=100, penalty='l2'):
    # Need xcols to be a tuple for the timeme method to work VERY HACKY
    y = df['target']
    X = df[list(xcols)]

    # Standardize and split the training nad test data
    X_std = standardize(X)
    ts = 0.3
    X_train, X_test, y_train, y_test = train_test_split(X_std,
                                                        y,
                                                        test_size=ts,
                                                        random_state=0)

    # Normalization of the data --> max = 1, min=0, etc
    # mms = MinMaxScaler()
    # X_train_norm = mms.fit_transform(X_train)
    # X_test_norm = mms.transform(X_test)

    # C: regularization parameter, (C = 1/lambda)
    # smaller C = more regulatiazion, smaller wieghts,  higher C = less regularization, lareger weights
    # penalty: type of regulatizaion function used for weight shrinkage / decay to prevent overfitting
    lr = LogisticRegression(C=C, random_state=0, penalty=penalty)
    lr.fit(X_train, y_train)

    # Shows the percentage of falling into each class
    print("Class breakdowns: " + str(lr.predict_proba(X_test[0:1])))
    print('Training accuracy:', lr.score(X_train, y_train))
    print('Test accuracy:', lr.score(X_test, y_test))
    print("y-intercept:" + str(lr.intercept_))
    print("coeffs:" + str(lr.coef_))

    try:
        plot_decision_regions(X_train, y_train.values, classifier=lr)
        plt.title('Logistic Regression')
        plt.xlabel(list(X.columns)[0])
        plt.ylabel(list(X.columns)[1])
        plt.legend(loc='upper left')
        plt.tight_layout()
        plt.savefig(IMG_ROOT + 'dow/log_reg_1.png', dpi=300)
        plt.close()
    except Exception as e:
        print("May have more than 2 variables")
    return lr
コード例 #12
0
def random_forest(df, xcols, estimators=5):
    """
    Run random forest algorithm
    """
    y = df['target']
    X = df[list(xcols)]

    # Standardize and split the training nad test data
    X_std = standardize(X)
    t_s = 0.3
    X_train, X_test, y_train, y_test = train_test_split(X_std,
                                                        y,
                                                        test_size=t_s,
                                                        random_state=0)

    forest = RandomForestClassifier(criterion='entropy',
                                    n_estimators=estimators,
                                    random_state=1,
                                    n_jobs=3)
    forest.fit(X_train, y_train)

    # Shows the percentage of falling into each class
    print("Class breakdowns: " + str(forest.predict_proba(X_test[0:1])))
    print('Training accuracy:', forest.score(X_train, y_train))
    print('Test accuracy:', forest.score(X_test, y_test))
    print("Feature Importances :" + str(forest.feature_importances_))

    pdb.set_trace()
    plot_decision_regions(X_std, y.values, classifier=forest)
    plt.title('Randaom Forest (Decision Tree Ensemble)')
    plt.xlabel(list(X.columns)[0])
    plt.ylabel(list(X.columns)[1])
    plt.legend(loc='upper left')
    plt.tight_layout()
    plt.savefig(IMG_ROOT + 'snp/kmeans/random_forest.png', dpi=300)
    plt.close()
コード例 #13
0
def nonlinear_svm(df, xcols, C=100, gamma=0.10):
    y = df['target']
    X = df[list(xcols)]

    # Standardize and split the training nad test data
    X_std = standardize(X)
    ts = 0.3
    X_train, X_test, y_train, y_test = \
          train_test_split(X_std, y, test_size=ts, random_state=0)

    svm = SVC(kernel='rbf', random_state=0, gamma=gamma, C=C)
    svm.fit(X_train, y_train)

    print('Training accuracy:', svm.score(X_train, y_train))
    print('Test accuracy:', svm.score(X_test, y_test))

    plot_decision_regions(X_std, y.values, classifier=svm)
    plt.title('Support Vector Machines - Non Linear')
    plt.xlabel(list(X.columns)[0])
    plt.ylabel(list(X.columns)[1])
    plt.legend(loc='upper left')
    plt.tight_layout()
    plt.savefig(IMG_PATH + 'svm_nonlinear_C' + str(C) + '.png', dpi=300)
    plt.close()
コード例 #14
0
    AX[1].set_xlabel('Epochs')
    AX[1].set_ylabel('Sum-squared-error')
    AX[1].set_title('Adaline - Learning rate 0.0001')
    plt.tight_layout()
    plt.savefig(IMG_ROOT + "PML/" + 'adaline_comp.png', dpi=300)
    plt.close()

    # standardize features
    X_STD = np.copy(X_VALS)
    X_STD[:, 0] = (X_VALS[:, 0] - X_VALS[:, 0].mean()) / X_VALS[:, 0].std()
    X_STD[:, 1] = (X_VALS[:, 1] - X_VALS[:, 1].mean()) / X_VALS[:, 1].std()

    # Implement AdalineGD on Standardized data
    ADA = AdalineGD(n_iter=15, eta=0.01)
    ADA.fit(X_STD, Y_VALS)
    plot_decision_regions(X_STD, Y_VALS, classifier=ADA)
    plt.title('Adaline - Gradient Descent')
    plt.xlabel('sepal length [standardized]')
    plt.ylabel('petal length [standardized]')
    plt.legend(loc='upper left')
    plt.tight_layout()
    plt.savefig(IMG_ROOT + "PML/" + 'adaline_2.png', dpi=300)
    plt.close()

    plt.plot(range(1, len(ADA.cost_) + 1), ADA.cost_, marker='o')
    plt.xlabel('Epochs')
    plt.ylabel('Sum-squared-error')
    plt.tight_layout()
    plt.savefig(IMG_ROOT + "PML/" + 'adaline_3.png', dpi=300)
    plt.close()
    
コード例 #15
0
    Y_VALS = np.where(Y_VALS == 0, -1, 1)
    X_VALS = IRIS.data[0:100, [0, 2]]

    # Plot the x vals of the data set
    plt.scatter(X_VALS[:50, 0],
                X_VALS[:50, 1],
                color='red',
                marker='o',
                label='setosa')
    plt.scatter(X_VALS[50:100, 0],
                X_VALS[50:100, 1],
                color='blue',
                marker='x',
                label='versicolor')

    PPN = Perceptron(eta=0.1, n_iter=10)
    PPN.fit(X_VALS, Y_VALS)
    plt.xlabel('sepal length [cm]')
    plt.ylabel('petal length [cm]')
    plt.legend(loc='upper left')
    plot_decision_regions(X_VALS, Y_VALS, classifier=PPN)
    pdb.set_trace()
    plt.savefig(IMG_ROOT + "PML/" + "iris_ch2.png", dpi=300)
    plt.close()

    plt.plot(range(1, len(PPN.errors_) + 1), PPN.errors_, marker='o')
    plt.xlabel('Epochs')
    plt.ylabel('Number of misclassifications')
    plt.savefig(IMG_ROOT + "PML/" + "iris2_ch2.png", dpi=300)
    plt.close()