Ejemplo n.º 1
0
def run_4d_model():
    """
        4D example
    """
    print('\nLinear Discriminant Analysis - 4 dimensions\n')
    # get features of the data and the target
    dt = DataFeeder()
    X, y = dt.get_data()
    # reduce our features only to 2 dimensions
    X = run_pca(X, n_components=4, columns=['pc_1', 'pc_2', 'pc_3', 'pc_4'])
    # split data into 70% training & 30% testing
    X_train_std, X_test_std, y_train, y_test = train_test_split(X,
                                                                y,
                                                                test_size=0.3,
                                                                random_state=1)
    # create linear dicriminant analysis model
    model = LinearDiscriminantAnalysis()
    # train
    model.fit(X_train_std, y_train)
    # test
    y_pred = model.predict(X_test_std)
    # calculate model accuracy score
    score = accuracy_score(y_test, y_pred) * 100
    print('# Accuracy score: %.2f' % score)
    calculate_f1_score(y_test, y_pred)
    # plot confusion matrix
    plot_confusion_matrix(y_test,
                          y_pred,
                          normalize=True,
                          title='Confusion Matrix')
    plt.show()
def main():
    # create data feeder and get features and target
    dt = DataFeeder()
    features, target = dt.get_data()
    
    # perform PCA with variety of components
    #features = dt.pca(2)
    features = dt.pca(10)
    
    # get best hyperparameters
    scorer = make_scorer(f1_score, pos_label=0)
    params = find_parameters(features, target, scorer=scorer)
    
    # run train test split without penalty
    print('#################################################')
    print('Train test split without penaty')          
    run_train_test_split(features, target, C=params['C'], penalty='none', solver='saga')
    # run train test split with L2 penalty
    print('#################################################')
    print('Train test split with L2 penaty')
    run_train_test_split(features, target, C=params['C'])
    # run cross validation with L2 penalty
    print('#################################################')
    print('Cross Validation with L2 penalty')
    run_cross_validation(features, target, C=params['C'], penalty='none', solver='saga', title='Cross validation with no penalty')
    # run cross validation without penalty
    print('#################################################')
    print('Cross Validation without penalty')
    run_cross_validation(features, target, C=params['C'], title='Cross validation with l2 penalty')
    
    # plot decission boundaries
    plt.show()
def main():
    """ Initialise DataFrame and pull the features and targets """
    df = DataFeeder()
    features, target = df.get_data()
    """ Use only 1 component """
    features = df.pca(n_components=1)
    """ Split features and target into 70% train and 30% test """
    features_train, features_test, target_train, target_test = train_test_split(
        features, target, test_size=0.3, stratify=target, random_state=100)
    """ Initialise Gaussian Naive Bayes into variable clf """
    clf = GaussianNB()
    """ Fit the training data into the classifier and predict using test data """

    y_pred = clf.fit(features_train, target_train).predict(features_test)
    """ Calculate and print accuracy score """
    acc = accuracy_score(target_test, y_pred) * 100
    print("Accuracy Score: %.2f" % acc)
    print("F1 score: %.2f" % (f1_score(target_test, y_pred) * 100))
    print("Recall score: %.2f" % (recall_score(target_test, y_pred) * 100))
    print("Precision score: %.2f" %
          (precision_score(target_test, y_pred) * 100))
Ejemplo n.º 4
0
def run_2d_model():
    """
        2D example
    """
    print(
        '\nLinear Discriminant Analysis - 2 dimensions with decision regions\n'
    )
    # get features of the data and the target
    dt = DataFeeder()
    X, y = dt.get_data()
    # reduce our features only to 2 dimensions
    X = run_pca(X)
    # split data into 70% training & 30% testing
    X_train_std, X_test_std, y_train, y_test = train_test_split(X,
                                                                y,
                                                                test_size=0.3,
                                                                random_state=1)
    # create linear dicriminant analysis model
    model = LinearDiscriminantAnalysis()
    # train
    model.fit(X_train_std, y_train)
    # test
    y_pred = model.predict(X_test_std)
    # calculate model accuracy score
    score = accuracy_score(y_test, y_pred) * 100
    print('# Accuracy score: %.2f' % score)
    calculate_f1_score(y_test, y_pred)

    # prepare data for visualization
    X_combined_std = np.vstack((X_train_std, X_test_std))
    y_combined_std = np.hstack((y_train, y_test))
    # plot decision boundaries
    plt.figure()
    plot_decision_regions(X_combined_std, y_combined_std, model)
    # plot confusion matrix
    plot_confusion_matrix(y_test,
                          y_pred,
                          normalize=True,
                          title='Confusion Matrix')
    plt.show()
def main():
    # init data feeder
    df = DataFeeder()
    # get pre-processed features and target
    features, target = df.get_data()

    plot_hist(target, xlabel='Diagnosis', ylabel='Patient Records', title='Patient Diagnosis Distribution', xlim=['M', 'B'])

    # run PCA to reduce data dimensionality
    # features = df.pca(n_components=2)
    # features = df.pca(n_components=4)
    features = df.pca(n_components=10)

    # find best hyperparameter
    n_neighbors = find_best_params(features, target)['n_neighbors']
    print("Best number of neighbors: %d" % n_neighbors)
    # run train_test_split
    std_test_train_split(features, target, n_neighbors=n_neighbors)
    # run cross validation
    cross_validation(features, target, n_neighbors=n_neighbors)
    # show all graphs
    plt.show()
def main():
    # initialize dataframe as data attained from the DataFeeder
    df = DataFeeder()
    # get feature and target data sets from cancer data
    features, target = df.get_data()

    # perform PCA with the option of 4 or 2 components
    #features = df.pca(n_components=4)
    features = df.pca(n_components=2)

    # find best hyperparameters (max depth for decision tree)
    scorer = make_scorer(f1_score, pos_label=0)
    params = find_best_params(features, target, scorer=scorer)

    features_train, features_test, target_train, target_test = train_test_split(
        features, target, stratify=target, random_state=1)

    # run training and testing data split
    std_train_test_split(features_train, features_test,
                         target_train, target_test, max_depth=int(params['max_depth']))

    # run cross validation
    cross_validation(features, target, max_depth=int(params['max_depth']))
    plt.show()
def main():
    """
        Main function containing object initialization and method triggering order
    """
    # data feeding object
    df = DataFeeder()
    # evaluation object
    ev = Evaluator()
    # get features and target data sets
    features, target = df.get_data(normalize=False)

    Plotter.plot_distribution(target, ["M", "B"],
                              bins=2,
                              title="Diagnosis Distribution",
                              xlabel="Diagnosis",
                              ylabel="Records")
    Plotter.plot_distribution(features.iloc[:, 1],
                              bins=50,
                              title="Texture Mean Distribution",
                              xlabel="Texture Mean",
                              ylabel="Records")
    Plotter.plot_distribution(features.iloc[:, 2],
                              bins=50,
                              title="Perimeter Mean Distribution",
                              xlabel="Perimeter Mean",
                              ylabel="Records")
    # get features and target data sets
    features, target = df.get_data()

    # run PCA
    # features = df.pca(n_components=2)
    # features = df.pca(n_components=4)
    features = df.pca(n_components=10)

    # split data
    features_train, features_test, target_train, target_test = Evaluator.split(
        features, target, stratify=target)
    # find best parameters based on F1-score
    scorer = make_scorer(f1_score, pos_label=0)
    linear_params, rbf_params = Evaluator.find_best_params(features_train,
                                                           target_train,
                                                           n_folds=10,
                                                           scoring=scorer)
    # train and test model trained on K-fold cross validation
    ev.k_fold_cv(features,
                 target,
                 n_splits=10,
                 linear_params=linear_params,
                 rbf_params=rbf_params)
    # train and test linear SVM model with best parameter
    ev.run_linear_svm(features_train,
                      features_test,
                      target_train,
                      target_test,
                      params=linear_params)
    # train and test rbf SVM model with best parameter
    ev.run_rbf_svm(features_train,
                   features_test,
                   target_train,
                   target_test,
                   params=rbf_params)
    # show all plot figures
    plt.show()