Exemple #1
0
               coef0=0.0,
               decision_function_shape='ovr',
               degree=3,
               gamma='scale',
               kernel='rbf',
               max_iter=-1,
               probability=False,
               random_state=None,
               shrinking=True,
               tol=0.001,
               verbose=False)),
          ('KNN',
           KNeighborsClassifier(algorithm='auto',
                                leaf_size=30,
                                metric='minkowski',
                                metric_params=None,
                                n_jobs=None,
                                n_neighbors=5,
                                p=2,
                                weights='uniform')),
          ('NB', GaussianNB(priors=None, var_smoothing=1e-09))]

results = []
names = []
folds = 7

for name, model in models:
    kfold = model_selection.KFold(n_splits=folds, random_state=folds)
    accuracy = model_selection.cross_val_score(model,
                                               X,
                                               np.ravel(Y),
                                               cv=kfold,
    if labels_train[ii] == 1
]

#### initial visualization
plt.xlim(0.0, 1.0)
plt.ylim(0.0, 1.0)
plt.scatter(bumpy_fast, grade_fast, color="b", label="fast")
plt.scatter(grade_slow, bumpy_slow, color="r", label="slow")
plt.legend()
plt.xlabel("bumpiness")
plt.ylabel("grade")
plt.show()
################################################################################
from sklearn.neighbors import KNeighborsClassifier
#Create KNN Classifier
knn_clf = KNeighborsClassifier(n_neighbors=13, p=6)
#Fit data into the classifier to create model
knn_clf.fit(features_train, labels_train)
#Pass Testing Features to Get A Prediction Out of The Classifier
pred = knn_clf.predict(features_test)

from sklearn.metrics import accuracy_score
#Get Accuracy of The Prediction By Comparing With the Testing Labels
acc = accuracy_score(pred, labels_test)

print(acc)
#########################################################
#Accuracy = 0.94

try:
    prettyPicture(clf, features_test, labels_test)
    if opts.print_cm:
        print("confusion matrix:")
        print(metrics.confusion_matrix(y_test, pred))

    print()
    clf_descr = str(clf).split('(')[0]
    return clf_descr, score, train_time, test_time


results = []
for clf, name in ((RidgeClassifier(tol=1e-2, solver="sag"),
                   "Ridge Classifier"), (Perceptron(max_iter=50,
                                                    tol=1e-3), "Perceptron"),
                  (PassiveAggressiveClassifier(max_iter=50, tol=1e-3),
                   "Passive-Aggressive"),
                  (KNeighborsClassifier(n_neighbors=10), "kNN"),
                  (RandomForestClassifier(n_estimators=100), "Random forest")):
    print('=' * 80)
    print(name)
    results.append(benchmark(clf))

for penalty in ["l2", "l1"]:
    print('=' * 80)
    print("%s penalty" % penalty.upper())
    # Train Liblinear model
    results.append(benchmark(LinearSVC(penalty=penalty, dual=False, tol=1e-3)))

    # Train SGD model
    results.append(
        benchmark(SGDClassifier(alpha=.0001, max_iter=50, penalty=penalty)))
Exemple #4
0
X = pd.read_csv('.\\Datasets\\wheat.data', index_col=0)  # load dataset
X.dropna(inplace=True)

y = X.wheat_type.copy().map({'canadian': 0, 'kama': 1, 'rosa': 2})  # create
X.drop('wheat_type', axis=1, inplace=True)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=7)

svc = SVC(kernel='linear', C=C)

from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5)

from sklearn.tree import DecisionTreeClassifier
decision_tree_model = DecisionTreeClassifier(max_depth=9, random_state=2)

benchmark(decision_tree_model, X_train, X_test, y_train, y_test,
          'Decision Tree Classifier')
drawPlots(decision_tree_model, X_train, X_test, y_train, y_test,
          'Decision Tree Classifier')

benchmark(knn, X_train, X_test, y_train, y_test, 'KNeighbors')
drawPlots(knn, X_train, X_test, y_train, y_test, 'KNeighbors')

benchmark(svc, X_train, X_test, y_train, y_test, 'SVC')
drawPlots(svc, X_train, X_test, y_train, y_test, 'SVC')
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.35,
                                                    random_state=0)

#feature scaling
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Fitting classifier to the Training set
from sklearn.neighbors import KNeighborsClassifier

classifier = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2)
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred)

# Visualising the Training set results
from matplotlib.colors import ListedColormap

X_set, y_set = X_train, y_train
X1, X2 = np.meshgrid(
def train_test(code_classifier):
    training_set = []
    class_training = []
    annot_training = []
    total_training_set = []
    total_class_training = []

    testing_set = []
    class_testing = []
    annot_testing = []
    TP = 0
    FP = 0
    TN = 0
    FN = 0
    Prec = 0
    Rec = 0
    Fscore = 0
    Spec = 0
    res_sequence = []
    list_name = read_training_testing_file(2)
    if code_classifier == 0:
        clf = svm.SVC()  #
        clf_name = "SVM"
    elif code_classifier == 1:
        clf = tree.DecisionTreeClassifier(random_state=1)
        clf_name = "Decision_Tree"
    elif code_classifier == 2:
        clf = KNeighborsClassifier(n_neighbors=1, leaf_size=40)
        clf_name = "KNN"
    else:
        clf = LogisticRegression(C=1e8)
        clf_name = "Logistic"

    for name in list_name:

        for sub_name in list_name:
            if name == sub_name:
                print "loading test samples"
                testing_set, class_testing = read_file(sub_name)
            else:
                print "loading training samples " + str(sub_name)
                training_set, class_training = read_file(sub_name)
                for i in range(len(training_set)):
                    total_training_set.append(training_set[i])
                    total_class_training.append(class_training[i])

        print "training and testing " + name
        print len(total_training_set)
        clf = clf.fit(total_training_set, total_class_training)

        prediction_val = clf.predict(testing_set)

        TP, FP, TN, FN = calc_metrics(prediction_val, class_testing, name)
        #the case when the result is on the edge
        if TP == 0 and FP == 0:
            Prec = 0
        else:
            Prec = float(TP) / (TP + FP) * 100
        Rec = float(TP) / (TP + FN) * 100
        Fscore = float((2 * TP)) / ((2 * TP) + FP + FN) * 100
        Spec = float(TN) / (FP + TN) * 100
        del total_training_set[:]
        del total_class_training[:]

        result_metric = [name, TP, FP, TN, FN, Prec, Rec, Fscore, Spec]
        res_sequence.append(result_metric)

        #print_read_classifier(fin_ml, clf_name, code, percentage, True)

    return res_sequence
print(counter)
# print(df)
# print(df.shape);
"""
KNN
"""
#Create arrays for features and target variable
y = df['osteoporosis']
X = df.drop('osteoporosis', axis=1)

#train test split
(X_train, X_test, y_train, y_test) = train_test_split(X,
                                                      y,
                                                      test_size=0.2,
                                                      random_state=42,
                                                      stratify=y)

knn = KNeighborsClassifier(n_neighbors=3,
                           algorithm='brute',
                           metric='euclidean')

cv_results = cross_val_score(knn, X, y, cv=5)
print("Accuracy: %0.2f (+/- %0.2f)" %
      (cv_results.mean(), cv_results.std() * 2))
print("Execution time : %0.3f seconds" % (time.time() - start_time))
"""
Accuracy: 0.56 (+/- 0.12)
Execution time : 2.072 seconds
"""
Exemple #8
0
def knnClassify(trainData, trainLabel):
    knnClf = KNeighborsClassifier(
    )  # default:k = 5,defined by yourself:KNeighborsClassifier(n_neighbors=10)
    knnClf.fit(trainData,
               ravel(trainLabel))  # ravel Return a contiguous flattened array.
    return knnClf
    kf = ms.KFold(
        n_splits=5,
        shuffle=True,
        random_state=42
    )

    k_score = list()

    for k in range(1, 51):
        k_score.append(
            (
                (
                    ms.cross_val_score(
                        estimator=KNeighborsClassifier(
                            n_neighbors=k
                        ),
                        X=X,
                        y=y,
                        cv=kf,
                        scoring='accuracy'
                    )
                ).mean(),
                k
            )
        )

    print(
        sorted(
            k_score,
            reverse=True
features = scaler.fit_transform(features)

### Try a variety of classifiers
# Import classifiers
from sklearn.naive_bayes import GaussianNB
from sklearn import tree
#from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier

# Initialize classifiers
clf_NB = GaussianNB()
clf_DT = tree.DecisionTreeClassifier(min_samples_split=5,criterion='entropy')
#clf_SVC = SVC()
clf_KN = KNeighborsClassifier()
clf_RF = RandomForestClassifier()
clf_AB = AdaBoostClassifier()

# Leverage tester.py to fit and test the classifiers
test_classifier(clf_NB, my_dataset, features_list)
#test_classifier(clf_DT, my_dataset, features_list)
#test_classifier(clf_SVC, my_dataset, features_list)
#test_classifier(clf_KN, my_dataset, features_list)
#test_classifier(clf_RF, my_dataset, features_list)
#test_classifier(clf_AB, my_dataset, features_list)

# Apply Grid Search to fine tune the parameters
from sklearn import grid_search

# Set the parameters for my two chosen classifiers
feature_data = pd.DataFrame(x_scaled, columns=feature_data.columns)

body_type_labels = body_type_df["body_type"]
body_type_train, body_type_test, body_type_labels_train, body_type_labels_test = train_test_split(
    feature_data,
    body_type_labels,
    train_size=0.8,
    test_size=0.2,
    random_state=6)

### CLASSIFICATION

start = timeit.default_timer()

### KNeighbors
classifier = KNeighborsClassifier(n_neighbors=80)

classifier.fit(body_type_train, body_type_labels_train)

print("Score: " + str(classifier.score(body_type_test, body_type_labels_test)))

prediction = classifier.predict(body_type_test)

stop = timeit.default_timer()

print("\nAccuracy score: " +
      str(accuracy_score(body_type_labels_test, prediction)))
print("Recall score: " +
      str(recall_score(body_type_labels_test, prediction, average='micro')))
print("Precision score: " +
      str(precision_score(body_type_labels_test, prediction, average='micro')))
def random_subspace(n_estimators, M0, M1, verbose=False):
    if n_estimators == 0:
        ave_sub_model_acc = 0
        acc = 0
    else:
        standard = False
        # M__pca_ideal = 147
        # M__lda_ideal = 46
        # if verbose:
        #    print ('M__pca_ideal = ', M__pca_ideal)
        #    print ('M__lda_ideal = ', M__lda_ideal)

        M_pca_bag = N - 1

        M_pca = 147  # M__pca_ideal
        M_lda = 46  # M__lda_ideal

        assert (M1 <= (N - 1 - M0))
        assert (M0 + M1 > M_lda)

        estimators = [('lda', LinearDiscriminantAnalysis(n_components=M_lda)), ('knn', KNeighborsClassifier(n_neighbors=1))]

        base_est = Pipeline(estimators)

        base_est.fit(X_train.T, y_train.T.ravel())

        acc = base_est.score(X_test.T, y_test.T.ravel())
        if verbose:
            print('Accuracy of base estimator with no pre PCA = %.2f%%' % (acc * 100))

        pca = PCA(n_components=M_pca_bag)
        W_train = pca.fit_transform(X_train.T)
        W_test = pca.transform(X_test.T)

        base_est.fit(W_train, y_train.T.ravel())

        acc = base_est.score(W_test, y_test.T.ravel())
        if verbose:
            print('Accuracy of base estimator with pre PCA applied = %.2f%%' % (acc * 100))

        estimators = []
        sub_model_accuracies = []
        masks = []

        for i in range(n_estimators):

            mask0 = np.arange(M0)
            mask1 = np.random.choice(np.arange(M0, (N - 1)), M1, replace=False)

            mask1 = np.array(mask1).ravel()

            mask = np.concatenate((mask0, mask1), axis=None)
            masks.append(mask)

            W_bag = W_train[:, mask]
            y_bag = y_train

            estimator = clone(base_est)

            estimator.fit(W_bag, y_bag.T.ravel())

            name = 'est_' + str(i + 1)
            estimators.append((name, estimator))

            sub_model_acc = estimator.score(W_test[:, mask], y_test.T.ravel())
            sub_model_accuracies.append(sub_model_acc)
            if verbose:
                print('Accuracy of sub model ', i + 1, ' = %.2f%%' % (sub_model_acc * 100))

        ave_sub_model_acc = sum(sub_model_accuracies) / n_estimators
        if verbose:
            print('Average accuracy of sub models = %.2f%%' % (ave_sub_model_acc * 100))

        y_hat = []

        for w in W_test:
            prediction_sum = 0
            predictions = np.empty(n_estimators, dtype=np.int64)
            for i, (name, estimator) in enumerate(estimators):
                y = estimator.predict(w[masks[i]].reshape(1, -1))

                prediction_sum = prediction_sum + float(y[0])
                predictions[i] = int(y[0])
            #sum
            prediction = round(prediction_sum / n_estimators)
            # y_hat.append(prediction)
            #voting
            counts = np.bincount(predictions)
            y_hat.append(np.argmax(counts))

        acc = accuracy_score(y_test.T, y_hat)
        if verbose:
            print('Accuracy of ensemble models = %.2f%%' % (acc * 100))

    return acc, ave_sub_model_acc, sub_model_accuracies
Exemple #13
0
def first_generation(X, y, seed=None):
    mlp_parameters = list(
        itertools.product([1, 2, 4, 8, 16], [0, 0.2, 0.5, 0.9], [0.3, 0.6]))
    mlp_clf = [
        MLPClassifier(hidden_layer_sizes=(h, ),
                      momentum=m,
                      learning_rate_init=a) for (h, m, a) in mlp_parameters
    ]
    mlp_name = ['mlp_{0}_{1}_{2}'.format(*param) for param in mlp_parameters]

    neigbhors_number = [int(i) for i in np.linspace(1, X.shape[0], 20)]
    weighting_methods = ['uniform', 'distance', lambda x: abs(1 - x)]
    knn_clf = [
        KNeighborsClassifier(n_neighbors=nn, weights=w)
        for (nn, w) in itertools.product(neigbhors_number, weighting_methods)
    ]
    knn_name = [
        'knn_{0}_{1}'.format(*param) for param in itertools.product(
            neigbhors_number, ['uniform', 'distance', 'similarity'])
    ]

    C = [1e-5, 1e-4, 1e-3, 1e-2, 0.1, 1, 10, 100]
    degree = [2, 3]
    gamma = [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 2]
    svm_clf_poly = [
        SVC(C=c, kernel='poly', degree=d)
        for (c, d) in itertools.product(C, degree)
    ]
    svm_clf_poly_name = [
        'svm_poly_{0}_{1}'.format(*param)
        for param in itertools.product(C, degree)
    ]
    svm_clf_rbf = [
        SVC(C=c, kernel='rbf', gamma=g)
        for (c, g) in itertools.product(C, gamma)
    ]
    svm_clf_rbf_name = [
        'svm_rbf_{0}_{1}'.format(*param)
        for param in itertools.product(C, gamma)
    ]

    dt_max_depth_params = list(
        itertools.product(['gini', 'entropy'], [1, 2, 3, 4, None]))
    dt_max_depth_clf = [DecisionTreeClassifier(criterion=c, max_depth=d) \
                        for (c, d) in dt_max_depth_params]
    dt_max_depth_name = [
        'dt_max_depth_{0}_{1}'.format(*param) for param in dt_max_depth_params
    ]

    dt_max_features_params = list(
        itertools.product(['gini', 'entropy'], [None, 'sqrt', 'log2', 0.5]))
    dt_max_features_clf = [DecisionTreeClassifier(criterion=c, max_features=f) \
                           for (c, f) in dt_max_features_params]
    dt_max_features_name = [
        'dt_max_features_{0}_{1}'.format(*param)
        for param in dt_max_features_params
    ]

    dt_min_leaf_params = [2, 3]
    dt_min_leaf_clf = [
        DecisionTreeClassifier(min_samples_leaf=l) for l in dt_min_leaf_params
    ]
    dt_min_leaf_name = [
        'dt_min_leaf_{0}'.format(param) for param in dt_min_leaf_params
    ]

    pool = mlp_clf + knn_clf + svm_clf_poly + svm_clf_rbf + dt_max_depth_clf + dt_max_features_clf + \
           dt_min_leaf_clf
    pool_name = mlp_name + knn_name + svm_clf_poly_name + svm_clf_rbf_name + dt_max_depth_name + \
                dt_max_features_name + dt_min_leaf_name

    ensemble = VotingClassifier(estimators=list(zip(pool_name, pool)))
    ensemble.fit(X, y)
    estimators = ensemble.estimators_

    return estimators, pool_name
Exemple #14
0
def third_generation(X, y, size=200, seed=None):
    mlp_parameters = list(itertools.product([1, 2, 4, 8, 32, 128],\
                                            [0, 0.2, 0.5, 0.9],
                                            [0.1, 0.3, 0.6]))
    mlp_clf = [
        MLPClassifier(hidden_layer_sizes=(h, ),
                      momentum=m,
                      learning_rate_init=a) for (h, m, a) in mlp_parameters
    ]
    mlp_name = ['mlp_{0}_{1}_{2}'.format(*param) for param in mlp_parameters]

    neigbhors_number = [int(i) for i in np.linspace(1, X.shape[0], 40)]
    weighting_methods = ['uniform', 'distance']
    knn_clf = [
        KNeighborsClassifier(n_neighbors=nn, weights=w)
        for (nn, w) in itertools.product(neigbhors_number, weighting_methods)
    ]
    knn_name = [
        'knn_{0}_{1}'.format(*param) for param in itertools.product(
            neigbhors_number, ['uniform', 'distance'])
    ]
    C = np.logspace(-3, 7, num=11)
    degree = [2, 3, 4]
    gamma = [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 2]
    svm_clf_poly = [
        SVC(C=c, kernel='poly', degree=d)
        for (c, d) in itertools.product(C, degree)
    ]
    svm_clf_poly_name = [
        'svm_poly_{0}_{1}'.format(*param)
        for param in itertools.product(C, degree)
    ]
    svm_clf_rbf = [
        SVC(C=c, kernel='rbf', gamma=g)
        for (c, g) in itertools.product(C, gamma)
    ]
    svm_clf_rbf_name = [
        'svm_rbf_{0}_{1}'.format(*param)
        for param in itertools.product(C, gamma)
    ]

    dt_params = list(itertools.product(['gini', 'entropy'], \
                                       [1, 2, 3, 4, 5, None], \
                                       [None, 'sqrt', 'log2'], \
                                       ['best', 'random']))
    dt_clf = [
        DecisionTreeClassifier(criterion=c,
                               max_depth=d,
                               max_features=f,
                               splitter=s) for (c, d, f, s) in dt_params
    ]
    dt_name = ['dt_{0}_{1}_{2}_{3}'.format(*param) for param in dt_params]

    et_clf = [
        ExtraTreeClassifier(criterion=c,
                            max_depth=d,
                            max_features=f,
                            splitter=s) for (c, d, f, s) in dt_params
    ]
    et_name = ['et_{0}_{1}_{2}_{3}'.format(*param) for param in dt_params]

    ada_params = list(itertools.product([2**i for i in range(1, 14)], \
                                        [1, 2, 3]))
    ada_dt_clf = [
        AdaBoostClassifier(n_estimators=n,
                           base_estimator=DecisionTreeClassifier(max_depth=m))
        for (n, m) in ada_params
    ]
    ada_et_clf = [
        AdaBoostClassifier(n_estimators=n,
                           base_estimator=ExtraTreeClassifier(max_depth=m))
        for (n, m) in ada_params
    ]
    ada_dt_name = ['ada_dt_{0}_{1}'.format(*param) for param in ada_params]
    ada_et_name = ['ada_et_{0}_{1}'.format(*param) for param in ada_params]

    nb_bag_est = 50
    nb_bag_stumps = 200
    bag_dt = BaggingClassifier(n_estimators=nb_bag_est,
                               base_estimator=DecisionTreeClassifier())
    bag_et = BaggingClassifier(n_estimators=nb_bag_est,
                               base_estimator=ExtraTreeClassifier())
    bag_stumps = BaggingClassifier(
        n_estimators=nb_bag_stumps,
        base_estimator=DecisionTreeClassifier(max_depth=1))
    bag_dt.fit(X, y)
    bag_et.fit(X, y)
    bag_stumps.fit(X, y)
    dt_bag_clf = bag_dt.estimators_
    et_bag_clf = bag_et.estimators_
    stump_bag_clf = bag_stumps.estimators_
    dt_bag_name = ['dt_bag_{0}'.format(nb_est) for nb_est in range(nb_bag_est)]
    et_bag_name = ['et_bag_{0}'.format(nb_est) for nb_est in range(nb_bag_est)]
    stump_bag_name = [
        'stump_bag_{0}'.format(nb_est) for nb_est in range(nb_bag_stumps)
    ]

    bag_dt_clf = [bag_dt]
    bag_et_clf = [bag_dt]
    bag_stump_clf = [bag_stumps]
    bag_dt_name = ['bag_dt_{0}'.format(str(nb_bag_est))]
    bag_et_name = ['bag_et_{0}'.format(str(nb_bag_est))]
    bag_stump_name = ['bag_stump_{0}'.format(str(200))]

    nb_rf = 15
    rf = RandomForestClassifier(n_estimators=nb_rf)
    rf.fit(X, y)
    dt_rf_clf = rf.estimators_
    dt_rf_name = ['dt_rf_{0}'.format(nb_est) for nb_est in range(nb_rf)]

    log_parameters = list(itertools.product(['l1', 'l2'],\
                                            np.logspace(-5, 9, num=15),
                                            [True, False]))
    log_clf = [
        LogisticRegression(penalty=l, C=c, fit_intercept=f)
        for (l, c, f) in log_parameters
    ]
    log_name = ['log_{0}_{1}_{2}'.format(*param) for param in log_parameters]

    sgd_parameters = list(
        itertools.product([
            'hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron',
            'squared_loss', 'huber', 'epsilon_insensitive',
            'squared_epsilon_insensitive'
        ], ['elasticnet'], [True, False], np.arange(0, 1.1, 0.1)))
    sgd_clf = [
        SGDClassifier(loss=l, penalty=p, fit_intercept=f, l1_ratio=l1)
        for (l, p, f, l1) in sgd_parameters
    ]
    sgd_name = [
        'sgd_{0}_{1}_{2}_{3}'.format(*param) for param in sgd_parameters
    ]

    pool = mlp_clf + knn_clf + svm_clf_poly + svm_clf_rbf + dt_clf + et_clf + ada_dt_clf + ada_et_clf + \
                dt_bag_clf + et_bag_clf + stump_bag_clf + bag_dt_clf + bag_et_clf + bag_stump_clf + dt_rf_clf + \
                log_clf + sgd_clf
    pool_name = mlp_name + knn_name + svm_clf_poly_name + svm_clf_rbf_name + dt_name + et_name + ada_dt_name + \
                ada_et_name + dt_bag_name + et_bag_name + stump_bag_name + bag_dt_name + bag_et_name + \
                bag_stump_name + dt_rf_name + log_name + sgd_name

    for model in pool:
        if not check_model_is_fitted(model, X[0, :].reshape((1, -1))):
            model.fit(X, y)

    np.random.seed(seed)
    order = np.random.permutation(range(len(pool)))
    estimators = [pool[i] for i in order[:size]]

    return estimators, pool_name
#CV = model_selection.LeaveOneOut()
errors = np.zeros((K,L))
i=0
for train_index, test_index in CV.split(X):
    print('Crossvalidation fold: {0}/{1}'.format(i+1,K))    
    
    # extract training and test set for current CV fold
    X_train = X[train_index,:]
    y_train = y[train_index]
    X_test = X[test_index,:]
    y_test = y[test_index]
#    print(test_index)
    # Fit classifier and classify the test points (consider 1 to 40 neighbors)
    
    for l in range(1,L+1):
        knclassifier = KNeighborsClassifier(n_neighbors=l);
        knclassifier.fit(X_train, y_train);
        y_est = knclassifier.predict(X_test);
        errors[i,l-1] = np.sum(y_est!=y_test)
    i+=1
    
# Plot the classification error rate
#figure()
#plot(100*sum(errors,0)/N)
#xlabel('Number of neighbors')
#ylabel('Classification error rate (%)')
#show()

#error_sum = 0
#for i in range(0,len(y_test)-1):
#    if y_test[i]!=y_est[i]:
print(classification_report(y_test, y_pred))

# Accuracy
from sklearn.metrics import accuracy_score
print("Accuracy: ",accuracy_score(y_test, y_pred))# Recall
from sklearn.metrics import recall_score
print("Recall: ",recall_score(y_test, y_pred, average='weighted'))# Precision
from sklearn.metrics import precision_score
print("Precision: ",precision_score(y_test, y_pred, average='weighted'))



"""# **ML Model**"""

from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
y_knn_pred = knn.predict(X_test)

# Accuracy
from sklearn.metrics import accuracy_score
print("Accuracy: ",accuracy_score(y_test, y_knn_pred))# Recall
from sklearn.metrics import recall_score
print("Recall: ",recall_score(y_test, y_knn_pred, average='weighted'))# Precision
from sklearn.metrics import precision_score
print("Precision: ",precision_score(y_test, y_knn_pred, average='weighted'))

"""# **Simple Neural Network**"""

# Number of times we want to iterate over whole training data
BATCH_SIZE = 1000
Exemple #17
0
# Nomalize Data
X = preprocessing.StandardScaler().fit(X).transfrom(X.astype(float))

# Train Test Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=4)
print ('Train set:', X_train.shape,  y_train.shape)
print ('Test set:', X_test.shape,  y_test.shape)

# Classfication KNN
# import library from Sklearn
from sklearn.neighbors import KNeighborsClassifier
# Training
k = 4
#Train Model and Predict
neigh = KNeighborsClassifier(n_neighbors = k).fit(X_train,y_train)
neigh
#Predicting
yhat = neigh.predict(X_test)
# Accuracy evaluation
from sklearn import metrics
print("Train set Accuracy: ", metrics.accuracy_score(y_train, neigh.predict(X_train)))
print("Test set Accuracy: ", metrics.accuracy_score(y_test, yhat))

# Find the best K
Ks = 10
mean_acc = np.zeros((Ks-1))
std_acc = np.zeros((Ks-1))
ConfustionMx = [];
for n in range(1,Ks):
Exemple #18
0
# 我们看看这 6 种算法:
#
# 逻辑回归(LR)
# 线性判别分析(LDA)
# K最近邻算法(KNN)
# 分类和回归树(CART)
# 高斯朴素贝叶斯(NB)
# 支持向量机(SVM)
# 这里面既有简单的线性算法(LA和LDA),也有非线性算法(KNN,CART,NB和SVM)。我们每次运行算法前都要重新设置随机数量的种子,以确保是在用相同的数据拆分来评估每个算法。这样能保证最终结果可以直接进行比较。
#
# 我们来搭建和评估模型:
# Spot Check Algorithms
models = []
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))
# evaluate each model in turn
results = []
names = []
for name, model in models:
    kfold = model_selection.KFold(n_splits=10, shuffle=True, random_state=seed)
    cv_results = model_selection.cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
    # cv_results = model_selection.cross_val_score(model, X_train, Y_train, cv=kfold)
    # results.append(cv_results)
    # names.append(name)
    # msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    # print(msg)
Exemple #19
0
def predefined_estimators(estimator, random_state, n_jobs, p):
    """
    Provides the classifiers and parameters using by the module

    Parameters
    -----------
    estimator : str
        Name of scikit learn estimator.

    random_state : Any number
        Seed to use in randomized components.

    n_jobs : int
        Number of processing cores to use.

    p : dict
        Classifier setttings (keys) and values.

    Returns
    -------
    clf : object
        Scikit-learn classifier object

    mode : str
        Flag to indicate whether classifier performs classification or
        regression.
    """
    try:
        from sklearn.experimental import enable_hist_gradient_boosting
    except ImportError:
        pass

    from sklearn.linear_model import (
        LogisticRegression,
        LinearRegression,
        SGDRegressor,
        SGDClassifier,
    )
    from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
    from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
    from sklearn.naive_bayes import GaussianNB
    from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
    from sklearn.ensemble import (
        RandomForestClassifier,
        RandomForestRegressor,
        ExtraTreesClassifier,
        ExtraTreesRegressor,
    )
    from sklearn.ensemble import (GradientBoostingClassifier,
                                  GradientBoostingRegressor)
    from sklearn.svm import SVC, SVR
    from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
    from sklearn.neural_network import MLPClassifier, MLPRegressor

    estimators = {
        "SVC":
        SVC(C=p["C"], probability=True, random_state=random_state),
        "SVR":
        SVR(C=p["C"], epsilon=p["epsilon"]),
        "LogisticRegression":
        LogisticRegression(
            C=p["C"],
            solver="liblinear",
            random_state=random_state,
            multi_class="auto",
            n_jobs=1,
            fit_intercept=True,
        ),
        "LinearRegression":
        LinearRegression(n_jobs=n_jobs, fit_intercept=True),
        "SGDClassifier":
        SGDClassifier(
            penalty=p["penalty"],
            alpha=p["alpha"],
            l1_ratio=p["l1_ratio"],
            n_jobs=n_jobs,
            random_state=random_state,
        ),
        "SGDRegressor":
        SGDRegressor(
            penalty=p["penalty"],
            alpha=p["alpha"],
            l1_ratio=p["l1_ratio"],
            random_state=random_state,
        ),
        "DecisionTreeClassifier":
        DecisionTreeClassifier(
            max_depth=p["max_depth"],
            max_features=p["max_features"],
            min_samples_leaf=p["min_samples_leaf"],
            random_state=random_state,
        ),
        "DecisionTreeRegressor":
        DecisionTreeRegressor(
            max_features=p["max_features"],
            min_samples_leaf=p["min_samples_leaf"],
            random_state=random_state,
        ),
        "RandomForestClassifier":
        RandomForestClassifier(
            n_estimators=p["n_estimators"],
            max_features=p["max_features"],
            min_samples_leaf=p["min_samples_leaf"],
            random_state=random_state,
            n_jobs=n_jobs,
            oob_score=True,
        ),
        "RandomForestRegressor":
        RandomForestRegressor(
            n_estimators=p["n_estimators"],
            max_features=p["max_features"],
            min_samples_leaf=p["min_samples_leaf"],
            random_state=random_state,
            n_jobs=n_jobs,
            oob_score=True,
        ),
        "ExtraTreesClassifier":
        ExtraTreesClassifier(
            n_estimators=p["n_estimators"],
            max_features=p["max_features"],
            min_samples_leaf=p["min_samples_leaf"],
            random_state=random_state,
            n_jobs=n_jobs,
            bootstrap=True,
            oob_score=True,
        ),
        "ExtraTreesRegressor":
        ExtraTreesRegressor(
            n_estimators=p["n_estimators"],
            max_features=p["max_features"],
            min_samples_leaf=p["min_samples_leaf"],
            random_state=random_state,
            bootstrap=True,
            n_jobs=n_jobs,
            oob_score=True,
        ),
        "GradientBoostingClassifier":
        GradientBoostingClassifier(
            learning_rate=p["learning_rate"],
            n_estimators=p["n_estimators"],
            max_depth=p["max_depth"],
            min_samples_leaf=p["min_samples_leaf"],
            subsample=p["subsample"],
            max_features=p["max_features"],
            random_state=random_state,
        ),
        "GradientBoostingRegressor":
        GradientBoostingRegressor(
            learning_rate=p["learning_rate"],
            n_estimators=p["n_estimators"],
            max_depth=p["max_depth"],
            min_samples_leaf=p["min_samples_leaf"],
            subsample=p["subsample"],
            max_features=p["max_features"],
            random_state=random_state,
        ),
        "HistGradientBoostingClassifier":
        GradientBoostingClassifier(
            learning_rate=p["learning_rate"],
            n_estimators=p["n_estimators"],
            max_depth=p["max_depth"],
            min_samples_leaf=p["min_samples_leaf"],
            subsample=p["subsample"],
            max_features=p["max_features"],
            random_state=random_state,
        ),
        "HistGradientBoostingRegressor":
        GradientBoostingRegressor(
            learning_rate=p["learning_rate"],
            n_estimators=p["n_estimators"],
            max_depth=p["max_depth"],
            min_samples_leaf=p["min_samples_leaf"],
            subsample=p["subsample"],
            max_features=p["max_features"],
            random_state=random_state,
        ),
        "MLPClassifier":
        MLPClassifier(
            hidden_layer_sizes=p["hidden_layer_sizes"],
            alpha=p["alpha"],
            random_state=random_state,
        ),
        "MLPRegressor":
        MLPRegressor(
            hidden_layer_sizes=p["hidden_layer_sizes"],
            alpha=p["alpha"],
            random_state=random_state,
        ),
        "GaussianNB":
        GaussianNB(),
        "LinearDiscriminantAnalysis":
        LinearDiscriminantAnalysis(),
        "QuadraticDiscriminantAnalysis":
        QuadraticDiscriminantAnalysis(),
        "KNeighborsClassifier":
        KNeighborsClassifier(n_neighbors=p["n_neighbors"],
                             weights=p["weights"],
                             n_jobs=n_jobs),
        "KNeighborsRegressor":
        KNeighborsRegressor(n_neighbors=p["n_neighbors"],
                            weights=p["weights"],
                            n_jobs=n_jobs),
    }

    # define classifier
    model = estimators[estimator]

    # classification or regression
    if (estimator == "LogisticRegression" or estimator == "SGDClassifier"
            or estimator == "MLPClassifier"
            or estimator == "DecisionTreeClassifier"
            or estimator == "RandomForestClassifier"
            or estimator == "ExtraTreesClassifier"
            or estimator == "GradientBoostingClassifier"
            or estimator == "HistGradientBoostingClassifier"
            or estimator == "GaussianNB"
            or estimator == "LinearDiscriminantAnalysis"
            or estimator == "QuadraticDiscriminantAnalysis"
            or estimator == "SVC" or estimator == "KNeighborsClassifier"):
        mode = "classification"
    else:
        mode = "regression"

    return (model, mode)
Exemple #20
0
def main(verbose):

    # Data to train/test
    sentences, language = prepare_data()
    tests_language, tests_text = get_directory_content(
        "identification_langue/corpus_test1/*.txt")

    # Use cases for test
    test_cases = [
        ClassifierTest('MultinomialNB', MultinomialNB(), 1),
        ClassifierTest('MultinomialNB', MultinomialNB(), 2),
        ClassifierTest('MultinomialNB', MultinomialNB(), 3),
        ClassifierTest('LogisticRegression', LogisticRegression(), 1),
        ClassifierTest('LogisticRegression', LogisticRegression(), 2),
        ClassifierTest('LogisticRegression', LogisticRegression(), 3),
        ClassifierTest('KNeighborsClassifier 3 neighbors',
                       KNeighborsClassifier(3), 1),  # Strange predictions
        ClassifierTest('KNeighborsClassifier 3 neighbors',
                       KNeighborsClassifier(3), 2),  # Strange predictions
        ClassifierTest('KNeighborsClassifier 3 neighbors',
                       KNeighborsClassifier(3), 3),  # Strange predictions
        ClassifierTest('KNeighborsClassifier 5 neighbors',
                       KNeighborsClassifier(5), 1),  # Strange predictions
        ClassifierTest('KNeighborsClassifier 5 neighbors',
                       KNeighborsClassifier(5), 2),  # Strange predictions
        ClassifierTest('KNeighborsClassifier 5 neighbors',
                       KNeighborsClassifier(5), 3),  # Strange predictions
        ClassifierTest('LinearSVC', LinearSVC(random_state=0, tol=1e-5),
                       1),  # -
        ClassifierTest('LinearSVC', LinearSVC(random_state=0, tol=1e-5),
                       2),  # GOOD
        ClassifierTest('LinearSVC', LinearSVC(random_state=0, tol=1e-5),
                       3),  # GOOD
        ClassifierTest('SVC gamma auto', SVC(gamma='auto'), 1),  # strange
        ClassifierTest('SVC gamma auto', SVC(gamma='auto'), 2),  # strange
        ClassifierTest('SVC gamma auto', SVC(gamma='auto'), 3),  # strange
        ClassifierTest(
            'SVC avec linear', SVC(kernel="linear", C=0.025),
            1),  # This linear with 1 gram its better than the other class
        ClassifierTest('SVC avec linear', SVC(kernel="linear", C=0.025),
                       2),  # GOOD
        ClassifierTest('SVC avec linear', SVC(kernel="linear", C=0.025),
                       3),  # GOOD
        ClassifierTest('SVC gamma 2', SVC(gamma=2, C=1),
                       1),  # always english...
        ClassifierTest('SVC gamma 2', SVC(gamma=2, C=1),
                       2),  # always english...
        ClassifierTest('SVC gamma 2', SVC(gamma=2, C=1),
                       3),  # always english...
        ClassifierTest('DecisionTreeClassifier',
                       DecisionTreeClassifier(max_depth=5), 1),  # very bad
        ClassifierTest('DecisionTreeClassifier',
                       DecisionTreeClassifier(max_depth=5),
                       2),  # Strange results
        ClassifierTest('DecisionTreeClassifier',
                       DecisionTreeClassifier(max_depth=5),
                       3),  # Strange results
        ClassifierTest('SGDClassifier ', SGDClassifier(max_iter=1000), 1),  # 
        ClassifierTest('SGDClassifier ', SGDClassifier(max_iter=1000),
                       2),  # GOOD
        ClassifierTest('SGDClassifier ', SGDClassifier(max_iter=1000),
                       3),  # GOOD      

        ### ClassifierTest('GaussianNB', GaussianNB(), 1), # Doenst work... too dense error,
    ]

    # Just to show header
    headerClassifier = ClassifierTest('Header', None, 0)
    print(headerClassifier.str_keys())

    # tests our cases
    for test_case in test_cases:  #[:1]
        classifier = Classifier(test_case, language, sentences, verbose=False)

        predictions = []
        for test in tests_text:
            prediction = classifier.predict(test)
            predictions.append(prediction[0])
            if (verbose):
                print('# Prediction: {} | Text: {}'.format(
                    prediction, test[:70].replace("\n", "")))

        mean = np.mean(np.array(predictions) == tests_language)

        print("{}{}{} | {}".format(bcolors.HEADER, test_case, bcolors.ENDC,
                                   mean))
Exemple #21
0
Ytest = Xtest - np.ones((Ntest,1))*X.mean(0)

# Obtain the PCA solution  by calculate the SVD of Y
U,S,V = linalg.svd(Y,full_matrices=False)
V = V.T


# Repeat classification for different values of K
error_rates = []
for k in K:
    # Project data onto principal component space,
    Z = Y @ V[:,:k]
    Ztest = Ytest @ V[:,:k]

    # Classify data with knn classifier
    knn_classifier = KNeighborsClassifier(n_neighbors=1)
    knn_classifier.fit(Z,y.ravel())
    y_estimated = knn_classifier.predict(Ztest)

    # Compute classification error rates
    y_estimated = y_estimated.T
    er = (sum(ytest!=y_estimated)/float(len(ytest)))*100
    error_rates.append(er)
    print('K={0}: Error rate: {1:.1f}%'.format(k, er))

# Visualize error rates vs. number of principal components considered
figure()
plot(K,error_rates,'o-')
xlabel('Number of principal components K')
ylabel('Error rate [%]')
show()
Exemple #22
0
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from Group4_SelectProcessOrganize import *

if __name__ == '__main__':
    #defining local variables, isolating explanatory variables from response
    feature_count = 8
    preprocessed = run_data_job()
    test_partition = 0.4
    X = preprocessed[:, :feature_count]
    y = preprocessed[:, feature_count]
    #Show us what were working with
    print("Size of Feature Data : ", X.shape)
    print("Size of Label Data : ", y.shape)
    #Generate random test and train sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_partition)
    #a couple model types we experimented with
    default_model = KNeighborsClassifier(n_neighbors=2)
    LDA_model = LinearDiscriminantAnalysis()
    LR_model = LogisticRegression()
    #fit the default model with the training set and generate predictions for the test set
    default_model.fit(X_train, y_train)
    y_modeled = default_model.predict(X_test)
    #evaluate the model
    delta = abs(y_modeled - y_test)
    error_count = np.count_nonzero(delta)
    print("Classifier Accuracy:", 1 - (error_count / len(y_test)))
    print("Average Absolute Error", np.mean(delta))
    print(confusion_matrix(y_test, y_modeled))
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.dummy import DummyClassifier
from xgboost import XGBClassifier

# define a dictionary for different classifiers and their parameters
classifiers = {
    "Dummy"        : DummyClassifier(strategy='uniform', random_state=2),
    "KNN(3)"       : KNeighborsClassifier(3), 
    "RBF SVM"      : SVC(gamma=2, C=1), 
    "Decision Tree": DecisionTreeClassifier(max_depth=7), 
    "Random Forest": RandomForestClassifier(max_depth=7, n_estimators=10, max_features=4), 
    "xgboost"      : XGBClassifier(),
    "Neural Net"   : MLPClassifier(alpha=1), 
    "AdaBoost"     : AdaBoostClassifier(),
    "Naive Bayes"  : GaussianNB(), 
    "QDA"          : QuadraticDiscriminantAnalysis(),
    "Linear SVC"   : LinearSVC(),
    "Linear SVM"   : SVC(kernel="linear"), 
    "Gaussian Proc": GaussianProcessClassifier(1.0 * RBF(1.0)),
}
from time import time
nfast = 10      # Run the first nfast learner. Don't run the very slow ones at the end
head = list(classifiers.items())[:nfast]
Exemple #24
0
docs = corpus.split('\n')
X, y = [], []
for doc in docs:
    i, l = doc.split(':')
    X.append(i.strip())
    y.append(l.strip())

#Structure input data
from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer()
matrix_X = vec.fit_transform(X)

#Applying K-Nearest Neighbors classifier
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=3,
                           algorithm='brute',
                           weights='distance')
knn.fit(matrix_X[:5], y[:5])
print('KNN Classifier, Label: ' + str(knn.predict(matrix_X[5])))
print('KNN Classifier, prob.' + str(knn.predict_proba(matrix_X[5])))

#Applying Naive Bayes Classifier
from sklearn.naive_bayes import MultinomialNB
nbc = MultinomialNB(alpha=0.2, fit_prior=False, class_prior=[0.6, 0.4])
nbc.fit(matrix_X[:5], y[:5])
print('Naive Bayes Classifier, Label: ' + str(nbc.predict(matrix_X[5])))
print('Naive Bayes Classifier, prob.' + str(nbc.predict_proba(matrix_X[5])))

#Applying Decision Tree Classifier
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier(max_depth=2)
Exemple #25
0
import numpy as np

from sklearn.cross_validation import train_test_split
from sklearn.ensemble import VotingClassifier
from sklearn.feature_selection import SelectPercentile, f_classif
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import FunctionTransformer

# NOTE: Make sure that the class is labeled 'class' in the data file
tpot_data = np.recfromcsv('PATH/TO/DATA/FILE',
                          delimiter='COLUMN_SEPARATOR',
                          dtype=np.float64)
features = np.delete(tpot_data.view(np.float64).reshape(tpot_data.size, -1),
                     tpot_data.dtype.names.index('class'),
                     axis=1)
training_features, testing_features, training_classes, testing_classes = \
    train_test_split(features, tpot_data['class'], random_state=42)

exported_pipeline = make_pipeline(
    SelectPercentile(percentile=99, score_func=f_classif),
    KNeighborsClassifier(n_neighbors=2, weights="uniform"))

exported_pipeline.fit(training_features, training_classes)
results = exported_pipeline.predict(testing_features)
cm = confusion_matrix(y_test, svm_predictions) 
print(cm)
print(classification_report(y_test, svm_predictions, target_names = targets))
plot_confusion_matrix(svm_model_linear, X_test, y_test, normalize='true',display_labels=targets, xticks_rotation = 45) 
plt.title('Electra Query Type Classification using linear SVM')
plt.savefig('Electra Query Type Classification using linear SVM.jpg')




# In[8]:


# training a KNN classifier 
from sklearn.neighbors import KNeighborsClassifier 
knn = KNeighborsClassifier(n_neighbors = 5).fit(X_train, y_train) 
  
# accuracy on X_test 
accuracy = knn.score(X_test, y_test) 
print(accuracy) 
  
# creating a confusion matrix 
knn_predictions = knn.predict(X_test)  
cm = confusion_matrix(y_test, knn_predictions)
print(cm)
print(classification_report(y_test, knn_predictions, target_names = targets))
plot_confusion_matrix(knn, X_test, y_test, normalize='true',display_labels=targets, xticks_rotation = 45)
plt.title('Electra Query Type Classification using KNN classifier')
plt.savefig('Electra Query Type Classification using KNN classifier.jpg')

Exemple #27
0
def main():
    # choose which dataset(s) we are going to use ('gym' or 'song' prediction)
    for prob_name in ['gym', 'song']:

        p = global_params[prob_name]
        folds = 5

        print('\n##### Learning on {0} dataset #####\n'.format(prob_name))
        x_train, y_train, x_test, y_test = PreProcess(prob_name, False)

        learning_algos = [
            (
                DecisionTreeClassifier(
                    max_depth=p['max_depth'],
                    min_samples_split=p['min_samples_split'],
                ),
                True,  # Normalize Inputs
                {  # Grid Search Params
                    'max_depth': range(1, 30, 2),
                    'min_samples_split': range(2, 400, 20)
                },
                # chosen params
                dict(max_depth=p['max_depth'],
                     min_samples_split=p['min_samples_split'])),
            (
                KNeighborsClassifier(n_neighbors=p['n_neighbors'],
                                     weights=p['weights']),
                True,  # Normalize Inputs
                {  # Grid Search Params
                    'n_neighbors': [1] + [x * 5 for x in range(1, 9)],
                    'weights': ['distance', 'uniform']
                },
                # chosen params
                dict(n_neighbors=p['n_neighbors'], weights=p['weights'])),
            (
                AdaBoostClassifier(n_estimators=p['n_estimators'],
                                   learning_rate=p['learning_rate']),
                True,  # Normalize Inputs
                {  # Grid Search Params
                    'n_estimators': [100, 250, 400, 550, 700],
                    'learning_rate': np.logspace(-5, 0, 6)
                },
                # chosen params
                dict(n_estimators=p['n_estimators'],
                     learning_rate=p['learning_rate'])),
            (
                MLPClassifier(alpha=p['alpha'],
                              hidden_layer_sizes=p['hidden_layer_sizes'],
                              random_state=p['random_state']),
                True,  # Normalize Inputs
                {  # Grid Search Params
                    'hidden_layer_sizes': [(x * 10, ) for x in range(2, 7)] +
                    [(x * 10, y * 10) for x in range(2, 7)
                     for y in range(2, 7)] + [(x * 10, y * 10, z * 10)
                                              for x in range(2, 7)
                                              for y in range(2, 7)
                                              for z in range(2, 7)],
                    'alpha': [1e-9, 5e-8, 1e-6, 1e-5, 1e-4, 1e-3]
                },
                # chosen params
                dict(alpha=p['alpha'],
                     hidden_layer_sizes=p['hidden_layer_sizes'],
                     random_state=p['random_state'])),
            (
                svm.SVC(C=p['C'], gamma=p['gamma'], kernel=p['kernel']),
                True,  # Normalize Inputs
                [
                    {  # Grid Search Params
                        'kernel': ['rbf'],
                        'C': np.logspace(-3, 3, 7),
                        'gamma': np.logspace(-3, 3, 7)
                    },
                    {
                        'kernel': ['linear'],
                        'C': np.logspace(-3, 3, 7),
                    }
                ],
                # chosen params
                dict(C=p['C'], gamma=p['gamma'], kernel=p['kernel']))
        ]

        learning_algos_chosen = {
            'Decision Tree': learning_algos[0],
            'K-Nearest Neighbors': learning_algos[1],
            'Boosting': learning_algos[2],
            'Neural Network': learning_algos[3],
            'Support Vector Machine': learning_algos[4]
        }

        skf = StratifiedKFold(n_splits=folds)

        plotting_learning_curve, grid_search, run_cv, testing = (False, False,
                                                                 False, False)
        testing = True

        for algoName, (estimator, normalize_data, param_grid,
                       params) in learning_algos_chosen.items():
            print('\n{0} Performance\n'.format(algoName))

            if normalize_data:
                # Normalize for less sensitivity: https://www.springboard.com/blog/beginners-guide-neural-network-in-python-scikit-learn-0-18/
                scaler = StandardScaler()
                scaler.fit(x_train)
                x_train = scaler.transform(x_train)
                x_test = scaler.transform(x_test)

            if plotting_learning_curve:
                plot_learning_curve(estimator,
                                    '{0} - {1}'.format(algoName,
                                                       prob_name.upper()),
                                    x_train,
                                    y_train,
                                    train_sizes=np.linspace(.1, 1.0, 10),
                                    cv=skf)
                plt.ion()
                plt.savefig('{0}_{1}.png'.format(algoName, prob_name.upper()))
                plt.pause(0.001)
                plt.show()

            if grid_search:
                clf = None
                grid_pickle_loc = './Grid_Search_{0}_{1}'.format(
                    prob_name, algoName)

                if not os.path.exists(grid_pickle_loc):
                    print('generating grid search results')
                    # grid search and then pickle the cv_results
                    clf = GridSearchCV(estimator,
                                       param_grid,
                                       verbose=3,
                                       cv=folds)
                    clf.fit(x_train, y_train)
                    results = clf
                    pickle_out = open(grid_pickle_loc, "wb")
                    pickle.dump(results, pickle_out)
                    pickle_out.close()
                else:
                    print('loading pickled grid search results')
                    clf = pickle.load(open(grid_pickle_loc, "rb"))
                    import ipdb
                    ipdb.set_trace()

            if run_cv:
                # cross validation
                print(params)
                scores = cross_val_score(estimator,
                                         x_train,
                                         y_train,
                                         cv=folds,
                                         verbose=3)
                print("\tCross Validation Accuracy: %0.2f (+/- %0.2f)" %
                      (scores.mean(), scores.std() * 2))

            if testing:
                # testing on held out test set
                print(params)
                t1 = time.time()
                estimator.fit(x_train, y_train)
                t2 = time.time()
                avg_runtime = str(
                    datetime.timedelta(seconds=((t2 - t1) / folds)))
                print('Wall Clock Time: {0}'.format(avg_runtime))
                y_predict = estimator.predict(x_test)
                print("\tTest Set Accuracy:{0}".format(
                    np.count_nonzero(y_predict == y_test) / len(y_test)))
Exemple #28
0
def knn_classifier(train_x, train_y):  
    from sklearn.neighbors import KNeighborsClassifier  
    model = KNeighborsClassifier()  
    model.fit(train_x, train_y)  
    return model  
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
#Create a GaussianNB object
gnb = GaussianNB()
pred = gnb.fit(X, Y).predict(x)
print("Naive-Bayes accuracy: ", accuracy_score(y, pred, normalize=True))

#2. Linear Support Vector Classifier
from sklearn.svm import LinearSVC
svc_model = LinearSVC(random_state=0)
pred = svc_model.fit(X, Y).predict(x)
print("Linear SVC accuracy: ", accuracy_score(y, pred, normalize=True))

#3. k-Nearest-Neighbours classifier
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(X, Y)
pred = neigh.predict(x)
print("k-Nearest-Neighbours score: ", accuracy_score(y, pred))

#4. Decision trees
from sklearn import tree
#Create a tree
clf = tree.DecisionTreeClassifier()
clf.fit(X, Y)
preds = clf.predict(x)
print("Decision tree score: ", accuracy_score(y, preds))

#5. Random Forest classifier
from sklearn.ensemble import RandomForestClassifier
forClf = RandomForestClassifier(n_estimators=10)
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline, make_union
from tpot.builtins import StackingEstimator
from tpot.export_utils import set_param_recursive
from sklearn.preprocessing import FunctionTransformer
from copy import copy

# NOTE: Make sure that the outcome column is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
features = tpot_data.drop('target', axis=1)
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'], random_state=42)

# Average CV score on the training set was: 0.7339031339031339
exported_pipeline = make_pipeline(
    make_union(
        make_union(
            FunctionTransformer(copy),
            RFE(estimator=ExtraTreesClassifier(criterion="gini", max_features=0.45, n_estimators=100), step=0.4)
        ),
        FunctionTransformer(copy)
    ),
    KNeighborsClassifier(n_neighbors=16, p=1, weights="distance")
)
# Fix random state for all the steps in exported pipeline
set_param_recursive(exported_pipeline.steps, 'random_state', 42)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)