Exemple #1
0
def NearestCentroidImplementation(X_train, X_test, y_train, y_test, x_classA,
                                  x_classB, userID):

    print("Implementing Nearest Centroid")

    clf = NearestCentroid()
    clf.fit(X_train, y_train)
    print("Predicting the train data")

    trainAccuracy = clf.score(X_train, y_train)
    print("Train accuracy =", trainAccuracy)

    print("Predicting the test data")

    testAccuracy = clf.score(X_test, y_test)
    print("Test accuracy =", testAccuracy)

    #Getting the centroid of Class A and Class B
    centroids = clf.centroids_
    centroidClassA = np.array(centroids[0])
    centroidClassB = np.array(centroids[1])

    distanceA, distanceB = distForIndivClassesFromCentroid(
        x_classA, centroidClassA, centroidClassB, x_classB, userID)

    analysisClassWisePointsToCentroid(x_classA, centroidClassA, centroidClassB,
                                      x_classB, userID, distanceA, distanceB)

    analysisAllPointsToBothCentroids(x_classA, centroidClassA, centroidClassB,
                                     x_classB, userID, distanceA, distanceB)
def pickData(filename,class_numbers, training_instances, test_instances):


    data1 = np.genfromtxt(filename, delimiter=",")  ####  Reading File


    array = np.array(data1)
    data = array
    class_count = 0
    test_instance = test_instances
    training_instance = training_instances
    count = 1
    file_name = filename

    train_label_final = []
    test_label_final = []
    train_data_final = []
    test_data_final = []

    if (file_name == "HandWrittenLetters.txt"):
        class_count = 39
    elif (file_name == "ATNTFaceImages400.txt"):
        class_count = 10

    for i in range(len(class_numbers)):
        column_from = (class_numbers[i] - 1) * class_count
        column_to = column_from + class_count;
        training_column_end = column_to - test_instance

        train_label = data[0, column_from:training_column_end]
        train_data = data[1:, column_from:training_column_end]

        test_label = data[0, training_column_end:column_to]
        test_data = data[1:, training_column_end:column_to]

        if (count == 1):
            train_label_final = train_label
            test_label_final = test_label
            train_data_final = train_data
            test_data_final = test_data
            count = 0
        else:
            train_label_final = np.hstack((train_label_final, train_label))
            test_label_final = np.hstack((test_label_final, test_label))
            train_data_final = np.hstack((train_data_final, train_data))
            test_data_final = np.hstack((test_data_final, test_data))

    train_data_final_t = train_data_final.transpose()
    test_data_final_t = test_data_final.transpose()

    clf = NearestCentroid()
    clf.fit(train_data_final_t, train_label_final)
    # predictions = clf.predict(test_data_final_t)
    # print("Test set predictions:\n{}".format(clf.predict(test_data_final_t)))
    # print("Test set accuracy: {:.2f}".format(clf.score(test_data_final_t, test_label_final)))
    accuracy =clf.score(test_data_final_t, test_label_final)
    return accuracy
Exemple #3
0
def ml_algo(inp):
    df = pd.read_csv("data/final_preprocess.csv")
    X = np.array(df.drop(['Result'], axis=1))
    y = np.array(df['Result'])
    X, y = shuffle(X, y, random_state=1)
    X_train, X_test, y_train, y_test = model_selection.train_test_split(
        X, y, test_size=0.2)

    model_centroid = NearestCentroid().fit(X_train, y_train)
    model_knn = KNeighborsClassifier(25).fit(X_train, y_train)
    model_svm = SVC().fit(X_train, y_train)
    model_lr = LinearRegression().fit(X_train, y_train)
    model_nb = BernoulliNB().fit(X_train, y_train)
    # criterion-> gini or entropy; splitter-> best or random; max_depth-> any integer value or None;
    # min_samples_split-> min no. of samples reqd. to split an internal node;
    # min_samples_leaf -> The minimum number of samples required to be at a leaf node.
    # min_impurity_split -> It defines the threshold for early stopping tree growth.
    model_dtree = DecisionTreeClassifier(criterion="entropy",
                                         random_state=100,
                                         max_depth=3,
                                         min_samples_leaf=5).fit(
                                             X_train, y_train)

    # print ("[1] ACCURACY OF DIFFERENT MODELS ",'\n___________________')
    accu_centroid = model_centroid.score(X_test, y_test)
    # print ("NearestCentroid -> ", accu_centroid)
    accu_knn = model_knn.score(X_test, y_test)
    # print ("Knn             -> ",accu_knn)
    accu_svm = model_svm.score(X_test, y_test)
    # print ("SVM             -> ", accu_svm,)
    accu_lr = model_lr.score(X_test, y_test)
    # print ("Linear Regr     -> ", accu_lr)
    accu_nb = model_nb.score(X_test, y_test)
    # print ("Naive Bayes     -> ", accu_nb)
    accu_dtree = model_dtree.score(X_test, y_test)
    # print ("Decission Tree  -> ", accu_dtree, "\n")

    result_centroid = model_centroid.predict(inp)
    result_knn = model_knn.predict(inp)
    result_svm = model_svm.predict(inp)
    result_lr = model_lr.predict(inp)
    result_nb = model_nb.predict(inp)
    result_dtree = model_dtree.predict(inp)

    # disease-name, description, [list of step to be taken], [list of to whom we can contact]

    # print ("[2] PREDICTION ",'\n___________________')
    # print ("NearestCentroid -> ", result_centroid)
    # print ("knn             -> ", result_centroid)
    # print ("svm             -> ", result_svm)
    # print ("LinearReg       -> ", result_lr)
    # print ("Naive Bayes     -> ", result_nb)
    # print ("Decission Tree  -> ", result_dtree)

    # return map_disease[str(result_knn[0])]
    return result_knn[0]
Exemple #4
0
def NC_select_cv(X, Y, num_features):
    scores = []
    skf = cross_validation.StratifiedKFold(Y, n_folds=10)
    for train, test in skf:
        X_train, X_test, y_train, y_test = X[train], X[test], Y[train], Y[test]
        XRF_train, imp, ind, std = fitRF(X_train, y_train, est=2000)  # RFsel
        XRF_test = X_test[:, ind]  # reorder test set after RFsel
        clf = NearestCentroid()
        clf.fit(XRF_train[:, 0:num_features], y_train)
        scores.append(clf.score(XRF_test[:, 0:num_features], y_test))
    score = np.mean(scores)
    return(score)
Exemple #5
0
def model_train(train_datas, train_labels):
    """产生决策树。"""
    clf = NearestCentroid()

    model = clf.fit(train_datas, train_labels)

    # 保存产生的模型
    with open(model_save, 'wb') as f:
        pickle.dump(clf, f)

    train_acc = clf.score(train_datas, train_labels)
    print("训练集上的精度是:", train_acc)
Exemple #6
0
def Centroid(i, X, y):
    kf = KFold(n_splits=i, random_state=None, shuffle=True)
    print("printing kf", kf)
    kf.get_n_splits(X)

    clf = NearestCentroid()

    accuracy_centroid = 0

    for train_index, test_index in kf.split(X):
        # print('TRAIN:', train_index, 'TEST:', test_index)
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        clf.fit(X_train, y_train)
        # print("Test set predictions:\n{}".format(clf.predict(X_test)))
        # print("Test set accuracy: {:.2f}".format(clf.score(X_test, y_test)))
        accuracy_centroid += clf.score(X_test, y_test)
        print("Centroid Accuracy with ", i, " Fold: ",
              (clf.score(X_test, y_test)))
    print("Average accuracy of centroid with all folds: ",
          accuracy_centroid / i)
    centroid_accuracy_list.append(accuracy_centroid / i)
Exemple #7
0
class Knn():
    def __init__(self, method, n_neighbors, weights, radius):
        if method == 'knn_class':
            self.clf = neighbors.KNeighborsClassifier(n_neighbors,
                                                      weights=weights)
        elif method == 'knn_rad':
            self.clf = RadiusNeighborsClassifier(radius=radius)
        elif method == 'knn_cent':
            self.clf = NearestCentroid()

    def train_model(self, train):
        self.clf.fit(train[0], train[1])

    def predict(self, data):
        return self.clf.predict(data)

    def test_model(self, test):
        return self.clf.score(test[0], test[1])
Exemple #8
0
def main(CV=False, PLOT=True):
    """Entry Point.

    Parameters
    ----------
    CV: bool
        Cross-validation flag
    PLOT: bool
        Plotting flag
    """
    _data = fetch_data()

    if CV:
        method, params = cross_validate(_data)
    else:
        method = 'l2'
        params = {'metric': chisquare}

    data = normalise(_data, method)

    X_train, y_train = data['train']
    X_test, y_test = data['test']

    classifier = NearestCentroid(**params)
    classifier.fit(X_train, y_train)

    print('ACCURACY: ', classifier.score(X_test, y_test))

    if PLOT:

        y_hat = classifier.predict(X_test)

        cnf_matrix = confusion_matrix(y_test, y_hat)

        plot_confusion_matrix(cnf_matrix,
                              classes=list(set(y_test)),
                              title='Nearest Centroid\nConfusion Matrix',
                              cmap=plt.cm.Blues)

        plt.savefig('data/out/nc_cnf_matrix.pdf',
                    format='pdf',
                    dpi=300,
                    transparent=True)
Exemple #9
0
def train():
    df = pd.read_csv('data.csv')
    df.drop(['id'], 1, inplace=True)
    X = np.array(df.drop(['move'], axis=1))
    y = np.array(df['move'])

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=1)
    clf = NearestCentroid(metric='euclidean', shrink_threshold=None)
    clf.fit(X_train, y_train)
    accuracy = clf.score(X_test, y_test)

    print(accuracy)
    example_measures = np.array([
        0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
        0
    ])
    example_measures = example_measures.reshape(1, -1)
    prediction = clf.predict(example_measures)
    print(prediction)
Exemple #10
0
def text_classify(X_train, X_test, y_train, y_test):
    """
    machine learning classifier
    :param X_train:
    :param X_test:
    :param y_train:
    :param y_test:
    :return:
    """
    print('=' * 100)
    print('start launching MLP Classifier......')
    mlp = MLPClassifier(solver='lbfgs', alpha=1e-4, hidden_layer_sizes=(50, 30, 20, 20, 20, 30, 50), random_state=1)
    mlp.fit(X_train, y_train)
    print('finish launching MLP Classifier, the test accuracy is {:.5%}'.format(mlp.score(X_test, y_test)))

    print('=' * 100)
    print('start launching SVM Classifier......')
    svc = svm.SVC(decision_function_shape='ovo')
    svc.fit(X_train, y_train)
    print('finish launching SVM Classifier, the test accuracy is {:.5%}'.format(svc.score(X_test, y_test)))

    print('=' * 100)
    print('start launching Decision Tree Classifier......')
    dtree = tree.DecisionTreeClassifier()
    dtree.fit(X_train, y_train)
    print('finish launching Decision Tree Classifier, the test accuracy is {:.5%}'.format(
        dtree.score(X_test, y_test)))

    print('=' * 100)
    print('start launching KNN Classifier......')
    knn = NearestCentroid()
    knn.fit(X_train, y_train)
    print('finish launching KNN Classifier, the test accuracy is {:.5%}'.format(knn.score(X_test, y_test)))

    print('=' * 100)
    print('start launching Random Forest Classifier......')
    rf = RandomForestClassifier(n_estimators=20)
    rf.fit(X_train, y_train)
    print('finish launching Random Forest Classifier, the test accuracy is {:.5%}'.format(rf.score(X_test, y_test)))
Exemple #11
0
def ncc_classify(X_train, y_train, X_test, y_test):
    hyper_param = True

    if hyper_param == True:
        params = {
            'dist': ['euclidean', 'manhattan'],
        }
        best_accuracy = 0
        for i in range(0, 2):
            model = NearestCentroid(metric=params['dist'][i])
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)

            accuracy = model.score(X_test, y_test)
            if accuracy > best_accuracy:
                best_accuracy = accuracy
                best_param = model.metric
        model = NearestCentroid(metric=best_param)
    plot_confusion_matrix3(y_test, y_pred)
    print(model.metric)

    return model, y_pred
def nc_train(training_data):
    '''
    svm
    from kernel https://www.kaggle.com/archaeocharlie/a-beginner-s-approach-to-classification
    '''
    labeled_images = pd.read_csv(training_data)
    images = labeled_images.iloc[0:10000, 1:]
    labels = labeled_images.iloc[0:10000, :1]
    train_images, test_images, train_labels, test_labels = train_test_split(
        images, labels, train_size=0.8, random_state=0)

    # convert all pixels to black and white
    test_images[test_images > 0] = 1
    train_images[train_images > 0] = 1

    clf = NearestCentroid(shrink_threshold=1)
    # Train the model using the training sets and check score
    start = time.time()
    clf.fit(train_images, train_labels.values.ravel())
    end = time.time()
    print("Training time: ", end - start)
    print("Accuracy: ", clf.score(test_images, test_labels))

    return clf
#scores = cross_validation.cross_val_score(clf, data[:, 3:15], data[:, 2], cv=5)
#print scores

# Nearest Neighbor
nbrs = KNeighborsClassifier(n_neighbors=2).fit(X_train, y_train)
nbrs_y_pred = nbrs.predict(X_test)
nbrs_pr = precision_score(y_test,nbrs_y_pred)
nbrs_rc = recall_score(y_test,nbrs_y_pred)
nbrs_CM = confusion_matrix(y_test,nbrs_y_pred)
print "------------------"
print "\tNearest Neighbor"
print "------------------"
print "Real: "
print y_test
print "Predict"
print nbrs_y_pred
print "Score:"
print nbrs_pr

# NearestCentroid
clf = NearestCentroid().fit(X_train, y_train)
print "------------------"
print "\tNearest Centroid"
print "------------------"
print "Real: "
print y_test
print "Predict"
print clf.predict(X_test)
print "Score: "
print clf.score(X_test, y_test)
Exemple #14
0
    print('=' * 100)
    print('start launching Decision Tree Classifier......')
    dtree = tree.DecisionTreeClassifier()
    dtree.fit(train_X, training_label)
    print(
        'finish launching Decision Tree Classifier, the test accuracy is {:.5%}'
        .format(dtree.score(test_X, test_label)))

    print('=' * 100)
    print('start launching KNN Classifier......')
    knn = NearestCentroid()
    knn.fit(train_X, training_label)
    print(
        'finish launching KNN Classifier, the test accuracy is {:.5%}'.format(
            knn.score(test_X, test_label)))

    print('=' * 100)
    print('start launching Random Forest Classifier......')
    rf = RandomForestClassifier(n_estimators=10)
    rf.fit(train_X, training_label)
    print(
        'finish launching Random Forest Classifier, the test accuracy is {:.5%}'
        .format(rf.score(test_X, test_label)))
"""
    train_X, training_label, test_X, test_label = init_20groups_data(TEXT_DIR)
    print('=' * 100)
    print('start launching MLP Classifier......')
    mlp = MLPClassifier(solver='lbfgs', alpha=1e-4, hidden_layer_sizes=(50, 30, 20, 20), random_state=1)
    mlp.fit(train_X, training_label)
    print('finish launching MLP Classifier, the test accuracy is {:.5%}'.format(mlp.score(test_X, test_label)))
Exemple #15
0
import numpy as np
import scipy.io as sio
from sklearn.model_selection import train_test_split
from sklearn.neighbors.nearest_centroid import NearestCentroid

banana = sio.loadmat("banana.mat")
train_data = banana["train_data"]
train_labels = banana["train_labels"]
train_labels = np.array(train_labels)
test_data = banana["test_data"]
test_labels = banana["test_labels"]
test_labels = np.array(test_labels)

data = np.concatenate((train_data, test_data), axis=0)
data_labels = np.concatenate((train_labels, test_labels), axis=0)

# division of the whole set, training 30%, testing 70%
train, test, train_targets, test_targets = train_test_split(
    data, data_labels.ravel(), test_size=0.70, random_state=42)

#Training the Classifier
tmp = NearestCentroid()

tmp.fit(train, train_targets)
#score
print("the percentage of correct classifications:",
      tmp.score(test, test_targets))
Exemple #16
0
import scipy.io as sio
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.neighbors.nearest_centroid import NearestCentroid

banana = sio.loadmat("banana.mat")
train_data = banana["train_data"]
train_labels = banana["train_labels"]
train_labels = np.array(train_labels)
test_data = banana["test_data"]
test_labels = banana["test_labels"]
test_labels = np.array(test_labels)

train, dummy, train_targets, dummy = train_test_split(train_data,
                                                      train_labels.ravel(),
                                                      test_size=0.70)
dummy, test, dummy, test_targets = train_test_split(test_data,
                                                    test_labels.ravel(),
                                                    test_size=0.70)

clf = NearestCentroid()
clf.fit(train, train_targets)
Z = clf.predict(test)

print("Procent poprawnych klasyfikacji:",
      round(clf.score(test, test_targets) * 100, 2), "%")
# -*- coding: utf-8 -*-
"""
Created on Sun Jun  4 09:20:28 2017

@author: 凯风
"""

from sklearn.neighbors.nearest_centroid import NearestCentroid
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

# 准备数据
iris_dataset = load_iris()
X, Y = iris_dataset.data, iris_dataset.target
trainX, testX, trainY, testY = train_test_split(X, Y, test_size=.3)
'''
    最近质心分类:
        和KNN很像,通过每个类的数据计算每个类的质心
        然后用这个质心来表示这个类
        算是比较简单的基分类器,参数不多
'''

rlf = NearestCentroid(metric='euclidean', shrink_threshold=None)
rlf.fit(trainX, trainY)
rlf.score(testX, testY)
preY = rlf.predict(testX)
'''
    metric                      计算距离的方法
    shrink_threshold            是否缩小质心以消除特征的阈值
'''
    for train_index, test_index in kfold.split(data_opto_SOM, target):

        ## CONTROL ##
        x_train, x_test = data_control[train_index, :], data_control[
            test_index, :]
        y_train, y_test = target[train_index], target[test_index]
        mul_lr = LogisticRegression(multi_class='multinomial',
                                    solver='newton-cg',
                                    max_iter=300)
        mul_lr.fit(x_train, y_train)
        score_control_LR[n, f] = mul_lr.score(x_test, y_test) * 100
        print(mul_lr.score(x_test, y_test))

        clf = NearestCentroid(metric='euclidean', shrink_threshold=None)
        clf.fit(x_train, y_train)
        score_control_NN[n, f] = clf.score(x_test, y_test) * 100

        lda = LinearDiscriminantAnalysis(solver='svd')
        lda.fit(x_train, y_train)
        score_control_LDA[n, f] = lda.score(x_test, y_test) * 100
        print(lda.score(x_test, y_test))

        svm_algo = svm.SVC(decision_function_shape='ovo', kernel='linear')
        svm_algo.fit(x_train, y_train)
        score_control_SVM[n, f] = svm_algo.score(x_test, y_test) * 100

        ## DBS ##
        x_train, x_test = data_DBS[train_index, :], data_DBS[test_index, :]
        y_train, y_test = target[train_index], target[test_index]

        mul_lr = LogisticRegression(multi_class='multinomial',
Exemple #19
0
print 'Reading features... Done!'

# STEP 2 - computing scores
print 'Training...'
tfidf = models.TfidfModel(dictionary=features) # Computing tfidf model to be queried.
tfidf.save('reuters/data/tfidf.model')

# STEP 3 - computing centroids
tfidf = models.TfidfModel.load('reuters/data/tfidf.model')
features = corpora.Dictionary.load_from_text('reuters/data/word.dict')
by_bow = Corpus2Dictionary(features)
train_corpus = ReutersCorpus('training')
tfidf_train = tfidf[by_bow[by_word[train_corpus]]]
X = matutils.corpus2csc(tfidf_train)  # to gensim into scipy sparse matrix
X = X.transpose() # from csc (document as column) to csr (document as row)
y = train_corpus.category_mask # label for doc
rocchio = NearestCentroid()
rocchio.fit(X, y)
print 'Training... Done!'

# STEP 4 - evaluate prediction
test_corpus = ReutersCorpus('test')
tfidf_test = tfidf[by_bow[by_word[test_corpus]]]
# num_terms required: otherwise Z shrink to the max feature found
X = matutils.corpus2csc(tfidf_test, num_terms=len(features))
X = X.transpose()
y_true = test_corpus.category_mask
y_pred = rocchio.predict(X)
# print precision_score(y_true, y_pred)
print rocchio.score(X, y_true)
Exemple #20
0
predicted = clf.predict(X_test)

#import report, confusion matrix for results
print(clf.score(X_test, y_test))
print("Classification report for kNN classifier %s:\n%s\n" %
      (clf, metrics.classification_report(expected, predicted)))
print("Confusion matrix:\n%s" % metrics.confusion_matrix(expected, predicted))
print(clf.score(X_test, y_test))

#calculation time fitting for Nearest Centroid
start = int(round(time.time() * 1000))

#Import Nearest Centroid
classifier = NearestCentroid()
classifier.fit(X_lda, y_train)
NearestCentroid(metric='euclidean', shrink_threshold=None)
print(classifier)

end = int(round(time.time() * 1000))
print("--Centroid fitting finished in ", (end - start), "ms")

expected = y_test
predicted = classifier.predict(X_test)

#import report,confusion matrix for results
print(classifier.score(X_test, y_test))
print("Classification report for Centroid classifier %s:\n%s\n" %
      (classifier, metrics.classification_report(expected, predicted)))
print("Confusion matrix:\n%s" % metrics.confusion_matrix(expected, predicted))
print(classifier.score(X_test, y_test))
Exemple #21
0
def cross_validate(data):
    """Cross-Validate KNN.

    Parameters
    ----------
    data: dict
        * train: tuple
            - X: features
            - y: labels
        * test: tuple
            - X: features
            - y: labels

    Returns
    -------
    method: std
        Transformation function
    params: dict
        * metric: function | str
            Distance metric function
        * metric_params: dict
            Parameters of `metric` function
    """

    norm_methods = [
        'none', 'l1', 'l2', 'max', 'standard', 'maxabs', 'minmax', 'robust'
    ]

    params_grid = [('Intersection', {
        'metric': intersection
    }), ('Correlation', {
        'metric': correlation
    }), ('Manhattan', {
        'metric': 'manhattan'
    }), ('Euclidean', {
        'metric': 'euclidean'
    }), ('Chebyshev', {
        'metric': 'chebyshev'
    }), ('Chi-Square', {
        'metric': chisquare
    })]
    results = {}
    best_params = {}
    best_score = -1

    for method in norm_methods:

        data = normalise(data, method=method)

        X_train, y_train = data['train']
        X_test, y_test = data['test']

        results[method] = {}

        for name, params in params_grid:

            classifier = NearestCentroid(**params)
            acc = cross_val_score(classifier, X_train, y_train, cv=3).mean()
            results[method][name] = acc

        best_metric = None
        best_accuracy = -1

        for name, accuracy in results[method].items():
            if accuracy > best_accuracy:
                best_accuracy = accuracy
                best_metric = name

        best_params_ = {**dict(params_grid)[best_metric]}
        print('')
        print('[%s]' % method, 'Best params:', best_params_)
        print('[%s]' % method, 'Best CV Score:', results[method][best_metric])

        best_classifier_ = NearestCentroid(**best_params_)
        best_classifier_.fit(X_train, y_train)

        best_score_ = best_classifier_.score(X_test, y_test)

        print('[%s]' % method, 'Accuracy:', best_score_)

        if best_score_ > best_score:
            print('[%s]' % method, 'New Best:', best_params_)
            best_params = (method, best_params_)
            best_score = best_score_
    print('')
    print('Cross Validation Results:', best_params)
    print('')
    return best_params
Exemple #22
0
counter2 = 0
for i in range(len(x)):
    if z[i] == 1:
        plt.scatter(x[i], y[i], c="RED")
        counter1 += 1
    else:
        plt.scatter(x[i], y[i], c="GREEN")
        counter2 += 1
c1 = [c1[0] / counter1, c1[1] / counter1]
c2 = [c2[0] / counter2, c2[1] / counter2]
plt.scatter(c1[0], c1[1], c="BLUE")
plt.scatter(c2[0], c2[1], c="BROWN")
plt.show()

# Zadanie 8:
print("Classifier efficiency: %f" % clf.score(train, np.ravel(train_targets)))

# Zadanie 9:
k_best = [0, 0]
for k in range(5, 17):
    knn = neighbors.KNeighborsClassifier(k,
                                         weights='uniform',
                                         metric='euclidean')
    knn.fit(train, np.ravel(train_targets))
    predicted = knn.predict(test)
    sc = knn.score(train, np.ravel(train_targets))
    if sc > k_best[1]:
        k_best = [k, sc]
print("Best efficiency in 5-16 is for k =", k_best[0], " --> efficiency : ",
      k_best[1])
Exemple #23
0
### your code here!  name your classifier object clf if you want the
### visualization code (prettyPicture) to show you the decision boundary

## k-nearest neighbors
from sklearn.neighbors.nearest_centroid import NearestCentroid
clf = NearestCentroid()
T0 = time()
clf = clf.fit(features_train, labels_train)
print("nearest neighbors nearest centroid training time:",
      round(time() - T0, 3), "s")
T1 = time()
PRED = clf.predict(features_test)
print("nearest neighbors nearest centroid predition time:",
      round(time() - T1, 3), "s")
ACC = clf.score(features_test, labels_test)
print(ACC)

# ## adaboost
from sklearn import ensemble
clf = ensemble.AdaBoostClassifier()
T0 = time()
clf = clf.fit(features_train, labels_train)
print("adaboost training time:", round(time() - T0, 3), "s")
T1 = time()
PRED = clf.predict(features_test)
print("adaboost predition time:", round(time() - T1, 3), "s")
ACC = clf.score(features_test, labels_test)
print(ACC)

# ## random forest
plt.scatter(grade_slow, bumpy_slow, color="r", label="slow")
plt.legend()
plt.xlabel("bumpiness")
plt.ylabel("grade")
plt.show()
################################################################################

### your code here!  name your classifier object clf if you want the
### visualization code (prettyPicture) to show you the decision boundary

# kmeans
from sklearn.neighbors.nearest_centroid import NearestCentroid
clf = NearestCentroid()
clf.fit(features_train, labels_train)
predict = clf.predict(features_test)
acc = clf.score(features_test, labels_test)
print(acc)
# # 0.908

# AdaBoostClassifier
from sklearn.ensemble import AdaBoostClassifier
clf = AdaBoostClassifier(n_estimators=100, random_state=0)
clf.fit(features_train, labels_train)
clf.predict(features_test)
acc = clf.score(features_test, labels_test)
print(acc)
# 0.924

# RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(max_depth=2, random_state=0)
def myclassify(numfiers=5,xtrain=xtrain,ytrain=ytrain,xtest=xtest,ytest=ytest):
    count = 0



    bagging2 = BaggingClassifier(ETC(),bootstrap=False,bootstrap_features=False)
    bagging2.fit(xtrain,ytrain)
    #print bagging2.score(xtest,ytest)
    count += 1
    classifiers = [bagging2.score(xtest,ytest)]

    if count < numfiers:

        tree2 = ETC()
        tree2.fit(xtrain,ytrain)
        #print tree2.fit(xtrain,ytrain)
        #print tree2.score(xtest,ytest)
        count+=1
        classifiers = np.append(classifiers,tree2.score(xtest,ytest))
        print "1"
        print tree2.score(xtest,ytest)

    if count < numfiers:
        bagging1 = BaggingClassifier(ETC())
        bagging1.fit(xtrain,ytrain)
        #print bagging1.score(xtest,ytest)
        count+=1
        classifiers = np.append(classifiers,bagging1.score(xtest,ytest))
        print "2"
        print bagging1.score(xtest,ytest)

#     if count < numfiers:
#         # votingClassifiers combine completely different machine learning classifiers and use a majority vote
#         clff1 = SVC()
#         clff2 = RFC(bootstrap=False)
#         clff3 = ETC()
#         clff4 = neighbors.KNeighborsClassifier()
#         clff5 = quadda()
#         print"3"


#         eclf = VotingClassifier(estimators = [('svc',clff1),('rfc',clff2),('etc',clff3),('knn',clff4),('qda',clff5)])
#         eclf = eclf.fit(xtrain,ytrain)
#         #print(eclf.score(xtest,ytest))
#         # for claf, label in zip([clff1,clff2,clff3,clff4,clff5,eclf],['SVC','RFC','ETC','KNN','QDA','Ensemble']):
#         #     cla
#         #     scores = crossvalidation.cross_val_score(claf,xtrain,ytrain,scoring='accuracy')
#         #     print ()
#         count+=1
#         classifiers = np.append(classifiers,eclf.score(xtest,ytest))


#     if count < numfiers:
#         svc1 = SVC()
#         svc1.fit(xtrain,ytrain)
#         dec = svc1.score(xtest,ytest)
#         count+=1
#         classifiers = np.append(classifiers,svc1.score(xtest,ytest))
#         print "3"

    if count < numfiers:
        # Quadradic discriminant analysis - classifier with quadratic decision boundary -
        qda = quadda()
        qda.fit(xtrain,ytrain)
        #print(qda.score(xtest,ytest))
        count+=1
        classifiers = np.append(classifiers,qda.score(xtest,ytest))
        print "4"


    if count < numfiers:

        tree1 = DTC()
        tree1.fit(xtrain,ytrain)
        #print tree1.fit(xtrain,ytrain)
        #print tree1.score(xtest,ytest)
        count+=1
        classifiers = np.append(classifiers,tree1.score(xtest,ytest))

    if count < numfiers:
        knn1 = neighbors.KNeighborsClassifier() # this classifies based on the #k nearest neighbors, where k is definted by the user.
        knn1.fit(xtrain,ytrain)
        #print(knn1.score(xtest,ytest))
        count+=1
        classifiers = np.append(classifiers,knn1.score(xtest,ytest))

    if count < numfiers:
        # linear discriminant analysis - classifier with linear decision boundary -
        lda = linda()
        lda.fit(xtrain,ytrain)
        #print(lda.score(xtest,ytest))
        count+=1
        classifiers = np.append(classifiers,lda.score(xtest,ytest))

    if count < numfiers:
        tree3 = RFC()
        tree3.fit(xtrain,ytrain)
        #print tree3.score(xtest,ytest)
        count+=1
        classifiers = np.append(classifiers,tree3.score(xtest,ytest))

    if count < numfiers:
        bagging3 = BaggingClassifier(RFC(),bootstrap=False,bootstrap_features=False)
        bagging3.fit(xtrain,ytrain)
        #print bagging3.score(xtest,ytest)
        count+=1
        classifiers = np.append(classifiers,bagging3.score(xtest,ytest))


    if count < numfiers:
        bagging4 = BaggingClassifier(SVC(),bootstrap=False,bootstrap_features=False)
        bagging4.fit(xtrain,ytrain)
        #print bagging4.score(xtest,ytest)
        count+=1
        classifiers = np.append(classifiers,bagging4.score(xtest,ytest))

    if count < numfiers:
        tree4 = RFC(bootstrap=False)
        tree4.fit(xtrain,ytrain)
        #print tree4.score(xtest,ytest)
        count+=1
        classifiers = np.append(classifiers,tree4.score(xtest,ytest))

    if count < numfiers:
        tree6 = GBC()
        tree6.fit(xtrain,ytrain)
        #print(tree6.score(xtest,ytest))
        count+=1
        classifiers = np.append(classifiers,tree6.score(xtest,ytest))

    if count < numfiers:
        knn2 = neighbors.KNeighborsClassifier(n_neighbors = 10)
        knn2.fit(xtrain,ytrain)
        #print(knn2.score(xtest,ytest))
        count+=1
        classifiers = np.append(classifiers,knn2.score(xtest,ytest))

    if count < numfiers:
        knn3 = neighbors.KNeighborsClassifier(n_neighbors = 3)
        knn3.fit(xtrain,ytrain)
        #print(knn3.score(xtest,ytest))
        count+=1
        classifiers = np.append(classifiers,knn3.score(xtest,ytest))

    if count < numfiers:
        knn4 = neighbors.KNeighborsClassifier(algorithm = 'ball_tree')
        knn4.fit(xtrain,ytrain)
        #print(knn4.score(xtest,ytest))
        count+=1
        classifiers = np.append(classifiers,knn4.score(xtest,ytest))

    if count < numfiers:
        knn5 = neighbors.KNeighborsClassifier(algorithm = 'kd_tree')
        knn5.fit(xtrain,ytrain)
        #print(knn5.score(xtest,ytest))
        count+=1
        classifiers = np.append(classifiers,knn5.score(xtest,ytest))

    if count < numfiers:
        ncc1 = NearestCentroid()
        ncc1.fit(xtrain,ytrain)
        #print (ncc1.score(xtest,ytest))
        count+=1
        classifiers = np.append(classifiers,ncc1.score(xtest,ytest))

    if count < numfiers:
    # Nearest shrunken Centroid
        for shrinkage in [None,0.05,0.1,0.2,0.3,0.4,0.5]:
            ncc2 = NearestCentroid(shrink_threshold = shrinkage)
            ncc2.fit(xtrain,ytrain)
            #print(ncc2.score(xtest,ytest))

        count+=1
        classifiers = np.append(classifiers,ncc2.score(xtest,ytest))

    if count < numfiers:
        tree5 = ABC()
        tree5.fit(xtrain,ytrain)
        #print(tree5.score(xtest,ytest))
        count+=1
        classifiers = np.append(classifiers,tree5.score(xtest,ytest))

    classifierlabel = ["BaggingETC (with bootstraps set to false)","ETC","BaggingETC","Voting Classifier","svm","QDA","DTC","KNN (default)","LDA","RFC",
                       "BaggingRFC (with bootstraps set to false)","BaggingSVC (with bootstraps set to false)","RFC (bootstrap false)","GBC",
                        "knn (n_neighbors = 10)","knn (n_neighbors = 3)","knn (ball tree algorithm)","knn (kd_tree algorithm)",
                       "Nearest Centroid","Shrunken Centroid?","ABC"]


    classifierlabel = classifierlabel[:len(classifiers)]
    #print len(classifiers)
    #print classifiers
    for i in range(len(classifiers)):


        print ("{} classifier has percent correct {}".format(classifierlabel[i],classifiers[i]))
Exemple #26
0
def Euclidean_MDC(X_train, X_test, y_train, y_test):
    clf = NearestCentroid(metric='euclidean')
    clf.fit(X_train, y_train.values.ravel())
    print(clf.score(X_test, y_test))
Exemple #27
0
import pickle
import numpy as np
from matplotlib.colors import ListedColormap
import matplotlib.pyplot as plt

features = [[20,11],[20,5],[20,12],[17,7],[16,7],[18,7],[19,7],[20,4],[20,9],[20,10]]
r_fea = [[a[1],a[0] ]for a in features]
#labels = [[0],[1],[1],[0],[1],[0],[1],[0],[0],[1]]
labels = [0,1,1,0,1,0,1,0,0,1]
r_lab = [(a-1)*(a-1) for a in labels]
X = np.array(features+r_fea)
y = np.array(labels + r_lab)
clf = NearestCentroid()
clf.fit(X, y)
print(clf.centroids_)
print(clf.score(X,y))
print(clf.predict([[20, 7]]))

print(clf.predict([[7, 20]]))

list_pickle = open('lr.pkl', 'wb')
pickle.dump(clf, list_pickle)

cmap_light = ListedColormap(['#FFAAAA',  '#AAAAFF'])
h = .02
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                     np.arange(y_min, y_max, h))
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
Exemple #28
0
import winsound as s
s.Beep(1300,2000)
neigh.score(arr[n:], target[n:])
# Roccio 
from sklearn.neighbors.nearest_centroid import NearestCentroid
from sklearn.model_selection import cross_val_score
import numpy as np
import winsound as s

n=50000
clf = NearestCentroid()
clf.fit(arr[:n], target[:n])
s.Beep(1300,2000)
NearestCentroid(metric='euclidean', shrink_threshold=None)
s.Beep(1300,2000)
print(clf.score(arr[n:],target[n:]))
s.Beep(1300,2000)
scores = cross_val_score(clf, arr[n:], target[n:], cv=5)
print(scores)
# # Naive-bayes
from sklearn.naive_bayes import GaussianNB
# gnb = GaussianNB()
# y_pred = gnb.fit(arr[:500], target[:500]).predict(arr[500:600])
# print("Number of mislabeled points out of a total %d points : %d"
#       % (arr.shape[0],(target != y_pred).sum()))
n=50000

clf = GaussianNB()
clf.fit(arr[:n], target[:n])
clf.score(arr[n:],target[n:])
clf_pf = GaussianNB()
Exemple #29
0
def Mahalanobis_MDC(X_train, X_test, y_train, y_test):
    clf = NearestCentroid(metric='mahalanobis')
    clf.fit(X_train, y_train.values.ravel())
    print(clf.score(X_test, y_test))
    t[:, 2:30, 2:30, 1] = x_test[k:(k + 100)]
    t[:, 2:30, 2:30, 2] = x_test[k:(k + 100)]
    _ = model.predict(t)
    out = [model.layers[5].output]
    func = K.function([model.input, K.learning_phase()], out)
    test[k:(k + 100), :] = func([t, 1.])[0]
    k += 100
np.save("mnist_test_embedded.npy", test)

#  use the 128 element vectors as training data for other models
print("Full MNIST dataset:")
print()
print("Training Nearest Centroid")
clf0 = NearestCentroid()
clf0.fit(train, y_train)
nscore = 100.0 * clf0.score(test, y_test)

print("Training 3-NN")
clf1 = KNeighborsClassifier(n_neighbors=3)
clf1.fit(train, y_train)
kscore = 100.0 * clf1.score(test, y_test)

print("Training Random Forest")
clf2 = RandomForestClassifier(n_estimators=50)
clf2.fit(train, y_train)
rscore = 100.0 * clf2.score(test, y_test)

print("Training Linear SVM")
clf3 = LinearSVC(C=0.1)
clf3.fit(train, y_train)
sscore = 100.0 * clf3.score(test, y_test)
                          usecols=tuple(columns))
# standard normally distributed data: Gaussian with zero mean and unit variance
trainingData_scaled = preprocessing.scale(trainingData)

# get a 50000 x 1 column array for all of the results (boolean) (just 1000 x 1 for now)
results = np.loadtxt(filePath,
                     delimiter=',',
                     skiprows=numRowsToSkip,
                     usecols=(622, ))

# TRAIN THE MODELS

# randomly split the data into training set and test set (40% testing)
X_train, X_test, y_train, y_test = train_test_split(trainingData_scaled,
                                                    results,
                                                    test_size=0.4,
                                                    random_state=0)

# Accuracy: 70%
model = NearestCentroid()

# Fit the model according to the given training data
model.fit(X_train, y_train)

# evaluate the trained model on the test set
# Returns the mean accuracy on the given test data and labels
testAccuracy = model.score(X_test, y_test)

print("Final results for '%s': testing accuracy of %f%%" %
      (model, testAccuracy * 100))
Exemple #32
0
prec = precision_score(y_test,y_pred)
rec = recall_score(y_test,y_pred)
conf = confusion_matrix(y_test, y_pred)

print("Number of mislabeled points out of a total %d points : %d" % (X_test.shape[0],(y_test != y_pred).sum()))

print("K-nn" ,acc, prec, rec, conf)

#+++++++++++++++++++++++++++++++++++++++++++ Nearest Centroid ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

from sklearn.neighbors.nearest_centroid import NearestCentroid
clf = NearestCentroid()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(clf.predict(X_test))
print(clf.score(X_test,y_test))
print("Number of mislabeled points out of a total %d points : %d" % (X_test.shape[0],(y_test != y_pred).sum()))
acc = accuracy_score(y_test,y_pred)
prec = precision_score(y_test,y_pred)
rec = recall_score(y_test,y_pred)
conf = confusion_matrix(y_test, y_pred)

print("Nearest Centroid" ,acc, prec, rec, conf)

#+++++++++++++++++++++++++++++++++++++++++++++++++++++++ EM +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

from sklearn.mixture import GaussianMixture
gmm = GaussianMixture(n_components=2)
gmm.fit(X_train)
#print(gmm.means_)
#print(gmm.covariances_)
Exemple #33
0
                       hidden_layer_sizes=(3, 3),
                       random_state=1)
model3 = model3.fit(textBowTrain, y_train)
model3.score(textBowTest, y_test)
predictions3 = model3.predict(textBowTest)
# permite obtener las métrica para el modelo
print(classification_report(y_test, predictions3))
predictions3 = pd.DataFrame(predictions3)
predictions3.to_csv('predictmodelMLP.csv', index=False)

# K-NN
from sklearn.neighbors.nearest_centroid import NearestCentroid
# Creación del modelo de aprendizaje
model4 = NearestCentroid()
model4 = model4.fit(textBowTrain, y_train)
model4.score(textBowTest, y_test)
predictions4 = model4.predict(textBowTest)
print(classification_report(y_test, predictions4))
predictions4 = pd.DataFrame(predictions4)
predictions4.to_csv('predictmodelKNN.csv', index=False)

# Creación de la matriz de confusión
predictions = [predictions1, predictions2, predictions3, predictions4]

for i in range(len(predictions)):

    cm = confusion_matrix(y_test, predictions[i])
    print(cm)
    classes = [0, 1, 2]
    plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
    plt.title("title")
# Executando classificador random forest com 10, 100 e 500 arvores, 10 vezes cada uma
for j in 10, 100, 500:
    print("Numero de arvores: " + str(j))
    soma = 0
    for i in range(0, 10):
        clf = RandomForestClassifier(n_estimators=j)
        clf = clf.fit(treino_caracteristicas, treino_rotulos)
        print(clf.score(teste_caracteristicas, teste_rotulos))
        soma += clf.score(teste_caracteristicas, teste_rotulos)
    #print(soma)
    media = soma / 10
    print("Media: " + str(media))

featureImp = clf.feature_importances_

print(featureImp)
print("posicao odor: " + str(featureImp[4]))

clf2 = svm.SVC()
clf2.fit(treino_caracteristicas, treino_rotulos)
print("SVM")
print(clf2.score(teste_caracteristicas, teste_rotulos))
#print(clf2.support_vectors_)

clf3 = NearestCentroid()
clf3.fit(treino_caracteristicas, treino_rotulos)
print("KNN Centroide")
print(clf3.score(teste_caracteristicas, teste_rotulos))
#print(clf3.centroids_)
# rnc1.fit(xtrain,ytrain1)
# print (rnc1.score(xtest,ytest1))


# In[ ]:

get_ipython().magic(u'whos')


# In[17]:

# Nearest centroid
from sklearn.neighbors.nearest_centroid import NearestCentroid
ncc1 = NearestCentroid()
ncc1.fit(xtrain,ytrain1)
print (ncc1.score(xtest,ytest1))


# In[18]:

# Nearest shrunken Centroid
for shrinkage in [None,0.05,0.1,0.2,0.3,0.4,0.5]:
    ncc2 = NearestCentroid(shrink_threshold = shrinkage)
    ncc2.fit(xtrain,ytrain1)
    print(ncc2.score(xtest,ytest1))


# In[19]:

# linear discriminant analysis - classifier with linear decision boundary - 
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as linda