def test_classification_toy():
    # Check classification on a toy dataset, including sparse versions.
    clf = NearestCentroid()
    clf.fit(X, y)
    assert_array_equal(clf.predict(T), true_result)

    # Same test, but with a sparse matrix to fit and test.
    clf = NearestCentroid()
    clf.fit(X_csr, y)
    assert_array_equal(clf.predict(T_csr), true_result)

    # Fit with sparse, test with non-sparse
    clf = NearestCentroid()
    clf.fit(X_csr, y)
    assert_array_equal(clf.predict(T), true_result)

    # Fit with non-sparse, test with sparse
    clf = NearestCentroid()
    clf.fit(X, y)
    assert_array_equal(clf.predict(T_csr), true_result)

    # Fit and predict with non-CSR sparse matrices
    clf = NearestCentroid()
    clf.fit(X_csr.tocoo(), y)
    assert_array_equal(clf.predict(T_csr.tolil()), true_result)
def test_classification_toy():
    """Check classification on a toy dataset, including sparse versions."""
    clf = NearestCentroid()
    clf.fit(X, y)
    assert_array_equal(clf.predict(T), true_result)

    # Same test, but with a sparse matrix to fit and test.
    clf = NearestCentroid()
    clf.fit(X_csr, y)
    assert_array_equal(clf.predict(T_csr), true_result)

    # Fit with sparse, test with non-sparse
    clf = NearestCentroid()
    clf.fit(X_csr, y)
    assert_array_equal(clf.predict(T), true_result)

    # Fit with non-sparse, test with sparse
    clf = NearestCentroid()
    clf.fit(X, y)
    assert_array_equal(clf.predict(T_csr), true_result)

    # Fit and predict with non-CSR sparse matrices
    clf = NearestCentroid()
    clf.fit(X_csr.tocoo(), y)
    assert_array_equal(clf.predict(T_csr.tolil()), true_result)
Beispiel #3
0
def _nearestcentroid(*, train, test, x_predict=None, metrics, metric='euclidean', shrink_threshold=None):
    """
    For more info visit :
    https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.NearestCentroid.html#sklearn.neighbors.NearestCentroid
    """

    model = NearestCentroid(metric=metric, shrink_threshold=shrink_threshold)
    model.fit(train[0], train[1])
    model_name = 'Nearest Centroid'
    y_hat = model.predict(test[0])

    if metrics == 'accuracy':
        accuracy = accuracy_score(test[1], y_hat)

    if metrics == 'f1':
        accuracy = f1_score(test[1], y_hat)

    if metrics == 'jaccard':
        accuracy = jaccard_score(test[1], y_hat)

    if x_predict is None:
        return (model_name, accuracy, None)

    y_predict = model.predict(x_predict)
    return (model_name, accuracy, y_predict)
def nearest_centroid_classifier(train, validation, verbose=False):
    nearest_centroid = NearestCentroid()
    nearest_centroid.fit(train['data'], train['labels'])
    # Find the prediction and accuracy on the training set.
    Yhat_svc_linear_train = nearest_centroid.predict(train['data'])
    acc_train = np.mean(Yhat_svc_linear_train == train['labels'])

    # Find the prediction and accuracy on the test set.
    Yhat_svc_linear_test = nearest_centroid.predict(validation['data'])
    acc_validation = np.mean(Yhat_svc_linear_test == validation['labels'])
    if verbose:
        print('Train Accuracy for lda classifier, = {0:f}'.format(acc_train))
        print('Validation Accuracy for lda classifier, = {0:f}'.format(acc_validation))
    return acc_train, acc_validation
def test_predict_translated_data():
    # Test that NearestCentroid gives same results on translated data

    rng = np.random.RandomState(0)
    X = rng.rand(50, 50)
    y = rng.randint(0, 3, 50)
    noise = rng.rand(50)
    clf = NearestCentroid(shrink_threshold=0.1)
    clf.fit(X, y)
    y_init = clf.predict(X)
    clf = NearestCentroid(shrink_threshold=0.1)
    X_noise = X + noise
    clf.fit(X_noise, y)
    y_translate = clf.predict(X_noise)
    assert_array_equal(y_init, y_translate)
Beispiel #6
0
def test_predict_translated_data():
    # Test that NearestCentroid gives same results on translated data

    rng = np.random.RandomState(0)
    X = rng.rand(50, 50)
    y = rng.randint(0, 3, 50)
    noise = rng.rand(50)
    clf = NearestCentroid(shrink_threshold=0.1)
    clf.fit(X, y)
    y_init = clf.predict(X)
    clf = NearestCentroid(shrink_threshold=0.1)
    X_noise = X + noise
    clf.fit(X_noise, y)
    y_translate = clf.predict(X_noise)
    assert_array_equal(y_init, y_translate)
Beispiel #7
0
class NC:
    def __init__(self):
        self.clf = NearestCentroid()
        self.centroids = []

    """ 
    Calculates the mean of each class in the training data. 
    param:
        @train_data: training data
        @train_lbls: training labels
    """

    def fit(self, train_data, train_lbls):
        self.clf.fit(train_data, train_lbls)
        self.centroids = self.clf.centroids_
        return self

    """ 
    Classifies test data using the class means of the training and Nearest Centroid algorithm.
    param:
        @test_data: testing data
        @test_lbls: testing labels
    returns:
        @classification: numpy array with classification labels
        @score: the mean accuracy classifications
    """

    def predict(self, test_data, test_lbls):
        classification = self.clf.predict(test_data)
        try:
            score = accuracy_score(test_lbls, classification)
        except ValueError:
            score = None

        return classification, score
    def agglomerative_validation(self, iterations):
        print('\nPerforming holdout validation for agglomerative clusterer...')

        rands = []

        for i in range(0, iterations):

            X_train, X_test = train_test_split(self.gene_df,
                                               test_size=0.2,
                                               random_state=i)

            agglomerative_training = cluster.AgglomerativeClustering( \
                n_clusters=self.n_clusters, linkage='ward', affinity='euclidean').fit(X_train)

            agglomerative_testing = cluster.AgglomerativeClustering( \
                n_clusters=self.n_clusters, linkage='ward', affinity='euclidean').fit(X_test)

            #classify testing data using centroids of training data clusters
            clf = NearestCentroid()
            clf.fit(X_train, agglomerative_training.labels_)

            #calculate rand score between clustering labels and prediction labels of held out samples
            rands.append(
                adjusted_rand_score(clf.predict(X_test),
                                    agglomerative_testing.labels_))

        ##print('rand scores of kmeans and held out kmeans cluster samples', rands)
        print('average of rand scores', sum(rands) / len(rands))
        print('variance of rand scores', statistics.variance((rands)))
def NC(data_train, data_train_vectors, data_test_vectors, **kwargs):
    # Implementing classification model- using NearestCentroid
    clf_nc =  NearestCentroid()
    clf_nc.fit(data_train_vectors, data_train.target)
    y_pred = clf_nc.predict(data_test_vectors)
    
    return y_pred
def nsc_fit(Xtrain, Xtrain_lbls, Xtest, Xtest_lbls, n_clust, rng, start, name, datat, t0=time()):
    centers = []
    labels = []
    correct_labels = []

    #Cluster the data
    # Start is the index, it starts at since MNIST starts at 0 and ORL at 1.
    # rng is the range, MNIST has 10 classes where ORL has 40.
    for i in range(start, rng):
        data = Xtrain[np.nonzero(Xtrain_lbls == i)]
        kmeans = KMeans(n_clusters=n_clust, random_state=42).fit(data)

        # Get the centers for the amount of clusters specified.
        for k in range(0, n_clust):
            centers.append(kmeans.cluster_centers_[k, :])
            labels.append(str(i) + '_' + str(k))

    #Fit with nearest centroid
    clf = NearestCentroid()
    clf.fit(centers, labels)
    pred = clf.predict(Xtest)

    for i in range(0, len(pred)):
        correct_labels.append(int(pred[i].split('_')[0]))

    #Calculate score
    score = accuracy_score(Xtest_lbls, correct_labels)

    print('%-9s\t%.2fs\t%-9s\t%-9s'
          % (name, (time() - t0), score, datat))
    return pred
Beispiel #11
0
class AlgorithmRunner:
    """
    initalize the algorithms
    :param  algo_name: the name of the current algorithm
    """
    def __init__(self, algo_name):
        if algo_name == "KNN":
            self.algorithm = KNeighborsClassifier(n_neighbors=10)
        if algo_name == "Rocchio":
            self.algorithm = NearestCentroid()

    """
    call to the fit method from sklearn.neighbours
    :param  train_features: the features that we train on
            train_labels: the labels for the features that we train on
    """

    def fit(self, train_features, train_labels):
        self.algorithm.fit(train_features, train_labels)

    """
    call to the predict method from sklearn.neighbours
    :param  test_features: the features that we test on
    """

    def predict(self, test_features):
        return self.algorithm.predict(test_features)
Beispiel #12
0
def Centroid(X_train, Y_train, X_test, Y_test):
	# Parameter 'shrinkage' is tuned 
	#Cross validation
	shrinkages = np.linspace(0, 10, 100)
	tuned_parameters = [{'shrink_threshold': shrinkages}]
	cv = GridSearchCV(NearestCentroid(), tuned_parameters)
	cv.fit(X_train, Y_train)
	
	#Optimal parameters
	print('Best Params: ')
	print(cv.best_params_)

	#Optimal Model
	clf = NearestCentroid()
	clf.set_params(shrink_threshold=cv.best_params_['shrink_threshold'])
	clf.fit(X_train, Y_train)
	pred = clf.predict(X_test)
	test_error = mean_squared_error(Y_test, pred)
	acc_score = accuracy_score(Y_test, pred)
	print('Nearest Centroid Test Error: ' + str(test_error))
	print('Nearest Centroid Accuracy Score: ' + str(acc_score))
	print('First 10 predictions: ')
	print(pred[:10])
	print('First 10 actual: ')
	print(Y_test[:10])
	print('Centroid of each class: ')
	print(clf.centroids_[0])
	print('Class labels known to the classifier: ')
	print(clf.classes_)
	return clf, test_error, acc_score
def nearest_centroid_classifier(X_train, categories, X_test, test_categories):
    from sklearn.neighbors import NearestCentroid
    clf = NearestCentroid().fit(X_train, categories)
    y_roccio_predicted = clf.predict(X_test)
    print "\n Here is the classification report for NearestCentroid classifier:"
    print metrics.classification_report(test_categories, y_roccio_predicted)
    to_latex(test_categories, y_roccio_predicted)  
Beispiel #14
0
def run_NearestCentroid(X_train, X_test, y_train):
    centroid = NearestCentroid()

    centroid.fit(X_train, y_train)
    cents = centroid.centroids_
    y_pred = centroid.predict(X_test)
    return y_pred, cents
Beispiel #15
0
def run_knn(clf_output_file, x_train, x_test, y_train, y_test):
    """
    Builds and saves a trained K nearest neighbour classifier.
    :param training_path: String
        File path for the training matrix.
    :param test_size: float
        Proportion of data to use for testing.
    :param clf_output_file: String
        Name of file to save the classifier to.
    """

    # Train K Nearest Neighbour classifier
    start = time.time() # Performance
    clf = NearestCentroid()
    clf = clf.fit(x_train, y_train)
    # Save the model
    joblib.dump(clf, clf_output_file)
    end = time.time() # Performance
    print('KNN train & save model in: ' + str(end - start)) # Performance

    # Predict on the testing data
    y_predict = clf.predict(x_test)

    # Performance measurements
    knn_acc = accuracy_score(y_test, y_predict)
    knn_mcc = matthews_corrcoef(y_test, y_predict)
    knn_auc = roc_auc_score(y_test, y_predict)
    
    print("KNN classifier:")
    print("acc: " + str(knn_acc))
    print("mcc: " + str(knn_mcc))
    print("auc: " + str(knn_auc))

    return knn_acc, knn_mcc, knn_auc
Beispiel #16
0
def nearest_mean_classifier(X_train, y_train, X_validation, X_test):
    # Returns the labels for test_data, predicted by the nearest mean classifier trained on X_train and y_train
    # Input:
    # X_train - num_train x num_features matrix with features for the training data
    # y_train - num_train x 1 vector with labels for the training data
    # X_validation - num_test x num_features matrix with features for the validation data
    # X_test - num_test x num_features matrix with features for the test data
    # Output:
    # y_pred_validation - num_test x 1 predicted vector with labels for the validation data
    # y_pred_test - num_test x 1 predicted vector with labels for the test data

    # Niet in gebruik

    X_test_val = np.vstack((X_validation, X_test))
    # Gooi datasets samen

    clf = NearestCentroid()
    clf.fit(X_train, y_train)  # Bepaal de means

    predicted_labels = clf.predict(X_test_val)  # Voorspel de data

    # Sla voorspellingen op
    y_pred_validation = predicted_labels[:len(X_validation)]
    y_pred_test = predicted_labels[len(X_validation):]
    return y_pred_validation, y_pred_test
Beispiel #17
0
def test_iris_shrinkage():
    # Check consistency on dataset iris, when using shrinkage.
    for metric in ("euclidean", "cosine"):
        for shrink_threshold in [None, 0.1, 0.5]:
            clf = NearestCentroid(metric=metric, shrink_threshold=shrink_threshold)
            clf = clf.fit(iris.data, iris.target)
            score = np.mean(clf.predict(iris.data) == iris.target)
            assert score > 0.8, "Failed with score = " + str(score)
Beispiel #18
0
    def nearestcentrclassif(self, shrinkage=0.1):
        # we create an instance of Neighbours Classifier and fit the data.
        clf = NearestCentroid(shrink_threshold=shrinkage)
        clf.fit(self.x_train, self.y_train)
        z = clf.predict(self.x_test)
        print(np.mean(self.y_test == z))

        return z
Beispiel #19
0
def nearestNeighbour():
	import numpy as np
	import pylab as pl
	from matplotlib.colors import ListedColormap
	from sklearn import datasets
	from sklearn.neighbors import NearestCentroid

	n_neighbors = 15

	# import some data to play with
	iris = datasets.load_iris()
	X = iris.data[:, :2]  # we only take the first two features. We could
	                      # avoid this ugly slicing by using a two-dim dataset
	y = iris.target

	h = .02  # step size in the mesh

	# Create color maps
	cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
	cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])

	for shrinkage in [None, 0.1]:
	    # we create an instance of Neighbours Classifier and fit the data.
	    clf = NearestCentroid(shrink_threshold=shrinkage)
	    clf.fit(X, y)
	    y_pred = clf.predict(X)
	    print shrinkage, np.mean(y == y_pred)
	    # Plot the decision boundary. For that, we will asign a color to each
	    # point in the mesh [x_min, m_max]x[y_min, y_max].
	    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
	    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
	    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
	                         np.arange(y_min, y_max, h))
	    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])

	    # Put the result into a color plot
	    Z = Z.reshape(xx.shape)
	    pl.figure()
	    pl.pcolormesh(xx, yy, Z, cmap=cmap_light)

	    # Plot also the training points
	    pl.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold)
	    pl.title("3-Class classification (shrink_threshold=%r)"
	             % shrinkage)
	    pl.axis('tight')
Beispiel #20
0
def get_centeroids_per_class(features, labels):
    clf = NearestCentroid()
    clf.fit(features, labels)
    centroids = clf.centroids_
    class_labels = clf.predict(clf.centroids_)
    return {
        class_label: centroid
        for class_label, centroid in zip(class_labels, centroids)
    }
def test_iris_shrinkage():
    # Check consistency on dataset iris, when using shrinkage.
    for metric in ('euclidean', 'cosine'):
        for shrink_threshold in [None, 0.1, 0.5]:
            clf = NearestCentroid(metric=metric,
                                  shrink_threshold=shrink_threshold)
            clf = clf.fit(iris.data, iris.target)
            score = np.mean(clf.predict(iris.data) == iris.target)
            assert score > 0.8, "Failed with score = " + str(score)
Beispiel #22
0
class BinBasedCluster(BaseEstimator):
    def __init__(self, bins=[0, 0.5, 1] + range(5, 36)):
        self.bins = bins

    def fit(self, X, y):

        biny = self.bin_data(y)

        self.pred = NearestCentroid().fit(X, biny)
        return self

    def predict(self, X):
        return self.pred.predict(X)

    def score(self, X, y, is_raw=True):
        clusters = self.pred.predict(X)
        if is_raw:
            return adjusted_rand_score(self.bin_data(y), clusters)
        else:
            return adjusted_rand_score(y, clusters)

    def bin_data(self, y):
        return np.digitize(y, self.bins)

    def make_vern_points(self, X, y):

        sel = SelectKBest(score_func=normalized_mutual_info_score_scorefunc)
        sdata = sel.fit_transform(X, y)
        print X.shape, sdata.shape

        pca = PCA(n_components=2)
        pca_trans = pca.fit_transform(sdata)

        biny = self.bin_data(y)

        pred = NearestCentroid().fit(pca_trans, biny)

        x_min, x_max = pca_trans[:, 0].min() - 1, pca_trans[:, 0].max() + 1
        y_min, y_max = pca_trans[:, 1].min() - 1, pca_trans[:, 1].max() + 1
        xx, yy = np.meshgrid(np.linspace(x_min, x_max, 50), np.linspace(y_min, y_max, 50))
        Z = pred.predict(np.c_[xx.ravel(), yy.ravel()])
        Z = Z.reshape(xx.shape)

        return pca_trans, biny, xx, yy, Z
Beispiel #23
0
def nearestNeighbour():
    import numpy as np
    import pylab as pl
    from matplotlib.colors import ListedColormap
    from sklearn import datasets
    from sklearn.neighbors import NearestCentroid

    n_neighbors = 15

    # import some data to play with
    iris = datasets.load_iris()
    X = iris.data[:, :2]  # we only take the first two features. We could
    # avoid this ugly slicing by using a two-dim dataset
    y = iris.target

    h = .02  # step size in the mesh

    # Create color maps
    cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
    cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])

    for shrinkage in [None, 0.1]:
        # we create an instance of Neighbours Classifier and fit the data.
        clf = NearestCentroid(shrink_threshold=shrinkage)
        clf.fit(X, y)
        y_pred = clf.predict(X)
        print shrinkage, np.mean(y == y_pred)
        # Plot the decision boundary. For that, we will asign a color to each
        # point in the mesh [x_min, m_max]x[y_min, y_max].
        x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
        y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
        xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                             np.arange(y_min, y_max, h))
        Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])

        # Put the result into a color plot
        Z = Z.reshape(xx.shape)
        pl.figure()
        pl.pcolormesh(xx, yy, Z, cmap=cmap_light)

        # Plot also the training points
        pl.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold)
        pl.title("3-Class classification (shrink_threshold=%r)" % shrinkage)
        pl.axis('tight')
Beispiel #24
0
def nearest_centroid_classifier(X_train, X_test, y_train, y_test):
    from sklearn.neighbors import NearestCentroid
    clf = NearestCentroid().fit(X_train, y_train)

    evaluate_cross_validation(clf,X_train, y_train, 5)


    y_roccio_predicted = clf.predict(X_test)
    print "\n Here is the classification report for NearestCentroid classifier:"
    print metrics.classification_report(y_test, y_roccio_predicted)
Beispiel #25
0
def assign_mg2k_centroids(X, centroids=None):
    """
    Assigns Mg II k centroids found in the study
    'Identifying typical Mg II flare spectra using machine learning', by
    B. Panos et. al. 2018 to the Mg II k spectra supplied in X. The centroids
    are assigned using a nearest neighbour procedure.
    
    The spectra in X have to be interpolated to 216 wavelength bins between 
    LAMBDA_MIN = 2793.8500976562500 and LAMBDA_MAX = 2799.3239974882454. For example::
        
        X = raster.get_interpolated_image_step( 
                step = <step>, 
                lambda_min = LAMBDA_MIN, 
                lambda_max = LAMBDA_MAX, 
                n_breaks = 216  
                )
    
    Parameters
    ----------
    X : numpy.array
        interpolated raster image of shape (_,bins)
    centroids : numpy.array
        If None, the centroids defined in the above study will be used, otherwise an array of shape (n_centroids, n_bins) should be passed.
        Important: both the spectra in 'X' and in 'centroids' should be constrained to the same wavelength region!
    
    Returns
    -------
    assigned_mg2k_centroids
        numpy vector with shape (X.shape[1],)
    """

    # load default centroids if no centroids are passed
    if centroids is None:
        centroids = get_mg2k_centroids(bins=X.shape[1])

    # create list of numbered centroid ids
    centroid_ids = list(range(centroids.shape[0]))

    # check whether X comes in the correct dimensions
    if not X.shape[1] == centroids.shape[1]:
        raise Exception(
            "Expecting X to have shape (_,{}). Please interpolate accordingly (More information with 'help(assign_mg2k_centroids)')."
            .format(centroids.shape[1]))

    # create nearest centroid finder instance and fit it
    knc = NearestCentroid()
    knc.fit(X=centroids, y=centroid_ids)

    # predict nearest centroids for the supplied spectra
    # (making sure that X is normalized)
    assigned_mg2k_centroids = knc.predict(normalize(X))

    # return vector of assigned centroids
    return assigned_mg2k_centroids
Beispiel #26
0
def test_nc_classify_with_sklearn(trainingData, trainingLabels, testData,
                                  testLabels):
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        X = np.array(trainingData)
        y = np.array(trainingLabels)
        clf = NearestCentroid()
        clf.fit(X, y)

        predictions = clf.predict(testData)
        printCorrectWrong(predictions, testLabels)
Beispiel #27
0
def nearest_class_centroid(images_training, labels_training, images_testing,
                           labels_testing):

    pca = PCA(n_components=(2))
    training_images_pca = pca.fit_transform(images_training)
    test_images_pca = pca.fit_transform(images_testing)

    clf = NearestCentroid()
    clf.fit(training_images_pca, labels_training)
    print("Centoids: \n", clf.centroids_)

    return (clf.predict(test_images_pca), clf.centroids_, training_images_pca,
            test_images_pca)
Beispiel #28
0
def rocchio(X_train, X_test, y_train, y_test,string):
    clf = NearestCentroid()
    clf.fit(X_train, y_train.values.ravel())
    #pickles.criarModelo(clf,"Rocchio "+string)
    if("Fold" in string):
        pickles.criarModelo(clf,"oraculo/"+string) #SALVAR MODELO
    y_predito = clf.predict(X_test)
    micro = f1_score(y_test,y_predito,average='micro')
    macro = f1_score(y_test,y_predito,average='macro')
    #f1_individual = f1_score(y_test,y_predito,average=None)    
    #salvar_dados.salvar(y_test,y_predito,micro, macro, f1_individual," Rocchio "+string)
    print("O f1Score micro do Rocchio ", string ," é: ",micro)
    print("O f1Score macro do Rocchio ", string ," é: ",macro)
Beispiel #29
0
class NearestCentroidImpl:
    def __init__(self, **hyperparams):
        self._hyperparams = hyperparams
        self._wrapped_model = Op(**self._hyperparams)

    def fit(self, X, y=None):
        if y is not None:
            self._wrapped_model.fit(X, y)
        else:
            self._wrapped_model.fit(X)
        return self

    def predict(self, X):
        return self._wrapped_model.predict(X)
Beispiel #30
0
class NearestMeanClassifier(BaseClassifier):
    def __init__(self, feature_length, num_classes):
        super().__init__(feature_length, num_classes)
        self.num_classes = num_classes

        # model build
        # shrink_threshold = True for Nearest Shrunken Centroid Classifier
        self.model = NearestCentroid(metric='manhattan')

    def train(self, features, labels):
        """
        Using a set of features and labels, trains the classifier and returns the training accuracy.
        :param features: An MxN matrix of features to use in prediction
        :param labels: An M row list of labels to train to predict
        :return: Prediction accuracy, as a float between 0 and 1
        """
        labels = self.labels_to_categorical(labels)
        self.model.fit(features, labels)
        accuracy = self.model.score(features, labels)
        return accuracy


    def get_prediction(self,features):
        '''
        this function get the prediction from the
        :param features: sample to predict
        :return: prediction from the model
        '''
        return self.model.predict(features)

    def predict(self, features, labels):
        """
        Using a set of features and labels, predicts the labels from the features,
        and returns the accuracy of predicted vs actual labels.
        :param features: An MxN matrix of features to use in prediction
        :param labels: An M row list of labels to test prediction accuracy on
        :return: Prediction accuracy, as a float between 0 and 1
        """
        labels = self.labels_to_categorical(labels)
        accuracy = self.model.score(features, labels)
        return accuracy

    def labels_to_categorical(self, labels):
        '''
        convert the labels from string to number
        :param labels: labels list of string
        :return: labels converted in number
        '''
        _, IDs = unique(labels, return_inverse=True)
        return IDs
def cluster(dataname,
            trn_X,
            trn_Y,
            dev_X,
            dev_Y,
            k,
            trial_num,
            link_type='ward',
            m='euclidean'):
    print("[{}] clustering with: linkage={}, m={}, n_clusters={}...".format(
        trial_num, link_type, m, k))
    clustering = AgglomerativeClustering(n_clusters=k,
                                         linkage=link_type,
                                         affinity=m)
    # clustering = KMeans(n_clusters=k)
    clustering.fit(trn_X)
    labels = clustering.labels_
    print('[{}] finished clustering.'.format(trial_num))

    ## labels: new_id -> cluster_number
    trn_id2i = dict()
    for rid, eid in trn_Y.items():
        trn_id2i[rid] = labels[eid]
    trn_oname = '../../resources/topicreps/{}_{}_{}_{}-train.labels.pkl'.format(
        dataname, link_type, m, k)
    pickle.dump(trn_id2i, open(trn_oname, 'wb'))
    print("[{}] saved to {}".format(trial_num, trn_oname))

    print("[{}] fitting centroid classifier ...".format(trial_num))
    clf = NearestCentroid()
    clf.fit(trn_X, labels)
    print("[{}] finished fitting classifier.".format(trial_num))
    cen_oname = '../../resources/topicreps/{}_{}_{}_{}.centroids.npy'.format(
        dataname, link_type, m, k)
    np.save(cen_oname, clf.centroids_)
    print("[{}] saved to {}".format(trial_num, cen_oname))

    dev_labels = clf.predict(dev_X)
    sse = calculate_sse(clf.centroids_, dev_X, dev_labels)
    print("[{}] Sum Squared Error: {}".format(trial_num, sse))

    dev_id2i = dict()
    for rid, eid in dev_Y.items():
        dev_id2i[rid] = dev_labels[eid]
    dev_oname = '../../resources/topicreps/{}_{}_{}_{}-dev.labels.pkl'.format(
        dataname, link_type, m, k)
    pickle.dump(dev_id2i, open(dev_oname, 'wb'))
    print("[{}] saved to {}".format(trial_num, dev_oname))
    print()
    return sse
Beispiel #32
0
def classification(train_img, train_label, test_img, distance):
    """
    It trains the nearest centroid classification and output the predicting label.
    
    :param train_img: feature vector of training images 
    :param train_label: labels of training images 
    :param test_img: feature vector of test images
    :param distance: 'l1','l2' or 'cosine'
    :return: predicting labels of testing images and distance of all test feature vectors to the centroids
    """
    clf = NearestCentroid(metric=distance)
    clf.fit(train_img, train_label)
    predict_label = clf.predict(test_img)
    dist = pairwise_distances(test_img, clf.centroids_, metric=clf.metric)
    return predict_label, dist
def nc_fit(Xtrain, Xtest, Xtrain_lbls, Xtest_lbls, name, data, t0=time()):
    #Create a nearest centroid
    clf = NearestCentroid()
    # Train with the data
    clf.fit(Xtrain, Xtrain_lbls)

    # Create prediction for test data
    y_pred_test = clf.predict(Xtest)

    # How well does it fit
    score = clf.score(Xtest, Xtest_lbls)

    print('%-9s\t%.2fs\t%-9s\t%-9s'
          % (name, (time() - t0), score, data))

    return y_pred_test
Beispiel #34
0
def centroid_knn(data,
                 ref_data,
                 label,
                 ref_label,
                 using_boostrap=False,
                 output_mode=0):
    # clf = KNeighborsClassifier(n_neighbors=5)
    clf = NearestCentroid()
    clf.fit(ref_data, ref_label)
    pred = clf.predict(data)
    # print(confusion_matrix(pred, label))
    # print(classification_report(pred, label, digits=4))
    if using_boostrap:
        bootstrap(pred, label, output_mode)
    else:
        no_bootstrap(pred, label, output_mode)
Beispiel #35
0
class NCClassifier(Classifier):
    """Rocchio classifier"""
    def __init__(self, shrink=None):
        self.cl = NearestCentroid(shrink_threshold=shrink)
        self.shrink = shrink

    def retrain(self, vectorFeature, vectorTarget):
        if self.shrink != None:
            self.cl.fit([v.toarray()[0] for v in vectorFeature], vectorTarget)
        else:
            super(NCClassifier, self).retrain(vectorFeature, vectorTarget)

    def classify(self, vectorizedTest):
        if self.shrink != None:
            return self.cl.predict(vectorizedTest.toarray()[0])[0]
        else:
            return super(NCClassifier, self).classify(vectorizedTest)
Beispiel #36
0
class NCClassifier(Classifier):
    """Rocchio classifier"""
    def __init__(self, shrink=None):
        self.cl = NearestCentroid(shrink_threshold=shrink)
        self.shrink = shrink

    def retrain(self, vectorFeature, vectorTarget):
        if self.shrink != None:
            self.cl.fit([v.toarray()[0] for v in vectorFeature], vectorTarget)
        else:
            super(NCClassifier, self).retrain(vectorFeature, vectorTarget)

    def classify(self, vectorizedTest):
        if self.shrink != None:
            return self.cl.predict(vectorizedTest.toarray()[0])[0]
        else:
            return super(NCClassifier, self).classify(vectorizedTest)
def ROCCHIO(FeatureMatrix, Labels):

    samples, features = FeatureMatrix.shape

    XTrain, XTest, LabelTrain, LabelTest = train_test_split(FeatureMatrix,
                                                            Labels,
                                                            test_size=0.1)

    # training model on dataset
    clf = NearestCentroid()
    clf.fit(XTrain, LabelTrain)

    # testing model on dataset
    expected = LabelTest
    predicted = clf.predict(XTest)

    return (expected, predicted)
    def __test_epoch_cluster(self):
        train_embeddings, train_targets = self.__extract_embeddings(
            self.eval_train_loader)
        test_embeddings, test_targets = self.__extract_embeddings(
            self.eval_test_loader)

        nc = NearestCentroid()
        nc.fit(train_embeddings, train_targets)
        predictions = nc.predict(test_embeddings)
        #classification_report = sklearn.metrics.classification_report(test_targets, predictions, target_names=['Open','Partial','Closed'])
        classification_report = sklearn.metrics.classification_report(
            test_targets, predictions, target_names=['Open', 'Closed'])
        classification_metrics = sklearn.metrics.precision_recall_fscore_support(
            test_targets, predictions, average='macro')
        confussion_matrix = sklearn.metrics.confusion_matrix(
            test_targets, predictions)

        return classification_report, classification_metrics, confussion_matrix
 def predict(self, user_id):
     train_set = pd.read_csv(
         f'../FileCenter/FeaturesPerUser/user{user_id}_train_features.csv')
     test_set = pd.read_csv(
         f'../FileCenter/FeaturesPerUser/user{user_id}_test_features.csv')
     clf = NearestCentroid()
     x_train = train_set.iloc[:, :-1]
     clf.fit(x_train, train_set['label'])
     x_test = test_set.iloc[:, :-1]
     plot_confusion_matrix(clf, x_test, test_set['label'],
                           normalize='true')  # doctest: +SKIP
     plt.show()
     predicted = clf.predict(x_test)
     with open(
             '../FileCenter/classifiers_predictions/predicted_NearestCentroid',
             'wb') as fp:
         pickle.dump(predicted, fp)
     return predicted
Beispiel #40
0
    def make_vern_points(self, X, y):

        sel = SelectKBest(score_func=normalized_mutual_info_score_scorefunc)
        sdata = sel.fit_transform(X, y)
        print X.shape, sdata.shape

        pca = PCA(n_components=2)
        pca_trans = pca.fit_transform(sdata)

        biny = self.bin_data(y)

        pred = NearestCentroid().fit(pca_trans, biny)

        x_min, x_max = pca_trans[:, 0].min() - 1, pca_trans[:, 0].max() + 1
        y_min, y_max = pca_trans[:, 1].min() - 1, pca_trans[:, 1].max() + 1
        xx, yy = np.meshgrid(np.linspace(x_min, x_max, 50), np.linspace(y_min, y_max, 50))
        Z = pred.predict(np.c_[xx.ravel(), yy.ravel()])
        Z = Z.reshape(xx.shape)

        return pca_trans, biny, xx, yy, Z
Beispiel #41
0
train_images = faces.images[np.negative(ind)]
train_targets = faces.target[np.negative(ind)]
n_train = len(train_images)

test_images = faces.images[ind]
test_targets = faces.target[ind]
n_tests = len(test_images)
for test in test_images:
    test = test + norm.rvs(scale=10, size=test.shape)
    for i in range(25, 30):
        test[i, :] = 0
        test[:, i] = 0
    test = np.minimum(test, 1)
    test = np.maximum(test, 0)
    test = np.zeros(test.shape)

train = train_images.reshape((n_train, -1))
train_pca = pca.fit_transform(train)
    
test = test_images.reshape((n_tests, -1))
test_pca = pca.transform(test)

neigh = NearestCentroid()

neigh.fit(train, train_targets)
print("Sans ACP :", np.count_nonzero(neigh.predict(test) - test_targets), n_tests)

neigh.fit(train_pca, train_targets)
print("Avec ACP :", np.count_nonzero(neigh.predict(test_pca) - test_targets), n_tests)

iris = datasets.load_iris()
X = iris.data[:, :2]  # we only take the first two features. We could
# avoid this ugly slicing by using a two-dim dataset
y = iris.target

h = 0.02  # step size in the mesh

# Create color maps
cmap_light = ListedColormap(["#FFAAAA", "#AAFFAA", "#AAAAFF"])
cmap_bold = ListedColormap(["#FF0000", "#00FF00", "#0000FF"])

for shrinkage in [None, 0.1]:
    # we create an instance of Neighbours Classifier and fit the data.
    clf = NearestCentroid(shrink_threshold=shrinkage)
    clf.fit(X, y)
    y_pred = clf.predict(X)
    print(shrinkage, np.mean(y == y_pred))
    # Plot the decision boundary. For that, we will assign a color to each
    # point in the mesh [x_min, m_max]x[y_min, y_max].
    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])

    # Put the result into a color plot
    Z = Z.reshape(xx.shape)
    pl.figure()
    pl.pcolormesh(xx, yy, Z, cmap=cmap_light)

    # Plot also the training points
    pl.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold)
clf5=RandomForestClassifier(n_estimators=100)   #RandomForest Classifier
clf5.fit(X_train, y_train)
pred = clf5.predict(X_test)
writeToDisk(pred,"RandomForestClassifier")

clf6=Pipeline([('feature_selection',            #LinearSVC with L2-based feature selection
    LinearSVC(penalty="l2", dual=False, tol=1e-3)),
    ('classification', LinearSVC())])
clf6.fit(X_train, y_train)
pred = clf6.predict(X_test)
writeToDisk(pred,"LinearSVC")

clf7=NearestCentroid()                          #NearestCentroid (aka Rocchio classifier), no threshold 
clf7.fit(X_train, y_train)
pred = clf7.predict(X_test)
writeToDisk(pred,"NearestCentroid")

clf8=SVC(C=1.0, class_weight=None, coef0=0.0,   #SVC
    decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
    max_iter=-1, probability=False, random_state=1, shrinking=True,
    tol=0.001, verbose=False)
clf8.fit(X_train, y_train)
pred = clf8.predict(X_test)
writeToDisk(pred,"SVC")
'''
clf9=VotingClassifier(estimators=[
    ('Ridge',clf1),('MultiNB',clf2),('BernNB',clf3),('KNN',clf4),
    ('RF',clf5),('LinearSVC',clf6),('NearC',clf7),('SVC',clf8)
    ],voting='soft')
Beispiel #44
0
        y_test = labels[272:,i]
    else:
        X_train = training
        y_train = labels[:172,i]
        X_test = sampletest
        y_test = labels[172:,i]

    posterior = np.empty([100,72,6])
    box = np.zeros([6,6])
    for j in range(4,5):
        for k in range(1,2):
            accuracy = np.zeros(100)
            for m in range(0,100):
                ncc = NearestCentroid()
                ncc.fit(X_train, y_train)
                y_pred = ncc.predict(X_test)
                
                n=0
                for i in range(0,len(y_pred)):
                    if y_pred[i] == y_test[i]:
                #print i, y_pred[i], y_test[i]
                        n = n+1
                        accuracy[m] = accuracy[m]+1
                    box[y_test[i]-1,y_pred[i]-1] = box[y_test[i]-1,y_pred[i]-1] + 1
                #posterior[m] =  knc.predict_proba(X_test)
            print j, k, np.mean(accuracy)/0.72, np.std(accuracy)/0.72
            #print 30, 20, sum(accuracy[0:8])/8.0, sum(accuracy[8:18])/10.0, sum(accuracy[18:30])/12.0, sum(accuracy[56:72])/16.0, sum(accuracy[30:43])/13.0, sum(accuracy[43:56])/13.0, sum(accuracy)/72.0
        '''
    means = np.empty([72,6])
    stds = np.empty([72,6])
    grid = np.empty([6,6])
def test_iris():
    # Check consistency on dataset iris.
    for metric in ('euclidean', 'cosine'):
        clf = NearestCentroid(metric=metric).fit(iris.data, iris.target)
        score = np.mean(clf.predict(iris.data) == iris.target)
        assert score > 0.9, "Failed with score = " + str(score)
def test_precomputed():
    clf = NearestCentroid(metric="precomputed")
    clf.fit(X, y)
    S = pairwise_distances(T, clf.centroids_)
    assert_array_equal(clf.predict(S), true_result)
# Nearest Centroid
from sklearn import datasets
from sklearn import metrics
from sklearn.neighbors import NearestCentroid
# load the iris datasets
dataset = datasets.load_iris()
# fit a nearest centroid model to the data
model = NearestCentroid()
model.fit(dataset.data, dataset.target)
print(model)
# make predictions
expected = dataset.target
predicted = model.predict(dataset.data)
# summarize the fit of the model
print(metrics.classification_report(expected, predicted))
print(metrics.confusion_matrix(expected, predicted))