def test_classification_toy(): # Check classification on a toy dataset, including sparse versions. clf = NearestCentroid() clf.fit(X, y) assert_array_equal(clf.predict(T), true_result) # Same test, but with a sparse matrix to fit and test. clf = NearestCentroid() clf.fit(X_csr, y) assert_array_equal(clf.predict(T_csr), true_result) # Fit with sparse, test with non-sparse clf = NearestCentroid() clf.fit(X_csr, y) assert_array_equal(clf.predict(T), true_result) # Fit with non-sparse, test with sparse clf = NearestCentroid() clf.fit(X, y) assert_array_equal(clf.predict(T_csr), true_result) # Fit and predict with non-CSR sparse matrices clf = NearestCentroid() clf.fit(X_csr.tocoo(), y) assert_array_equal(clf.predict(T_csr.tolil()), true_result)
def test_classification_toy(): """Check classification on a toy dataset, including sparse versions.""" clf = NearestCentroid() clf.fit(X, y) assert_array_equal(clf.predict(T), true_result) # Same test, but with a sparse matrix to fit and test. clf = NearestCentroid() clf.fit(X_csr, y) assert_array_equal(clf.predict(T_csr), true_result) # Fit with sparse, test with non-sparse clf = NearestCentroid() clf.fit(X_csr, y) assert_array_equal(clf.predict(T), true_result) # Fit with non-sparse, test with sparse clf = NearestCentroid() clf.fit(X, y) assert_array_equal(clf.predict(T_csr), true_result) # Fit and predict with non-CSR sparse matrices clf = NearestCentroid() clf.fit(X_csr.tocoo(), y) assert_array_equal(clf.predict(T_csr.tolil()), true_result)
def _nearestcentroid(*, train, test, x_predict=None, metrics, metric='euclidean', shrink_threshold=None): """ For more info visit : https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.NearestCentroid.html#sklearn.neighbors.NearestCentroid """ model = NearestCentroid(metric=metric, shrink_threshold=shrink_threshold) model.fit(train[0], train[1]) model_name = 'Nearest Centroid' y_hat = model.predict(test[0]) if metrics == 'accuracy': accuracy = accuracy_score(test[1], y_hat) if metrics == 'f1': accuracy = f1_score(test[1], y_hat) if metrics == 'jaccard': accuracy = jaccard_score(test[1], y_hat) if x_predict is None: return (model_name, accuracy, None) y_predict = model.predict(x_predict) return (model_name, accuracy, y_predict)
def nearest_centroid_classifier(train, validation, verbose=False): nearest_centroid = NearestCentroid() nearest_centroid.fit(train['data'], train['labels']) # Find the prediction and accuracy on the training set. Yhat_svc_linear_train = nearest_centroid.predict(train['data']) acc_train = np.mean(Yhat_svc_linear_train == train['labels']) # Find the prediction and accuracy on the test set. Yhat_svc_linear_test = nearest_centroid.predict(validation['data']) acc_validation = np.mean(Yhat_svc_linear_test == validation['labels']) if verbose: print('Train Accuracy for lda classifier, = {0:f}'.format(acc_train)) print('Validation Accuracy for lda classifier, = {0:f}'.format(acc_validation)) return acc_train, acc_validation
def test_predict_translated_data(): # Test that NearestCentroid gives same results on translated data rng = np.random.RandomState(0) X = rng.rand(50, 50) y = rng.randint(0, 3, 50) noise = rng.rand(50) clf = NearestCentroid(shrink_threshold=0.1) clf.fit(X, y) y_init = clf.predict(X) clf = NearestCentroid(shrink_threshold=0.1) X_noise = X + noise clf.fit(X_noise, y) y_translate = clf.predict(X_noise) assert_array_equal(y_init, y_translate)
def test_predict_translated_data(): # Test that NearestCentroid gives same results on translated data rng = np.random.RandomState(0) X = rng.rand(50, 50) y = rng.randint(0, 3, 50) noise = rng.rand(50) clf = NearestCentroid(shrink_threshold=0.1) clf.fit(X, y) y_init = clf.predict(X) clf = NearestCentroid(shrink_threshold=0.1) X_noise = X + noise clf.fit(X_noise, y) y_translate = clf.predict(X_noise) assert_array_equal(y_init, y_translate)
class NC: def __init__(self): self.clf = NearestCentroid() self.centroids = [] """ Calculates the mean of each class in the training data. param: @train_data: training data @train_lbls: training labels """ def fit(self, train_data, train_lbls): self.clf.fit(train_data, train_lbls) self.centroids = self.clf.centroids_ return self """ Classifies test data using the class means of the training and Nearest Centroid algorithm. param: @test_data: testing data @test_lbls: testing labels returns: @classification: numpy array with classification labels @score: the mean accuracy classifications """ def predict(self, test_data, test_lbls): classification = self.clf.predict(test_data) try: score = accuracy_score(test_lbls, classification) except ValueError: score = None return classification, score
def agglomerative_validation(self, iterations): print('\nPerforming holdout validation for agglomerative clusterer...') rands = [] for i in range(0, iterations): X_train, X_test = train_test_split(self.gene_df, test_size=0.2, random_state=i) agglomerative_training = cluster.AgglomerativeClustering( \ n_clusters=self.n_clusters, linkage='ward', affinity='euclidean').fit(X_train) agglomerative_testing = cluster.AgglomerativeClustering( \ n_clusters=self.n_clusters, linkage='ward', affinity='euclidean').fit(X_test) #classify testing data using centroids of training data clusters clf = NearestCentroid() clf.fit(X_train, agglomerative_training.labels_) #calculate rand score between clustering labels and prediction labels of held out samples rands.append( adjusted_rand_score(clf.predict(X_test), agglomerative_testing.labels_)) ##print('rand scores of kmeans and held out kmeans cluster samples', rands) print('average of rand scores', sum(rands) / len(rands)) print('variance of rand scores', statistics.variance((rands)))
def NC(data_train, data_train_vectors, data_test_vectors, **kwargs): # Implementing classification model- using NearestCentroid clf_nc = NearestCentroid() clf_nc.fit(data_train_vectors, data_train.target) y_pred = clf_nc.predict(data_test_vectors) return y_pred
def nsc_fit(Xtrain, Xtrain_lbls, Xtest, Xtest_lbls, n_clust, rng, start, name, datat, t0=time()): centers = [] labels = [] correct_labels = [] #Cluster the data # Start is the index, it starts at since MNIST starts at 0 and ORL at 1. # rng is the range, MNIST has 10 classes where ORL has 40. for i in range(start, rng): data = Xtrain[np.nonzero(Xtrain_lbls == i)] kmeans = KMeans(n_clusters=n_clust, random_state=42).fit(data) # Get the centers for the amount of clusters specified. for k in range(0, n_clust): centers.append(kmeans.cluster_centers_[k, :]) labels.append(str(i) + '_' + str(k)) #Fit with nearest centroid clf = NearestCentroid() clf.fit(centers, labels) pred = clf.predict(Xtest) for i in range(0, len(pred)): correct_labels.append(int(pred[i].split('_')[0])) #Calculate score score = accuracy_score(Xtest_lbls, correct_labels) print('%-9s\t%.2fs\t%-9s\t%-9s' % (name, (time() - t0), score, datat)) return pred
class AlgorithmRunner: """ initalize the algorithms :param algo_name: the name of the current algorithm """ def __init__(self, algo_name): if algo_name == "KNN": self.algorithm = KNeighborsClassifier(n_neighbors=10) if algo_name == "Rocchio": self.algorithm = NearestCentroid() """ call to the fit method from sklearn.neighbours :param train_features: the features that we train on train_labels: the labels for the features that we train on """ def fit(self, train_features, train_labels): self.algorithm.fit(train_features, train_labels) """ call to the predict method from sklearn.neighbours :param test_features: the features that we test on """ def predict(self, test_features): return self.algorithm.predict(test_features)
def Centroid(X_train, Y_train, X_test, Y_test): # Parameter 'shrinkage' is tuned #Cross validation shrinkages = np.linspace(0, 10, 100) tuned_parameters = [{'shrink_threshold': shrinkages}] cv = GridSearchCV(NearestCentroid(), tuned_parameters) cv.fit(X_train, Y_train) #Optimal parameters print('Best Params: ') print(cv.best_params_) #Optimal Model clf = NearestCentroid() clf.set_params(shrink_threshold=cv.best_params_['shrink_threshold']) clf.fit(X_train, Y_train) pred = clf.predict(X_test) test_error = mean_squared_error(Y_test, pred) acc_score = accuracy_score(Y_test, pred) print('Nearest Centroid Test Error: ' + str(test_error)) print('Nearest Centroid Accuracy Score: ' + str(acc_score)) print('First 10 predictions: ') print(pred[:10]) print('First 10 actual: ') print(Y_test[:10]) print('Centroid of each class: ') print(clf.centroids_[0]) print('Class labels known to the classifier: ') print(clf.classes_) return clf, test_error, acc_score
def nearest_centroid_classifier(X_train, categories, X_test, test_categories): from sklearn.neighbors import NearestCentroid clf = NearestCentroid().fit(X_train, categories) y_roccio_predicted = clf.predict(X_test) print "\n Here is the classification report for NearestCentroid classifier:" print metrics.classification_report(test_categories, y_roccio_predicted) to_latex(test_categories, y_roccio_predicted)
def run_NearestCentroid(X_train, X_test, y_train): centroid = NearestCentroid() centroid.fit(X_train, y_train) cents = centroid.centroids_ y_pred = centroid.predict(X_test) return y_pred, cents
def run_knn(clf_output_file, x_train, x_test, y_train, y_test): """ Builds and saves a trained K nearest neighbour classifier. :param training_path: String File path for the training matrix. :param test_size: float Proportion of data to use for testing. :param clf_output_file: String Name of file to save the classifier to. """ # Train K Nearest Neighbour classifier start = time.time() # Performance clf = NearestCentroid() clf = clf.fit(x_train, y_train) # Save the model joblib.dump(clf, clf_output_file) end = time.time() # Performance print('KNN train & save model in: ' + str(end - start)) # Performance # Predict on the testing data y_predict = clf.predict(x_test) # Performance measurements knn_acc = accuracy_score(y_test, y_predict) knn_mcc = matthews_corrcoef(y_test, y_predict) knn_auc = roc_auc_score(y_test, y_predict) print("KNN classifier:") print("acc: " + str(knn_acc)) print("mcc: " + str(knn_mcc)) print("auc: " + str(knn_auc)) return knn_acc, knn_mcc, knn_auc
def nearest_mean_classifier(X_train, y_train, X_validation, X_test): # Returns the labels for test_data, predicted by the nearest mean classifier trained on X_train and y_train # Input: # X_train - num_train x num_features matrix with features for the training data # y_train - num_train x 1 vector with labels for the training data # X_validation - num_test x num_features matrix with features for the validation data # X_test - num_test x num_features matrix with features for the test data # Output: # y_pred_validation - num_test x 1 predicted vector with labels for the validation data # y_pred_test - num_test x 1 predicted vector with labels for the test data # Niet in gebruik X_test_val = np.vstack((X_validation, X_test)) # Gooi datasets samen clf = NearestCentroid() clf.fit(X_train, y_train) # Bepaal de means predicted_labels = clf.predict(X_test_val) # Voorspel de data # Sla voorspellingen op y_pred_validation = predicted_labels[:len(X_validation)] y_pred_test = predicted_labels[len(X_validation):] return y_pred_validation, y_pred_test
def test_iris_shrinkage(): # Check consistency on dataset iris, when using shrinkage. for metric in ("euclidean", "cosine"): for shrink_threshold in [None, 0.1, 0.5]: clf = NearestCentroid(metric=metric, shrink_threshold=shrink_threshold) clf = clf.fit(iris.data, iris.target) score = np.mean(clf.predict(iris.data) == iris.target) assert score > 0.8, "Failed with score = " + str(score)
def nearestcentrclassif(self, shrinkage=0.1): # we create an instance of Neighbours Classifier and fit the data. clf = NearestCentroid(shrink_threshold=shrinkage) clf.fit(self.x_train, self.y_train) z = clf.predict(self.x_test) print(np.mean(self.y_test == z)) return z
def nearestNeighbour(): import numpy as np import pylab as pl from matplotlib.colors import ListedColormap from sklearn import datasets from sklearn.neighbors import NearestCentroid n_neighbors = 15 # import some data to play with iris = datasets.load_iris() X = iris.data[:, :2] # we only take the first two features. We could # avoid this ugly slicing by using a two-dim dataset y = iris.target h = .02 # step size in the mesh # Create color maps cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF']) cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF']) for shrinkage in [None, 0.1]: # we create an instance of Neighbours Classifier and fit the data. clf = NearestCentroid(shrink_threshold=shrinkage) clf.fit(X, y) y_pred = clf.predict(X) print shrinkage, np.mean(y == y_pred) # Plot the decision boundary. For that, we will asign a color to each # point in the mesh [x_min, m_max]x[y_min, y_max]. x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]) # Put the result into a color plot Z = Z.reshape(xx.shape) pl.figure() pl.pcolormesh(xx, yy, Z, cmap=cmap_light) # Plot also the training points pl.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold) pl.title("3-Class classification (shrink_threshold=%r)" % shrinkage) pl.axis('tight')
def get_centeroids_per_class(features, labels): clf = NearestCentroid() clf.fit(features, labels) centroids = clf.centroids_ class_labels = clf.predict(clf.centroids_) return { class_label: centroid for class_label, centroid in zip(class_labels, centroids) }
def test_iris_shrinkage(): # Check consistency on dataset iris, when using shrinkage. for metric in ('euclidean', 'cosine'): for shrink_threshold in [None, 0.1, 0.5]: clf = NearestCentroid(metric=metric, shrink_threshold=shrink_threshold) clf = clf.fit(iris.data, iris.target) score = np.mean(clf.predict(iris.data) == iris.target) assert score > 0.8, "Failed with score = " + str(score)
class BinBasedCluster(BaseEstimator): def __init__(self, bins=[0, 0.5, 1] + range(5, 36)): self.bins = bins def fit(self, X, y): biny = self.bin_data(y) self.pred = NearestCentroid().fit(X, biny) return self def predict(self, X): return self.pred.predict(X) def score(self, X, y, is_raw=True): clusters = self.pred.predict(X) if is_raw: return adjusted_rand_score(self.bin_data(y), clusters) else: return adjusted_rand_score(y, clusters) def bin_data(self, y): return np.digitize(y, self.bins) def make_vern_points(self, X, y): sel = SelectKBest(score_func=normalized_mutual_info_score_scorefunc) sdata = sel.fit_transform(X, y) print X.shape, sdata.shape pca = PCA(n_components=2) pca_trans = pca.fit_transform(sdata) biny = self.bin_data(y) pred = NearestCentroid().fit(pca_trans, biny) x_min, x_max = pca_trans[:, 0].min() - 1, pca_trans[:, 0].max() + 1 y_min, y_max = pca_trans[:, 1].min() - 1, pca_trans[:, 1].max() + 1 xx, yy = np.meshgrid(np.linspace(x_min, x_max, 50), np.linspace(y_min, y_max, 50)) Z = pred.predict(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) return pca_trans, biny, xx, yy, Z
def nearestNeighbour(): import numpy as np import pylab as pl from matplotlib.colors import ListedColormap from sklearn import datasets from sklearn.neighbors import NearestCentroid n_neighbors = 15 # import some data to play with iris = datasets.load_iris() X = iris.data[:, :2] # we only take the first two features. We could # avoid this ugly slicing by using a two-dim dataset y = iris.target h = .02 # step size in the mesh # Create color maps cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF']) cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF']) for shrinkage in [None, 0.1]: # we create an instance of Neighbours Classifier and fit the data. clf = NearestCentroid(shrink_threshold=shrinkage) clf.fit(X, y) y_pred = clf.predict(X) print shrinkage, np.mean(y == y_pred) # Plot the decision boundary. For that, we will asign a color to each # point in the mesh [x_min, m_max]x[y_min, y_max]. x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]) # Put the result into a color plot Z = Z.reshape(xx.shape) pl.figure() pl.pcolormesh(xx, yy, Z, cmap=cmap_light) # Plot also the training points pl.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold) pl.title("3-Class classification (shrink_threshold=%r)" % shrinkage) pl.axis('tight')
def nearest_centroid_classifier(X_train, X_test, y_train, y_test): from sklearn.neighbors import NearestCentroid clf = NearestCentroid().fit(X_train, y_train) evaluate_cross_validation(clf,X_train, y_train, 5) y_roccio_predicted = clf.predict(X_test) print "\n Here is the classification report for NearestCentroid classifier:" print metrics.classification_report(y_test, y_roccio_predicted)
def assign_mg2k_centroids(X, centroids=None): """ Assigns Mg II k centroids found in the study 'Identifying typical Mg II flare spectra using machine learning', by B. Panos et. al. 2018 to the Mg II k spectra supplied in X. The centroids are assigned using a nearest neighbour procedure. The spectra in X have to be interpolated to 216 wavelength bins between LAMBDA_MIN = 2793.8500976562500 and LAMBDA_MAX = 2799.3239974882454. For example:: X = raster.get_interpolated_image_step( step = <step>, lambda_min = LAMBDA_MIN, lambda_max = LAMBDA_MAX, n_breaks = 216 ) Parameters ---------- X : numpy.array interpolated raster image of shape (_,bins) centroids : numpy.array If None, the centroids defined in the above study will be used, otherwise an array of shape (n_centroids, n_bins) should be passed. Important: both the spectra in 'X' and in 'centroids' should be constrained to the same wavelength region! Returns ------- assigned_mg2k_centroids numpy vector with shape (X.shape[1],) """ # load default centroids if no centroids are passed if centroids is None: centroids = get_mg2k_centroids(bins=X.shape[1]) # create list of numbered centroid ids centroid_ids = list(range(centroids.shape[0])) # check whether X comes in the correct dimensions if not X.shape[1] == centroids.shape[1]: raise Exception( "Expecting X to have shape (_,{}). Please interpolate accordingly (More information with 'help(assign_mg2k_centroids)')." .format(centroids.shape[1])) # create nearest centroid finder instance and fit it knc = NearestCentroid() knc.fit(X=centroids, y=centroid_ids) # predict nearest centroids for the supplied spectra # (making sure that X is normalized) assigned_mg2k_centroids = knc.predict(normalize(X)) # return vector of assigned centroids return assigned_mg2k_centroids
def test_nc_classify_with_sklearn(trainingData, trainingLabels, testData, testLabels): with warnings.catch_warnings(): warnings.simplefilter("ignore") X = np.array(trainingData) y = np.array(trainingLabels) clf = NearestCentroid() clf.fit(X, y) predictions = clf.predict(testData) printCorrectWrong(predictions, testLabels)
def nearest_class_centroid(images_training, labels_training, images_testing, labels_testing): pca = PCA(n_components=(2)) training_images_pca = pca.fit_transform(images_training) test_images_pca = pca.fit_transform(images_testing) clf = NearestCentroid() clf.fit(training_images_pca, labels_training) print("Centoids: \n", clf.centroids_) return (clf.predict(test_images_pca), clf.centroids_, training_images_pca, test_images_pca)
def rocchio(X_train, X_test, y_train, y_test,string): clf = NearestCentroid() clf.fit(X_train, y_train.values.ravel()) #pickles.criarModelo(clf,"Rocchio "+string) if("Fold" in string): pickles.criarModelo(clf,"oraculo/"+string) #SALVAR MODELO y_predito = clf.predict(X_test) micro = f1_score(y_test,y_predito,average='micro') macro = f1_score(y_test,y_predito,average='macro') #f1_individual = f1_score(y_test,y_predito,average=None) #salvar_dados.salvar(y_test,y_predito,micro, macro, f1_individual," Rocchio "+string) print("O f1Score micro do Rocchio ", string ," é: ",micro) print("O f1Score macro do Rocchio ", string ," é: ",macro)
class NearestCentroidImpl: def __init__(self, **hyperparams): self._hyperparams = hyperparams self._wrapped_model = Op(**self._hyperparams) def fit(self, X, y=None): if y is not None: self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def predict(self, X): return self._wrapped_model.predict(X)
class NearestMeanClassifier(BaseClassifier): def __init__(self, feature_length, num_classes): super().__init__(feature_length, num_classes) self.num_classes = num_classes # model build # shrink_threshold = True for Nearest Shrunken Centroid Classifier self.model = NearestCentroid(metric='manhattan') def train(self, features, labels): """ Using a set of features and labels, trains the classifier and returns the training accuracy. :param features: An MxN matrix of features to use in prediction :param labels: An M row list of labels to train to predict :return: Prediction accuracy, as a float between 0 and 1 """ labels = self.labels_to_categorical(labels) self.model.fit(features, labels) accuracy = self.model.score(features, labels) return accuracy def get_prediction(self,features): ''' this function get the prediction from the :param features: sample to predict :return: prediction from the model ''' return self.model.predict(features) def predict(self, features, labels): """ Using a set of features and labels, predicts the labels from the features, and returns the accuracy of predicted vs actual labels. :param features: An MxN matrix of features to use in prediction :param labels: An M row list of labels to test prediction accuracy on :return: Prediction accuracy, as a float between 0 and 1 """ labels = self.labels_to_categorical(labels) accuracy = self.model.score(features, labels) return accuracy def labels_to_categorical(self, labels): ''' convert the labels from string to number :param labels: labels list of string :return: labels converted in number ''' _, IDs = unique(labels, return_inverse=True) return IDs
def cluster(dataname, trn_X, trn_Y, dev_X, dev_Y, k, trial_num, link_type='ward', m='euclidean'): print("[{}] clustering with: linkage={}, m={}, n_clusters={}...".format( trial_num, link_type, m, k)) clustering = AgglomerativeClustering(n_clusters=k, linkage=link_type, affinity=m) # clustering = KMeans(n_clusters=k) clustering.fit(trn_X) labels = clustering.labels_ print('[{}] finished clustering.'.format(trial_num)) ## labels: new_id -> cluster_number trn_id2i = dict() for rid, eid in trn_Y.items(): trn_id2i[rid] = labels[eid] trn_oname = '../../resources/topicreps/{}_{}_{}_{}-train.labels.pkl'.format( dataname, link_type, m, k) pickle.dump(trn_id2i, open(trn_oname, 'wb')) print("[{}] saved to {}".format(trial_num, trn_oname)) print("[{}] fitting centroid classifier ...".format(trial_num)) clf = NearestCentroid() clf.fit(trn_X, labels) print("[{}] finished fitting classifier.".format(trial_num)) cen_oname = '../../resources/topicreps/{}_{}_{}_{}.centroids.npy'.format( dataname, link_type, m, k) np.save(cen_oname, clf.centroids_) print("[{}] saved to {}".format(trial_num, cen_oname)) dev_labels = clf.predict(dev_X) sse = calculate_sse(clf.centroids_, dev_X, dev_labels) print("[{}] Sum Squared Error: {}".format(trial_num, sse)) dev_id2i = dict() for rid, eid in dev_Y.items(): dev_id2i[rid] = dev_labels[eid] dev_oname = '../../resources/topicreps/{}_{}_{}_{}-dev.labels.pkl'.format( dataname, link_type, m, k) pickle.dump(dev_id2i, open(dev_oname, 'wb')) print("[{}] saved to {}".format(trial_num, dev_oname)) print() return sse
def classification(train_img, train_label, test_img, distance): """ It trains the nearest centroid classification and output the predicting label. :param train_img: feature vector of training images :param train_label: labels of training images :param test_img: feature vector of test images :param distance: 'l1','l2' or 'cosine' :return: predicting labels of testing images and distance of all test feature vectors to the centroids """ clf = NearestCentroid(metric=distance) clf.fit(train_img, train_label) predict_label = clf.predict(test_img) dist = pairwise_distances(test_img, clf.centroids_, metric=clf.metric) return predict_label, dist
def nc_fit(Xtrain, Xtest, Xtrain_lbls, Xtest_lbls, name, data, t0=time()): #Create a nearest centroid clf = NearestCentroid() # Train with the data clf.fit(Xtrain, Xtrain_lbls) # Create prediction for test data y_pred_test = clf.predict(Xtest) # How well does it fit score = clf.score(Xtest, Xtest_lbls) print('%-9s\t%.2fs\t%-9s\t%-9s' % (name, (time() - t0), score, data)) return y_pred_test
def centroid_knn(data, ref_data, label, ref_label, using_boostrap=False, output_mode=0): # clf = KNeighborsClassifier(n_neighbors=5) clf = NearestCentroid() clf.fit(ref_data, ref_label) pred = clf.predict(data) # print(confusion_matrix(pred, label)) # print(classification_report(pred, label, digits=4)) if using_boostrap: bootstrap(pred, label, output_mode) else: no_bootstrap(pred, label, output_mode)
class NCClassifier(Classifier): """Rocchio classifier""" def __init__(self, shrink=None): self.cl = NearestCentroid(shrink_threshold=shrink) self.shrink = shrink def retrain(self, vectorFeature, vectorTarget): if self.shrink != None: self.cl.fit([v.toarray()[0] for v in vectorFeature], vectorTarget) else: super(NCClassifier, self).retrain(vectorFeature, vectorTarget) def classify(self, vectorizedTest): if self.shrink != None: return self.cl.predict(vectorizedTest.toarray()[0])[0] else: return super(NCClassifier, self).classify(vectorizedTest)
class NCClassifier(Classifier): """Rocchio classifier""" def __init__(self, shrink=None): self.cl = NearestCentroid(shrink_threshold=shrink) self.shrink = shrink def retrain(self, vectorFeature, vectorTarget): if self.shrink != None: self.cl.fit([v.toarray()[0] for v in vectorFeature], vectorTarget) else: super(NCClassifier, self).retrain(vectorFeature, vectorTarget) def classify(self, vectorizedTest): if self.shrink != None: return self.cl.predict(vectorizedTest.toarray()[0])[0] else: return super(NCClassifier, self).classify(vectorizedTest)
def ROCCHIO(FeatureMatrix, Labels): samples, features = FeatureMatrix.shape XTrain, XTest, LabelTrain, LabelTest = train_test_split(FeatureMatrix, Labels, test_size=0.1) # training model on dataset clf = NearestCentroid() clf.fit(XTrain, LabelTrain) # testing model on dataset expected = LabelTest predicted = clf.predict(XTest) return (expected, predicted)
def __test_epoch_cluster(self): train_embeddings, train_targets = self.__extract_embeddings( self.eval_train_loader) test_embeddings, test_targets = self.__extract_embeddings( self.eval_test_loader) nc = NearestCentroid() nc.fit(train_embeddings, train_targets) predictions = nc.predict(test_embeddings) #classification_report = sklearn.metrics.classification_report(test_targets, predictions, target_names=['Open','Partial','Closed']) classification_report = sklearn.metrics.classification_report( test_targets, predictions, target_names=['Open', 'Closed']) classification_metrics = sklearn.metrics.precision_recall_fscore_support( test_targets, predictions, average='macro') confussion_matrix = sklearn.metrics.confusion_matrix( test_targets, predictions) return classification_report, classification_metrics, confussion_matrix
def predict(self, user_id): train_set = pd.read_csv( f'../FileCenter/FeaturesPerUser/user{user_id}_train_features.csv') test_set = pd.read_csv( f'../FileCenter/FeaturesPerUser/user{user_id}_test_features.csv') clf = NearestCentroid() x_train = train_set.iloc[:, :-1] clf.fit(x_train, train_set['label']) x_test = test_set.iloc[:, :-1] plot_confusion_matrix(clf, x_test, test_set['label'], normalize='true') # doctest: +SKIP plt.show() predicted = clf.predict(x_test) with open( '../FileCenter/classifiers_predictions/predicted_NearestCentroid', 'wb') as fp: pickle.dump(predicted, fp) return predicted
def make_vern_points(self, X, y): sel = SelectKBest(score_func=normalized_mutual_info_score_scorefunc) sdata = sel.fit_transform(X, y) print X.shape, sdata.shape pca = PCA(n_components=2) pca_trans = pca.fit_transform(sdata) biny = self.bin_data(y) pred = NearestCentroid().fit(pca_trans, biny) x_min, x_max = pca_trans[:, 0].min() - 1, pca_trans[:, 0].max() + 1 y_min, y_max = pca_trans[:, 1].min() - 1, pca_trans[:, 1].max() + 1 xx, yy = np.meshgrid(np.linspace(x_min, x_max, 50), np.linspace(y_min, y_max, 50)) Z = pred.predict(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) return pca_trans, biny, xx, yy, Z
train_images = faces.images[np.negative(ind)] train_targets = faces.target[np.negative(ind)] n_train = len(train_images) test_images = faces.images[ind] test_targets = faces.target[ind] n_tests = len(test_images) for test in test_images: test = test + norm.rvs(scale=10, size=test.shape) for i in range(25, 30): test[i, :] = 0 test[:, i] = 0 test = np.minimum(test, 1) test = np.maximum(test, 0) test = np.zeros(test.shape) train = train_images.reshape((n_train, -1)) train_pca = pca.fit_transform(train) test = test_images.reshape((n_tests, -1)) test_pca = pca.transform(test) neigh = NearestCentroid() neigh.fit(train, train_targets) print("Sans ACP :", np.count_nonzero(neigh.predict(test) - test_targets), n_tests) neigh.fit(train_pca, train_targets) print("Avec ACP :", np.count_nonzero(neigh.predict(test_pca) - test_targets), n_tests)
iris = datasets.load_iris() X = iris.data[:, :2] # we only take the first two features. We could # avoid this ugly slicing by using a two-dim dataset y = iris.target h = 0.02 # step size in the mesh # Create color maps cmap_light = ListedColormap(["#FFAAAA", "#AAFFAA", "#AAAAFF"]) cmap_bold = ListedColormap(["#FF0000", "#00FF00", "#0000FF"]) for shrinkage in [None, 0.1]: # we create an instance of Neighbours Classifier and fit the data. clf = NearestCentroid(shrink_threshold=shrinkage) clf.fit(X, y) y_pred = clf.predict(X) print(shrinkage, np.mean(y == y_pred)) # Plot the decision boundary. For that, we will assign a color to each # point in the mesh [x_min, m_max]x[y_min, y_max]. x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]) # Put the result into a color plot Z = Z.reshape(xx.shape) pl.figure() pl.pcolormesh(xx, yy, Z, cmap=cmap_light) # Plot also the training points pl.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold)
clf5=RandomForestClassifier(n_estimators=100) #RandomForest Classifier clf5.fit(X_train, y_train) pred = clf5.predict(X_test) writeToDisk(pred,"RandomForestClassifier") clf6=Pipeline([('feature_selection', #LinearSVC with L2-based feature selection LinearSVC(penalty="l2", dual=False, tol=1e-3)), ('classification', LinearSVC())]) clf6.fit(X_train, y_train) pred = clf6.predict(X_test) writeToDisk(pred,"LinearSVC") clf7=NearestCentroid() #NearestCentroid (aka Rocchio classifier), no threshold clf7.fit(X_train, y_train) pred = clf7.predict(X_test) writeToDisk(pred,"NearestCentroid") clf8=SVC(C=1.0, class_weight=None, coef0=0.0, #SVC decision_function_shape=None, degree=3, gamma='auto', kernel='linear', max_iter=-1, probability=False, random_state=1, shrinking=True, tol=0.001, verbose=False) clf8.fit(X_train, y_train) pred = clf8.predict(X_test) writeToDisk(pred,"SVC") ''' clf9=VotingClassifier(estimators=[ ('Ridge',clf1),('MultiNB',clf2),('BernNB',clf3),('KNN',clf4), ('RF',clf5),('LinearSVC',clf6),('NearC',clf7),('SVC',clf8) ],voting='soft')
y_test = labels[272:,i] else: X_train = training y_train = labels[:172,i] X_test = sampletest y_test = labels[172:,i] posterior = np.empty([100,72,6]) box = np.zeros([6,6]) for j in range(4,5): for k in range(1,2): accuracy = np.zeros(100) for m in range(0,100): ncc = NearestCentroid() ncc.fit(X_train, y_train) y_pred = ncc.predict(X_test) n=0 for i in range(0,len(y_pred)): if y_pred[i] == y_test[i]: #print i, y_pred[i], y_test[i] n = n+1 accuracy[m] = accuracy[m]+1 box[y_test[i]-1,y_pred[i]-1] = box[y_test[i]-1,y_pred[i]-1] + 1 #posterior[m] = knc.predict_proba(X_test) print j, k, np.mean(accuracy)/0.72, np.std(accuracy)/0.72 #print 30, 20, sum(accuracy[0:8])/8.0, sum(accuracy[8:18])/10.0, sum(accuracy[18:30])/12.0, sum(accuracy[56:72])/16.0, sum(accuracy[30:43])/13.0, sum(accuracy[43:56])/13.0, sum(accuracy)/72.0 ''' means = np.empty([72,6]) stds = np.empty([72,6]) grid = np.empty([6,6])
def test_iris(): # Check consistency on dataset iris. for metric in ('euclidean', 'cosine'): clf = NearestCentroid(metric=metric).fit(iris.data, iris.target) score = np.mean(clf.predict(iris.data) == iris.target) assert score > 0.9, "Failed with score = " + str(score)
def test_precomputed(): clf = NearestCentroid(metric="precomputed") clf.fit(X, y) S = pairwise_distances(T, clf.centroids_) assert_array_equal(clf.predict(S), true_result)
# Nearest Centroid from sklearn import datasets from sklearn import metrics from sklearn.neighbors import NearestCentroid # load the iris datasets dataset = datasets.load_iris() # fit a nearest centroid model to the data model = NearestCentroid() model.fit(dataset.data, dataset.target) print(model) # make predictions expected = dataset.target predicted = model.predict(dataset.data) # summarize the fit of the model print(metrics.classification_report(expected, predicted)) print(metrics.confusion_matrix(expected, predicted))