def train_with(self, training_data_list, answers): #put data in right format training_data = self.get_sparse_matrix(training_data_list) if training_data is not False: #make model if self.model_name == "random_forest": forest = RandomForestClassifier(n_estimators=100) self.model = forest.fit(training_data.todense(), answers) elif self.model_name == "centroid_prediction": clf = NearestCentroid() self.model = clf.fit(training_data, answers) elif self.model_name == "linearSVC": SVC = LinearSVC() self.model = SVC.fit(training_data.todense(), answers) elif self.model_name == "nearest_neighbor": near = KNeighborsClassifier() self.model = near.fit(training_data.todense(), answers) elif self.model_name == "decision_tree": clf = tree.DecisionTreeClassifier() self.model = clf.fit(training_data.todense(), answers) elif self.model_name == "svc": clf = svm.SVC() self.model = clf.fit(training_data, answers)
def _nearestcentroid(*, train, test, x_predict=None, metrics, metric='euclidean', shrink_threshold=None): """ For more info visit : https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.NearestCentroid.html#sklearn.neighbors.NearestCentroid """ model = NearestCentroid(metric=metric, shrink_threshold=shrink_threshold) model.fit(train[0], train[1]) model_name = 'Nearest Centroid' y_hat = model.predict(test[0]) if metrics == 'accuracy': accuracy = accuracy_score(test[1], y_hat) if metrics == 'f1': accuracy = f1_score(test[1], y_hat) if metrics == 'jaccard': accuracy = jaccard_score(test[1], y_hat) if x_predict is None: return (model_name, accuracy, None) y_predict = model.predict(x_predict) return (model_name, accuracy, y_predict)
class NC: def __init__(self): self.clf = NearestCentroid() self.centroids = [] """ Calculates the mean of each class in the training data. param: @train_data: training data @train_lbls: training labels """ def fit(self, train_data, train_lbls): self.clf.fit(train_data, train_lbls) self.centroids = self.clf.centroids_ return self """ Classifies test data using the class means of the training and Nearest Centroid algorithm. param: @test_data: testing data @test_lbls: testing labels returns: @classification: numpy array with classification labels @score: the mean accuracy classifications """ def predict(self, test_data, test_lbls): classification = self.clf.predict(test_data) try: score = accuracy_score(test_lbls, classification) except ValueError: score = None return classification, score
def nearest_mean_classifier(X_train, y_train, X_validation, X_test): # Returns the labels for test_data, predicted by the nearest mean classifier trained on X_train and y_train # Input: # X_train - num_train x num_features matrix with features for the training data # y_train - num_train x 1 vector with labels for the training data # X_validation - num_test x num_features matrix with features for the validation data # X_test - num_test x num_features matrix with features for the test data # Output: # y_pred_validation - num_test x 1 predicted vector with labels for the validation data # y_pred_test - num_test x 1 predicted vector with labels for the test data # Niet in gebruik X_test_val = np.vstack((X_validation, X_test)) # Gooi datasets samen clf = NearestCentroid() clf.fit(X_train, y_train) # Bepaal de means predicted_labels = clf.predict(X_test_val) # Voorspel de data # Sla voorspellingen op y_pred_validation = predicted_labels[:len(X_validation)] y_pred_test = predicted_labels[len(X_validation):] return y_pred_validation, y_pred_test
def test_classification_toy(): # Check classification on a toy dataset, including sparse versions. clf = NearestCentroid() clf.fit(X, y) assert_array_equal(clf.predict(T), true_result) # Same test, but with a sparse matrix to fit and test. clf = NearestCentroid() clf.fit(X_csr, y) assert_array_equal(clf.predict(T_csr), true_result) # Fit with sparse, test with non-sparse clf = NearestCentroid() clf.fit(X_csr, y) assert_array_equal(clf.predict(T), true_result) # Fit with non-sparse, test with sparse clf = NearestCentroid() clf.fit(X, y) assert_array_equal(clf.predict(T_csr), true_result) # Fit and predict with non-CSR sparse matrices clf = NearestCentroid() clf.fit(X_csr.tocoo(), y) assert_array_equal(clf.predict(T_csr.tolil()), true_result)
class AlgorithmRunner: """ initalize the algorithms :param algo_name: the name of the current algorithm """ def __init__(self, algo_name): if algo_name == "KNN": self.algorithm = KNeighborsClassifier(n_neighbors=10) if algo_name == "Rocchio": self.algorithm = NearestCentroid() """ call to the fit method from sklearn.neighbours :param train_features: the features that we train on train_labels: the labels for the features that we train on """ def fit(self, train_features, train_labels): self.algorithm.fit(train_features, train_labels) """ call to the predict method from sklearn.neighbours :param test_features: the features that we test on """ def predict(self, test_features): return self.algorithm.predict(test_features)
def plot_mnist_centroids(data, labels, title="", fp="", draw=False): # Create set of classes in data set classes = list(set(labels)) # Calculate mean vector of each class clf = NearestCentroid() clf.fit(data, labels) centroids = clf.centroids_ # https://stackoverflow.com/questions/37228371/visualize-mnist-dataset-using-opencv-or-matplotlib-pyplot plt.figure() plt.suptitle(title, fontsize=14) for i, class_center in enumerate(centroids): pixels = np.array(class_center, dtype='uint8') # Reshape the array into 28 x 28 array (2-dimensional array) pixels = pixels.reshape((28, 28)) # Plot each mean vector as a gray scale image in a subplot plt.subplot(2, 5, i + 1) plt.title('Label: {label}'.format(label=classes[i])) plt.imshow(pixels, cmap='gray') plt.tick_params(which='both', bottom='off', left='off', labelbottom='off', labelleft='off') if fp != "": plt.savefig(fp, bbox_inches='tight', pad_inches=0) if draw: plt.draw()
def plot_orl_centroids(data, labels, title="", fp="", draw=False): # Create set of classes in data set classes = list(set(labels)) # Calculate mean vector of each class clf = NearestCentroid() clf.fit(data, labels) centroids = clf.centroids_ plt.figure(figsize=(18, 12)) plt.suptitle(title, fontsize=14) for i, class_center in enumerate(centroids): pixels = np.array(class_center, dtype='float') # Reshape the array into 30 x 40 array (2-dimensional array) pixels = pixels.reshape( (30, 40)).transpose() # image vectors are sideways # Plot each mean vector as a gray scale image in a subplot plt.subplot(4, 10, i + 1) plt.title('Label: {label}'.format(label=classes[i])) plt.imshow(pixels, cmap='gray') plt.tick_params(which='both', bottom='off', left='off', labelbottom='off', labelleft='off') if fp != "": plt.savefig(fp, bbox_inches='tight', pad_inches=0) if draw: plt.draw()
def NC(data_train, data_train_vectors, data_test_vectors, **kwargs): # Implementing classification model- using NearestCentroid clf_nc = NearestCentroid() clf_nc.fit(data_train_vectors, data_train.target) y_pred = clf_nc.predict(data_test_vectors) return y_pred
def agglomerative_validation(self, iterations): print('\nPerforming holdout validation for agglomerative clusterer...') rands = [] for i in range(0, iterations): X_train, X_test = train_test_split(self.gene_df, test_size=0.2, random_state=i) agglomerative_training = cluster.AgglomerativeClustering( \ n_clusters=self.n_clusters, linkage='ward', affinity='euclidean').fit(X_train) agglomerative_testing = cluster.AgglomerativeClustering( \ n_clusters=self.n_clusters, linkage='ward', affinity='euclidean').fit(X_test) #classify testing data using centroids of training data clusters clf = NearestCentroid() clf.fit(X_train, agglomerative_training.labels_) #calculate rand score between clustering labels and prediction labels of held out samples rands.append( adjusted_rand_score(clf.predict(X_test), agglomerative_testing.labels_)) ##print('rand scores of kmeans and held out kmeans cluster samples', rands) print('average of rand scores', sum(rands) / len(rands)) print('variance of rand scores', statistics.variance((rands)))
def nsc_fit(Xtrain, Xtrain_lbls, Xtest, Xtest_lbls, n_clust, rng, start, name, datat, t0=time()): centers = [] labels = [] correct_labels = [] #Cluster the data # Start is the index, it starts at since MNIST starts at 0 and ORL at 1. # rng is the range, MNIST has 10 classes where ORL has 40. for i in range(start, rng): data = Xtrain[np.nonzero(Xtrain_lbls == i)] kmeans = KMeans(n_clusters=n_clust, random_state=42).fit(data) # Get the centers for the amount of clusters specified. for k in range(0, n_clust): centers.append(kmeans.cluster_centers_[k, :]) labels.append(str(i) + '_' + str(k)) #Fit with nearest centroid clf = NearestCentroid() clf.fit(centers, labels) pred = clf.predict(Xtest) for i in range(0, len(pred)): correct_labels.append(int(pred[i].split('_')[0])) #Calculate score score = accuracy_score(Xtest_lbls, correct_labels) print('%-9s\t%.2fs\t%-9s\t%-9s' % (name, (time() - t0), score, datat)) return pred
def calc_cluster_props(self): """Calculate cluster properties. Returns ------- tuple of float centroid Notes ----- Add column `silh` to DataFrame. """ data = self.df[self.relevant_groups()] scaler = StandardScaler() data_ = scaler.fit_transform(data) labels = self.df['hgt'].tolist() # calculate silhouette scores for all samples self.df['silh'] = silhouette_samples(data_, labels) # calculate centroid clf = NearestCentroid() clf.fit(data_, self.df['hgt']) cent = scaler.inverse_transform(clf.centroids_[1]) return cent
def test_classification_toy(): """Check classification on a toy dataset, including sparse versions.""" clf = NearestCentroid() clf.fit(X, y) assert_array_equal(clf.predict(T), true_result) # Same test, but with a sparse matrix to fit and test. clf = NearestCentroid() clf.fit(X_csr, y) assert_array_equal(clf.predict(T_csr), true_result) # Fit with sparse, test with non-sparse clf = NearestCentroid() clf.fit(X_csr, y) assert_array_equal(clf.predict(T), true_result) # Fit with non-sparse, test with sparse clf = NearestCentroid() clf.fit(X, y) assert_array_equal(clf.predict(T_csr), true_result) # Fit and predict with non-CSR sparse matrices clf = NearestCentroid() clf.fit(X_csr.tocoo(), y) assert_array_equal(clf.predict(T_csr.tolil()), true_result)
def run_NearestCentroid(X_train, X_test, y_train): centroid = NearestCentroid() centroid.fit(X_train, y_train) cents = centroid.centroids_ y_pred = centroid.predict(X_test) return y_pred, cents
def Centroid(X_train, Y_train, X_test, Y_test): # Parameter 'shrinkage' is tuned #Cross validation shrinkages = np.linspace(0, 10, 100) tuned_parameters = [{'shrink_threshold': shrinkages}] cv = GridSearchCV(NearestCentroid(), tuned_parameters) cv.fit(X_train, Y_train) #Optimal parameters print('Best Params: ') print(cv.best_params_) #Optimal Model clf = NearestCentroid() clf.set_params(shrink_threshold=cv.best_params_['shrink_threshold']) clf.fit(X_train, Y_train) pred = clf.predict(X_test) test_error = mean_squared_error(Y_test, pred) acc_score = accuracy_score(Y_test, pred) print('Nearest Centroid Test Error: ' + str(test_error)) print('Nearest Centroid Accuracy Score: ' + str(acc_score)) print('First 10 predictions: ') print(pred[:10]) print('First 10 actual: ') print(Y_test[:10]) print('Centroid of each class: ') print(clf.centroids_[0]) print('Class labels known to the classifier: ') print(clf.classes_) return clf, test_error, acc_score
def print_accuracy(test_features, control_group, folds, classifiers): from sklearn.model_selection import train_test_split from sklearn.preprocessing import MinMaxScaler from sklearn.linear_model import LogisticRegression from sklearn.tree import DecisionTreeClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.discriminant_analysis import LinearDiscriminantAnalysis from sklearn.naive_bayes import GaussianNB from sklearn.svm import SVC from sklearn.neighbors import NearestCentroid x_train, x_test, y_train, y_test = train_test_split(test_features, control_group, random_state=folds) scaler = MinMaxScaler() x_train = scaler.fit_transform(x_train) x_test = scaler.transform(x_test) logreg = LogisticRegression() logreg.fit(x_train, y_train) clf2 = DecisionTreeClassifier(max_depth=3).fit(x_train, y_train) knn = KNeighborsClassifier(n_neighbors=3) knn.fit(x_train, y_train) gnb = GaussianNB() gnb.fit(x_train, y_train) lda = LinearDiscriminantAnalysis() lda.fit(x_train, y_train) svm = SVC() svm.fit(x_train, y_train) cent = NearestCentroid() cent.fit(x_train, y_train) def get_accuracy(x, y): a = logreg.score(x, y) b = clf2.score(x, y) c = knn.score(x, y) d = gnb.score(x, y) e = lda.score(x, y) f = svm.score(x, y) g = cent.score(x, y) return (a, b, c, d, e, f, g) training_sets = [] test_sets = [] for i in range(len(classifiers)): train = float(get_accuracy(x_train, y_train)[i]) test = float(get_accuracy(x_test, y_test)[i]) training_sets.append(train) test_sets.append(test) training_sets = tuple(training_sets) test_sets = tuple(test_sets) return (training_sets, test_sets)
def test_shrinkage_threshold_decoded_y(): clf = NearestCentroid(shrink_threshold=0.01) y_ind = np.asarray(y) y_ind[y_ind == -1] = 0 clf.fit(X, y_ind) centroid_encoded = clf.centroids_ clf.fit(X, y) assert_array_equal(centroid_encoded, clf.centroids_)
def nearestcentrclassif(self, shrinkage=0.1): # we create an instance of Neighbours Classifier and fit the data. clf = NearestCentroid(shrink_threshold=shrinkage) clf.fit(self.x_train, self.y_train) z = clf.predict(self.x_test) print(np.mean(self.y_test == z)) return z
def test_should_recognise_mean_classifier(self): # given clf = NearestCentroid() clf.fit(self.X, self.y) # when clf_type = ClassifLibraryOld.determine_clf_type(clf) # then self.assertEqual(clf_type, ClassifLibrary.ClfType.MEAN)
def test_manhattan_metric(): # Test the manhattan metric. clf = NearestCentroid(metric='manhattan') clf.fit(X, y) dense_centroid = clf.centroids_ clf.fit(X_csr, y) assert_array_equal(clf.centroids_, dense_centroid) assert_array_equal(dense_centroid, [[-1, -1], [1, 1]])
def get_centeroids_per_class(features, labels): clf = NearestCentroid() clf.fit(features, labels) centroids = clf.centroids_ class_labels = clf.predict(clf.centroids_) return { class_label: centroid for class_label, centroid in zip(class_labels, centroids) }
def trainKnn(imagen): # fvp: objeto generado por computeFeatureVector # NUM: numero de puntos de entrenamiento, dasignado por objeto Image.NUM fvp = imagen.fvp NUM = imagen.NUM clf = NearestCentroid() labels = [1] * int(NUM) + [0] * int(NUM) clf.fit(fvp, labels) imagen.set_clf(clf) return(clf)
def test_nc_classify_with_sklearn(trainingData, trainingLabels, testData, testLabels): with warnings.catch_warnings(): warnings.simplefilter("ignore") X = np.array(trainingData) y = np.array(trainingLabels) clf = NearestCentroid() clf.fit(X, y) predictions = clf.predict(testData) printCorrectWrong(predictions, testLabels)
def print_accuracy(test_features, control_group, folds, classifiers): #SPLITS SCORESETS IN TRAINING AND DATA VARIABLES. x_train, x_test, y_train, y_test = train_test_split(test_features, control_group, test_size=0.40, random_state=folds) #FIT SCORES scaler = MinMaxScaler() x_train = scaler.fit_transform(x_train) x_test = scaler.transform(x_test) #NAME THE CLASSIFIERS. logreg = LogisticRegression() clf2 = DecisionTreeClassifier(max_depth=3).fit(x_train, y_train) knn = KNeighborsClassifier() gnb = GaussianNB() lda = LinearDiscriminantAnalysis() svm = SVC() cent = NearestCentroid() #FIT SCORES FOR CLASSIFIERS. logreg.fit(x_train, y_train) knn.fit(x_train, y_train) gnb.fit(x_train, y_train) lda.fit(x_train, y_train) svm.fit(x_train, y_train) cent.fit(x_train, y_train) """GET ACCURACY SCORE""" def get_accuracy(x, y): a = logreg.score(x, y) b = clf2.score(x, y) c = knn.score(x, y) d = gnb.score(x, y) e = lda.score(x, y) f = svm.score(x, y) g = cent.score(x, y) return (a, b, c, d, e, f, g) training_sets = [] test_sets = [] for i in range(len(classifiers)): train = float(get_accuracy(x_train, y_train)[i]) test = float(get_accuracy(x_test, y_test)[i]) training_sets.append(train) test_sets.append(test) training_sets = tuple(training_sets) test_sets = tuple(test_sets) return (training_sets, test_sets)
def assign_mg2k_centroids(X, centroids=None): """ Assigns Mg II k centroids found in the study 'Identifying typical Mg II flare spectra using machine learning', by B. Panos et. al. 2018 to the Mg II k spectra supplied in X. The centroids are assigned using a nearest neighbour procedure. The spectra in X have to be interpolated to 216 wavelength bins between LAMBDA_MIN = 2793.8500976562500 and LAMBDA_MAX = 2799.3239974882454. For example:: X = raster.get_interpolated_image_step( step = <step>, lambda_min = LAMBDA_MIN, lambda_max = LAMBDA_MAX, n_breaks = 216 ) Parameters ---------- X : numpy.array interpolated raster image of shape (_,bins) centroids : numpy.array If None, the centroids defined in the above study will be used, otherwise an array of shape (n_centroids, n_bins) should be passed. Important: both the spectra in 'X' and in 'centroids' should be constrained to the same wavelength region! Returns ------- assigned_mg2k_centroids numpy vector with shape (X.shape[1],) """ # load default centroids if no centroids are passed if centroids is None: centroids = get_mg2k_centroids(bins=X.shape[1]) # create list of numbered centroid ids centroid_ids = list(range(centroids.shape[0])) # check whether X comes in the correct dimensions if not X.shape[1] == centroids.shape[1]: raise Exception( "Expecting X to have shape (_,{}). Please interpolate accordingly (More information with 'help(assign_mg2k_centroids)')." .format(centroids.shape[1])) # create nearest centroid finder instance and fit it knc = NearestCentroid() knc.fit(X=centroids, y=centroid_ids) # predict nearest centroids for the supplied spectra # (making sure that X is normalized) assigned_mg2k_centroids = knc.predict(normalize(X)) # return vector of assigned centroids return assigned_mg2k_centroids
def test_shrinkage_correct(): # Ensure that the shrinking is correct. # The expected result is calculated by R (pamr), # which is implemented by the author of the original paper. # (One need to modify the code to output the new centroid in pamr.predict) X = np.array([[0, 1], [1, 0], [1, 1], [2, 0], [6, 8]]) y = np.array([1, 1, 2, 2, 2]) clf = NearestCentroid(shrink_threshold=0.1) clf.fit(X, y) expected_result = np.array([[0.7787310, 0.8545292], [2.814179, 2.763647]]) np.testing.assert_array_almost_equal(clf.centroids_, expected_result)
def caculate_metrics(self): #-compute silhouette score SC = metrics.silhouette_score(self.gene_df, self.clusterer.labels_, metric='euclidean') #-compute calinski-harabaz score CH = metrics.calinski_harabasz_score(self.gene_df, self.clusterer.labels_) #covert dataframe to numpy array genes = self.gene_df.to_numpy() #get number of clusters K = len(list(dict.fromkeys(self.clusterer.labels_))) #-tally members of each cluster members = [[] for i in range(K)] # lists of members of each cluster for j in range(len(self.gene_df)): # loop through instances members[self.clusterer.labels_[j]].append( j) # add this instance to cluster returned by scikit function #calculate centroids nc = NearestCentroid() nc.fit(genes, self.clusterer.labels_) #-compute the within-cluster score within = np.zeros((K)) for i in range(K): # loop through all clusters within[i] = 0.0 for j in members[i]: # loop through members of this cluster # tally the distance to this cluster centre from each of its members within[i] += ( np.square( genes[j,0]-nc.centroids_[i][0] ) \ + np.square( genes[j,1]-nc.centroids_[i][1] )) WC = np.sum(within) #-compute the between-cluster score between = np.zeros((K)) for i in range(K): # loop through all clusters between[i] = 0.0 for l in range(i + 1, K): # loop through remaining clusters # tally the distance from this cluster centre to the centres of the remaining clusters between[i] += ( np.square( nc.centroids_[i][0]-nc.centroids_[l][0] ) \ + np.square( nc.centroids_[i][1]-nc.centroids_[l][1] )) BC = np.sum(between) #-compute overall clustering score score = BC / WC #-print results for this value of K print('\nCluster metrics:') print('K = %d, Within Cluster Score = %.4f, Between Cluster score = %.4f, Overall Cluster Score = %.4f, Silhouette = %f, Calinski-Harabasz = %.4f' \ % ( K, WC, BC, score, SC, CH ))
def test_features_zero_var(): # Test that features with 0 variance throw error X = np.empty((10, 2)) X[:, 0] = -0.13725701 X[:, 1] = -0.9853293 y = np.zeros((10)) y[0] = 1 clf = NearestCentroid(shrink_threshold=0.1) with pytest.raises(ValueError): clf.fit(X, y)
def centroids_initialize(self, input: Tensor, labels: Tensor): """ (Re-)initialize the centers based on nearest centroids algorithm :param input: :param labels: """ model = NearestCentroid() model.fit(input.cpu().detach().numpy(), labels.cpu().detach().numpy().ravel()) self.weight.data.copy_( torch.Tensor(model.centroids_).to(self.weight.data.device))
def nearest_class_centroid(images_training, labels_training, images_testing, labels_testing): pca = PCA(n_components=(2)) training_images_pca = pca.fit_transform(images_training) test_images_pca = pca.fit_transform(images_testing) clf = NearestCentroid() clf.fit(training_images_pca, labels_training) print("Centoids: \n", clf.centroids_) return (clf.predict(test_images_pca), clf.centroids_, training_images_pca, test_images_pca)
def train_nearest_class_centroid_model(traning_set): traning_data = [traning_set[i].raw_bytes for i in range(len(traning_set))] traning_labels = [traning_set[i].label for i in range(len(traning_set))] pca_images = PCA(n_components=2) pca_training_image = pca_images.fit_transform(traning_data) nearest_class_centroid_model = NearestCentroid() # for each class calculate the mean of the class = centroid nearest_class_centroid_model.fit(pca_training_image, traning_labels) # return the traied model return nearest_class_centroid_model
def rocchio(X_train, X_test, y_train, y_test,string): clf = NearestCentroid() clf.fit(X_train, y_train.values.ravel()) #pickles.criarModelo(clf,"Rocchio "+string) if("Fold" in string): pickles.criarModelo(clf,"oraculo/"+string) #SALVAR MODELO y_predito = clf.predict(X_test) micro = f1_score(y_test,y_predito,average='micro') macro = f1_score(y_test,y_predito,average='macro') #f1_individual = f1_score(y_test,y_predito,average=None) #salvar_dados.salvar(y_test,y_predito,micro, macro, f1_individual," Rocchio "+string) print("O f1Score micro do Rocchio ", string ," é: ",micro) print("O f1Score macro do Rocchio ", string ," é: ",macro)
def test_pickle(): import pickle # classification obj = NearestCentroid() obj.fit(iris.data, iris.target) score = obj.score(iris.data, iris.target) s = pickle.dumps(obj) obj2 = pickle.loads(s) assert_equal(type(obj2), obj.__class__) score2 = obj2.score(iris.data, iris.target) assert_array_equal(score, score2, "Failed to generate same score" " after pickling (classification).")
def test_predict_translated_data(): # Test that NearestCentroid gives same results on translated data rng = np.random.RandomState(0) X = rng.rand(50, 50) y = rng.randint(0, 3, 50) noise = rng.rand(50) clf = NearestCentroid(shrink_threshold=0.1) clf.fit(X, y) y_init = clf.predict(X) clf = NearestCentroid(shrink_threshold=0.1) X_noise = X + noise clf.fit(X_noise, y) y_translate = clf.predict(X_noise) assert_array_equal(y_init, y_translate)
class NCClassifier(Classifier): """Rocchio classifier""" def __init__(self, shrink=None): self.cl = NearestCentroid(shrink_threshold=shrink) self.shrink = shrink def retrain(self, vectorFeature, vectorTarget): if self.shrink != None: self.cl.fit([v.toarray()[0] for v in vectorFeature], vectorTarget) else: super(NCClassifier, self).retrain(vectorFeature, vectorTarget) def classify(self, vectorizedTest): if self.shrink != None: return self.cl.predict(vectorizedTest.toarray()[0])[0] else: return super(NCClassifier, self).classify(vectorizedTest)
def nearestNeighbour(): import numpy as np import pylab as pl from matplotlib.colors import ListedColormap from sklearn import datasets from sklearn.neighbors import NearestCentroid n_neighbors = 15 # import some data to play with iris = datasets.load_iris() X = iris.data[:, :2] # we only take the first two features. We could # avoid this ugly slicing by using a two-dim dataset y = iris.target h = .02 # step size in the mesh # Create color maps cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF']) cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF']) for shrinkage in [None, 0.1]: # we create an instance of Neighbours Classifier and fit the data. clf = NearestCentroid(shrink_threshold=shrinkage) clf.fit(X, y) y_pred = clf.predict(X) print shrinkage, np.mean(y == y_pred) # Plot the decision boundary. For that, we will asign a color to each # point in the mesh [x_min, m_max]x[y_min, y_max]. x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]) # Put the result into a color plot Z = Z.reshape(xx.shape) pl.figure() pl.pcolormesh(xx, yy, Z, cmap=cmap_light) # Plot also the training points pl.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold) pl.title("3-Class classification (shrink_threshold=%r)" % shrinkage) pl.axis('tight')
def create_and_train_model(engine): cmd = "SELECT review_rating, review_text FROM bf_reviews" bfdf = pd.read_sql_query(cmd, engine) bfdfl = bfdf[bfdf['review_text'].str.len() > 300].copy() train_data = bfdfl['review_text'].values[:1000] y_train = bfdfl['review_rating'].values[:1000] t0 = time() vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english') X_train = vectorizer.fit_transform(train_data) duration = time() - t0 print('vectorized in {:.2f} seconds.'.format(duration)) print(X_train.shape) clf = NearestCentroid() clf.fit(X_train, y_train) return clf, vectorizer
def test_iris_shrinkage(): # Check consistency on dataset iris, when using shrinkage. for metric in ('euclidean', 'cosine'): for shrink_threshold in [None, 0.1, 0.5]: clf = NearestCentroid(metric=metric, shrink_threshold=shrink_threshold) clf = clf.fit(iris.data, iris.target) score = np.mean(clf.predict(iris.data) == iris.target) assert score > 0.8, "Failed with score = " + str(score)
def test_disambiguator_store(self): # Create a silly classifier that disambiguates between "stam" (tree # trunk) or "romp" (body trunk) as the Dutch translation of the # English noun "trunk" lempos = u"trunk/n" # FIXME: store_fit() should only accept unicode strings target_names = u"stam romp".encode("utf-8").split() vocab = u"boom hoofd".split() X = np.array([[0,1], [1,0], [0,1], [1,0]]) y = np.array([1,0,1,0]) estimator = NearestCentroid() estimator.fit(X, y) centroids = estimator.centroids_ score = estimator.score(X, y) # Store estimator fname = tempfile.NamedTemporaryFile().name f = DisambiguatorStore(fname, "w") f.save_estimator(NearestCentroid()) f.save_vocab(vocab) f.store_fit(lempos, estimator) f.save_target_names(lempos, target_names) f.close() # Restore estimator f2 = DisambiguatorStore(fname) estimator2 = f2.load_estimator() vocab2 = f2.load_vocab() f2.restore_fit(lempos, estimator2) target_names2 = f2.load_target_names(lempos) centroids2 = estimator2.centroids_ score2 = estimator2.score(X, y) assert_array_equal(centroids, centroids2) assert target_names == target_names2 assert vocab == vocab2 assert score == score2
train_images = faces.images[np.negative(ind)] train_targets = faces.target[np.negative(ind)] n_train = len(train_images) test_images = faces.images[ind] test_targets = faces.target[ind] n_tests = len(test_images) for test in test_images: test = test + norm.rvs(scale=10, size=test.shape) for i in range(25, 30): test[i, :] = 0 test[:, i] = 0 test = np.minimum(test, 1) test = np.maximum(test, 0) test = np.zeros(test.shape) train = train_images.reshape((n_train, -1)) train_pca = pca.fit_transform(train) test = test_images.reshape((n_tests, -1)) test_pca = pca.transform(test) neigh = NearestCentroid() neigh.fit(train, train_targets) print("Sans ACP :", np.count_nonzero(neigh.predict(test) - test_targets), n_tests) neigh.fit(train_pca, train_targets) print("Avec ACP :", np.count_nonzero(neigh.predict(test_pca) - test_targets), n_tests)
X_test = sample2 y_test = labels[272:,i] else: X_train = training y_train = labels[:172,i] X_test = sampletest y_test = labels[172:,i] posterior = np.empty([100,72,6]) box = np.zeros([6,6]) for j in range(4,5): for k in range(1,2): accuracy = np.zeros(100) for m in range(0,100): ncc = NearestCentroid() ncc.fit(X_train, y_train) y_pred = ncc.predict(X_test) n=0 for i in range(0,len(y_pred)): if y_pred[i] == y_test[i]: #print i, y_pred[i], y_test[i] n = n+1 accuracy[m] = accuracy[m]+1 box[y_test[i]-1,y_pred[i]-1] = box[y_test[i]-1,y_pred[i]-1] + 1 #posterior[m] = knc.predict_proba(X_test) print j, k, np.mean(accuracy)/0.72, np.std(accuracy)/0.72 #print 30, 20, sum(accuracy[0:8])/8.0, sum(accuracy[8:18])/10.0, sum(accuracy[18:30])/12.0, sum(accuracy[56:72])/16.0, sum(accuracy[30:43])/13.0, sum(accuracy[43:56])/13.0, sum(accuracy)/72.0 ''' means = np.empty([72,6]) stds = np.empty([72,6])
# import some data to play with iris = datasets.load_iris() X = iris.data[:, :2] # we only take the first two features. We could # avoid this ugly slicing by using a two-dim dataset y = iris.target h = 0.02 # step size in the mesh # Create color maps cmap_light = ListedColormap(["#FFAAAA", "#AAFFAA", "#AAAAFF"]) cmap_bold = ListedColormap(["#FF0000", "#00FF00", "#0000FF"]) for shrinkage in [None, 0.1]: # we create an instance of Neighbours Classifier and fit the data. clf = NearestCentroid(shrink_threshold=shrinkage) clf.fit(X, y) y_pred = clf.predict(X) print(shrinkage, np.mean(y == y_pred)) # Plot the decision boundary. For that, we will assign a color to each # point in the mesh [x_min, m_max]x[y_min, y_max]. x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]) # Put the result into a color plot Z = Z.reshape(xx.shape) pl.figure() pl.pcolormesh(xx, yy, Z, cmap=cmap_light) # Plot also the training points
# Nearest Centroid from sklearn import datasets from sklearn import metrics from sklearn.neighbors import NearestCentroid # load the iris datasets dataset = datasets.load_iris() # fit a nearest centroid model to the data model = NearestCentroid() model.fit(dataset.data, dataset.target) print(model) # make predictions expected = dataset.target predicted = model.predict(dataset.data) # summarize the fit of the model print(metrics.classification_report(expected, predicted)) print(metrics.confusion_matrix(expected, predicted))
def test_precomputed(): clf = NearestCentroid(metric='precomputed') with assert_raises(ValueError): clf.fit(X, y)
def test_precomputed(): clf = NearestCentroid(metric='precomputed') with assert_raises(ValueError) as context: clf.fit(X, y) assert_equal(ValueError, type(context.exception))
def get_results(city,no): processing.preprocessing() pre=open('preprocess1.txt') train_set=[] line=pre.readline() while(line!=''): train_set.append(line) #print line line=pre.readline() #print train_set pos=open('positive-words.txt') neg=open('negative-words.txt') positive=[] negative=[] for i in pos.read().split(): positive.append(i) for j in neg.read().split(): negative.append(j) stopWords = stopwords.words('english') vectorizer = CountVectorizer(stop_words = stopWords) transformer = TfidfTransformer() #train_set=get_traindata() #l=[] #l.append(test_set) #l.append(test_set1) #trainVectorizerArray = vectorizer.fit_transform(train_set).toarray() #print vectorizer.get_feature_names() #testVectorizerArray = vectorizer.transform(test_set).toarray() #testVectorizerArray1 = vectorizer.transform(l[1]).toarray() #print 'Fit Vectorizer to train set', trainVectorizerArray #print 'Transform Vectorizer to test set', testVectorizerArray #print testVectorizerArray1[0] #transformer.fit(trainVectorizerArray) v= vectorizer.fit_transform(train_set) #print v.toarray() tfidf= transformer.fit_transform(v) #transformer.fit(testVectorizerArray) #tfidf = transformer.transform(trainVectorizerArray) #print tfidf.todense() #print("done in %0.3fs." % (time() - t0)) #print nmf.components_ # Inverse the vectorizer vocabulary to be able feature_names = vectorizer.get_feature_names() #print (feature_names) #if 'area' in feature_names: print (feature_names) print ("\n") #------- nmf = decomposition.NMF(n_components=3, init='random',random_state=0).fit(tfidf.todense()) topic_list=[] l= int(len(feature_names)/5) #print l for topic_idx, topic in enumerate(nmf.components_): topic_list.append(topic.argsort()[:-l-1:-1]) #print("Topic #%d:" % topic_idx) #print (topic) #print "Hello----" #print topic_list train_target=[] for arr in v.toarray(): train_target.append(calculate_Topic(arr,topic_list)) #print train_target #clf = MultinomialNB() #clf2= LinearSVC() #clf1=NearestCentroid() #clf.fit(tfidf.todense(),train_target) #clf1.fit(tfidf.todense(),train_target) #clf2.fit(tfidf.todense(),train_target) #print (clf.predict(X_test)) #print (clf1.predict(X_test)) #print (clf2.predict(X_test)) #print "Hello" ch2 = SelectKBest(chi2, k=l*2) X_train = ch2.fit_transform(tfidf.todense(), train_target) cs= ch2.scores_.argsort()[::-1] cs_featurenames=[] cs=cs[:l*2] for x in cs: cs_featurenames.append(feature_names[x]) print (cs_featurenames) print "\n" nmf1 = decomposition.NMF(n_components=3, init='random',random_state=0).fit(X_train) topic_list=[] l= int(len(feature_names)/5) #print l for topic_idx, topic in enumerate(nmf1.components_): z=topic.argsort()[:-l-1:-1] topic_list.append(z) print("Topic #%d:---------------------------------------" % topic_idx) for y in z: print cs_featurenames[y] #print (topic) #print "Hello----" #print topic_list train_target=[] for arr in X_train: train_target.append(calculate_Topic(arr,topic_list)) #--------- #print "hello" #print train_target #print ch2.get_feature_names() #print X_train #print train_target #print "=--------------" #print ta #print X_test train_count=[0]*4 #print train_target for x in train_target: train_count[x]=train_count[x]+1 #print "hello" #print train_count clf = MultinomialNB() clf2= LinearSVC() clf1=NearestCentroid() clf.fit(X_train,train_target) clf1.fit(X_train,train_target) clf2.fit(X_train,train_target) dic={} hotels=read_hotels(city,dic) temp=[] for each in hotels: temp.append(calculate(vectorizer,transformer,train_count,ch2,each,clf,clf1,clf2,positive,negative,train_set)) res=[] temp1=numpy.array(temp).argsort()[::-1] #print temp1 print "Top %d recommendations are as follows[in the FORMAT Index,(Hotel name,Location),Score]:\n" %no for g in temp1[:no]: print g,dic[g],temp[g] res.append(dic[g]) return res
writeToDisk(pred,"KNeighborsClassifier") clf5=RandomForestClassifier(n_estimators=100) #RandomForest Classifier clf5.fit(X_train, y_train) pred = clf5.predict(X_test) writeToDisk(pred,"RandomForestClassifier") clf6=Pipeline([('feature_selection', #LinearSVC with L2-based feature selection LinearSVC(penalty="l2", dual=False, tol=1e-3)), ('classification', LinearSVC())]) clf6.fit(X_train, y_train) pred = clf6.predict(X_test) writeToDisk(pred,"LinearSVC") clf7=NearestCentroid() #NearestCentroid (aka Rocchio classifier), no threshold clf7.fit(X_train, y_train) pred = clf7.predict(X_test) writeToDisk(pred,"NearestCentroid") clf8=SVC(C=1.0, class_weight=None, coef0=0.0, #SVC decision_function_shape=None, degree=3, gamma='auto', kernel='linear', max_iter=-1, probability=False, random_state=1, shrinking=True, tol=0.001, verbose=False) clf8.fit(X_train, y_train) pred = clf8.predict(X_test) writeToDisk(pred,"SVC") ''' clf9=VotingClassifier(estimators=[ ('Ridge',clf1),('MultiNB',clf2),('BernNB',clf3),('KNN',clf4), ('RF',clf5),('LinearSVC',clf6),('NearC',clf7),('SVC',clf8) ],voting='soft')
def test_precomputed(): clf = NearestCentroid(metric="precomputed") clf.fit(X, y) S = pairwise_distances(T, clf.centroids_) assert_array_equal(clf.predict(S), true_result)