def nearest_centroid_classifier(X_train, categories, X_test, test_categories): from sklearn.neighbors import NearestCentroid clf = NearestCentroid().fit(X_train, categories) y_roccio_predicted = clf.predict(X_test) print "\n Here is the classification report for NearestCentroid classifier:" print metrics.classification_report(test_categories, y_roccio_predicted) to_latex(test_categories, y_roccio_predicted)
def NC(data_train, data_train_vectors, data_test_vectors, **kwargs): # Implementing classification model- using NearestCentroid clf_nc = NearestCentroid() clf_nc.fit(data_train_vectors, data_train.target) y_pred = clf_nc.predict(data_test_vectors) return y_pred
def train_with(self, training_data_list, answers): #put data in right format training_data = self.get_sparse_matrix(training_data_list) if training_data is not False: #make model if self.model_name == "random_forest": forest = RandomForestClassifier(n_estimators=100) self.model = forest.fit(training_data.todense(), answers) elif self.model_name == "centroid_prediction": clf = NearestCentroid() self.model = clf.fit(training_data, answers) elif self.model_name == "linearSVC": SVC = LinearSVC() self.model = SVC.fit(training_data.todense(), answers) elif self.model_name == "nearest_neighbor": near = KNeighborsClassifier() self.model = near.fit(training_data.todense(), answers) elif self.model_name == "decision_tree": clf = tree.DecisionTreeClassifier() self.model = clf.fit(training_data.todense(), answers) elif self.model_name == "svc": clf = svm.SVC() self.model = clf.fit(training_data, answers)
def test_shrinkage_threshold_decoded_y(): clf = NearestCentroid(shrink_threshold=0.01) y_ind = np.asarray(y) y_ind[y_ind == -1] = 0 clf.fit(X, y_ind) centroid_encoded = clf.centroids_ clf.fit(X, y) assert_array_equal(centroid_encoded, clf.centroids_)
def test_iris_shrinkage(): # Check consistency on dataset iris, when using shrinkage. for metric in ('euclidean', 'cosine'): for shrink_threshold in [None, 0.1, 0.5]: clf = NearestCentroid(metric=metric, shrink_threshold=shrink_threshold) clf = clf.fit(iris.data, iris.target) score = np.mean(clf.predict(iris.data) == iris.target) assert score > 0.8, "Failed with score = " + str(score)
def test_manhattan_metric(): # Test the manhattan metric. clf = NearestCentroid(metric='manhattan') clf.fit(X, y) dense_centroid = clf.centroids_ clf.fit(X_csr, y) assert_array_equal(clf.centroids_, dense_centroid) assert_array_equal(dense_centroid, [[-1, -1], [1, 1]])
def nearest_centroid_classifier(X_train, X_test, y_train, y_test): from sklearn.neighbors import NearestCentroid clf = NearestCentroid().fit(X_train, y_train) evaluate_cross_validation(clf,X_train, y_train, 5) y_roccio_predicted = clf.predict(X_test) print "\n Here is the classification report for NearestCentroid classifier:" print metrics.classification_report(y_test, y_roccio_predicted)
def test_shrinkage_correct(): # Ensure that the shrinking is correct. # The expected result is calculated by R (pamr), # which is implemented by the author of the original paper. # (One need to modify the code to output the new centroid in pamr.predict) X = np.array([[0, 1], [1, 0], [1, 1], [2, 0], [6, 8]]) y = np.array([1, 1, 2, 2, 2]) clf = NearestCentroid(shrink_threshold=0.1) clf.fit(X, y) expected_result = np.array([[0.7787310, 0.8545292], [2.814179, 2.763647]]) np.testing.assert_array_almost_equal(clf.centroids_, expected_result)
def test_pickle(): import pickle # classification obj = NearestCentroid() obj.fit(iris.data, iris.target) score = obj.score(iris.data, iris.target) s = pickle.dumps(obj) obj2 = pickle.loads(s) assert_equal(type(obj2), obj.__class__) score2 = obj2.score(iris.data, iris.target) assert_array_equal(score, score2, "Failed to generate same score" " after pickling (classification).")
class NCClassifier(Classifier): """Rocchio classifier""" def __init__(self, shrink=None): self.cl = NearestCentroid(shrink_threshold=shrink) self.shrink = shrink def retrain(self, vectorFeature, vectorTarget): if self.shrink != None: self.cl.fit([v.toarray()[0] for v in vectorFeature], vectorTarget) else: super(NCClassifier, self).retrain(vectorFeature, vectorTarget) def classify(self, vectorizedTest): if self.shrink != None: return self.cl.predict(vectorizedTest.toarray()[0])[0] else: return super(NCClassifier, self).classify(vectorizedTest)
def nearestNeighbour(): import numpy as np import pylab as pl from matplotlib.colors import ListedColormap from sklearn import datasets from sklearn.neighbors import NearestCentroid n_neighbors = 15 # import some data to play with iris = datasets.load_iris() X = iris.data[:, :2] # we only take the first two features. We could # avoid this ugly slicing by using a two-dim dataset y = iris.target h = .02 # step size in the mesh # Create color maps cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF']) cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF']) for shrinkage in [None, 0.1]: # we create an instance of Neighbours Classifier and fit the data. clf = NearestCentroid(shrink_threshold=shrinkage) clf.fit(X, y) y_pred = clf.predict(X) print shrinkage, np.mean(y == y_pred) # Plot the decision boundary. For that, we will asign a color to each # point in the mesh [x_min, m_max]x[y_min, y_max]. x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]) # Put the result into a color plot Z = Z.reshape(xx.shape) pl.figure() pl.pcolormesh(xx, yy, Z, cmap=cmap_light) # Plot also the training points pl.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold) pl.title("3-Class classification (shrink_threshold=%r)" % shrinkage) pl.axis('tight')
def create_and_train_model(engine): cmd = "SELECT review_rating, review_text FROM bf_reviews" bfdf = pd.read_sql_query(cmd, engine) bfdfl = bfdf[bfdf['review_text'].str.len() > 300].copy() train_data = bfdfl['review_text'].values[:1000] y_train = bfdfl['review_rating'].values[:1000] t0 = time() vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english') X_train = vectorizer.fit_transform(train_data) duration = time() - t0 print('vectorized in {:.2f} seconds.'.format(duration)) print(X_train.shape) clf = NearestCentroid() clf.fit(X_train, y_train) return clf, vectorizer
class BinBasedCluster(BaseEstimator): def __init__(self, bins=[0, 0.5, 1] + range(5, 36)): self.bins = bins def fit(self, X, y): biny = self.bin_data(y) self.pred = NearestCentroid().fit(X, biny) return self def predict(self, X): return self.pred.predict(X) def score(self, X, y, is_raw=True): clusters = self.pred.predict(X) if is_raw: return adjusted_rand_score(self.bin_data(y), clusters) else: return adjusted_rand_score(y, clusters) def bin_data(self, y): return np.digitize(y, self.bins) def make_vern_points(self, X, y): sel = SelectKBest(score_func=normalized_mutual_info_score_scorefunc) sdata = sel.fit_transform(X, y) print X.shape, sdata.shape pca = PCA(n_components=2) pca_trans = pca.fit_transform(sdata) biny = self.bin_data(y) pred = NearestCentroid().fit(pca_trans, biny) x_min, x_max = pca_trans[:, 0].min() - 1, pca_trans[:, 0].max() + 1 y_min, y_max = pca_trans[:, 1].min() - 1, pca_trans[:, 1].max() + 1 xx, yy = np.meshgrid(np.linspace(x_min, x_max, 50), np.linspace(y_min, y_max, 50)) Z = pred.predict(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) return pca_trans, biny, xx, yy, Z
def test_disambiguator_store(self): # Create a silly classifier that disambiguates between "stam" (tree # trunk) or "romp" (body trunk) as the Dutch translation of the # English noun "trunk" lempos = u"trunk/n" # FIXME: store_fit() should only accept unicode strings target_names = u"stam romp".encode("utf-8").split() vocab = u"boom hoofd".split() X = np.array([[0,1], [1,0], [0,1], [1,0]]) y = np.array([1,0,1,0]) estimator = NearestCentroid() estimator.fit(X, y) centroids = estimator.centroids_ score = estimator.score(X, y) # Store estimator fname = tempfile.NamedTemporaryFile().name f = DisambiguatorStore(fname, "w") f.save_estimator(NearestCentroid()) f.save_vocab(vocab) f.store_fit(lempos, estimator) f.save_target_names(lempos, target_names) f.close() # Restore estimator f2 = DisambiguatorStore(fname) estimator2 = f2.load_estimator() vocab2 = f2.load_vocab() f2.restore_fit(lempos, estimator2) target_names2 = f2.load_target_names(lempos) centroids2 = estimator2.centroids_ score2 = estimator2.score(X, y) assert_array_equal(centroids, centroids2) assert target_names == target_names2 assert vocab == vocab2 assert score == score2
def make_vern_points(self, X, y): sel = SelectKBest(score_func=normalized_mutual_info_score_scorefunc) sdata = sel.fit_transform(X, y) print X.shape, sdata.shape pca = PCA(n_components=2) pca_trans = pca.fit_transform(sdata) biny = self.bin_data(y) pred = NearestCentroid().fit(pca_trans, biny) x_min, x_max = pca_trans[:, 0].min() - 1, pca_trans[:, 0].max() + 1 y_min, y_max = pca_trans[:, 1].min() - 1, pca_trans[:, 1].max() + 1 xx, yy = np.meshgrid(np.linspace(x_min, x_max, 50), np.linspace(y_min, y_max, 50)) Z = pred.predict(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) return pca_trans, biny, xx, yy, Z
def test_predict_translated_data(): # Test that NearestCentroid gives same results on translated data rng = np.random.RandomState(0) X = rng.rand(50, 50) y = rng.randint(0, 3, 50) noise = rng.rand(50) clf = NearestCentroid(shrink_threshold=0.1) clf.fit(X, y) y_init = clf.predict(X) clf = NearestCentroid(shrink_threshold=0.1) X_noise = X + noise clf.fit(X_noise, y) y_translate = clf.predict(X_noise) assert_array_equal(y_init, y_translate)
def select_classifier(X, y, n_splits=10, test_size=0.1, random_state=0, show=True): classifiers = [ AdaBoostClassifier(), BaggingClassifier(), BernoulliNB(), CalibratedClassifierCV(), DecisionTreeClassifier(), ExtraTreeClassifier(), GaussianNB(), GaussianProcessClassifier(), GradientBoostingClassifier(), KNeighborsClassifier(), LinearDiscriminantAnalysis(), LinearSVC(), LogisticRegression(), LogisticRegressionCV(), MLPClassifier(), MultinomialNB(), NearestCentroid(), NuSVC(), PassiveAggressiveClassifier(), Perceptron(), QuadraticDiscriminantAnalysis(), RadiusNeighborsClassifier(), RandomForestClassifier(), RidgeClassifier(), RidgeClassifierCV(), SGDClassifier(), SVC() ] names = [clf.__class__.__name__ for clf in classifiers] cv = StratifiedShuffleSplit(n_splits=n_splits, test_size=test_size, random_state=random_state) scores = {} for train_index, test_index in cv.split(X, y): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] for name, clf in zip(names, classifiers): try: clf.fit(X_train, y_train) train_predictions = clf.predict(X_test) acc = accuracy_score(y_test, train_predictions) except: acc = 0 s = scores.get(name, []) s.append(acc) scores[name] = s scores = [[n, np.mean(s)] for n, s in scores.items()] scores = pd.DataFrame(scores, columns=['Classifier', 'Score']).sort_values(by='Score', ascending=False) if show: print(scores) return scores.iloc[0, 0], classifiers[scores.iloc[0].name], scores
def get_hyperparameters_model(): metric = ['euclidean', 'manhattan'] param_dist = {'cls__metric': metric} clf = NearestCentroid() model = { 'nearest_centroid': { 'model': clf, 'param_distributions': param_dist } } return model
def nearest_mean_classifier(X_train, y_train, X_validation, X_test): # Returns the labels for test_data, predicted by the nearest mean classifier trained on X_train and y_train # Input: # X_train - num_train x num_features matrix with features for the training data # y_train - num_train x 1 vector with labels for the training data # X_validation - num_test x num_features matrix with features for the validation data # X_test - num_test x num_features matrix with features for the test data # Output: # y_pred_validation - num_test x 1 predicted vector with labels for the validation data # y_pred_test - num_test x 1 predicted vector with labels for the test data X_test_val = np.vstack((X_validation, X_test)) # Gooi datasets samen clf = NearestCentroid() clf.fit(X_train, y_train) # Bepaal de means predicted_labels = clf.predict(X_test_val) # Voorspel de data # Sla voorspellingen op y_pred_validation = predicted_labels[:len(X_validation)] y_pred_test = predicted_labels[len(X_validation):] return y_pred_validation, y_pred_test
def _cluster_func(self, n_clusters, km, pars=None, lsi=None): """ A helper function for clustering, includes base method used by all clustering implementations """ import warnings from sklearn.neighbors import NearestCentroid if pars is None: pars = {} pars.update(km.get_params(deep=True)) X = joblib.load(os.path.join(self.fe.dsid_dir, 'features')) mid, mid_dir = setup_model(self.model_dir) if lsi is not None: X = lsi.fit_transform(X) joblib.dump(X, os.path.join(self.model_dir, mid, 'lsi_features'), compress=9) pars['lsi'] = lsi with warnings.catch_warnings(): if type(km).__name__ != "DBSCAN": warnings.filterwarnings("ignore", category=DeprecationWarning) km.fit(X) pars['lsi'] = lsi self.mid = mid self.mid_dir = mid_dir labels_ = km.labels_ if type(km).__name__ == "DBSCAN": labels_ = _dbscan_noisy2unique(labels_) n_clusters = len(np.unique(labels_)) km.labels_ = labels_ if not hasattr(km, 'cluster_centers_'): # i.e. model is not MiniBatchKMeans => compute centroids km.cluster_centers_ = NearestCentroid().fit(X, labels_).centroids_ pars['n_clusters'] = n_clusters joblib.dump(km, os.path.join(self.model_dir, mid, 'model'), compress=9) joblib.dump(pars, os.path.join(self.model_dir, mid, 'pars'), compress=9) self.km = km self._pars = pars htree = self._get_htree(km) return labels_, htree
class scikit_NearestCentroid(MLAlgo): def __init__(self): self.clf = NearestCentroid() self.className = self.__class__.__name__ def train(self, train_data): train_X = train_data[:, :-1] train_Y = train_data[:, -1] self.clf.fit(train_X, train_Y) print("NearestCentroid model built.") return self.className + " Training finished...\n" def test(self, test_data): test_X = test_data[:, :-1] test_Y = test_data[:, -1] print("Accuracy: ", self.clf.score(test_X, test_Y)) return self.className + " Testing finished...\n" def predict(self, predict_data): print("Predictions: ", self.clf.predict(predict_data)) return self.className + " Prediction finished...\n" def cross_validate(self, train_data): X_ = train_data[:, :-1] Y_ = train_data[:, -1] predicted = cross_val_predict(self.clf, X_, Y_, cv=10) print("Cross-validation accuracy: ", metrics.accuracy_score(Y_, predicted)) if metrics.accuracy_score(Y_, predicted) > MLAlgo.cross_validate_accuracy: MLAlgo.cross_validate_accuracy = metrics.accuracy_score( Y_, predicted) MLAlgo.classifier = self.clf MLAlgo.trained_instance = self return self.className + " Cross validation finished...\n"
def ModelsIteration(self): results = [] for clf, name in ( (RidgeClassifier(tol=1e-2, solver="sag"), "Ridge Classifier"), (Perceptron(n_iter=50), "Perceptron"), (PassiveAggressiveClassifier(n_iter=50), "Passive-Aggressive"), (KNeighborsClassifier(n_neighbors=10), "kNN"), # (RandomForestClassifier(n_estimators=100), "Random forest"), (SVC(C=1e-8, gamma=1.0/self.X_train.shape[1], kernel='rbf'), "SVM with RBF Kernel")): print('=' * 80) print(name) results.append(self.benchmark(clf)) for penalty in ["l2", "l1"]: print('=' * 80) print("%s penalty" % penalty.upper()) # Train Liblinear model results.append(self.benchmark(LinearSVC(loss='squared_hinge', penalty=penalty, dual=False, tol=1e-3), 'LinearSVC')) # Train SGD model results.append(self.benchmark(SGDClassifier(alpha=.0001, n_iter=50, penalty=penalty), 'SGDClassifier')) # Train SGD with Elastic Net penalty print('=' * 80) print("Elastic-Net penalty") results.append(self.benchmark(SGDClassifier(alpha=.0001, n_iter=50, penalty="elasticnet"))) # Train NearestCentroid without threshold print('=' * 80) print("NearestCentroid (aka Rocchio classifier)") results.append(self.benchmark(NearestCentroid())) # Train sparse Naive Bayes classifiers print('=' * 80) print("Naive Bayes") results.append(self.benchmark(MultinomialNB(alpha=.01), 'MultinomialNB')) results.append(self.benchmark(BernoulliNB(alpha=.01), 'BernoulliNB')) # results.append(self.benchmark(GaussianNB(), 'GaussianNB')) print('=' * 80) print("LinearSVC") # The smaller C, the stronger the regularization. # The more regularization, the more sparsity. results.append(self.benchmark(LinearSVC(), 'LinearSVC')) return results
def classify(): results = [] for clf, name in ((RidgeClassifier(tol=1e-2, solver="lsqr"), "Ridge Classifier"), (Perceptron(n_iter=50), "Perceptron"), (PassiveAggressiveClassifier(n_iter=50), "Passive-Aggressive"), (KNeighborsClassifier(n_neighbors=10), "kNN")): print('=' * 80) print(name) results.append(benchmark(clf)) for penalty in ["l2", "l1"]: print('=' * 80) print("%s penalty" % penalty.upper()) # Train Liblinear model results.append( benchmark( LinearSVC(loss='l2', penalty=penalty, dual=False, tol=1e-3))) # Train SGD model results.append( benchmark(SGDClassifier(alpha=.0001, n_iter=50, penalty=penalty))) # Train SGD with Elastic Net penalty print('=' * 80) print("Elastic-Net penalty") results.append( benchmark(SGDClassifier(alpha=.0001, n_iter=50, penalty="elasticnet"))) # Train NearestCentroid without threshold print('=' * 80) print("NearestCentroid (aka Rocchio classifier)") results.append(benchmark(NearestCentroid())) # Train sparse Naive Bayes classifiers print('=' * 80) print("Naive Bayes") results.append(benchmark(MultinomialNB(alpha=.01))) results.append(benchmark(BernoulliNB(alpha=.01))) print('=' * 80) print("LinearSVC with L1-based feature selection") results.append(benchmark(L1LinearSVC())) return results
def fit(self, X, y): """ Runs very light, memory-based like fitting Method which primarily stores `X` and `y` in memory. In the case of profile-based verifier, we store a single, mean centroid per author in memory. Parameters ---------- X: floats, array-like [nb_documents, nb_features] The 2D matrix representing the training instance-based to be memorized. y, array of ints [nb_documents] An int-encoded representation of the correct authorship for each training documents. References ---------- - Daelemans, W. & van den Bosch, A. (2005). Memory-Based Language Processing. Cambridge University Press. - M. Koppel and S. Seidman (2013), Automatically Identifying Pseudepigraphic Texts, EMNLP-13: 1449-1454. """ self.train_X = NearestCentroid().fit(X, y).centroids_ # mean centroids self.train_y = np.array(range(self.train_X.shape[0])) nb_items = self.train_X.shape[0] # calculate all pairwise distances in data set: distances = [] idxs = range(self.train_X.shape[0]) for i, j in combinations(range(nb_items), 2): distances.append(self.metric_fn(self.train_X[i], self.train_X[j], idxs)) # fit a 0-1 scaler on the distances: distances = np.array(distances, dtype='float32').transpose() distances = distances[~np.isnan(distances)] self.distance_scaler1 = StandardScaler().fit(distances) distances = self.distance_scaler1.transform(distances.transpose()) self.distance_scaler2 = MinMaxScaler().fit(distances)
def prepare_models(): models = [] # Non-Ensemble classifiers to be included in classifer test with default params # Some classifiers have non-default params to reduce training time significantly models.append(('Dummy', DummyClassifier(strategy="uniform"))) models.append(('LogisticRegression', LogisticRegression(C=0.001))) models.append(('Ridge', RidgeClassifier())) # Non-probabilistic models.append(('Perceptron', Perceptron())) # Non-probabilistic models.append(('PassiveAggressive', PassiveAggressiveClassifier(C=0.001))) # Non-probabilistic models.append(('LDA', LinearDiscriminantAnalysis())) models.append(('QDA', QuadraticDiscriminantAnalysis())) models.append(('Naive_Bayes_Gaussian', GaussianNB())) models.append(('LinearSVC', LinearSVC(C=0.001))) # Non-probabilistic models.append(('DecisionTree', DecisionTreeClassifier(max_depth=5))) models.append(('NearestCentroid', NearestCentroid())) # Non-probabilistic models.append(('MultiLayerPerceptron', MLPClassifier())) models.append(('Keras', KerasClassifier(build_fn=keras_baseline_model, nb_epoch=5, batch_size=100, verbose=0))) return models
def test_plot13(self): np.random.seed(seed) X, y = iris_data() X = X[:, [0, 2]] dml = NCMML() clf = NearestCentroid() dml_plot(X, y, clf, cmap="gist_rainbow", figsize=(15, 8)) self.newsave() dml_plot(X, y, dml=dml, clf=clf, cmap="gist_rainbow", figsize=(15, 8)) self.newsave() dml_pairplots(X, y, dml=dml, clf=clf, cmap="gist_rainbow", figsize=(15, 8)) self.newsave() plt.close()
def get_lots_o_models(): """ Returns a list of SKLearn classifiers to exercise :return: List of instantiated classifiers. """ the_models = [] the_models.append((RidgeClassifier(tol=1e-2, solver="lsqr"), 'Ridge_Classifier')) the_models.append((Perceptron(n_iter=50), "Perceptron")) the_models.append( (PassiveAggressiveClassifier(n_iter=50), "Passive_Aggressive")) the_models.append((KNeighborsClassifier(n_neighbors=10), "kNN")) the_models.append( (RandomForestClassifier(n_estimators=100), "Random_Forest_100")) # the_models.append((RandomForestClassifier(n_estimators=10), "Random_Forest_10")) # the_models.append((RandomForestClassifier(n_estimators=1000), "Random_Forest_1000")) for penalty in ["l2", "l1"]: the_models.append( (LinearSVC(loss='squared_hinge', penalty=penalty, dual=False, tol=1e-3), "%s_penalty" % penalty.upper())) the_models.append( (SGDClassifier(alpha=.0001, n_iter=50, penalty=penalty), "%s penalty" % penalty.upper())) the_models.append( (SGDClassifier(alpha=.0001, n_iter=50, penalty="elasticnet"), "Elastic-Net penalty")) the_models.append( (NearestCentroid(), "NearestCentroid_aka_Rocchio_classifier)")) the_models.append((MultinomialNB(alpha=.01), 'Naive_Bayes_Multi')) the_models.append((BernoulliNB(alpha=.01), 'Naive_Bayes_Bernoulli')) the_models.append((Pipeline([ ('feature_selection', SelectFromModel(LinearSVC(penalty="l1", dual=False, tol=1e-3))), ('classification', LinearSVC()) ]), 'LinearSVC_with_L1')) return the_models
def get_classifier_with_best_parameters(classifier_enum, best_parameters): if classifier_enum == Classifier.ADA_BOOST_CLASSIFIER: return AdaBoostClassifier(**best_parameters) elif classifier_enum == Classifier.BERNOULLI_NB: return BernoulliNB(**best_parameters) elif classifier_enum == Classifier.COMPLEMENT_NB: return ComplementNB(**best_parameters) elif classifier_enum == Classifier.DECISION_TREE_CLASSIFIER: return DecisionTreeClassifier(**best_parameters) elif classifier_enum == Classifier.GRADIENT_BOOSTING_CLASSIFIER: return GradientBoostingClassifier(**best_parameters) elif classifier_enum == Classifier.K_NEIGHBORS_CLASSIFIER: return KNeighborsClassifier(**best_parameters) elif classifier_enum == Classifier.LINEAR_SVC: return LinearSVC(**best_parameters) elif classifier_enum == Classifier.LOGISTIC_REGRESSION: return LogisticRegression(**best_parameters) elif classifier_enum == Classifier.MULTINOMIAL_NB: return MultinomialNB(**best_parameters) elif classifier_enum == Classifier.NEAREST_CENTROID: return NearestCentroid(**best_parameters) elif classifier_enum == Classifier.PASSIVE_AGGRESSIVE_CLASSIFIER: return PassiveAggressiveClassifier(**best_parameters) elif classifier_enum == Classifier.PERCEPTRON: return Perceptron(**best_parameters) elif classifier_enum == Classifier.RANDOM_FOREST_CLASSIFIER: return RandomForestClassifier(**best_parameters) elif classifier_enum == Classifier.RIDGE_CLASSIFIER: return RidgeClassifier(**best_parameters)
def nearest_centroid_classifier(train, validation, verbose=False): nearest_centroid = NearestCentroid() nearest_centroid.fit(train['data'], train['labels']) # Find the prediction and accuracy on the training set. Yhat_svc_linear_train = nearest_centroid.predict(train['data']) acc_train = np.mean(Yhat_svc_linear_train == train['labels']) # Find the prediction and accuracy on the test set. Yhat_svc_linear_test = nearest_centroid.predict(validation['data']) acc_validation = np.mean(Yhat_svc_linear_test == validation['labels']) if verbose: print('Train Accuracy for lda classifier, = {0:f}'.format(acc_train)) print('Validation Accuracy for lda classifier, = {0:f}'.format(acc_validation)) return acc_train, acc_validation
def __init__(self, metric='euclidean', shrink_threshold=None, ranking_size=30): """ :param metric: The metric to use when calculating distance between instances. The default metric is Euclidean. Choices are: - 'euclidean' for standard Euclidean distance - 'manhattan': for the Manhattan distance - 'haversine' for distances between (latitude,longitude) points only - 'cosine': for cosinus similarity :param shrink_thresold: The threshold for shrinking centroids to remove features """ self.metric = metric self.shrink_threshold = shrink_threshold self.ranking_size = ranking_size self.clf = NearestCentroid(metric=metric, shrink_threshold=shrink_threshold)
def proceed_classification(X, y, text="Classification Experiment"): print("===========" + text + "===========") X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33) clr = LogisticRegression() clr.fit(X_train, y_train) print("Logistic Regession: %f" % (clr.score(X_test, y_test))) clr = RidgeClassifier() clr.fit(X_train, y_train) print("Ridge: %f" % (clr.score(X_test, y_test))) clr = MultinomialNB() clr.fit(X_train, y_train) print("Multinomial: %f" % (clr.score(X_test, y_test))) clr = GaussianNB() clr.fit(X_train, y_train) print("GaussianNB: %f" % (clr.score(X_test, y_test))) clr = SGDClassifier() clr.fit(X_train, y_train) print("SGDClassifier: %f" % (clr.score(X_test, y_test))) clr = Perceptron() clr.fit(X_train, y_train) print("Perceptron: %f" % (clr.score(X_test, y_test))) clr = BernoulliNB() clr.fit(X_train, y_train) print("BernoulliNB: %f" % (clr.score(X_test, y_test))) clr = KNeighborsClassifier() clr.fit(X_train, y_train) print("KNeighbors: %f" % (clr.score(X_test, y_test))) clr = NearestCentroid() clr.fit(X_train, y_train) print("NearestCentroid: %f" % (clr.score(X_test, y_test))) clr = RandomForestClassifier() clr.fit(X_train, y_train) print("RandomForestClassifier: %f" % (clr.score(X_test, y_test))) clr = MLPClassifier() clr.fit(X_train, y_train) print("Neutral network: %f" % (clr.score(X_test, y_test))) clr = SVC(kernel="rbf") clr.fit(X_train, y_train) print("Kernel SVM network: %f" % (clr.score(X_test, y_test))) print("\n")
def sklearn_get_clasifier(X, y, method): if method == "Svm": classifier = LinearSVC(loss='l1') elif method == "Svc": classifier = SVC(kernel='linear', probability=True) elif method == "BernoulliNB": classifier = BernoulliNB() elif method == "MultinomialNB": classifier = MultinomialNB() elif method == "Centroid": classifier = NearestCentroid() # metric = 'manhattan', shrink_threshold=None) #manhattan, euclidian, l2, l1, cityblock elif method == "MaxEnt": classifier = LogisticRegression() elif method == "KNeighbors": classifier = KNeighborsClassifier(n_neighbors=5, p=3) # p=1 - manhatnska razdalja, p=2: evklidska; sicer: minkovski #elif method == "DecisionTree": # classifier = DecisionTreeClassifier() classifier.fit(X, y) return classifier
def ncentroid(args): """Uses scikit-learn's KNeighborsClassifier, each class is represented by its centroid, with test samples classified to the class with the nearest centroid. Parameters ---------- metric : string, or callable The metric to use when calculating distance between instances in a feature array. shrink_threshold : float Threshold for shrinking centroids to remove features. """ st = None if (args[1].find("None") == -1): st = float(args[1]) met = args[2] return NearestCentroid(metric=met, shrink_threshold=st)
def initialize_classifiers(self): classifiers = [] for kernel in self.kernels: print kernel fun = lambda X_train, y_train, X_test: SVC(kernel=kernel).fit( X_train, y_train).predict(X_test) classifiers.append(fun) fun = lambda X_train, y_train, X_test: LinearSVC( multi_class='crammer_singer').fit(X_train, y_train).predict(X_test) classifiers.append(fun) fun = lambda X_train, y_train, X_test: KNeighborsClassifier( n_neighbors=1).fit(X_train, y_train).predict(X_test) classifiers.append(fun) fun = lambda X_train, y_train, X_test: NearestCentroid().fit( X_train, y_train).predict(X_test) classifiers.append(fun) return classifiers
def nc_fit(Xtrain, Xtest, Xtrain_lbls, Xtest_lbls, name, data, t0=time()): #Create a nearest centroid clf = NearestCentroid() # Train with the data clf.fit(Xtrain, Xtrain_lbls) # Create prediction for test data y_pred_test = clf.predict(Xtest) # How well does it fit score = clf.score(Xtest, Xtest_lbls) print('%-9s\t%.2fs\t%-9s\t%-9s' % (name, (time() - t0), score, data)) return y_pred_test
def __init__(self, object): if object == 'randomforest': self.models = {(RandomForestClassifier(), "Random forest")} if object == 'sklearnmodels': self.models = {(RidgeClassifier(tol=1e-2, solver="sag"), "Ridge Classifier"), (Perceptron(max_iter=50), "Perceptron"), (PassiveAggressiveClassifier(max_iter=50), "Passive-Aggressive"), (KNeighborsClassifier(n_neighbors=10), "kNN"), (RandomForestClassifier(), "Random forest"), (LinearSVC(penalty="l2", dual=False, tol=1e-3), "L2 Linear SVC"), (LinearSVC(penalty="l1", dual=False, tol=1e-3), "L1 Linear SVC"), (SGDClassifier(alpha=.0001, max_iter=50, penalty="l2"), "L2 SGDClassifier"), (SGDClassifier(alpha=.0001, max_iter=50, penalty="l1"), "L1 SGDClassifier"), (SGDClassifier(alpha=.0001, max_iter=50, penalty="elasticnet"), "Elastic-Net penalty SGDClassifier"), (NearestCentroid(), "NearestCentroid (aka Rocchio classifier)"), (MultinomialNB(alpha=.01), "Naive Bayes MultinomialNB"), (BernoulliNB(alpha=.01), "Naive Bayes BernoulliNB"), (ComplementNB(alpha=.1), "Naive Bayes ComplementB"), (Pipeline([ ('feature_selection', SelectFromModel( LinearSVC(penalty="l1", dual=False, tol=1e-3))), ('classification', LinearSVC(penalty="l2")) ]), "LinearSVC with L1-based feature selection")}
def populate_label( X_train, y_train, X_test, log_prob = False ): predictions = [] train_dat = [] for clf, name in ( #(LassoLars(),"LassoLars"), #(BayesianRidge(),"BayesianRidge"), #(GaussianNB(),"Gaussian NB"), #dense (GradientBoostingClassifier(),"Gradient Boosting"), (ExtraTreesClassifier(),"ExtraTreesClassifier"), (AdaBoostClassifier(),"AdaBoostClassifier"), (LinearSVC(),"LinearSVC"), (NearestCentroid(),"NearestCentroid"), (BernoulliNB(binarize=False, fit_prior=True, alpha=0.1),"BernoulliNB"), (Lasso(),"Lasso"), # regressor #(ElasticNet(),"ElasticNet"), # regressor #(SGDClassifier(),"SGDClassifier"), (RidgeClassifier(tol=1e-2, solver="sag"), "Ridge Classifier sag"), (Perceptron(max_iter=150), "Perceptron"), (PassiveAggressiveClassifier(max_iter=150), "Passive-Aggressive hinge"), # hinge > squarehinge (KNeighborsClassifier(n_neighbors=8), "kNN8"), (RandomForestClassifier(n_estimators=100), "Random forest")): if log_prob: try: predictions.append(predict_logprob(clf, X_train=X_train, X_test=X_test, y_train = y_train)) except: # it's just input data for the Dense layer, so I'll mix log probabs and labels predictions.append(predict(clf, X_train=X_train, X_test=X_test, y_train = y_train)) else: clf = fit_clf(clf, X_train,y_train) predictions.append(predict(clf,X_test=X_test)) train_dat.append(predict(clf, X_test=X_train)) return np.asarray(train_dat), np.asarray(predictions)
def test_plot16(self): np.random.seed(seed) X, y = toy_datasets.balls_toy_dataset(centers=[[-1.0, 0.0], [0.0, 0.0], [1.0, 0.0]], rads=[0.3, 0.3, 0.3], samples=[50, 50, 50], noise=[0.1, 0.1, 0.1]) y[y == 2] = 0 y = y.astype(int) ncm = NearestCentroid() ncmc = NCMC_Classifier(centroids_num=[2, 1]) dml_multiplot(X, y, nrow=1, ncol=2, clfs=[ncm, ncmc], cmap='rainbow', subtitles=['NCM', 'NCMC'], figsize=(6, 3)) self.newsave() plt.close()
def getDiscreetClassifier(name, params={}): if(name == 'svm'): return SVC(**params) elif(name == 'knearest'): return KNeighborsClassifier(**params) elif(name == 'guassNB'): return GaussianNB() elif(name == 'sgd'): return SGDClassifier(**params) elif(name == 'adaBoost'): return AdaBoostClassifier(**params) elif(name == 'randomForest'): return RandomForestClassifier(**params) elif(name == 'perceptron'): return Perceptron(**params) elif(name == 'nearestCentroid'): return NearestCentroid(**params) elif(name == 'passiveAggressive'): return PassiveAggressiveClassifier(**params) elif(name == 'decisionTree'): return DecisionTreeClassifier(**params) elif(name == 'leastSquares'): return LinearRegression() elif(name == 'ridge'): return Ridge() elif(name == 'lasso'): return Lasso() elif(name == 'elasticNet'): return ElasticNet() elif(name == 'lars'): return Lars() elif(name == 'orthogonalMatchingPursuit'): return OrthogonalMatchingPursuit() elif(name == 'bayesianRidge'): return BayesianRidge() elif(name == 'logisticRegression'): return LogisticRegression() else: raise ValueError('Classifer' + name + ' is not supported')
def __init__(self, metric = 'euclidean', shrink_threshold = None, k=5): NearestCentroid.__init__(self, metric, shrink_threshold) self.k = k
def get_results(city,no): processing.preprocessing() pre=open('preprocess1.txt') train_set=[] line=pre.readline() while(line!=''): train_set.append(line) #print line line=pre.readline() #print train_set pos=open('positive-words.txt') neg=open('negative-words.txt') positive=[] negative=[] for i in pos.read().split(): positive.append(i) for j in neg.read().split(): negative.append(j) stopWords = stopwords.words('english') vectorizer = CountVectorizer(stop_words = stopWords) transformer = TfidfTransformer() #train_set=get_traindata() #l=[] #l.append(test_set) #l.append(test_set1) #trainVectorizerArray = vectorizer.fit_transform(train_set).toarray() #print vectorizer.get_feature_names() #testVectorizerArray = vectorizer.transform(test_set).toarray() #testVectorizerArray1 = vectorizer.transform(l[1]).toarray() #print 'Fit Vectorizer to train set', trainVectorizerArray #print 'Transform Vectorizer to test set', testVectorizerArray #print testVectorizerArray1[0] #transformer.fit(trainVectorizerArray) v= vectorizer.fit_transform(train_set) #print v.toarray() tfidf= transformer.fit_transform(v) #transformer.fit(testVectorizerArray) #tfidf = transformer.transform(trainVectorizerArray) #print tfidf.todense() #print("done in %0.3fs." % (time() - t0)) #print nmf.components_ # Inverse the vectorizer vocabulary to be able feature_names = vectorizer.get_feature_names() #print (feature_names) #if 'area' in feature_names: print (feature_names) print ("\n") #------- nmf = decomposition.NMF(n_components=3, init='random',random_state=0).fit(tfidf.todense()) topic_list=[] l= int(len(feature_names)/5) #print l for topic_idx, topic in enumerate(nmf.components_): topic_list.append(topic.argsort()[:-l-1:-1]) #print("Topic #%d:" % topic_idx) #print (topic) #print "Hello----" #print topic_list train_target=[] for arr in v.toarray(): train_target.append(calculate_Topic(arr,topic_list)) #print train_target #clf = MultinomialNB() #clf2= LinearSVC() #clf1=NearestCentroid() #clf.fit(tfidf.todense(),train_target) #clf1.fit(tfidf.todense(),train_target) #clf2.fit(tfidf.todense(),train_target) #print (clf.predict(X_test)) #print (clf1.predict(X_test)) #print (clf2.predict(X_test)) #print "Hello" ch2 = SelectKBest(chi2, k=l*2) X_train = ch2.fit_transform(tfidf.todense(), train_target) cs= ch2.scores_.argsort()[::-1] cs_featurenames=[] cs=cs[:l*2] for x in cs: cs_featurenames.append(feature_names[x]) print (cs_featurenames) print "\n" nmf1 = decomposition.NMF(n_components=3, init='random',random_state=0).fit(X_train) topic_list=[] l= int(len(feature_names)/5) #print l for topic_idx, topic in enumerate(nmf1.components_): z=topic.argsort()[:-l-1:-1] topic_list.append(z) print("Topic #%d:---------------------------------------" % topic_idx) for y in z: print cs_featurenames[y] #print (topic) #print "Hello----" #print topic_list train_target=[] for arr in X_train: train_target.append(calculate_Topic(arr,topic_list)) #--------- #print "hello" #print train_target #print ch2.get_feature_names() #print X_train #print train_target #print "=--------------" #print ta #print X_test train_count=[0]*4 #print train_target for x in train_target: train_count[x]=train_count[x]+1 #print "hello" #print train_count clf = MultinomialNB() clf2= LinearSVC() clf1=NearestCentroid() clf.fit(X_train,train_target) clf1.fit(X_train,train_target) clf2.fit(X_train,train_target) dic={} hotels=read_hotels(city,dic) temp=[] for each in hotels: temp.append(calculate(vectorizer,transformer,train_count,ch2,each,clf,clf1,clf2,positive,negative,train_set)) res=[] temp1=numpy.array(temp).argsort()[::-1] #print temp1 print "Top %d recommendations are as follows[in the FORMAT Index,(Hotel name,Location),Score]:\n" %no for g in temp1[:no]: print g,dic[g],temp[g] res.append(dic[g]) return res
pred = clf4.predict(X_test) writeToDisk(pred,"KNeighborsClassifier") clf5=RandomForestClassifier(n_estimators=100) #RandomForest Classifier clf5.fit(X_train, y_train) pred = clf5.predict(X_test) writeToDisk(pred,"RandomForestClassifier") clf6=Pipeline([('feature_selection', #LinearSVC with L2-based feature selection LinearSVC(penalty="l2", dual=False, tol=1e-3)), ('classification', LinearSVC())]) clf6.fit(X_train, y_train) pred = clf6.predict(X_test) writeToDisk(pred,"LinearSVC") clf7=NearestCentroid() #NearestCentroid (aka Rocchio classifier), no threshold clf7.fit(X_train, y_train) pred = clf7.predict(X_test) writeToDisk(pred,"NearestCentroid") clf8=SVC(C=1.0, class_weight=None, coef0=0.0, #SVC decision_function_shape=None, degree=3, gamma='auto', kernel='linear', max_iter=-1, probability=False, random_state=1, shrinking=True, tol=0.001, verbose=False) clf8.fit(X_train, y_train) pred = clf8.predict(X_test) writeToDisk(pred,"SVC") ''' clf9=VotingClassifier(estimators=[ ('Ridge',clf1),('MultiNB',clf2),('BernNB',clf3),('KNN',clf4), ('RF',clf5),('LinearSVC',clf6),('NearC',clf7),('SVC',clf8)
def __init__(self, shrink=None): self.cl = NearestCentroid(shrink_threshold=shrink) self.shrink = shrink
def test_iris(): # Check consistency on dataset iris. for metric in ('euclidean', 'cosine'): clf = NearestCentroid(metric=metric).fit(iris.data, iris.target) score = np.mean(clf.predict(iris.data) == iris.target) assert score > 0.9, "Failed with score = " + str(score)
def test_precomputed(): clf = NearestCentroid(metric="precomputed") clf.fit(X, y) S = pairwise_distances(T, clf.centroids_) assert_array_equal(clf.predict(S), true_result)
def test_classification_toy(): # Check classification on a toy dataset, including sparse versions. clf = NearestCentroid() clf.fit(X, y) assert_array_equal(clf.predict(T), true_result) # Same test, but with a sparse matrix to fit and test. clf = NearestCentroid() clf.fit(X_csr, y) assert_array_equal(clf.predict(T_csr), true_result) # Fit with sparse, test with non-sparse clf = NearestCentroid() clf.fit(X_csr, y) assert_array_equal(clf.predict(T), true_result) # Fit with non-sparse, test with sparse clf = NearestCentroid() clf.fit(X, y) assert_array_equal(clf.predict(T_csr), true_result) # Fit and predict with non-CSR sparse matrices clf = NearestCentroid() clf.fit(X_csr.tocoo(), y) assert_array_equal(clf.predict(T_csr.tolil()), true_result)
y_train = labels[100:172,i] X_test = sample2 y_test = labels[272:,i] else: X_train = training y_train = labels[:172,i] X_test = sampletest y_test = labels[172:,i] posterior = np.empty([100,72,6]) box = np.zeros([6,6]) for j in range(4,5): for k in range(1,2): accuracy = np.zeros(100) for m in range(0,100): ncc = NearestCentroid() ncc.fit(X_train, y_train) y_pred = ncc.predict(X_test) n=0 for i in range(0,len(y_pred)): if y_pred[i] == y_test[i]: #print i, y_pred[i], y_test[i] n = n+1 accuracy[m] = accuracy[m]+1 box[y_test[i]-1,y_pred[i]-1] = box[y_test[i]-1,y_pred[i]-1] + 1 #posterior[m] = knc.predict_proba(X_test) print j, k, np.mean(accuracy)/0.72, np.std(accuracy)/0.72 #print 30, 20, sum(accuracy[0:8])/8.0, sum(accuracy[8:18])/10.0, sum(accuracy[18:30])/12.0, sum(accuracy[56:72])/16.0, sum(accuracy[30:43])/13.0, sum(accuracy[43:56])/13.0, sum(accuracy)/72.0 ''' means = np.empty([72,6])
def fit(self, X, y): biny = self.bin_data(y) self.pred = NearestCentroid().fit(X, biny) return self
# Nearest Centroid from sklearn import datasets from sklearn import metrics from sklearn.neighbors import NearestCentroid # load the iris datasets dataset = datasets.load_iris() # fit a nearest centroid model to the data model = NearestCentroid() model.fit(dataset.data, dataset.target) print(model) # make predictions expected = dataset.target predicted = model.predict(dataset.data) # summarize the fit of the model print(metrics.classification_report(expected, predicted)) print(metrics.confusion_matrix(expected, predicted))
def __init__(self): self.metric = lambda x1, x2: np.log(np.sqrt(np.sum(np.square(x1 - x2)))) NearestCentroid.__init__(self, metric=self.metric)
def test_precomputed(): clf = NearestCentroid(metric='precomputed') with assert_raises(ValueError) as context: clf.fit(X, y) assert_equal(ValueError, type(context.exception))
# import some data to play with iris = datasets.load_iris() X = iris.data[:, :2] # we only take the first two features. We could # avoid this ugly slicing by using a two-dim dataset y = iris.target h = 0.02 # step size in the mesh # Create color maps cmap_light = ListedColormap(["#FFAAAA", "#AAFFAA", "#AAAAFF"]) cmap_bold = ListedColormap(["#FF0000", "#00FF00", "#0000FF"]) for shrinkage in [None, 0.1]: # we create an instance of Neighbours Classifier and fit the data. clf = NearestCentroid(shrink_threshold=shrinkage) clf.fit(X, y) y_pred = clf.predict(X) print(shrinkage, np.mean(y == y_pred)) # Plot the decision boundary. For that, we will assign a color to each # point in the mesh [x_min, m_max]x[y_min, y_max]. x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]) # Put the result into a color plot Z = Z.reshape(xx.shape) pl.figure() pl.pcolormesh(xx, yy, Z, cmap=cmap_light)
train_images = faces.images[np.negative(ind)] train_targets = faces.target[np.negative(ind)] n_train = len(train_images) test_images = faces.images[ind] test_targets = faces.target[ind] n_tests = len(test_images) for test in test_images: test = test + norm.rvs(scale=10, size=test.shape) for i in range(25, 30): test[i, :] = 0 test[:, i] = 0 test = np.minimum(test, 1) test = np.maximum(test, 0) test = np.zeros(test.shape) train = train_images.reshape((n_train, -1)) train_pca = pca.fit_transform(train) test = test_images.reshape((n_tests, -1)) test_pca = pca.transform(test) neigh = NearestCentroid() neigh.fit(train, train_targets) print("Sans ACP :", np.count_nonzero(neigh.predict(test) - test_targets), n_tests) neigh.fit(train_pca, train_targets) print("Avec ACP :", np.count_nonzero(neigh.predict(test_pca) - test_targets), n_tests)