def nearest_centroid_classifier(X_train, categories, X_test, test_categories):
    from sklearn.neighbors import NearestCentroid
    clf = NearestCentroid().fit(X_train, categories)
    y_roccio_predicted = clf.predict(X_test)
    print "\n Here is the classification report for NearestCentroid classifier:"
    print metrics.classification_report(test_categories, y_roccio_predicted)
    to_latex(test_categories, y_roccio_predicted)  
def NC(data_train, data_train_vectors, data_test_vectors, **kwargs):
    # Implementing classification model- using NearestCentroid
    clf_nc =  NearestCentroid()
    clf_nc.fit(data_train_vectors, data_train.target)
    y_pred = clf_nc.predict(data_test_vectors)
    
    return y_pred
Beispiel #3
0
    def train_with(self, training_data_list, answers):
        #put data in right format
        training_data = self.get_sparse_matrix(training_data_list)

        if training_data is not False:

        #make model
            if self.model_name == "random_forest":
                forest = RandomForestClassifier(n_estimators=100)
                self.model = forest.fit(training_data.todense(), answers)
            elif self.model_name == "centroid_prediction":
                clf = NearestCentroid()
                self.model = clf.fit(training_data, answers)
            elif self.model_name == "linearSVC":
                SVC = LinearSVC()
                self.model = SVC.fit(training_data.todense(), answers)
            elif self.model_name == "nearest_neighbor":
                near = KNeighborsClassifier()
                self.model = near.fit(training_data.todense(), answers)
            elif self.model_name == "decision_tree":
                clf = tree.DecisionTreeClassifier()
                self.model = clf.fit(training_data.todense(), answers)
            elif self.model_name == "svc":
                clf = svm.SVC()
                self.model = clf.fit(training_data, answers)
def test_shrinkage_threshold_decoded_y():
    clf = NearestCentroid(shrink_threshold=0.01)
    y_ind = np.asarray(y)
    y_ind[y_ind == -1] = 0
    clf.fit(X, y_ind)
    centroid_encoded = clf.centroids_
    clf.fit(X, y)
    assert_array_equal(centroid_encoded, clf.centroids_)
def test_iris_shrinkage():
    # Check consistency on dataset iris, when using shrinkage.
    for metric in ('euclidean', 'cosine'):
        for shrink_threshold in [None, 0.1, 0.5]:
            clf = NearestCentroid(metric=metric,
                                  shrink_threshold=shrink_threshold)
            clf = clf.fit(iris.data, iris.target)
            score = np.mean(clf.predict(iris.data) == iris.target)
            assert score > 0.8, "Failed with score = " + str(score)
def test_manhattan_metric():
    # Test the manhattan metric.

    clf = NearestCentroid(metric='manhattan')
    clf.fit(X, y)
    dense_centroid = clf.centroids_
    clf.fit(X_csr, y)
    assert_array_equal(clf.centroids_, dense_centroid)
    assert_array_equal(dense_centroid, [[-1, -1], [1, 1]])
Beispiel #7
0
def nearest_centroid_classifier(X_train, X_test, y_train, y_test):
    from sklearn.neighbors import NearestCentroid
    clf = NearestCentroid().fit(X_train, y_train)

    evaluate_cross_validation(clf,X_train, y_train, 5)


    y_roccio_predicted = clf.predict(X_test)
    print "\n Here is the classification report for NearestCentroid classifier:"
    print metrics.classification_report(y_test, y_roccio_predicted)
def test_shrinkage_correct():
    # Ensure that the shrinking is correct.
    # The expected result is calculated by R (pamr),
    # which is implemented by the author of the original paper.
    # (One need to modify the code to output the new centroid in pamr.predict)

    X = np.array([[0, 1], [1, 0], [1, 1], [2, 0], [6, 8]])
    y = np.array([1, 1, 2, 2, 2])
    clf = NearestCentroid(shrink_threshold=0.1)
    clf.fit(X, y)
    expected_result = np.array([[0.7787310, 0.8545292], [2.814179, 2.763647]])
    np.testing.assert_array_almost_equal(clf.centroids_, expected_result)
def test_pickle():
    import pickle

    # classification
    obj = NearestCentroid()
    obj.fit(iris.data, iris.target)
    score = obj.score(iris.data, iris.target)
    s = pickle.dumps(obj)

    obj2 = pickle.loads(s)
    assert_equal(type(obj2), obj.__class__)
    score2 = obj2.score(iris.data, iris.target)
    assert_array_equal(score, score2,
                       "Failed to generate same score"
                       " after pickling (classification).")
Beispiel #10
0
class NCClassifier(Classifier):
    """Rocchio classifier"""
    def __init__(self, shrink=None):
        self.cl = NearestCentroid(shrink_threshold=shrink)
        self.shrink = shrink

    def retrain(self, vectorFeature, vectorTarget):
        if self.shrink != None:
            self.cl.fit([v.toarray()[0] for v in vectorFeature], vectorTarget)
        else:
            super(NCClassifier, self).retrain(vectorFeature, vectorTarget)

    def classify(self, vectorizedTest):
        if self.shrink != None:
            return self.cl.predict(vectorizedTest.toarray()[0])[0]
        else:
            return super(NCClassifier, self).classify(vectorizedTest)
Beispiel #11
0
def nearestNeighbour():
	import numpy as np
	import pylab as pl
	from matplotlib.colors import ListedColormap
	from sklearn import datasets
	from sklearn.neighbors import NearestCentroid

	n_neighbors = 15

	# import some data to play with
	iris = datasets.load_iris()
	X = iris.data[:, :2]  # we only take the first two features. We could
	                      # avoid this ugly slicing by using a two-dim dataset
	y = iris.target

	h = .02  # step size in the mesh

	# Create color maps
	cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
	cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])

	for shrinkage in [None, 0.1]:
	    # we create an instance of Neighbours Classifier and fit the data.
	    clf = NearestCentroid(shrink_threshold=shrinkage)
	    clf.fit(X, y)
	    y_pred = clf.predict(X)
	    print shrinkage, np.mean(y == y_pred)
	    # Plot the decision boundary. For that, we will asign a color to each
	    # point in the mesh [x_min, m_max]x[y_min, y_max].
	    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
	    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
	    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
	                         np.arange(y_min, y_max, h))
	    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])

	    # Put the result into a color plot
	    Z = Z.reshape(xx.shape)
	    pl.figure()
	    pl.pcolormesh(xx, yy, Z, cmap=cmap_light)

	    # Plot also the training points
	    pl.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold)
	    pl.title("3-Class classification (shrink_threshold=%r)"
	             % shrinkage)
	    pl.axis('tight')
def create_and_train_model(engine):
    cmd = "SELECT review_rating, review_text FROM bf_reviews"
    bfdf = pd.read_sql_query(cmd, engine)
    bfdfl = bfdf[bfdf['review_text'].str.len() > 300].copy()
    train_data = bfdfl['review_text'].values[:1000]
    y_train = bfdfl['review_rating'].values[:1000]

    t0 = time()
    vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
                                 stop_words='english')
    X_train = vectorizer.fit_transform(train_data)
    duration = time() - t0
    print('vectorized in {:.2f} seconds.'.format(duration))
    print(X_train.shape)

    clf = NearestCentroid()
    clf.fit(X_train, y_train)
    return clf, vectorizer
Beispiel #13
0
class BinBasedCluster(BaseEstimator):
    def __init__(self, bins=[0, 0.5, 1] + range(5, 36)):
        self.bins = bins

    def fit(self, X, y):

        biny = self.bin_data(y)

        self.pred = NearestCentroid().fit(X, biny)
        return self

    def predict(self, X):
        return self.pred.predict(X)

    def score(self, X, y, is_raw=True):
        clusters = self.pred.predict(X)
        if is_raw:
            return adjusted_rand_score(self.bin_data(y), clusters)
        else:
            return adjusted_rand_score(y, clusters)

    def bin_data(self, y):
        return np.digitize(y, self.bins)

    def make_vern_points(self, X, y):

        sel = SelectKBest(score_func=normalized_mutual_info_score_scorefunc)
        sdata = sel.fit_transform(X, y)
        print X.shape, sdata.shape

        pca = PCA(n_components=2)
        pca_trans = pca.fit_transform(sdata)

        biny = self.bin_data(y)

        pred = NearestCentroid().fit(pca_trans, biny)

        x_min, x_max = pca_trans[:, 0].min() - 1, pca_trans[:, 0].max() + 1
        y_min, y_max = pca_trans[:, 1].min() - 1, pca_trans[:, 1].max() + 1
        xx, yy = np.meshgrid(np.linspace(x_min, x_max, 50), np.linspace(y_min, y_max, 50))
        Z = pred.predict(np.c_[xx.ravel(), yy.ravel()])
        Z = Z.reshape(xx.shape)

        return pca_trans, biny, xx, yy, Z
Beispiel #14
0
 def test_disambiguator_store(self):
     # Create a silly classifier that disambiguates between "stam" (tree
     # trunk) or "romp" (body trunk) as the Dutch translation of the
     # English noun "trunk"
     lempos = u"trunk/n"
     # FIXME: store_fit() should only accept unicode strings
     target_names = u"stam romp".encode("utf-8").split()
     vocab = u"boom hoofd".split()
     
     X = np.array([[0,1],
                   [1,0],
                   [0,1],
                   [1,0]])
     y = np.array([1,0,1,0])
     
     estimator = NearestCentroid()
     estimator.fit(X, y)
     
     centroids = estimator.centroids_
     score = estimator.score(X, y)
     
     # Store estimator
     fname = tempfile.NamedTemporaryFile().name
     f = DisambiguatorStore(fname, "w")
     f.save_estimator(NearestCentroid())
     f.save_vocab(vocab)
     f.store_fit(lempos, estimator)
     f.save_target_names(lempos, target_names)
     f.close()
     
     # Restore estimator    
     f2 = DisambiguatorStore(fname) 
     estimator2 = f2.load_estimator()
     vocab2 = f2.load_vocab()
     f2.restore_fit(lempos, estimator2)
     target_names2 = f2.load_target_names(lempos)
     centroids2 = estimator2.centroids_
     score2 = estimator2.score(X, y)
     
     assert_array_equal(centroids, centroids2)
     assert target_names == target_names2
     assert vocab == vocab2
     assert score == score2
Beispiel #15
0
    def make_vern_points(self, X, y):

        sel = SelectKBest(score_func=normalized_mutual_info_score_scorefunc)
        sdata = sel.fit_transform(X, y)
        print X.shape, sdata.shape

        pca = PCA(n_components=2)
        pca_trans = pca.fit_transform(sdata)

        biny = self.bin_data(y)

        pred = NearestCentroid().fit(pca_trans, biny)

        x_min, x_max = pca_trans[:, 0].min() - 1, pca_trans[:, 0].max() + 1
        y_min, y_max = pca_trans[:, 1].min() - 1, pca_trans[:, 1].max() + 1
        xx, yy = np.meshgrid(np.linspace(x_min, x_max, 50), np.linspace(y_min, y_max, 50))
        Z = pred.predict(np.c_[xx.ravel(), yy.ravel()])
        Z = Z.reshape(xx.shape)

        return pca_trans, biny, xx, yy, Z
def test_predict_translated_data():
    # Test that NearestCentroid gives same results on translated data

    rng = np.random.RandomState(0)
    X = rng.rand(50, 50)
    y = rng.randint(0, 3, 50)
    noise = rng.rand(50)
    clf = NearestCentroid(shrink_threshold=0.1)
    clf.fit(X, y)
    y_init = clf.predict(X)
    clf = NearestCentroid(shrink_threshold=0.1)
    X_noise = X + noise
    clf.fit(X_noise, y)
    y_translate = clf.predict(X_noise)
    assert_array_equal(y_init, y_translate)
Beispiel #17
0
def test_shrinkage_threshold_decoded_y():
    clf = NearestCentroid(shrink_threshold=0.01)
    y_ind = np.asarray(y)
    y_ind[y_ind == -1] = 0
    clf.fit(X, y_ind)
    centroid_encoded = clf.centroids_
    clf.fit(X, y)
    assert_array_equal(centroid_encoded, clf.centroids_)
Beispiel #18
0
def select_classifier(X, y, n_splits=10, test_size=0.1, random_state=0, show=True):
    classifiers = [
        AdaBoostClassifier(),
        BaggingClassifier(),
        BernoulliNB(),
        CalibratedClassifierCV(),
        DecisionTreeClassifier(),
        ExtraTreeClassifier(),
        GaussianNB(),
        GaussianProcessClassifier(),
        GradientBoostingClassifier(),
        KNeighborsClassifier(),
        LinearDiscriminantAnalysis(),
        LinearSVC(),
        LogisticRegression(),
        LogisticRegressionCV(),
        MLPClassifier(),
        MultinomialNB(),
        NearestCentroid(),
        NuSVC(),
        PassiveAggressiveClassifier(),
        Perceptron(),
        QuadraticDiscriminantAnalysis(),
        RadiusNeighborsClassifier(),
        RandomForestClassifier(),
        RidgeClassifier(),
        RidgeClassifierCV(),
        SGDClassifier(),
        SVC()
    ]
    names = [clf.__class__.__name__ for clf in classifiers]
    cv = StratifiedShuffleSplit(n_splits=n_splits, test_size=test_size, random_state=random_state)
    scores = {}
    for train_index, test_index in cv.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        for name, clf in zip(names, classifiers):
            try:
                clf.fit(X_train, y_train)
                train_predictions = clf.predict(X_test)
                acc = accuracy_score(y_test, train_predictions)
            except:
                acc = 0
            s = scores.get(name, [])
            s.append(acc)
            scores[name] = s
    scores = [[n, np.mean(s)] for n, s in scores.items()]
    scores = pd.DataFrame(scores, columns=['Classifier', 'Score']).sort_values(by='Score', ascending=False)
    if show:
        print(scores)
    return scores.iloc[0, 0], classifiers[scores.iloc[0].name], scores
Beispiel #19
0
def get_hyperparameters_model():
    metric = ['euclidean', 'manhattan']

    param_dist = {'cls__metric': metric}

    clf = NearestCentroid()

    model = {
        'nearest_centroid': {
            'model': clf,
            'param_distributions': param_dist
        }
    }
    return model
def nearest_mean_classifier(X_train, y_train, X_validation, X_test):
    # Returns the labels for test_data, predicted by the nearest mean classifier trained on X_train and y_train
    # Input:
    # X_train - num_train x num_features matrix with features for the training data
    # y_train - num_train x 1 vector with labels for the training data
    # X_validation - num_test x num_features matrix with features for the validation data
    # X_test - num_test x num_features matrix with features for the test data
    # Output:
    # y_pred_validation - num_test x 1 predicted vector with labels for the validation data
    # y_pred_test - num_test x 1 predicted vector with labels for the test data

    X_test_val = np.vstack((X_validation, X_test))
    # Gooi datasets samen

    clf = NearestCentroid()
    clf.fit(X_train, y_train) # Bepaal de means
    
    predicted_labels = clf.predict(X_test_val) # Voorspel de data

    # Sla voorspellingen op
    y_pred_validation = predicted_labels[:len(X_validation)]
    y_pred_test = predicted_labels[len(X_validation):]
    return y_pred_validation, y_pred_test
Beispiel #21
0
    def _cluster_func(self, n_clusters, km, pars=None, lsi=None):
        """ A helper function for clustering, includes base method used by
        all clustering implementations """
        import warnings
        from sklearn.neighbors import NearestCentroid
        if pars is None:
            pars = {}
        pars.update(km.get_params(deep=True))
        X = joblib.load(os.path.join(self.fe.dsid_dir, 'features'))

        mid, mid_dir = setup_model(self.model_dir)

        if lsi is not None:
            X = lsi.fit_transform(X)
            joblib.dump(X,
                        os.path.join(self.model_dir, mid, 'lsi_features'),
                        compress=9)
            pars['lsi'] = lsi

        with warnings.catch_warnings():
            if type(km).__name__ != "DBSCAN":
                warnings.filterwarnings("ignore", category=DeprecationWarning)
            km.fit(X)
        pars['lsi'] = lsi
        self.mid = mid
        self.mid_dir = mid_dir

        labels_ = km.labels_
        if type(km).__name__ == "DBSCAN":
            labels_ = _dbscan_noisy2unique(labels_)
            n_clusters = len(np.unique(labels_))
            km.labels_ = labels_

        if not hasattr(km, 'cluster_centers_'):
            # i.e. model is not MiniBatchKMeans => compute centroids
            km.cluster_centers_ = NearestCentroid().fit(X, labels_).centroids_

        pars['n_clusters'] = n_clusters

        joblib.dump(km, os.path.join(self.model_dir, mid, 'model'), compress=9)
        joblib.dump(pars,
                    os.path.join(self.model_dir, mid, 'pars'),
                    compress=9)

        self.km = km
        self._pars = pars

        htree = self._get_htree(km)

        return labels_, htree
class scikit_NearestCentroid(MLAlgo):
    def __init__(self):
        self.clf = NearestCentroid()
        self.className = self.__class__.__name__

    def train(self, train_data):
        train_X = train_data[:, :-1]
        train_Y = train_data[:, -1]
        self.clf.fit(train_X, train_Y)
        print("NearestCentroid model built.")
        return self.className + " Training finished...\n"

    def test(self, test_data):
        test_X = test_data[:, :-1]
        test_Y = test_data[:, -1]
        print("Accuracy: ", self.clf.score(test_X, test_Y))
        return self.className + " Testing finished...\n"

    def predict(self, predict_data):
        print("Predictions: ", self.clf.predict(predict_data))
        return self.className + " Prediction finished...\n"

    def cross_validate(self, train_data):
        X_ = train_data[:, :-1]
        Y_ = train_data[:, -1]
        predicted = cross_val_predict(self.clf, X_, Y_, cv=10)
        print("Cross-validation accuracy: ",
              metrics.accuracy_score(Y_, predicted))

        if metrics.accuracy_score(Y_,
                                  predicted) > MLAlgo.cross_validate_accuracy:
            MLAlgo.cross_validate_accuracy = metrics.accuracy_score(
                Y_, predicted)
            MLAlgo.classifier = self.clf
            MLAlgo.trained_instance = self

        return self.className + " Cross validation finished...\n"
Beispiel #23
0
    def ModelsIteration(self):
        results = []
        for clf, name in (
                (RidgeClassifier(tol=1e-2, solver="sag"), "Ridge Classifier"),
                (Perceptron(n_iter=50), "Perceptron"),
                (PassiveAggressiveClassifier(n_iter=50), "Passive-Aggressive"),
                (KNeighborsClassifier(n_neighbors=10), "kNN"),
                # (RandomForestClassifier(n_estimators=100), "Random forest"),
                (SVC(C=1e-8, gamma=1.0/self.X_train.shape[1], kernel='rbf'), "SVM with RBF Kernel")):

            print('=' * 80)
            print(name)
            results.append(self.benchmark(clf))

        for penalty in ["l2", "l1"]:
            print('=' * 80)
            print("%s penalty" % penalty.upper())
            # Train Liblinear model
            results.append(self.benchmark(LinearSVC(loss='squared_hinge', penalty=penalty,
                                                    dual=False, tol=1e-3), 'LinearSVC'))

            # Train SGD model
            results.append(self.benchmark(SGDClassifier(alpha=.0001, n_iter=50, penalty=penalty), 'SGDClassifier'))

        # Train SGD with Elastic Net penalty
        print('=' * 80)
        print("Elastic-Net penalty")
        results.append(self.benchmark(SGDClassifier(alpha=.0001, n_iter=50, penalty="elasticnet")))

        # Train NearestCentroid without threshold
        print('=' * 80)
        print("NearestCentroid (aka Rocchio classifier)")
        results.append(self.benchmark(NearestCentroid()))

        # Train sparse Naive Bayes classifiers
        print('=' * 80)
        print("Naive Bayes")
        results.append(self.benchmark(MultinomialNB(alpha=.01), 'MultinomialNB'))
        results.append(self.benchmark(BernoulliNB(alpha=.01), 'BernoulliNB'))
        # results.append(self.benchmark(GaussianNB(), 'GaussianNB'))

        print('=' * 80)
        print("LinearSVC")
        # The smaller C, the stronger the regularization.
        # The more regularization, the more sparsity.
        results.append(self.benchmark(LinearSVC(), 'LinearSVC'))

        return results
def classify():

    results = []
    for clf, name in ((RidgeClassifier(tol=1e-2,
                                       solver="lsqr"), "Ridge Classifier"),
                      (Perceptron(n_iter=50),
                       "Perceptron"), (PassiveAggressiveClassifier(n_iter=50),
                                       "Passive-Aggressive"),
                      (KNeighborsClassifier(n_neighbors=10), "kNN")):
        print('=' * 80)
        print(name)
        results.append(benchmark(clf))

    for penalty in ["l2", "l1"]:
        print('=' * 80)
        print("%s penalty" % penalty.upper())
        # Train Liblinear model
        results.append(
            benchmark(
                LinearSVC(loss='l2', penalty=penalty, dual=False, tol=1e-3)))

        # Train SGD model
        results.append(
            benchmark(SGDClassifier(alpha=.0001, n_iter=50, penalty=penalty)))

    # Train SGD with Elastic Net penalty
    print('=' * 80)
    print("Elastic-Net penalty")
    results.append(
        benchmark(SGDClassifier(alpha=.0001, n_iter=50, penalty="elasticnet")))

    # Train NearestCentroid without threshold
    print('=' * 80)
    print("NearestCentroid (aka Rocchio classifier)")
    results.append(benchmark(NearestCentroid()))

    # Train sparse Naive Bayes classifiers
    print('=' * 80)
    print("Naive Bayes")
    results.append(benchmark(MultinomialNB(alpha=.01)))
    results.append(benchmark(BernoulliNB(alpha=.01)))

    print('=' * 80)
    print("LinearSVC with L1-based feature selection")
    results.append(benchmark(L1LinearSVC()))

    return results
Beispiel #25
0
    def fit(self, X, y):
        """
        Runs very light, memory-based like fitting Method
        which primarily stores `X` and `y` in memory. In the
        case of profile-based verifier, we store a single,
        mean centroid per author in memory.

        Parameters
        ----------
        X: floats, array-like [nb_documents, nb_features]
            The 2D matrix representing the training instance-based
            to be memorized.

        y, array of ints [nb_documents]
            An int-encoded representation of the correct authorship
            for each training documents.

        References
        ----------
        - Daelemans, W. & van den Bosch, A. (2005). Memory-Based
          Language Processing. Cambridge University Press.
        - M. Koppel and S. Seidman (2013), Automatically
          Identifying Pseudepigraphic Texts, EMNLP-13: 1449-1454.

        """

        self.train_X = NearestCentroid().fit(X, y).centroids_ # mean centroids
        self.train_y = np.array(range(self.train_X.shape[0]))

        nb_items = self.train_X.shape[0]

        # calculate all pairwise distances in data set:
        distances = []
        idxs = range(self.train_X.shape[0])
        for i, j in combinations(range(nb_items), 2):
            distances.append(self.metric_fn(self.train_X[i],
                                            self.train_X[j],
                                            idxs))

        # fit a 0-1 scaler on the distances:
        distances = np.array(distances, dtype='float32').transpose()
        distances = distances[~np.isnan(distances)]
        self.distance_scaler1 = StandardScaler().fit(distances)
        distances = self.distance_scaler1.transform(distances.transpose())
        self.distance_scaler2 = MinMaxScaler().fit(distances)
Beispiel #26
0
def prepare_models():
    models = []
    # Non-Ensemble classifiers to be included in classifer test with default params
    # Some classifiers have non-default params to reduce training time significantly
    models.append(('Dummy', DummyClassifier(strategy="uniform")))
    models.append(('LogisticRegression', LogisticRegression(C=0.001)))
    models.append(('Ridge', RidgeClassifier())) # Non-probabilistic
    models.append(('Perceptron', Perceptron())) # Non-probabilistic
    models.append(('PassiveAggressive', PassiveAggressiveClassifier(C=0.001))) # Non-probabilistic
    models.append(('LDA', LinearDiscriminantAnalysis()))
    models.append(('QDA', QuadraticDiscriminantAnalysis()))
    models.append(('Naive_Bayes_Gaussian', GaussianNB()))
    models.append(('LinearSVC', LinearSVC(C=0.001))) # Non-probabilistic
    models.append(('DecisionTree', DecisionTreeClassifier(max_depth=5)))
    models.append(('NearestCentroid', NearestCentroid())) # Non-probabilistic
    models.append(('MultiLayerPerceptron', MLPClassifier()))
    models.append(('Keras', KerasClassifier(build_fn=keras_baseline_model, nb_epoch=5, batch_size=100, verbose=0)))
    return models
Beispiel #27
0
 def test_plot13(self):
     np.random.seed(seed)
     X, y = iris_data()
     X = X[:, [0, 2]]
     dml = NCMML()
     clf = NearestCentroid()
     dml_plot(X, y, clf, cmap="gist_rainbow", figsize=(15, 8))
     self.newsave()
     dml_plot(X, y, dml=dml, clf=clf, cmap="gist_rainbow", figsize=(15, 8))
     self.newsave()
     dml_pairplots(X,
                   y,
                   dml=dml,
                   clf=clf,
                   cmap="gist_rainbow",
                   figsize=(15, 8))
     self.newsave()
     plt.close()
Beispiel #28
0
def get_lots_o_models():
    """
    Returns a list of SKLearn classifiers to exercise
    :return: List of instantiated classifiers.
    """
    the_models = []
    the_models.append((RidgeClassifier(tol=1e-2,
                                       solver="lsqr"), 'Ridge_Classifier'))
    the_models.append((Perceptron(n_iter=50), "Perceptron"))
    the_models.append(
        (PassiveAggressiveClassifier(n_iter=50), "Passive_Aggressive"))
    the_models.append((KNeighborsClassifier(n_neighbors=10), "kNN"))
    the_models.append(
        (RandomForestClassifier(n_estimators=100), "Random_Forest_100"))
    #    the_models.append((RandomForestClassifier(n_estimators=10), "Random_Forest_10"))
    #    the_models.append((RandomForestClassifier(n_estimators=1000), "Random_Forest_1000"))

    for penalty in ["l2", "l1"]:
        the_models.append(
            (LinearSVC(loss='squared_hinge',
                       penalty=penalty,
                       dual=False,
                       tol=1e-3), "%s_penalty" % penalty.upper()))

        the_models.append(
            (SGDClassifier(alpha=.0001, n_iter=50,
                           penalty=penalty), "%s penalty" % penalty.upper()))

    the_models.append(
        (SGDClassifier(alpha=.0001, n_iter=50,
                       penalty="elasticnet"), "Elastic-Net penalty"))

    the_models.append(
        (NearestCentroid(), "NearestCentroid_aka_Rocchio_classifier)"))
    the_models.append((MultinomialNB(alpha=.01), 'Naive_Bayes_Multi'))
    the_models.append((BernoulliNB(alpha=.01), 'Naive_Bayes_Bernoulli'))
    the_models.append((Pipeline([
        ('feature_selection',
         SelectFromModel(LinearSVC(penalty="l1", dual=False, tol=1e-3))),
        ('classification', LinearSVC())
    ]), 'LinearSVC_with_L1'))

    return the_models
def get_classifier_with_best_parameters(classifier_enum, best_parameters):

    if classifier_enum == Classifier.ADA_BOOST_CLASSIFIER:
        return AdaBoostClassifier(**best_parameters)

    elif classifier_enum == Classifier.BERNOULLI_NB:
        return BernoulliNB(**best_parameters)

    elif classifier_enum == Classifier.COMPLEMENT_NB:
        return ComplementNB(**best_parameters)

    elif classifier_enum == Classifier.DECISION_TREE_CLASSIFIER:
        return DecisionTreeClassifier(**best_parameters)

    elif classifier_enum == Classifier.GRADIENT_BOOSTING_CLASSIFIER:
        return GradientBoostingClassifier(**best_parameters)

    elif classifier_enum == Classifier.K_NEIGHBORS_CLASSIFIER:
        return KNeighborsClassifier(**best_parameters)

    elif classifier_enum == Classifier.LINEAR_SVC:
        return LinearSVC(**best_parameters)

    elif classifier_enum == Classifier.LOGISTIC_REGRESSION:
        return LogisticRegression(**best_parameters)

    elif classifier_enum == Classifier.MULTINOMIAL_NB:
        return MultinomialNB(**best_parameters)

    elif classifier_enum == Classifier.NEAREST_CENTROID:
        return NearestCentroid(**best_parameters)

    elif classifier_enum == Classifier.PASSIVE_AGGRESSIVE_CLASSIFIER:
        return PassiveAggressiveClassifier(**best_parameters)

    elif classifier_enum == Classifier.PERCEPTRON:
        return Perceptron(**best_parameters)

    elif classifier_enum == Classifier.RANDOM_FOREST_CLASSIFIER:
        return RandomForestClassifier(**best_parameters)

    elif classifier_enum == Classifier.RIDGE_CLASSIFIER:
        return RidgeClassifier(**best_parameters)
def nearest_centroid_classifier(train, validation, verbose=False):
    nearest_centroid = NearestCentroid()
    nearest_centroid.fit(train['data'], train['labels'])
    # Find the prediction and accuracy on the training set.
    Yhat_svc_linear_train = nearest_centroid.predict(train['data'])
    acc_train = np.mean(Yhat_svc_linear_train == train['labels'])

    # Find the prediction and accuracy on the test set.
    Yhat_svc_linear_test = nearest_centroid.predict(validation['data'])
    acc_validation = np.mean(Yhat_svc_linear_test == validation['labels'])
    if verbose:
        print('Train Accuracy for lda classifier, = {0:f}'.format(acc_train))
        print('Validation Accuracy for lda classifier, = {0:f}'.format(acc_validation))
    return acc_train, acc_validation
 def __init__(self,
              metric='euclidean',
              shrink_threshold=None,
              ranking_size=30):
     """
       :param metric:
        The metric to use when calculating distance between instances.
        The default metric is Euclidean. Choices are:
         - 'euclidean' for standard Euclidean distance
         - 'manhattan': for the Manhattan distance
         - 'haversine' for distances between (latitude,longitude) points only
         - 'cosine': for cosinus similarity
        :param shrink_thresold:
         The threshold for shrinking centroids to remove features
     """
     self.metric = metric
     self.shrink_threshold = shrink_threshold
     self.ranking_size = ranking_size
     self.clf = NearestCentroid(metric=metric,
                                shrink_threshold=shrink_threshold)
def proceed_classification(X, y, text="Classification Experiment"):

    print("===========" + text + "===========")
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
    clr = LogisticRegression()
    clr.fit(X_train, y_train)
    print("Logistic Regession: %f" % (clr.score(X_test, y_test)))
    clr = RidgeClassifier()
    clr.fit(X_train, y_train)
    print("Ridge: %f" % (clr.score(X_test, y_test)))
    clr = MultinomialNB()
    clr.fit(X_train, y_train)
    print("Multinomial: %f" % (clr.score(X_test, y_test)))
    clr = GaussianNB()
    clr.fit(X_train, y_train)
    print("GaussianNB: %f" % (clr.score(X_test, y_test)))
    clr = SGDClassifier()
    clr.fit(X_train, y_train)
    print("SGDClassifier: %f" % (clr.score(X_test, y_test)))
    clr = Perceptron()
    clr.fit(X_train, y_train)
    print("Perceptron: %f" % (clr.score(X_test, y_test)))
    clr = BernoulliNB()
    clr.fit(X_train, y_train)
    print("BernoulliNB: %f" % (clr.score(X_test, y_test)))
    clr = KNeighborsClassifier()
    clr.fit(X_train, y_train)
    print("KNeighbors: %f" % (clr.score(X_test, y_test)))
    clr = NearestCentroid()
    clr.fit(X_train, y_train)
    print("NearestCentroid: %f" % (clr.score(X_test, y_test)))
    clr = RandomForestClassifier()
    clr.fit(X_train, y_train)
    print("RandomForestClassifier: %f" % (clr.score(X_test, y_test)))
    clr = MLPClassifier()
    clr.fit(X_train, y_train)
    print("Neutral network: %f" % (clr.score(X_test, y_test)))
    clr = SVC(kernel="rbf")
    clr.fit(X_train, y_train)
    print("Kernel SVM network: %f" % (clr.score(X_test, y_test)))
    print("\n")
Beispiel #33
0
def sklearn_get_clasifier(X, y, method):
    if   method == "Svm":   
        classifier = LinearSVC(loss='l1')
    elif method == "Svc":   
        classifier = SVC(kernel='linear', probability=True)
    elif method == "BernoulliNB":   
        classifier = BernoulliNB()
    elif method == "MultinomialNB":   
        classifier = MultinomialNB()
    elif method == "Centroid":   
        classifier = NearestCentroid() # metric = 'manhattan', shrink_threshold=None) #manhattan, euclidian, l2, l1, cityblock
    elif method == "MaxEnt":   
        classifier = LogisticRegression()    
    elif method == "KNeighbors":
        classifier = KNeighborsClassifier(n_neighbors=5,  p=3) # p=1 - manhatnska razdalja, p=2: evklidska; sicer: minkovski   
    #elif method == "DecisionTree":   
    #    classifier = DecisionTreeClassifier()
        
    classifier.fit(X, y)
    
    return classifier
Beispiel #34
0
def ncentroid(args):
    """Uses scikit-learn's KNeighborsClassifier, each class is represented by its centroid, with test samples classified to the class with the nearest centroid.
    
    Parameters
    ----------
  
   	metric : string, or callable
		The metric to use when calculating distance between instances in a feature array.

	shrink_threshold : float
		Threshold for shrinking centroids to remove features.

    """

    st = None
    if (args[1].find("None") == -1):
        st = float(args[1])

    met = args[2]

    return NearestCentroid(metric=met, shrink_threshold=st)
Beispiel #35
0
    def initialize_classifiers(self):
        classifiers = []
        for kernel in self.kernels:
            print kernel
            fun = lambda X_train, y_train, X_test: SVC(kernel=kernel).fit(
                X_train, y_train).predict(X_test)
            classifiers.append(fun)

        fun = lambda X_train, y_train, X_test: LinearSVC(
            multi_class='crammer_singer').fit(X_train, y_train).predict(X_test)
        classifiers.append(fun)

        fun = lambda X_train, y_train, X_test: KNeighborsClassifier(
            n_neighbors=1).fit(X_train, y_train).predict(X_test)
        classifiers.append(fun)

        fun = lambda X_train, y_train, X_test: NearestCentroid().fit(
            X_train, y_train).predict(X_test)
        classifiers.append(fun)

        return classifiers
def nc_fit(Xtrain, Xtest, Xtrain_lbls, Xtest_lbls, name, data, t0=time()):
    #Create a nearest centroid
    clf = NearestCentroid()
    # Train with the data
    clf.fit(Xtrain, Xtrain_lbls)

    # Create prediction for test data
    y_pred_test = clf.predict(Xtest)

    # How well does it fit
    score = clf.score(Xtest, Xtest_lbls)

    print('%-9s\t%.2fs\t%-9s\t%-9s'
          % (name, (time() - t0), score, data))

    return y_pred_test
Beispiel #37
0
 def __init__(self, object):
     if object == 'randomforest':
         self.models = {(RandomForestClassifier(), "Random forest")}
     if object == 'sklearnmodels':
         self.models = {(RidgeClassifier(tol=1e-2,
                                         solver="sag"), "Ridge Classifier"),
                        (Perceptron(max_iter=50), "Perceptron"),
                        (PassiveAggressiveClassifier(max_iter=50),
                         "Passive-Aggressive"),
                        (KNeighborsClassifier(n_neighbors=10), "kNN"),
                        (RandomForestClassifier(), "Random forest"),
                        (LinearSVC(penalty="l2", dual=False,
                                   tol=1e-3), "L2 Linear SVC"),
                        (LinearSVC(penalty="l1", dual=False,
                                   tol=1e-3), "L1 Linear SVC"),
                        (SGDClassifier(alpha=.0001,
                                       max_iter=50,
                                       penalty="l2"), "L2 SGDClassifier"),
                        (SGDClassifier(alpha=.0001,
                                       max_iter=50,
                                       penalty="l1"), "L1 SGDClassifier"),
                        (SGDClassifier(alpha=.0001,
                                       max_iter=50,
                                       penalty="elasticnet"),
                         "Elastic-Net penalty SGDClassifier"),
                        (NearestCentroid(),
                         "NearestCentroid (aka Rocchio classifier)"),
                        (MultinomialNB(alpha=.01),
                         "Naive Bayes MultinomialNB"),
                        (BernoulliNB(alpha=.01), "Naive Bayes BernoulliNB"),
                        (ComplementNB(alpha=.1), "Naive Bayes ComplementB"),
                        (Pipeline([
                            ('feature_selection',
                             SelectFromModel(
                                 LinearSVC(penalty="l1",
                                           dual=False,
                                           tol=1e-3))),
                            ('classification', LinearSVC(penalty="l2"))
                        ]), "LinearSVC with L1-based feature selection")}
 def populate_label( X_train, y_train, X_test, log_prob = False ):
     predictions = []
     train_dat = []
     for clf, name in (
         #(LassoLars(),"LassoLars"),
         #(BayesianRidge(),"BayesianRidge"),
         #(GaussianNB(),"Gaussian NB"), #dense
         (GradientBoostingClassifier(),"Gradient Boosting"),
         (ExtraTreesClassifier(),"ExtraTreesClassifier"),
         (AdaBoostClassifier(),"AdaBoostClassifier"),
         (LinearSVC(),"LinearSVC"),
         (NearestCentroid(),"NearestCentroid"),
         (BernoulliNB(binarize=False, fit_prior=True, alpha=0.1),"BernoulliNB"),
         (Lasso(),"Lasso"),  # regressor
         #(ElasticNet(),"ElasticNet"), # regressor
         #(SGDClassifier(),"SGDClassifier"),
         (RidgeClassifier(tol=1e-2, solver="sag"), "Ridge Classifier sag"),
         (Perceptron(max_iter=150), "Perceptron"),
         (PassiveAggressiveClassifier(max_iter=150), "Passive-Aggressive hinge"), # hinge > squarehinge
         (KNeighborsClassifier(n_neighbors=8), "kNN8"),
         (RandomForestClassifier(n_estimators=100), "Random forest")):
     
     
     
         if log_prob:
             try:
                 predictions.append(predict_logprob(clf, X_train=X_train, X_test=X_test, y_train = y_train))
             except:
             # it's just input data for the Dense layer, so I'll mix log probabs and labels
                 predictions.append(predict(clf, X_train=X_train, X_test=X_test, y_train = y_train))
         else:
         
         
             clf = fit_clf(clf, X_train,y_train)
         
             predictions.append(predict(clf,X_test=X_test))
             train_dat.append(predict(clf, X_test=X_train))
     
     return np.asarray(train_dat), np.asarray(predictions)
Beispiel #39
0
    def test_plot16(self):
        np.random.seed(seed)
        X, y = toy_datasets.balls_toy_dataset(centers=[[-1.0, 0.0], [0.0, 0.0],
                                                       [1.0, 0.0]],
                                              rads=[0.3, 0.3, 0.3],
                                              samples=[50, 50, 50],
                                              noise=[0.1, 0.1, 0.1])
        y[y == 2] = 0
        y = y.astype(int)

        ncm = NearestCentroid()
        ncmc = NCMC_Classifier(centroids_num=[2, 1])
        dml_multiplot(X,
                      y,
                      nrow=1,
                      ncol=2,
                      clfs=[ncm, ncmc],
                      cmap='rainbow',
                      subtitles=['NCM', 'NCMC'],
                      figsize=(6, 3))
        self.newsave()
        plt.close()
Beispiel #40
0
def getDiscreetClassifier(name, params={}):
	if(name == 'svm'):
		return SVC(**params)
	elif(name == 'knearest'):
		return KNeighborsClassifier(**params)
	elif(name == 'guassNB'):
		return GaussianNB()
	elif(name == 'sgd'):
		return SGDClassifier(**params)
	elif(name == 'adaBoost'):
		return AdaBoostClassifier(**params)
	elif(name == 'randomForest'):
		return RandomForestClassifier(**params)
	elif(name == 'perceptron'):
		return Perceptron(**params)
	elif(name == 'nearestCentroid'):
		return NearestCentroid(**params)
	elif(name == 'passiveAggressive'):
		return PassiveAggressiveClassifier(**params)
	elif(name == 'decisionTree'):
		return DecisionTreeClassifier(**params)
	elif(name == 'leastSquares'):
		return LinearRegression()
	elif(name == 'ridge'):
		return Ridge()
	elif(name == 'lasso'):
		return Lasso()
	elif(name == 'elasticNet'):
		return ElasticNet()
	elif(name == 'lars'):
		return Lars()
	elif(name == 'orthogonalMatchingPursuit'):
		return OrthogonalMatchingPursuit()
	elif(name == 'bayesianRidge'):
		return BayesianRidge()
	elif(name == 'logisticRegression'):
		return LogisticRegression()
	else:
		raise ValueError('Classifer'  + name + ' is not supported')
Beispiel #41
0
 def __init__(self, metric = 'euclidean', shrink_threshold = None, k=5):
     NearestCentroid.__init__(self, metric, shrink_threshold)
     self.k = k
def get_results(city,no):
	processing.preprocessing()
	pre=open('preprocess1.txt')
	train_set=[]
	line=pre.readline()
	while(line!=''):
		train_set.append(line)
		#print line
		line=pre.readline()
	#print train_set	
	pos=open('positive-words.txt')
	neg=open('negative-words.txt')
	positive=[]
	negative=[]
	for i in pos.read().split():
		positive.append(i)	

	for j in neg.read().split():
		negative.append(j)

	stopWords = stopwords.words('english')
	vectorizer = CountVectorizer(stop_words = stopWords)
	transformer = TfidfTransformer()

	#train_set=get_traindata()
	

	#l=[]
	#l.append(test_set)
	#l.append(test_set1)

	#trainVectorizerArray = vectorizer.fit_transform(train_set).toarray()
	#print vectorizer.get_feature_names()
	#testVectorizerArray = vectorizer.transform(test_set).toarray()
	#testVectorizerArray1 = vectorizer.transform(l[1]).toarray()
	#print 'Fit Vectorizer to train set', trainVectorizerArray
	#print 'Transform Vectorizer to test set', testVectorizerArray
	#print testVectorizerArray1[0]

	#transformer.fit(trainVectorizerArray)
	v= vectorizer.fit_transform(train_set)
	#print v.toarray()
	tfidf= transformer.fit_transform(v)
	#transformer.fit(testVectorizerArray)

	#tfidf = transformer.transform(trainVectorizerArray)
	#print tfidf.todense()

	#print("done in %0.3fs." % (time() - t0))
	#print nmf.components_
	# Inverse the vectorizer vocabulary to be able
	feature_names = vectorizer.get_feature_names()
	#print (feature_names)
	#if 'area' in feature_names: 
	print (feature_names)

	print ("\n")
	#-------

	nmf = decomposition.NMF(n_components=3, init='random',random_state=0).fit(tfidf.todense())
	topic_list=[]
	l= int(len(feature_names)/5)
	#print l
	for topic_idx, topic in enumerate(nmf.components_):
		topic_list.append(topic.argsort()[:-l-1:-1])
    
    #print("Topic #%d:" % topic_idx)
    #print (topic)
	#print "Hello----"
	#print topic_list


	train_target=[]	
	for arr in v.toarray():
		train_target.append(calculate_Topic(arr,topic_list))
	#print train_target
	#clf = MultinomialNB()
	#clf2= LinearSVC()
	#clf1=NearestCentroid()
	#clf.fit(tfidf.todense(),train_target)
	#clf1.fit(tfidf.todense(),train_target)
	#clf2.fit(tfidf.todense(),train_target)
	#print (clf.predict(X_test))
	#print (clf1.predict(X_test))
	#print (clf2.predict(X_test))
	#print "Hello"
	ch2 = SelectKBest(chi2, k=l*2)
	X_train = ch2.fit_transform(tfidf.todense(), train_target)

	cs= ch2.scores_.argsort()[::-1]
	cs_featurenames=[]
	cs=cs[:l*2]
	for x in cs:
		cs_featurenames.append(feature_names[x])

	print (cs_featurenames)
	print "\n"

	nmf1 = decomposition.NMF(n_components=3, init='random',random_state=0).fit(X_train)
	topic_list=[]
	l= int(len(feature_names)/5)
	#print l
	for topic_idx, topic in enumerate(nmf1.components_):
		z=topic.argsort()[:-l-1:-1]
		topic_list.append(z)
		print("Topic #%d:---------------------------------------" % topic_idx)
		for y in z:
			print cs_featurenames[y]
    #print (topic)
	#print "Hello----"    
	#print topic_list
	train_target=[]	
	for arr in X_train:
		train_target.append(calculate_Topic(arr,topic_list))

	#---------
	#print "hello"
	#print train_target
	#print ch2.get_feature_names()
	#print X_train
	#print train_target
	#print "=--------------"
	#print ta
	#print X_test
	train_count=[0]*4
	#print train_target
	for x in train_target:
		train_count[x]=train_count[x]+1
	#print "hello"
	#print train_count	


	clf = MultinomialNB()
	clf2= LinearSVC()
	clf1=NearestCentroid()
	clf.fit(X_train,train_target)
	clf1.fit(X_train,train_target)
	clf2.fit(X_train,train_target)	
	dic={}
	hotels=read_hotels(city,dic)
	temp=[]
	for each in hotels:
		temp.append(calculate(vectorizer,transformer,train_count,ch2,each,clf,clf1,clf2,positive,negative,train_set))
	res=[]	
	temp1=numpy.array(temp).argsort()[::-1]
	#print temp1
	print "Top %d recommendations are as follows[in the FORMAT Index,(Hotel name,Location),Score]:\n" %no

	for g in temp1[:no]:
		print g,dic[g],temp[g]
		res.append(dic[g])

	return res	
pred = clf4.predict(X_test)
writeToDisk(pred,"KNeighborsClassifier")

clf5=RandomForestClassifier(n_estimators=100)   #RandomForest Classifier
clf5.fit(X_train, y_train)
pred = clf5.predict(X_test)
writeToDisk(pred,"RandomForestClassifier")

clf6=Pipeline([('feature_selection',            #LinearSVC with L2-based feature selection
    LinearSVC(penalty="l2", dual=False, tol=1e-3)),
    ('classification', LinearSVC())])
clf6.fit(X_train, y_train)
pred = clf6.predict(X_test)
writeToDisk(pred,"LinearSVC")

clf7=NearestCentroid()                          #NearestCentroid (aka Rocchio classifier), no threshold 
clf7.fit(X_train, y_train)
pred = clf7.predict(X_test)
writeToDisk(pred,"NearestCentroid")

clf8=SVC(C=1.0, class_weight=None, coef0=0.0,   #SVC
    decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
    max_iter=-1, probability=False, random_state=1, shrinking=True,
    tol=0.001, verbose=False)
clf8.fit(X_train, y_train)
pred = clf8.predict(X_test)
writeToDisk(pred,"SVC")
'''
clf9=VotingClassifier(estimators=[
    ('Ridge',clf1),('MultiNB',clf2),('BernNB',clf3),('KNN',clf4),
    ('RF',clf5),('LinearSVC',clf6),('NearC',clf7),('SVC',clf8)
Beispiel #44
0
 def __init__(self, shrink=None):
     self.cl = NearestCentroid(shrink_threshold=shrink)
     self.shrink = shrink
def test_iris():
    # Check consistency on dataset iris.
    for metric in ('euclidean', 'cosine'):
        clf = NearestCentroid(metric=metric).fit(iris.data, iris.target)
        score = np.mean(clf.predict(iris.data) == iris.target)
        assert score > 0.9, "Failed with score = " + str(score)
def test_precomputed():
    clf = NearestCentroid(metric="precomputed")
    clf.fit(X, y)
    S = pairwise_distances(T, clf.centroids_)
    assert_array_equal(clf.predict(S), true_result)
def test_classification_toy():
    # Check classification on a toy dataset, including sparse versions.
    clf = NearestCentroid()
    clf.fit(X, y)
    assert_array_equal(clf.predict(T), true_result)

    # Same test, but with a sparse matrix to fit and test.
    clf = NearestCentroid()
    clf.fit(X_csr, y)
    assert_array_equal(clf.predict(T_csr), true_result)

    # Fit with sparse, test with non-sparse
    clf = NearestCentroid()
    clf.fit(X_csr, y)
    assert_array_equal(clf.predict(T), true_result)

    # Fit with non-sparse, test with sparse
    clf = NearestCentroid()
    clf.fit(X, y)
    assert_array_equal(clf.predict(T_csr), true_result)

    # Fit and predict with non-CSR sparse matrices
    clf = NearestCentroid()
    clf.fit(X_csr.tocoo(), y)
    assert_array_equal(clf.predict(T_csr.tolil()), true_result)
Beispiel #48
0
        y_train = labels[100:172,i]
        X_test = sample2
        y_test = labels[272:,i]
    else:
        X_train = training
        y_train = labels[:172,i]
        X_test = sampletest
        y_test = labels[172:,i]

    posterior = np.empty([100,72,6])
    box = np.zeros([6,6])
    for j in range(4,5):
        for k in range(1,2):
            accuracy = np.zeros(100)
            for m in range(0,100):
                ncc = NearestCentroid()
                ncc.fit(X_train, y_train)
                y_pred = ncc.predict(X_test)
                
                n=0
                for i in range(0,len(y_pred)):
                    if y_pred[i] == y_test[i]:
                #print i, y_pred[i], y_test[i]
                        n = n+1
                        accuracy[m] = accuracy[m]+1
                    box[y_test[i]-1,y_pred[i]-1] = box[y_test[i]-1,y_pred[i]-1] + 1
                #posterior[m] =  knc.predict_proba(X_test)
            print j, k, np.mean(accuracy)/0.72, np.std(accuracy)/0.72
            #print 30, 20, sum(accuracy[0:8])/8.0, sum(accuracy[8:18])/10.0, sum(accuracy[18:30])/12.0, sum(accuracy[56:72])/16.0, sum(accuracy[30:43])/13.0, sum(accuracy[43:56])/13.0, sum(accuracy)/72.0
        '''
    means = np.empty([72,6])
Beispiel #49
0
    def fit(self, X, y):

        biny = self.bin_data(y)

        self.pred = NearestCentroid().fit(X, biny)
        return self
# Nearest Centroid
from sklearn import datasets
from sklearn import metrics
from sklearn.neighbors import NearestCentroid
# load the iris datasets
dataset = datasets.load_iris()
# fit a nearest centroid model to the data
model = NearestCentroid()
model.fit(dataset.data, dataset.target)
print(model)
# make predictions
expected = dataset.target
predicted = model.predict(dataset.data)
# summarize the fit of the model
print(metrics.classification_report(expected, predicted))
print(metrics.confusion_matrix(expected, predicted))
 def __init__(self):
     self.metric = lambda x1, x2: np.log(np.sqrt(np.sum(np.square(x1 - x2))))
     NearestCentroid.__init__(self, metric=self.metric)
def test_precomputed():
    clf = NearestCentroid(metric='precomputed')
    with assert_raises(ValueError) as context:
        clf.fit(X, y)
    assert_equal(ValueError, type(context.exception))
# import some data to play with
iris = datasets.load_iris()
X = iris.data[:, :2]  # we only take the first two features. We could
# avoid this ugly slicing by using a two-dim dataset
y = iris.target

h = 0.02  # step size in the mesh

# Create color maps
cmap_light = ListedColormap(["#FFAAAA", "#AAFFAA", "#AAAAFF"])
cmap_bold = ListedColormap(["#FF0000", "#00FF00", "#0000FF"])

for shrinkage in [None, 0.1]:
    # we create an instance of Neighbours Classifier and fit the data.
    clf = NearestCentroid(shrink_threshold=shrinkage)
    clf.fit(X, y)
    y_pred = clf.predict(X)
    print(shrinkage, np.mean(y == y_pred))
    # Plot the decision boundary. For that, we will assign a color to each
    # point in the mesh [x_min, m_max]x[y_min, y_max].
    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])

    # Put the result into a color plot
    Z = Z.reshape(xx.shape)
    pl.figure()
    pl.pcolormesh(xx, yy, Z, cmap=cmap_light)
Beispiel #54
0
train_images = faces.images[np.negative(ind)]
train_targets = faces.target[np.negative(ind)]
n_train = len(train_images)

test_images = faces.images[ind]
test_targets = faces.target[ind]
n_tests = len(test_images)
for test in test_images:
    test = test + norm.rvs(scale=10, size=test.shape)
    for i in range(25, 30):
        test[i, :] = 0
        test[:, i] = 0
    test = np.minimum(test, 1)
    test = np.maximum(test, 0)
    test = np.zeros(test.shape)

train = train_images.reshape((n_train, -1))
train_pca = pca.fit_transform(train)
    
test = test_images.reshape((n_tests, -1))
test_pca = pca.transform(test)

neigh = NearestCentroid()

neigh.fit(train, train_targets)
print("Sans ACP :", np.count_nonzero(neigh.predict(test) - test_targets), n_tests)

neigh.fit(train_pca, train_targets)
print("Avec ACP :", np.count_nonzero(neigh.predict(test_pca) - test_targets), n_tests)