Beispiel #1
0
    def train_with(self, training_data_list, answers):
        #put data in right format
        training_data = self.get_sparse_matrix(training_data_list)

        if training_data is not False:

        #make model
            if self.model_name == "random_forest":
                forest = RandomForestClassifier(n_estimators=100)
                self.model = forest.fit(training_data.todense(), answers)
            elif self.model_name == "centroid_prediction":
                clf = NearestCentroid()
                self.model = clf.fit(training_data, answers)
            elif self.model_name == "linearSVC":
                SVC = LinearSVC()
                self.model = SVC.fit(training_data.todense(), answers)
            elif self.model_name == "nearest_neighbor":
                near = KNeighborsClassifier()
                self.model = near.fit(training_data.todense(), answers)
            elif self.model_name == "decision_tree":
                clf = tree.DecisionTreeClassifier()
                self.model = clf.fit(training_data.todense(), answers)
            elif self.model_name == "svc":
                clf = svm.SVC()
                self.model = clf.fit(training_data, answers)
Beispiel #2
0
def _nearestcentroid(*, train, test, x_predict=None, metrics, metric='euclidean', shrink_threshold=None):
    """
    For more info visit :
    https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.NearestCentroid.html#sklearn.neighbors.NearestCentroid
    """

    model = NearestCentroid(metric=metric, shrink_threshold=shrink_threshold)
    model.fit(train[0], train[1])
    model_name = 'Nearest Centroid'
    y_hat = model.predict(test[0])

    if metrics == 'accuracy':
        accuracy = accuracy_score(test[1], y_hat)

    if metrics == 'f1':
        accuracy = f1_score(test[1], y_hat)

    if metrics == 'jaccard':
        accuracy = jaccard_score(test[1], y_hat)

    if x_predict is None:
        return (model_name, accuracy, None)

    y_predict = model.predict(x_predict)
    return (model_name, accuracy, y_predict)
Beispiel #3
0
class NC:
    def __init__(self):
        self.clf = NearestCentroid()
        self.centroids = []

    """ 
    Calculates the mean of each class in the training data. 
    param:
        @train_data: training data
        @train_lbls: training labels
    """

    def fit(self, train_data, train_lbls):
        self.clf.fit(train_data, train_lbls)
        self.centroids = self.clf.centroids_
        return self

    """ 
    Classifies test data using the class means of the training and Nearest Centroid algorithm.
    param:
        @test_data: testing data
        @test_lbls: testing labels
    returns:
        @classification: numpy array with classification labels
        @score: the mean accuracy classifications
    """

    def predict(self, test_data, test_lbls):
        classification = self.clf.predict(test_data)
        try:
            score = accuracy_score(test_lbls, classification)
        except ValueError:
            score = None

        return classification, score
Beispiel #4
0
def nearest_mean_classifier(X_train, y_train, X_validation, X_test):
    # Returns the labels for test_data, predicted by the nearest mean classifier trained on X_train and y_train
    # Input:
    # X_train - num_train x num_features matrix with features for the training data
    # y_train - num_train x 1 vector with labels for the training data
    # X_validation - num_test x num_features matrix with features for the validation data
    # X_test - num_test x num_features matrix with features for the test data
    # Output:
    # y_pred_validation - num_test x 1 predicted vector with labels for the validation data
    # y_pred_test - num_test x 1 predicted vector with labels for the test data

    # Niet in gebruik

    X_test_val = np.vstack((X_validation, X_test))
    # Gooi datasets samen

    clf = NearestCentroid()
    clf.fit(X_train, y_train)  # Bepaal de means

    predicted_labels = clf.predict(X_test_val)  # Voorspel de data

    # Sla voorspellingen op
    y_pred_validation = predicted_labels[:len(X_validation)]
    y_pred_test = predicted_labels[len(X_validation):]
    return y_pred_validation, y_pred_test
def test_classification_toy():
    # Check classification on a toy dataset, including sparse versions.
    clf = NearestCentroid()
    clf.fit(X, y)
    assert_array_equal(clf.predict(T), true_result)

    # Same test, but with a sparse matrix to fit and test.
    clf = NearestCentroid()
    clf.fit(X_csr, y)
    assert_array_equal(clf.predict(T_csr), true_result)

    # Fit with sparse, test with non-sparse
    clf = NearestCentroid()
    clf.fit(X_csr, y)
    assert_array_equal(clf.predict(T), true_result)

    # Fit with non-sparse, test with sparse
    clf = NearestCentroid()
    clf.fit(X, y)
    assert_array_equal(clf.predict(T_csr), true_result)

    # Fit and predict with non-CSR sparse matrices
    clf = NearestCentroid()
    clf.fit(X_csr.tocoo(), y)
    assert_array_equal(clf.predict(T_csr.tolil()), true_result)
Beispiel #6
0
class AlgorithmRunner:
    """
    initalize the algorithms
    :param  algo_name: the name of the current algorithm
    """
    def __init__(self, algo_name):
        if algo_name == "KNN":
            self.algorithm = KNeighborsClassifier(n_neighbors=10)
        if algo_name == "Rocchio":
            self.algorithm = NearestCentroid()

    """
    call to the fit method from sklearn.neighbours
    :param  train_features: the features that we train on
            train_labels: the labels for the features that we train on
    """

    def fit(self, train_features, train_labels):
        self.algorithm.fit(train_features, train_labels)

    """
    call to the predict method from sklearn.neighbours
    :param  test_features: the features that we test on
    """

    def predict(self, test_features):
        return self.algorithm.predict(test_features)
Beispiel #7
0
def plot_mnist_centroids(data, labels, title="", fp="", draw=False):
    # Create set of classes in data set
    classes = list(set(labels))

    # Calculate mean vector of each class
    clf = NearestCentroid()
    clf.fit(data, labels)
    centroids = clf.centroids_

    # https://stackoverflow.com/questions/37228371/visualize-mnist-dataset-using-opencv-or-matplotlib-pyplot
    plt.figure()
    plt.suptitle(title, fontsize=14)
    for i, class_center in enumerate(centroids):
        pixels = np.array(class_center, dtype='uint8')

        # Reshape the array into 28 x 28 array (2-dimensional array)
        pixels = pixels.reshape((28, 28))

        # Plot each mean vector as a gray scale image in a subplot
        plt.subplot(2, 5, i + 1)
        plt.title('Label: {label}'.format(label=classes[i]))
        plt.imshow(pixels, cmap='gray')
        plt.tick_params(which='both',
                        bottom='off',
                        left='off',
                        labelbottom='off',
                        labelleft='off')
    if fp != "":
        plt.savefig(fp, bbox_inches='tight', pad_inches=0)
    if draw:
        plt.draw()
Beispiel #8
0
def plot_orl_centroids(data, labels, title="", fp="", draw=False):
    # Create set of classes in data set
    classes = list(set(labels))

    # Calculate mean vector of each class
    clf = NearestCentroid()
    clf.fit(data, labels)
    centroids = clf.centroids_

    plt.figure(figsize=(18, 12))
    plt.suptitle(title, fontsize=14)
    for i, class_center in enumerate(centroids):
        pixels = np.array(class_center, dtype='float')

        # Reshape the array into 30 x 40 array (2-dimensional array)
        pixels = pixels.reshape(
            (30, 40)).transpose()  # image vectors are sideways

        # Plot each mean vector as a gray scale image in a subplot
        plt.subplot(4, 10, i + 1)
        plt.title('Label: {label}'.format(label=classes[i]))
        plt.imshow(pixels, cmap='gray')
        plt.tick_params(which='both',
                        bottom='off',
                        left='off',
                        labelbottom='off',
                        labelleft='off')

    if fp != "":
        plt.savefig(fp, bbox_inches='tight', pad_inches=0)
    if draw:
        plt.draw()
def NC(data_train, data_train_vectors, data_test_vectors, **kwargs):
    # Implementing classification model- using NearestCentroid
    clf_nc =  NearestCentroid()
    clf_nc.fit(data_train_vectors, data_train.target)
    y_pred = clf_nc.predict(data_test_vectors)
    
    return y_pred
    def agglomerative_validation(self, iterations):
        print('\nPerforming holdout validation for agglomerative clusterer...')

        rands = []

        for i in range(0, iterations):

            X_train, X_test = train_test_split(self.gene_df,
                                               test_size=0.2,
                                               random_state=i)

            agglomerative_training = cluster.AgglomerativeClustering( \
                n_clusters=self.n_clusters, linkage='ward', affinity='euclidean').fit(X_train)

            agglomerative_testing = cluster.AgglomerativeClustering( \
                n_clusters=self.n_clusters, linkage='ward', affinity='euclidean').fit(X_test)

            #classify testing data using centroids of training data clusters
            clf = NearestCentroid()
            clf.fit(X_train, agglomerative_training.labels_)

            #calculate rand score between clustering labels and prediction labels of held out samples
            rands.append(
                adjusted_rand_score(clf.predict(X_test),
                                    agglomerative_testing.labels_))

        ##print('rand scores of kmeans and held out kmeans cluster samples', rands)
        print('average of rand scores', sum(rands) / len(rands))
        print('variance of rand scores', statistics.variance((rands)))
def nsc_fit(Xtrain, Xtrain_lbls, Xtest, Xtest_lbls, n_clust, rng, start, name, datat, t0=time()):
    centers = []
    labels = []
    correct_labels = []

    #Cluster the data
    # Start is the index, it starts at since MNIST starts at 0 and ORL at 1.
    # rng is the range, MNIST has 10 classes where ORL has 40.
    for i in range(start, rng):
        data = Xtrain[np.nonzero(Xtrain_lbls == i)]
        kmeans = KMeans(n_clusters=n_clust, random_state=42).fit(data)

        # Get the centers for the amount of clusters specified.
        for k in range(0, n_clust):
            centers.append(kmeans.cluster_centers_[k, :])
            labels.append(str(i) + '_' + str(k))

    #Fit with nearest centroid
    clf = NearestCentroid()
    clf.fit(centers, labels)
    pred = clf.predict(Xtest)

    for i in range(0, len(pred)):
        correct_labels.append(int(pred[i].split('_')[0]))

    #Calculate score
    score = accuracy_score(Xtest_lbls, correct_labels)

    print('%-9s\t%.2fs\t%-9s\t%-9s'
          % (name, (time() - t0), score, datat))
    return pred
Beispiel #12
0
    def calc_cluster_props(self):
        """Calculate cluster properties.

        Returns
        -------
        tuple of float
            centroid

        Notes
        -----
        Add column `silh` to DataFrame.
        """
        data = self.df[self.relevant_groups()]
        scaler = StandardScaler()
        data_ = scaler.fit_transform(data)
        labels = self.df['hgt'].tolist()

        # calculate silhouette scores for all samples
        self.df['silh'] = silhouette_samples(data_, labels)

        # calculate centroid
        clf = NearestCentroid()
        clf.fit(data_, self.df['hgt'])
        cent = scaler.inverse_transform(clf.centroids_[1])
        return cent
def test_classification_toy():
    """Check classification on a toy dataset, including sparse versions."""
    clf = NearestCentroid()
    clf.fit(X, y)
    assert_array_equal(clf.predict(T), true_result)

    # Same test, but with a sparse matrix to fit and test.
    clf = NearestCentroid()
    clf.fit(X_csr, y)
    assert_array_equal(clf.predict(T_csr), true_result)

    # Fit with sparse, test with non-sparse
    clf = NearestCentroid()
    clf.fit(X_csr, y)
    assert_array_equal(clf.predict(T), true_result)

    # Fit with non-sparse, test with sparse
    clf = NearestCentroid()
    clf.fit(X, y)
    assert_array_equal(clf.predict(T_csr), true_result)

    # Fit and predict with non-CSR sparse matrices
    clf = NearestCentroid()
    clf.fit(X_csr.tocoo(), y)
    assert_array_equal(clf.predict(T_csr.tolil()), true_result)
Beispiel #14
0
def run_NearestCentroid(X_train, X_test, y_train):
    centroid = NearestCentroid()

    centroid.fit(X_train, y_train)
    cents = centroid.centroids_
    y_pred = centroid.predict(X_test)
    return y_pred, cents
Beispiel #15
0
def Centroid(X_train, Y_train, X_test, Y_test):
	# Parameter 'shrinkage' is tuned 
	#Cross validation
	shrinkages = np.linspace(0, 10, 100)
	tuned_parameters = [{'shrink_threshold': shrinkages}]
	cv = GridSearchCV(NearestCentroid(), tuned_parameters)
	cv.fit(X_train, Y_train)
	
	#Optimal parameters
	print('Best Params: ')
	print(cv.best_params_)

	#Optimal Model
	clf = NearestCentroid()
	clf.set_params(shrink_threshold=cv.best_params_['shrink_threshold'])
	clf.fit(X_train, Y_train)
	pred = clf.predict(X_test)
	test_error = mean_squared_error(Y_test, pred)
	acc_score = accuracy_score(Y_test, pred)
	print('Nearest Centroid Test Error: ' + str(test_error))
	print('Nearest Centroid Accuracy Score: ' + str(acc_score))
	print('First 10 predictions: ')
	print(pred[:10])
	print('First 10 actual: ')
	print(Y_test[:10])
	print('Centroid of each class: ')
	print(clf.centroids_[0])
	print('Class labels known to the classifier: ')
	print(clf.classes_)
	return clf, test_error, acc_score
Beispiel #16
0
def print_accuracy(test_features, control_group, folds, classifiers):
    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import MinMaxScaler
    from sklearn.linear_model import LogisticRegression
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
    from sklearn.naive_bayes import GaussianNB
    from sklearn.svm import SVC
    from sklearn.neighbors import NearestCentroid

    x_train, x_test, y_train, y_test = train_test_split(test_features,
                                                        control_group,
                                                        random_state=folds)

    scaler = MinMaxScaler()
    x_train = scaler.fit_transform(x_train)
    x_test = scaler.transform(x_test)

    logreg = LogisticRegression()
    logreg.fit(x_train, y_train)

    clf2 = DecisionTreeClassifier(max_depth=3).fit(x_train, y_train)
    knn = KNeighborsClassifier(n_neighbors=3)
    knn.fit(x_train, y_train)
    gnb = GaussianNB()
    gnb.fit(x_train, y_train)
    lda = LinearDiscriminantAnalysis()
    lda.fit(x_train, y_train)
    svm = SVC()
    svm.fit(x_train, y_train)
    cent = NearestCentroid()
    cent.fit(x_train, y_train)

    def get_accuracy(x, y):
        a = logreg.score(x, y)
        b = clf2.score(x, y)
        c = knn.score(x, y)
        d = gnb.score(x, y)
        e = lda.score(x, y)
        f = svm.score(x, y)
        g = cent.score(x, y)

        return (a, b, c, d, e, f, g)

    training_sets = []
    test_sets = []

    for i in range(len(classifiers)):
        train = float(get_accuracy(x_train, y_train)[i])
        test = float(get_accuracy(x_test, y_test)[i])

        training_sets.append(train)
        test_sets.append(test)

    training_sets = tuple(training_sets)
    test_sets = tuple(test_sets)

    return (training_sets, test_sets)
def test_shrinkage_threshold_decoded_y():
    clf = NearestCentroid(shrink_threshold=0.01)
    y_ind = np.asarray(y)
    y_ind[y_ind == -1] = 0
    clf.fit(X, y_ind)
    centroid_encoded = clf.centroids_
    clf.fit(X, y)
    assert_array_equal(centroid_encoded, clf.centroids_)
Beispiel #18
0
    def nearestcentrclassif(self, shrinkage=0.1):
        # we create an instance of Neighbours Classifier and fit the data.
        clf = NearestCentroid(shrink_threshold=shrinkage)
        clf.fit(self.x_train, self.y_train)
        z = clf.predict(self.x_test)
        print(np.mean(self.y_test == z))

        return z
Beispiel #19
0
def test_shrinkage_threshold_decoded_y():
    clf = NearestCentroid(shrink_threshold=0.01)
    y_ind = np.asarray(y)
    y_ind[y_ind == -1] = 0
    clf.fit(X, y_ind)
    centroid_encoded = clf.centroids_
    clf.fit(X, y)
    assert_array_equal(centroid_encoded, clf.centroids_)
Beispiel #20
0
 def test_should_recognise_mean_classifier(self):
     # given
     clf = NearestCentroid()
     clf.fit(self.X, self.y)
     # when
     clf_type = ClassifLibraryOld.determine_clf_type(clf)
     # then
     self.assertEqual(clf_type, ClassifLibrary.ClfType.MEAN)
def test_manhattan_metric():
    # Test the manhattan metric.

    clf = NearestCentroid(metric='manhattan')
    clf.fit(X, y)
    dense_centroid = clf.centroids_
    clf.fit(X_csr, y)
    assert_array_equal(clf.centroids_, dense_centroid)
    assert_array_equal(dense_centroid, [[-1, -1], [1, 1]])
Beispiel #22
0
def test_manhattan_metric():
    # Test the manhattan metric.

    clf = NearestCentroid(metric='manhattan')
    clf.fit(X, y)
    dense_centroid = clf.centroids_
    clf.fit(X_csr, y)
    assert_array_equal(clf.centroids_, dense_centroid)
    assert_array_equal(dense_centroid, [[-1, -1], [1, 1]])
Beispiel #23
0
def get_centeroids_per_class(features, labels):
    clf = NearestCentroid()
    clf.fit(features, labels)
    centroids = clf.centroids_
    class_labels = clf.predict(clf.centroids_)
    return {
        class_label: centroid
        for class_label, centroid in zip(class_labels, centroids)
    }
Beispiel #24
0
def trainKnn(imagen):
    # fvp: objeto generado por computeFeatureVector
    # NUM: numero de puntos de entrenamiento, dasignado por objeto Image.NUM
    fvp = imagen.fvp
    NUM = imagen.NUM
    clf = NearestCentroid()
    labels = [1] * int(NUM) + [0] * int(NUM)
    clf.fit(fvp, labels)
    imagen.set_clf(clf)
    return(clf)
Beispiel #25
0
def test_nc_classify_with_sklearn(trainingData, trainingLabels, testData,
                                  testLabels):
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        X = np.array(trainingData)
        y = np.array(trainingLabels)
        clf = NearestCentroid()
        clf.fit(X, y)

        predictions = clf.predict(testData)
        printCorrectWrong(predictions, testLabels)
def print_accuracy(test_features, control_group, folds, classifiers):
    #SPLITS SCORESETS IN TRAINING AND DATA VARIABLES.
    x_train, x_test, y_train, y_test = train_test_split(test_features,
                                                        control_group,
                                                        test_size=0.40,
                                                        random_state=folds)

    #FIT SCORES
    scaler = MinMaxScaler()
    x_train = scaler.fit_transform(x_train)
    x_test = scaler.transform(x_test)

    #NAME THE CLASSIFIERS.
    logreg = LogisticRegression()
    clf2 = DecisionTreeClassifier(max_depth=3).fit(x_train, y_train)
    knn = KNeighborsClassifier()
    gnb = GaussianNB()
    lda = LinearDiscriminantAnalysis()
    svm = SVC()
    cent = NearestCentroid()

    #FIT SCORES FOR CLASSIFIERS.
    logreg.fit(x_train, y_train)
    knn.fit(x_train, y_train)
    gnb.fit(x_train, y_train)
    lda.fit(x_train, y_train)
    svm.fit(x_train, y_train)
    cent.fit(x_train, y_train)
    """GET ACCURACY SCORE"""
    def get_accuracy(x, y):
        a = logreg.score(x, y)
        b = clf2.score(x, y)
        c = knn.score(x, y)
        d = gnb.score(x, y)
        e = lda.score(x, y)
        f = svm.score(x, y)
        g = cent.score(x, y)

        return (a, b, c, d, e, f, g)

    training_sets = []
    test_sets = []

    for i in range(len(classifiers)):
        train = float(get_accuracy(x_train, y_train)[i])
        test = float(get_accuracy(x_test, y_test)[i])

        training_sets.append(train)
        test_sets.append(test)

    training_sets = tuple(training_sets)
    test_sets = tuple(test_sets)

    return (training_sets, test_sets)
Beispiel #27
0
def assign_mg2k_centroids(X, centroids=None):
    """
    Assigns Mg II k centroids found in the study
    'Identifying typical Mg II flare spectra using machine learning', by
    B. Panos et. al. 2018 to the Mg II k spectra supplied in X. The centroids
    are assigned using a nearest neighbour procedure.
    
    The spectra in X have to be interpolated to 216 wavelength bins between 
    LAMBDA_MIN = 2793.8500976562500 and LAMBDA_MAX = 2799.3239974882454. For example::
        
        X = raster.get_interpolated_image_step( 
                step = <step>, 
                lambda_min = LAMBDA_MIN, 
                lambda_max = LAMBDA_MAX, 
                n_breaks = 216  
                )
    
    Parameters
    ----------
    X : numpy.array
        interpolated raster image of shape (_,bins)
    centroids : numpy.array
        If None, the centroids defined in the above study will be used, otherwise an array of shape (n_centroids, n_bins) should be passed.
        Important: both the spectra in 'X' and in 'centroids' should be constrained to the same wavelength region!
    
    Returns
    -------
    assigned_mg2k_centroids
        numpy vector with shape (X.shape[1],)
    """

    # load default centroids if no centroids are passed
    if centroids is None:
        centroids = get_mg2k_centroids(bins=X.shape[1])

    # create list of numbered centroid ids
    centroid_ids = list(range(centroids.shape[0]))

    # check whether X comes in the correct dimensions
    if not X.shape[1] == centroids.shape[1]:
        raise Exception(
            "Expecting X to have shape (_,{}). Please interpolate accordingly (More information with 'help(assign_mg2k_centroids)')."
            .format(centroids.shape[1]))

    # create nearest centroid finder instance and fit it
    knc = NearestCentroid()
    knc.fit(X=centroids, y=centroid_ids)

    # predict nearest centroids for the supplied spectra
    # (making sure that X is normalized)
    assigned_mg2k_centroids = knc.predict(normalize(X))

    # return vector of assigned centroids
    return assigned_mg2k_centroids
def test_shrinkage_correct():
    # Ensure that the shrinking is correct.
    # The expected result is calculated by R (pamr),
    # which is implemented by the author of the original paper.
    # (One need to modify the code to output the new centroid in pamr.predict)

    X = np.array([[0, 1], [1, 0], [1, 1], [2, 0], [6, 8]])
    y = np.array([1, 1, 2, 2, 2])
    clf = NearestCentroid(shrink_threshold=0.1)
    clf.fit(X, y)
    expected_result = np.array([[0.7787310, 0.8545292], [2.814179, 2.763647]])
    np.testing.assert_array_almost_equal(clf.centroids_, expected_result)
Beispiel #29
0
def test_shrinkage_correct():
    # Ensure that the shrinking is correct.
    # The expected result is calculated by R (pamr),
    # which is implemented by the author of the original paper.
    # (One need to modify the code to output the new centroid in pamr.predict)

    X = np.array([[0, 1], [1, 0], [1, 1], [2, 0], [6, 8]])
    y = np.array([1, 1, 2, 2, 2])
    clf = NearestCentroid(shrink_threshold=0.1)
    clf.fit(X, y)
    expected_result = np.array([[0.7787310, 0.8545292], [2.814179, 2.763647]])
    np.testing.assert_array_almost_equal(clf.centroids_, expected_result)
Beispiel #30
0
    def caculate_metrics(self):
        #-compute silhouette score
        SC = metrics.silhouette_score(self.gene_df,
                                      self.clusterer.labels_,
                                      metric='euclidean')

        #-compute calinski-harabaz score
        CH = metrics.calinski_harabasz_score(self.gene_df,
                                             self.clusterer.labels_)

        #covert dataframe to numpy array
        genes = self.gene_df.to_numpy()

        #get number of clusters
        K = len(list(dict.fromkeys(self.clusterer.labels_)))

        #-tally members of each cluster
        members = [[] for i in range(K)]  # lists of members of each cluster
        for j in range(len(self.gene_df)):  # loop through instances
            members[self.clusterer.labels_[j]].append(
                j)  # add this instance to cluster returned by scikit function

        #calculate centroids
        nc = NearestCentroid()
        nc.fit(genes, self.clusterer.labels_)

        #-compute the within-cluster score
        within = np.zeros((K))
        for i in range(K):  # loop through all clusters
            within[i] = 0.0
            for j in members[i]:  # loop through members of this cluster
                # tally the distance to this cluster centre from each of its members
                within[i] += ( np.square( genes[j,0]-nc.centroids_[i][0] ) \
                               + np.square( genes[j,1]-nc.centroids_[i][1] ))
        WC = np.sum(within)

        #-compute the between-cluster score
        between = np.zeros((K))
        for i in range(K):  # loop through all clusters
            between[i] = 0.0
            for l in range(i + 1, K):  # loop through remaining clusters
                # tally the distance from this cluster centre to the centres of the remaining clusters
                between[i] += ( np.square( nc.centroids_[i][0]-nc.centroids_[l][0] ) \
                                + np.square( nc.centroids_[i][1]-nc.centroids_[l][1] ))
        BC = np.sum(between)

        #-compute overall clustering score
        score = BC / WC

        #-print results for this value of K
        print('\nCluster metrics:')
        print('K = %d,  Within Cluster Score = %.4f,  Between Cluster score = %.4f,  Overall Cluster Score = %.4f, Silhouette = %f,  Calinski-Harabasz = %.4f' \
              % ( K, WC, BC, score, SC, CH ))
Beispiel #31
0
def test_features_zero_var():
    # Test that features with 0 variance throw error

    X = np.empty((10, 2))
    X[:, 0] = -0.13725701
    X[:, 1] = -0.9853293
    y = np.zeros((10))
    y[0] = 1

    clf = NearestCentroid(shrink_threshold=0.1)
    with pytest.raises(ValueError):
        clf.fit(X, y)
Beispiel #32
0
    def centroids_initialize(self, input: Tensor, labels: Tensor):
        """
        (Re-)initialize the centers based on nearest centroids algorithm

        :param input:
        :param labels:
        """
        model = NearestCentroid()
        model.fit(input.cpu().detach().numpy(),
                  labels.cpu().detach().numpy().ravel())
        self.weight.data.copy_(
            torch.Tensor(model.centroids_).to(self.weight.data.device))
Beispiel #33
0
def nearest_class_centroid(images_training, labels_training, images_testing,
                           labels_testing):

    pca = PCA(n_components=(2))
    training_images_pca = pca.fit_transform(images_training)
    test_images_pca = pca.fit_transform(images_testing)

    clf = NearestCentroid()
    clf.fit(training_images_pca, labels_training)
    print("Centoids: \n", clf.centroids_)

    return (clf.predict(test_images_pca), clf.centroids_, training_images_pca,
            test_images_pca)
def train_nearest_class_centroid_model(traning_set):
    traning_data = [traning_set[i].raw_bytes for i in range(len(traning_set))]
    traning_labels = [traning_set[i].label for i in range(len(traning_set))]

    pca_images = PCA(n_components=2)
    pca_training_image = pca_images.fit_transform(traning_data)

    nearest_class_centroid_model = NearestCentroid()
    # for each class calculate the mean of the class = centroid
    nearest_class_centroid_model.fit(pca_training_image, traning_labels)

    # return the traied model
    return nearest_class_centroid_model
Beispiel #35
0
def rocchio(X_train, X_test, y_train, y_test,string):
    clf = NearestCentroid()
    clf.fit(X_train, y_train.values.ravel())
    #pickles.criarModelo(clf,"Rocchio "+string)
    if("Fold" in string):
        pickles.criarModelo(clf,"oraculo/"+string) #SALVAR MODELO
    y_predito = clf.predict(X_test)
    micro = f1_score(y_test,y_predito,average='micro')
    macro = f1_score(y_test,y_predito,average='macro')
    #f1_individual = f1_score(y_test,y_predito,average=None)    
    #salvar_dados.salvar(y_test,y_predito,micro, macro, f1_individual," Rocchio "+string)
    print("O f1Score micro do Rocchio ", string ," é: ",micro)
    print("O f1Score macro do Rocchio ", string ," é: ",macro)
def test_pickle():
    import pickle

    # classification
    obj = NearestCentroid()
    obj.fit(iris.data, iris.target)
    score = obj.score(iris.data, iris.target)
    s = pickle.dumps(obj)

    obj2 = pickle.loads(s)
    assert_equal(type(obj2), obj.__class__)
    score2 = obj2.score(iris.data, iris.target)
    assert_array_equal(score, score2,
                       "Failed to generate same score"
                       " after pickling (classification).")
def test_predict_translated_data():
    # Test that NearestCentroid gives same results on translated data

    rng = np.random.RandomState(0)
    X = rng.rand(50, 50)
    y = rng.randint(0, 3, 50)
    noise = rng.rand(50)
    clf = NearestCentroid(shrink_threshold=0.1)
    clf.fit(X, y)
    y_init = clf.predict(X)
    clf = NearestCentroid(shrink_threshold=0.1)
    X_noise = X + noise
    clf.fit(X_noise, y)
    y_translate = clf.predict(X_noise)
    assert_array_equal(y_init, y_translate)
Beispiel #38
0
class NCClassifier(Classifier):
    """Rocchio classifier"""
    def __init__(self, shrink=None):
        self.cl = NearestCentroid(shrink_threshold=shrink)
        self.shrink = shrink

    def retrain(self, vectorFeature, vectorTarget):
        if self.shrink != None:
            self.cl.fit([v.toarray()[0] for v in vectorFeature], vectorTarget)
        else:
            super(NCClassifier, self).retrain(vectorFeature, vectorTarget)

    def classify(self, vectorizedTest):
        if self.shrink != None:
            return self.cl.predict(vectorizedTest.toarray()[0])[0]
        else:
            return super(NCClassifier, self).classify(vectorizedTest)
Beispiel #39
0
def nearestNeighbour():
	import numpy as np
	import pylab as pl
	from matplotlib.colors import ListedColormap
	from sklearn import datasets
	from sklearn.neighbors import NearestCentroid

	n_neighbors = 15

	# import some data to play with
	iris = datasets.load_iris()
	X = iris.data[:, :2]  # we only take the first two features. We could
	                      # avoid this ugly slicing by using a two-dim dataset
	y = iris.target

	h = .02  # step size in the mesh

	# Create color maps
	cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
	cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])

	for shrinkage in [None, 0.1]:
	    # we create an instance of Neighbours Classifier and fit the data.
	    clf = NearestCentroid(shrink_threshold=shrinkage)
	    clf.fit(X, y)
	    y_pred = clf.predict(X)
	    print shrinkage, np.mean(y == y_pred)
	    # Plot the decision boundary. For that, we will asign a color to each
	    # point in the mesh [x_min, m_max]x[y_min, y_max].
	    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
	    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
	    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
	                         np.arange(y_min, y_max, h))
	    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])

	    # Put the result into a color plot
	    Z = Z.reshape(xx.shape)
	    pl.figure()
	    pl.pcolormesh(xx, yy, Z, cmap=cmap_light)

	    # Plot also the training points
	    pl.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold)
	    pl.title("3-Class classification (shrink_threshold=%r)"
	             % shrinkage)
	    pl.axis('tight')
def create_and_train_model(engine):
    cmd = "SELECT review_rating, review_text FROM bf_reviews"
    bfdf = pd.read_sql_query(cmd, engine)
    bfdfl = bfdf[bfdf['review_text'].str.len() > 300].copy()
    train_data = bfdfl['review_text'].values[:1000]
    y_train = bfdfl['review_rating'].values[:1000]

    t0 = time()
    vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
                                 stop_words='english')
    X_train = vectorizer.fit_transform(train_data)
    duration = time() - t0
    print('vectorized in {:.2f} seconds.'.format(duration))
    print(X_train.shape)

    clf = NearestCentroid()
    clf.fit(X_train, y_train)
    return clf, vectorizer
def test_iris_shrinkage():
    # Check consistency on dataset iris, when using shrinkage.
    for metric in ('euclidean', 'cosine'):
        for shrink_threshold in [None, 0.1, 0.5]:
            clf = NearestCentroid(metric=metric,
                                  shrink_threshold=shrink_threshold)
            clf = clf.fit(iris.data, iris.target)
            score = np.mean(clf.predict(iris.data) == iris.target)
            assert score > 0.8, "Failed with score = " + str(score)
Beispiel #42
0
 def test_disambiguator_store(self):
     # Create a silly classifier that disambiguates between "stam" (tree
     # trunk) or "romp" (body trunk) as the Dutch translation of the
     # English noun "trunk"
     lempos = u"trunk/n"
     # FIXME: store_fit() should only accept unicode strings
     target_names = u"stam romp".encode("utf-8").split()
     vocab = u"boom hoofd".split()
     
     X = np.array([[0,1],
                   [1,0],
                   [0,1],
                   [1,0]])
     y = np.array([1,0,1,0])
     
     estimator = NearestCentroid()
     estimator.fit(X, y)
     
     centroids = estimator.centroids_
     score = estimator.score(X, y)
     
     # Store estimator
     fname = tempfile.NamedTemporaryFile().name
     f = DisambiguatorStore(fname, "w")
     f.save_estimator(NearestCentroid())
     f.save_vocab(vocab)
     f.store_fit(lempos, estimator)
     f.save_target_names(lempos, target_names)
     f.close()
     
     # Restore estimator    
     f2 = DisambiguatorStore(fname) 
     estimator2 = f2.load_estimator()
     vocab2 = f2.load_vocab()
     f2.restore_fit(lempos, estimator2)
     target_names2 = f2.load_target_names(lempos)
     centroids2 = estimator2.centroids_
     score2 = estimator2.score(X, y)
     
     assert_array_equal(centroids, centroids2)
     assert target_names == target_names2
     assert vocab == vocab2
     assert score == score2
Beispiel #43
0
train_images = faces.images[np.negative(ind)]
train_targets = faces.target[np.negative(ind)]
n_train = len(train_images)

test_images = faces.images[ind]
test_targets = faces.target[ind]
n_tests = len(test_images)
for test in test_images:
    test = test + norm.rvs(scale=10, size=test.shape)
    for i in range(25, 30):
        test[i, :] = 0
        test[:, i] = 0
    test = np.minimum(test, 1)
    test = np.maximum(test, 0)
    test = np.zeros(test.shape)

train = train_images.reshape((n_train, -1))
train_pca = pca.fit_transform(train)
    
test = test_images.reshape((n_tests, -1))
test_pca = pca.transform(test)

neigh = NearestCentroid()

neigh.fit(train, train_targets)
print("Sans ACP :", np.count_nonzero(neigh.predict(test) - test_targets), n_tests)

neigh.fit(train_pca, train_targets)
print("Avec ACP :", np.count_nonzero(neigh.predict(test_pca) - test_targets), n_tests)

Beispiel #44
0
        X_test = sample2
        y_test = labels[272:,i]
    else:
        X_train = training
        y_train = labels[:172,i]
        X_test = sampletest
        y_test = labels[172:,i]

    posterior = np.empty([100,72,6])
    box = np.zeros([6,6])
    for j in range(4,5):
        for k in range(1,2):
            accuracy = np.zeros(100)
            for m in range(0,100):
                ncc = NearestCentroid()
                ncc.fit(X_train, y_train)
                y_pred = ncc.predict(X_test)
                
                n=0
                for i in range(0,len(y_pred)):
                    if y_pred[i] == y_test[i]:
                #print i, y_pred[i], y_test[i]
                        n = n+1
                        accuracy[m] = accuracy[m]+1
                    box[y_test[i]-1,y_pred[i]-1] = box[y_test[i]-1,y_pred[i]-1] + 1
                #posterior[m] =  knc.predict_proba(X_test)
            print j, k, np.mean(accuracy)/0.72, np.std(accuracy)/0.72
            #print 30, 20, sum(accuracy[0:8])/8.0, sum(accuracy[8:18])/10.0, sum(accuracy[18:30])/12.0, sum(accuracy[56:72])/16.0, sum(accuracy[30:43])/13.0, sum(accuracy[43:56])/13.0, sum(accuracy)/72.0
        '''
    means = np.empty([72,6])
    stds = np.empty([72,6])
# import some data to play with
iris = datasets.load_iris()
X = iris.data[:, :2]  # we only take the first two features. We could
# avoid this ugly slicing by using a two-dim dataset
y = iris.target

h = 0.02  # step size in the mesh

# Create color maps
cmap_light = ListedColormap(["#FFAAAA", "#AAFFAA", "#AAAAFF"])
cmap_bold = ListedColormap(["#FF0000", "#00FF00", "#0000FF"])

for shrinkage in [None, 0.1]:
    # we create an instance of Neighbours Classifier and fit the data.
    clf = NearestCentroid(shrink_threshold=shrinkage)
    clf.fit(X, y)
    y_pred = clf.predict(X)
    print(shrinkage, np.mean(y == y_pred))
    # Plot the decision boundary. For that, we will assign a color to each
    # point in the mesh [x_min, m_max]x[y_min, y_max].
    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])

    # Put the result into a color plot
    Z = Z.reshape(xx.shape)
    pl.figure()
    pl.pcolormesh(xx, yy, Z, cmap=cmap_light)

    # Plot also the training points
# Nearest Centroid
from sklearn import datasets
from sklearn import metrics
from sklearn.neighbors import NearestCentroid
# load the iris datasets
dataset = datasets.load_iris()
# fit a nearest centroid model to the data
model = NearestCentroid()
model.fit(dataset.data, dataset.target)
print(model)
# make predictions
expected = dataset.target
predicted = model.predict(dataset.data)
# summarize the fit of the model
print(metrics.classification_report(expected, predicted))
print(metrics.confusion_matrix(expected, predicted))
def test_precomputed():
    clf = NearestCentroid(metric='precomputed')
    with assert_raises(ValueError):
        clf.fit(X, y)
def test_precomputed():
    clf = NearestCentroid(metric='precomputed')
    with assert_raises(ValueError) as context:
        clf.fit(X, y)
    assert_equal(ValueError, type(context.exception))
def get_results(city,no):
	processing.preprocessing()
	pre=open('preprocess1.txt')
	train_set=[]
	line=pre.readline()
	while(line!=''):
		train_set.append(line)
		#print line
		line=pre.readline()
	#print train_set	
	pos=open('positive-words.txt')
	neg=open('negative-words.txt')
	positive=[]
	negative=[]
	for i in pos.read().split():
		positive.append(i)	

	for j in neg.read().split():
		negative.append(j)

	stopWords = stopwords.words('english')
	vectorizer = CountVectorizer(stop_words = stopWords)
	transformer = TfidfTransformer()

	#train_set=get_traindata()
	

	#l=[]
	#l.append(test_set)
	#l.append(test_set1)

	#trainVectorizerArray = vectorizer.fit_transform(train_set).toarray()
	#print vectorizer.get_feature_names()
	#testVectorizerArray = vectorizer.transform(test_set).toarray()
	#testVectorizerArray1 = vectorizer.transform(l[1]).toarray()
	#print 'Fit Vectorizer to train set', trainVectorizerArray
	#print 'Transform Vectorizer to test set', testVectorizerArray
	#print testVectorizerArray1[0]

	#transformer.fit(trainVectorizerArray)
	v= vectorizer.fit_transform(train_set)
	#print v.toarray()
	tfidf= transformer.fit_transform(v)
	#transformer.fit(testVectorizerArray)

	#tfidf = transformer.transform(trainVectorizerArray)
	#print tfidf.todense()

	#print("done in %0.3fs." % (time() - t0))
	#print nmf.components_
	# Inverse the vectorizer vocabulary to be able
	feature_names = vectorizer.get_feature_names()
	#print (feature_names)
	#if 'area' in feature_names: 
	print (feature_names)

	print ("\n")
	#-------

	nmf = decomposition.NMF(n_components=3, init='random',random_state=0).fit(tfidf.todense())
	topic_list=[]
	l= int(len(feature_names)/5)
	#print l
	for topic_idx, topic in enumerate(nmf.components_):
		topic_list.append(topic.argsort()[:-l-1:-1])
    
    #print("Topic #%d:" % topic_idx)
    #print (topic)
	#print "Hello----"
	#print topic_list


	train_target=[]	
	for arr in v.toarray():
		train_target.append(calculate_Topic(arr,topic_list))
	#print train_target
	#clf = MultinomialNB()
	#clf2= LinearSVC()
	#clf1=NearestCentroid()
	#clf.fit(tfidf.todense(),train_target)
	#clf1.fit(tfidf.todense(),train_target)
	#clf2.fit(tfidf.todense(),train_target)
	#print (clf.predict(X_test))
	#print (clf1.predict(X_test))
	#print (clf2.predict(X_test))
	#print "Hello"
	ch2 = SelectKBest(chi2, k=l*2)
	X_train = ch2.fit_transform(tfidf.todense(), train_target)

	cs= ch2.scores_.argsort()[::-1]
	cs_featurenames=[]
	cs=cs[:l*2]
	for x in cs:
		cs_featurenames.append(feature_names[x])

	print (cs_featurenames)
	print "\n"

	nmf1 = decomposition.NMF(n_components=3, init='random',random_state=0).fit(X_train)
	topic_list=[]
	l= int(len(feature_names)/5)
	#print l
	for topic_idx, topic in enumerate(nmf1.components_):
		z=topic.argsort()[:-l-1:-1]
		topic_list.append(z)
		print("Topic #%d:---------------------------------------" % topic_idx)
		for y in z:
			print cs_featurenames[y]
    #print (topic)
	#print "Hello----"    
	#print topic_list
	train_target=[]	
	for arr in X_train:
		train_target.append(calculate_Topic(arr,topic_list))

	#---------
	#print "hello"
	#print train_target
	#print ch2.get_feature_names()
	#print X_train
	#print train_target
	#print "=--------------"
	#print ta
	#print X_test
	train_count=[0]*4
	#print train_target
	for x in train_target:
		train_count[x]=train_count[x]+1
	#print "hello"
	#print train_count	


	clf = MultinomialNB()
	clf2= LinearSVC()
	clf1=NearestCentroid()
	clf.fit(X_train,train_target)
	clf1.fit(X_train,train_target)
	clf2.fit(X_train,train_target)	
	dic={}
	hotels=read_hotels(city,dic)
	temp=[]
	for each in hotels:
		temp.append(calculate(vectorizer,transformer,train_count,ch2,each,clf,clf1,clf2,positive,negative,train_set))
	res=[]	
	temp1=numpy.array(temp).argsort()[::-1]
	#print temp1
	print "Top %d recommendations are as follows[in the FORMAT Index,(Hotel name,Location),Score]:\n" %no

	for g in temp1[:no]:
		print g,dic[g],temp[g]
		res.append(dic[g])

	return res	
writeToDisk(pred,"KNeighborsClassifier")

clf5=RandomForestClassifier(n_estimators=100)   #RandomForest Classifier
clf5.fit(X_train, y_train)
pred = clf5.predict(X_test)
writeToDisk(pred,"RandomForestClassifier")

clf6=Pipeline([('feature_selection',            #LinearSVC with L2-based feature selection
    LinearSVC(penalty="l2", dual=False, tol=1e-3)),
    ('classification', LinearSVC())])
clf6.fit(X_train, y_train)
pred = clf6.predict(X_test)
writeToDisk(pred,"LinearSVC")

clf7=NearestCentroid()                          #NearestCentroid (aka Rocchio classifier), no threshold 
clf7.fit(X_train, y_train)
pred = clf7.predict(X_test)
writeToDisk(pred,"NearestCentroid")

clf8=SVC(C=1.0, class_weight=None, coef0=0.0,   #SVC
    decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
    max_iter=-1, probability=False, random_state=1, shrinking=True,
    tol=0.001, verbose=False)
clf8.fit(X_train, y_train)
pred = clf8.predict(X_test)
writeToDisk(pred,"SVC")
'''
clf9=VotingClassifier(estimators=[
    ('Ridge',clf1),('MultiNB',clf2),('BernNB',clf3),('KNN',clf4),
    ('RF',clf5),('LinearSVC',clf6),('NearC',clf7),('SVC',clf8)
    ],voting='soft')
def test_precomputed():
    clf = NearestCentroid(metric="precomputed")
    clf.fit(X, y)
    S = pairwise_distances(T, clf.centroids_)
    assert_array_equal(clf.predict(S), true_result)