def exercicio7():
    utils.print_header(7)
    np.random.seed(constants.SEED)

    print('Spiral')
    spiral = load_database(constants.FILENAME_SPIRAL_BATABASE,
                           constants.URL_SPIRAL_BATABASE)
    print('\t- Nb samples: {}'.format(spiral.shape[0]))
    print('\n\tkMeans:')
    kmeans_clusters = k_means(spiral, 3, max_iters=300, db_name='spiral')
    print('\t\t- Purity (kMeans): {:.2f}%'.format(
        utils.purity(kmeans_clusters)))
    print('\t\t- Dist_intra_inter (kMeans): {:.2f}%'.format(
        utils.dist_intra_inter(kmeans_clusters)))
    print('\n\tAGNES:')
    agnes_clusters = agnes(spiral, 3, db_name='spiral')
    print('\t\t- Purity (AGNES): {:.2f}%'.format(utils.purity(agnes_clusters)))
    print('\t\t- Dist_intra_inter (AGNES): {:.2f}%'.format(
        utils.dist_intra_inter(agnes_clusters)))

    print('\nJain')
    jain = load_database(constants.FILENAME_JAIN_DATABASE,
                         constants.URL_JAIN_DATABASE)
    print('\t- Nb samples: {}'.format(jain.shape[0]))
    print('\n\t- kMeans:')
    clusters = k_means(jain, 2, max_iters=300, db_name='jain')
    print('\t\t- Purity (kMeans): {:.2f}%'.format(utils.purity(clusters)))
    print('\t\t- Dist_intra_inter (kMeans): {:.2f}%'.format(
        utils.dist_intra_inter(clusters)))
    print('\n\tAGNES:')
    agnes_clusters = agnes(jain, 2, db_name='jain')
    print('\t\t- Purity (AGNES): {:.2f}%'.format(utils.purity(agnes_clusters)))
    print('\t\t- Dist_intra_inter (AGNES): {:.2f}%'.format(
        utils.dist_intra_inter(agnes_clusters)))

    exit()
# feature extraction
digits = utils.resize(digits, 16)
digits = utils.get_deskew_imgs(digits)
# pix = utils.get_pix_features(digits)
hog = utils.get_hog_feature(digits)
# X = np.hstack([pix])
X = np.hstack([hog])
X = utils.normalize(X)

#-------------
# data reduction
pca = pca.PCA(X)
n_component = 0
gained = 0
while (gained < pov):
    gained += pca.pov(n_component)
    n_component += 1
X_reduced = pca.transform(X, n_component)

#-------------
# clustring
start_time = timeit.default_timer()
print("clustring with k-means with k =", classes, "...")
k_means = kmeans.kmeans(X_reduced, y)
clusters, centers = k_means.train(classes, iter)
print("K-means purity:", utils.purity(clusters, y, classes))
print("K-means rand-index:", utils.rand_index(clusters, y, classes))
elapsed = timeit.default_timer() - start_time
print("execution time: ",
      str(int((elapsed / 60))) + ':' + str(int(elapsed % 60)))
Ejemplo n.º 3
0
import cProfile

import numpy as np

from rock import RockClustering
from utils import categorical_to_binary, purity

if __name__ == '__main__':
    data = np.loadtxt("data/agaricus-lepiota.data", dtype=str, delimiter=",", skiprows=0)
    for i, neighborhood in enumerate(np.unique(data[:, 0]), 0):
        data[data[:, 0] == neighborhood, 0] = i
    labels = np.asarray(data[:, 0], dtype=int)
    data = np.asarray(data[:, 1:])

    profile = cProfile.Profile()
    profile.enable()
    clustering = RockClustering(categorical_to_binary(data[:, :]), 20, theta=0.80)
    final_clusters = clustering.clusters()
    profile.disable()
    profile.print_stats(sort='cumulative')

    for i, cluster in enumerate(final_clusters, 1):
        print("Cluster no. {},\nlength = {}".format(i, len(cluster.points)))
        print(labels[cluster.points])

    print("Purity = {}".format(purity(final_clusters, labels[:])))
Ejemplo n.º 4
0
import cProfile
import numpy as np

from rock import RockRealClustering
from utils import categorical_to_binary, purity
from sklearn.preprocessing import MinMaxScaler

if __name__ == '__main__':
    data = np.loadtxt("data/iris.data", dtype=str, delimiter=",", skiprows=0)
    labels = np.asarray(data[:, -1], dtype=str)
    integer_labels = np.zeros(labels.shape[0], dtype=int)
    for i, label in enumerate(np.unique(labels), 0):
        integer_labels[labels == label] = i
    data = np.asarray(data[:, :-1], dtype=float)
    data = MinMaxScaler().fit_transform(data)
    profile = cProfile.Profile()
    profile.enable()
    clustering = RockRealClustering(data, 3, theta=0.50, nbr_max_distance=0.20)
    final_clusters = clustering.clusters()
    profile.disable()

    profile.print_stats(sort='time')
    for i, cluster in enumerate(final_clusters, 1):
        print("Cluster no. {},\nlength = {}".format(i, len(cluster.points)))
        print(labels[cluster.points])

    print("Purity = {}".format(purity(final_clusters, integer_labels)))
Ejemplo n.º 5
0
    profile = cProfile.Profile()
    profile.enable()
    clustering = RockGeoClustering(data, 5, theta=0.5, nbr_max_distance=40)
    final_clusters = clustering.clusters()
    profile.disable()
    profile.print_stats(sort='time')

    for i, cluster in enumerate(final_clusters, 1):
        print("Cluster no. {},\nlength = {}".format(i, len(cluster.points)))
        print(labels[cluster.points])
        counts = np.bincount(np.asarray(labels, dtype=int)[cluster.points])
        dominant = np.argmax(counts)
        if len(cluster.points) != 1:
            cluster_color = colors[random.randint(0, len(colors) - 1)]
            for point in cluster.points:
                folium.Marker(location=data[point, :],
                              icon=folium.Icon(color=cluster_color,
                                               icon='circle',
                                               prefix='fa-')).add_to(map)
        else:
            for point in cluster.points:
                folium.Marker(location=data[point, :],
                              icon=folium.Icon(color="purple",
                                               icon='circle',
                                               prefix='fa-')).add_to(map)

    print("Purity = {}".format(
        purity(final_clusters, np.asarray(labels, dtype=int))))
    map.save('docs/nyc_clustering.html')