def exercicio7(): utils.print_header(7) np.random.seed(constants.SEED) print('Spiral') spiral = load_database(constants.FILENAME_SPIRAL_BATABASE, constants.URL_SPIRAL_BATABASE) print('\t- Nb samples: {}'.format(spiral.shape[0])) print('\n\tkMeans:') kmeans_clusters = k_means(spiral, 3, max_iters=300, db_name='spiral') print('\t\t- Purity (kMeans): {:.2f}%'.format( utils.purity(kmeans_clusters))) print('\t\t- Dist_intra_inter (kMeans): {:.2f}%'.format( utils.dist_intra_inter(kmeans_clusters))) print('\n\tAGNES:') agnes_clusters = agnes(spiral, 3, db_name='spiral') print('\t\t- Purity (AGNES): {:.2f}%'.format(utils.purity(agnes_clusters))) print('\t\t- Dist_intra_inter (AGNES): {:.2f}%'.format( utils.dist_intra_inter(agnes_clusters))) print('\nJain') jain = load_database(constants.FILENAME_JAIN_DATABASE, constants.URL_JAIN_DATABASE) print('\t- Nb samples: {}'.format(jain.shape[0])) print('\n\t- kMeans:') clusters = k_means(jain, 2, max_iters=300, db_name='jain') print('\t\t- Purity (kMeans): {:.2f}%'.format(utils.purity(clusters))) print('\t\t- Dist_intra_inter (kMeans): {:.2f}%'.format( utils.dist_intra_inter(clusters))) print('\n\tAGNES:') agnes_clusters = agnes(jain, 2, db_name='jain') print('\t\t- Purity (AGNES): {:.2f}%'.format(utils.purity(agnes_clusters))) print('\t\t- Dist_intra_inter (AGNES): {:.2f}%'.format( utils.dist_intra_inter(agnes_clusters))) exit()
# feature extraction digits = utils.resize(digits, 16) digits = utils.get_deskew_imgs(digits) # pix = utils.get_pix_features(digits) hog = utils.get_hog_feature(digits) # X = np.hstack([pix]) X = np.hstack([hog]) X = utils.normalize(X) #------------- # data reduction pca = pca.PCA(X) n_component = 0 gained = 0 while (gained < pov): gained += pca.pov(n_component) n_component += 1 X_reduced = pca.transform(X, n_component) #------------- # clustring start_time = timeit.default_timer() print("clustring with k-means with k =", classes, "...") k_means = kmeans.kmeans(X_reduced, y) clusters, centers = k_means.train(classes, iter) print("K-means purity:", utils.purity(clusters, y, classes)) print("K-means rand-index:", utils.rand_index(clusters, y, classes)) elapsed = timeit.default_timer() - start_time print("execution time: ", str(int((elapsed / 60))) + ':' + str(int(elapsed % 60)))
import cProfile import numpy as np from rock import RockClustering from utils import categorical_to_binary, purity if __name__ == '__main__': data = np.loadtxt("data/agaricus-lepiota.data", dtype=str, delimiter=",", skiprows=0) for i, neighborhood in enumerate(np.unique(data[:, 0]), 0): data[data[:, 0] == neighborhood, 0] = i labels = np.asarray(data[:, 0], dtype=int) data = np.asarray(data[:, 1:]) profile = cProfile.Profile() profile.enable() clustering = RockClustering(categorical_to_binary(data[:, :]), 20, theta=0.80) final_clusters = clustering.clusters() profile.disable() profile.print_stats(sort='cumulative') for i, cluster in enumerate(final_clusters, 1): print("Cluster no. {},\nlength = {}".format(i, len(cluster.points))) print(labels[cluster.points]) print("Purity = {}".format(purity(final_clusters, labels[:])))
import cProfile import numpy as np from rock import RockRealClustering from utils import categorical_to_binary, purity from sklearn.preprocessing import MinMaxScaler if __name__ == '__main__': data = np.loadtxt("data/iris.data", dtype=str, delimiter=",", skiprows=0) labels = np.asarray(data[:, -1], dtype=str) integer_labels = np.zeros(labels.shape[0], dtype=int) for i, label in enumerate(np.unique(labels), 0): integer_labels[labels == label] = i data = np.asarray(data[:, :-1], dtype=float) data = MinMaxScaler().fit_transform(data) profile = cProfile.Profile() profile.enable() clustering = RockRealClustering(data, 3, theta=0.50, nbr_max_distance=0.20) final_clusters = clustering.clusters() profile.disable() profile.print_stats(sort='time') for i, cluster in enumerate(final_clusters, 1): print("Cluster no. {},\nlength = {}".format(i, len(cluster.points))) print(labels[cluster.points]) print("Purity = {}".format(purity(final_clusters, integer_labels)))
profile = cProfile.Profile() profile.enable() clustering = RockGeoClustering(data, 5, theta=0.5, nbr_max_distance=40) final_clusters = clustering.clusters() profile.disable() profile.print_stats(sort='time') for i, cluster in enumerate(final_clusters, 1): print("Cluster no. {},\nlength = {}".format(i, len(cluster.points))) print(labels[cluster.points]) counts = np.bincount(np.asarray(labels, dtype=int)[cluster.points]) dominant = np.argmax(counts) if len(cluster.points) != 1: cluster_color = colors[random.randint(0, len(colors) - 1)] for point in cluster.points: folium.Marker(location=data[point, :], icon=folium.Icon(color=cluster_color, icon='circle', prefix='fa-')).add_to(map) else: for point in cluster.points: folium.Marker(location=data[point, :], icon=folium.Icon(color="purple", icon='circle', prefix='fa-')).add_to(map) print("Purity = {}".format( purity(final_clusters, np.asarray(labels, dtype=int)))) map.save('docs/nyc_clustering.html')