plot.xlabel('k') plot.ylabel('silhouette score') plot.show() # And run k medoids with the highest silhouette score k = 6 dataset_kmed = clusteringNH.k_medoids_over_instances(copy.deepcopy(dataset), ['Bx', 'By', 'Bz'], k, 'default', 20, n_inits=50) DataViz.plot_clusters_3d(dataset_kmed, ['Bx', 'By', 'Bz'], 'cluster', ['label']) DataViz.plot_silhouette(dataset_kmed, 'cluster', 'silhouette') util.print_latex_statistics_clusters(dataset_kmed, 'cluster', ['Bx', 'By', 'Bz'], 'label') # And the hierarchical clustering is the last one we try clusteringH = HierarchicalClustering() k_values = range(2, 10) silhouette_values = [] # Do some initial runs to determine the right number for the mBximum number of clusters. print '===== agglomaritive clustering =====' for k in k_values: print 'k = ', k dataset_cluster, l = clusteringH.agglomerative_over_instances(copy.deepcopy(dataset), ['Bx', 'By', 'Bz'], k, 'euclidean', use_prev_linkage=True, link_function='ward') silhouette_score = dataset_cluster['silhouette'].mean() print 'silhouette = ', silhouette_score silhouette_values.append(silhouette_score) if k == k_values[0]: DataViz.plot_dendrogram(dataset_cluster, l)
# plot.xlabel('k') # plot.ylabel('silhouette score') # plot.show() # # # And run k medoids with the highest silhouette score # # k = 6 # # dataset_kmed = clusteringNH.k_medoids_over_instances(copy.deepcopy(dataset), ['gyr_phone_x', 'gyr_phone_y', 'gyr_phone_z'], k, 'default', 20, n_inits=50) # DataViz.plot_clusters_3d(dataset_kmed, ['gyr_phone_x', 'gyr_phone_y', 'gyr_phone_z'], 'cluster', ['label']) # DataViz.plot_silhouette(dataset_kmed, 'cluster', 'silhouette') # util.print_latex_statistics_clusters(dataset_kmed, 'cluster', ['gyr_phone_x', 'gyr_phone_y', 'gyr_phone_z'], 'label') # # And the hierarchical clustering is the last one we try clusteringH = HierarchicalClustering() k_values = range(2, 10) silhouette_values = [] # Do some initial runs to determine the right number for the maximum number of clusters. print '===== agglomaritive clustering =====' # for k in k_values: # print 'k = ', k # linkage # single # complete # average # weighted
def main(): # Read the result from the previous chapter convert the index to datetime try: dataset = pd.read_csv(DATA_PATH / DATASET_FNAME, index_col=0) dataset.index = pd.to_datetime(dataset.index) except IOError as e: print( 'File not found, try to run previous crowdsignals scripts first!') raise e # Create an instance of visualization class to plot the results DataViz = VisualizeDataset(__file__) # Create objects for clustering clusteringNH = NonHierarchicalClustering() clusteringH = HierarchicalClustering() if FLAGS.mode == 'kmeans': # Do some initial runs to determine the right number for k k_values = range(2, 10) silhouette_values = [] print('Running k-means clustering') for k in k_values: print(f'k = {k}') dataset_cluster = clusteringNH.k_means_over_instances( dataset=copy.deepcopy(dataset), cols=['acc_phone_x', 'acc_phone_y', 'acc_phone_z'], k=k, distance_metric='default', max_iters=20, n_inits=10) silhouette_score = dataset_cluster['silhouette'].mean() print(f'silhouette = {silhouette_score}') silhouette_values.append(silhouette_score) DataViz.plot_xy(x=[k_values], y=[silhouette_values], xlabel='k', ylabel='silhouette score', ylim=[0, 1], line_styles=['b-']) # Run the knn with the highest silhouette score k = k_values[np.argmax(silhouette_values)] print(f'Highest K-Means silhouette score: k = {k}') print('Use this value of k to run the --mode=final --k=?') if FLAGS.mode == 'kmediods': # Do some initial runs to determine the right number for k k_values = range(2, 10) silhouette_values = [] print('Running k-medoids clustering') for k in k_values: print(f'k = {k}') dataset_cluster = clusteringNH.k_medoids_over_instances( dataset=copy.deepcopy(dataset), cols=['acc_phone_x', 'acc_phone_y', 'acc_phone_z'], k=k, distance_metric='default', max_iters=20, n_inits=10) silhouette_score = dataset_cluster['silhouette'].mean() print(f'silhouette = {silhouette_score}') silhouette_values.append(silhouette_score) DataViz.plot_xy(x=[k_values], y=[silhouette_values], xlabel='k', ylabel='silhouette score', ylim=[0, 1], line_styles=['b-']) # Run k medoids with the highest silhouette score k = k_values[np.argmax(silhouette_values)] print(f'Highest K-Medoids silhouette score: k = {k}') dataset_kmed = clusteringNH.k_medoids_over_instances( dataset=copy.deepcopy(dataset), cols=['acc_phone_x', 'acc_phone_y', 'acc_phone_z'], k=k, distance_metric='default', max_iters=20, n_inits=50) DataViz.plot_clusters_3d( data_table=dataset_kmed, data_cols=['acc_phone_x', 'acc_phone_y', 'acc_phone_z'], cluster_col='cluster', label_cols=['label']) DataViz.plot_silhouette(data_table=dataset_kmed, cluster_col='cluster', silhouette_col='silhouette') util.print_latex_statistics_clusters( dataset=dataset_kmed, cluster_col='cluster', input_cols=['acc_phone_x', 'acc_phone_y', 'acc_phone_z'], label_col='label') # Run hierarchical clustering if FLAGS.mode == 'agglomerative': k_values = range(2, 10) silhouette_values = [] # Do some initial runs to determine the right number for the maximum number of clusters print('Running agglomerative clustering') for k in k_values: print(f'k = {k}') dataset_cluster, link = clusteringH.agglomerative_over_instances( dataset=dataset, cols=['acc_phone_x', 'acc_phone_y', 'acc_phone_z'], max_clusters=k, distance_metric='euclidean', use_prev_linkage=True, link_function='ward') silhouette_score = dataset_cluster['silhouette'].mean() print(f'silhouette = {silhouette_score}') silhouette_values.append(silhouette_score) if k == k_values[0]: DataViz.plot_dendrogram(dataset_cluster, link) # Plot the clustering results DataViz.plot_xy(x=[k_values], y=[silhouette_values], xlabel='k', ylabel='silhouette score', ylim=[0, 1], line_styles=['b-']) if FLAGS.mode == 'final': # Select the outcome dataset of the knn clustering clusteringNH = NonHierarchicalClustering() dataset = clusteringNH.k_means_over_instances( dataset=dataset, cols=['acc_phone_x', 'acc_phone_y', 'acc_phone_z'], k=FLAGS.k, distance_metric='default', max_iters=50, n_inits=50) # Plot the results DataViz.plot_clusters_3d(dataset, ['acc_phone_x', 'acc_phone_y', 'acc_phone_z'], 'cluster', ['label']) DataViz.plot_silhouette(dataset, 'cluster', 'silhouette') # Print table statistics util.print_latex_statistics_clusters( dataset, 'cluster', ['acc_phone_x', 'acc_phone_y', 'acc_phone_z'], 'label') del dataset['silhouette'] # Store the final dataset dataset.to_csv(DATA_PATH / RESULT_FNAME)
plot.xlabel('k') plot.ylabel('silhouette score') plot.show() # And run k medoids with the highest silhouette score k = 2 dataset_kmed = clusteringNH.k_medoids_over_instances(copy.deepcopy(dataset), ['gyr_phone_x', 'gyr_phone_y', 'gyr_phone_z'], k, 'default', 20, n_inits=50) DataViz.plot_clusters_3d(dataset_kmed, ['gyr_phone_x', 'gyr_phone_y', 'gyr_phone_z'], 'cluster', ['label']) DataViz.plot_silhouette(dataset_kmed, 'cluster', 'silhouette') util.print_latex_statistics_clusters(dataset_kmed, 'cluster', ['gyr_phone_x', 'gyr_phone_y', 'gyr_phone_z'], 'label') # And the hierarchical clustering is the last one we try ''' clusteringH = HierarchicalClustering() k_values = range(2, 10) silhouette_values = [] def plot_dendrogram(dataset, linkage, k): sys.setrecursionlimit(40000) plot.title('Hierarchical Clustering Dendrogram') plot.xlabel('time points') plot.ylabel('distance') times = dataset.index.strftime('%H:%M:%S') dendrogram(linkage,truncate_mode='lastp',p=k, show_leaf_counts=True, leaf_rotation=45.,leaf_font_size=8.,show_contracted=True, labels=times) plot.show() # Do some initial runs to determine the right number for the maximum number of clusters.