plot.ylabel('silhouette score') plot.ylim([0,1]) plot.show() # And run the knn with the highest silhouette score k = 0 best_silhouette_score = 0 for i in zip(silhouette_values, k_values): pass k = 6 dataset_knn = clusteringNH.k_means_over_instances(copy.deepcopy(dataset), ['Bx', 'By', 'Bz'], k, 'default', 50, 50) DataViz.plot_clusters_3d(dataset_knn, ['Bx', 'By', 'Bz'], 'cluster', ['label']) DataViz.plot_silhouette(dataset_knn, 'cluster', 'silhouette') util.print_latex_statistics_clusters(dataset_knn, 'cluster', ['Bx', 'By', 'Bz'], 'label') del dataset_knn['silhouette'] k_values = range(2, 10) silhouette_values = [] # Do some initial runs to determine the right number for k print '===== k medoids clustering =====' for k in k_values: print 'k = ', k dataset_cluster = clusteringNH.k_medoids_over_instances(copy.deepcopy(dataset), ['Bx', 'By', 'Bz'], k, 'default', 20, n_inits=10) silhouette_score = dataset_cluster['silhouette'].mean()
y=[silhouette_values], xlabel='k', ylabel='silhouette score', ylim=[0, 1], line_styles=['b-']) # And run the knn with the highest silhouette score # k = 6 # todo: replaced with np.argmax call over silhouette scores k = k_values[np.argmax(silhouette_values)] print(f'Highest K-Means silhouette score: k = {k}') dataset_knn = clusteringNH.k_means_over_instances(copy.deepcopy(dataset), attributes_to_cluster, k, 'default', 50, 50) DataViz.plot_clusters_3d(dataset_knn, attributes_to_cluster, 'cluster', ['label']) DataViz.plot_silhouette(dataset_knn, 'cluster', 'silhouette') util.print_latex_statistics_clusters(dataset_knn, 'cluster', attributes_to_cluster, 'label') del dataset_knn['silhouette'] k_values = range(2, 10) silhouette_values = [] print('===== k medoids clustering =====') for k in k_values: print(f'k = {k}') dataset_cluster = clusteringNH.k_medoids_over_instances( copy.deepcopy(dataset), attributes_to_cluster, k,
def main(): # Read the result from the previous chapter convert the index to datetime try: dataset = pd.read_csv(DATA_PATH / DATASET_FNAME, index_col=0) dataset.index = pd.to_datetime(dataset.index) except IOError as e: print( 'File not found, try to run previous crowdsignals scripts first!') raise e # Create an instance of visualization class to plot the results DataViz = VisualizeDataset(__file__) # Create objects for clustering clusteringNH = NonHierarchicalClustering() clusteringH = HierarchicalClustering() if FLAGS.mode == 'kmeans': # Do some initial runs to determine the right number for k k_values = range(2, 10) silhouette_values = [] print('Running k-means clustering') for k in k_values: print(f'k = {k}') dataset_cluster = clusteringNH.k_means_over_instances( dataset=copy.deepcopy(dataset), cols=['acc_phone_x', 'acc_phone_y', 'acc_phone_z'], k=k, distance_metric='default', max_iters=20, n_inits=10) silhouette_score = dataset_cluster['silhouette'].mean() print(f'silhouette = {silhouette_score}') silhouette_values.append(silhouette_score) DataViz.plot_xy(x=[k_values], y=[silhouette_values], xlabel='k', ylabel='silhouette score', ylim=[0, 1], line_styles=['b-']) # Run the knn with the highest silhouette score k = k_values[np.argmax(silhouette_values)] print(f'Highest K-Means silhouette score: k = {k}') print('Use this value of k to run the --mode=final --k=?') if FLAGS.mode == 'kmediods': # Do some initial runs to determine the right number for k k_values = range(2, 10) silhouette_values = [] print('Running k-medoids clustering') for k in k_values: print(f'k = {k}') dataset_cluster = clusteringNH.k_medoids_over_instances( dataset=copy.deepcopy(dataset), cols=['acc_phone_x', 'acc_phone_y', 'acc_phone_z'], k=k, distance_metric='default', max_iters=20, n_inits=10) silhouette_score = dataset_cluster['silhouette'].mean() print(f'silhouette = {silhouette_score}') silhouette_values.append(silhouette_score) DataViz.plot_xy(x=[k_values], y=[silhouette_values], xlabel='k', ylabel='silhouette score', ylim=[0, 1], line_styles=['b-']) # Run k medoids with the highest silhouette score k = k_values[np.argmax(silhouette_values)] print(f'Highest K-Medoids silhouette score: k = {k}') dataset_kmed = clusteringNH.k_medoids_over_instances( dataset=copy.deepcopy(dataset), cols=['acc_phone_x', 'acc_phone_y', 'acc_phone_z'], k=k, distance_metric='default', max_iters=20, n_inits=50) DataViz.plot_clusters_3d( data_table=dataset_kmed, data_cols=['acc_phone_x', 'acc_phone_y', 'acc_phone_z'], cluster_col='cluster', label_cols=['label']) DataViz.plot_silhouette(data_table=dataset_kmed, cluster_col='cluster', silhouette_col='silhouette') util.print_latex_statistics_clusters( dataset=dataset_kmed, cluster_col='cluster', input_cols=['acc_phone_x', 'acc_phone_y', 'acc_phone_z'], label_col='label') # Run hierarchical clustering if FLAGS.mode == 'agglomerative': k_values = range(2, 10) silhouette_values = [] # Do some initial runs to determine the right number for the maximum number of clusters print('Running agglomerative clustering') for k in k_values: print(f'k = {k}') dataset_cluster, link = clusteringH.agglomerative_over_instances( dataset=dataset, cols=['acc_phone_x', 'acc_phone_y', 'acc_phone_z'], max_clusters=k, distance_metric='euclidean', use_prev_linkage=True, link_function='ward') silhouette_score = dataset_cluster['silhouette'].mean() print(f'silhouette = {silhouette_score}') silhouette_values.append(silhouette_score) if k == k_values[0]: DataViz.plot_dendrogram(dataset_cluster, link) # Plot the clustering results DataViz.plot_xy(x=[k_values], y=[silhouette_values], xlabel='k', ylabel='silhouette score', ylim=[0, 1], line_styles=['b-']) if FLAGS.mode == 'final': # Select the outcome dataset of the knn clustering clusteringNH = NonHierarchicalClustering() dataset = clusteringNH.k_means_over_instances( dataset=dataset, cols=['acc_phone_x', 'acc_phone_y', 'acc_phone_z'], k=FLAGS.k, distance_metric='default', max_iters=50, n_inits=50) # Plot the results DataViz.plot_clusters_3d(dataset, ['acc_phone_x', 'acc_phone_y', 'acc_phone_z'], 'cluster', ['label']) DataViz.plot_silhouette(dataset, 'cluster', 'silhouette') # Print table statistics util.print_latex_statistics_clusters( dataset, 'cluster', ['acc_phone_x', 'acc_phone_y', 'acc_phone_z'], 'label') del dataset['silhouette'] # Store the final dataset dataset.to_csv(DATA_PATH / RESULT_FNAME)
# And run the knn with the highest silhouette score highest_score = 0 k = 0 for i, j in zip(k_values, silhouette_values): if j > highest_score: k = i highest_score = j print "k: \t" + str(k) dataset_knn = clusteringNH.k_means_over_instances(copy.deepcopy(dataset), selected_columns, k, 'default', 50, 50) DataViz.plot_clusters_3d(dataset_knn, selected_columns, 'cluster', ['label']) DataViz.plot_silhouette(dataset_knn, 'cluster', 'silhouette') util.print_latex_statistics_clusters(dataset_knn, 'cluster', selected_columns, 'label') del dataset_knn['silhouette'] k_values = range(2, 10) silhouette_values = [] # Do some initial runs to determine the right number for k print '===== k medoids clustering =====' for k in k_values: print 'k = ', k dataset_cluster = clusteringNH.k_medoids_over_instances( copy.deepcopy(dataset), selected_columns, k, 'default', 20, n_inits=10)
plot.plot(k_values, silhouette_values, 'b-') plot.xlabel('k') plot.ylabel('silhouette score') plot.ylim([0, 1]) plot.show() # And run the knn with the highest silhouette score k = 6 dataset_knn = clusteringNH.k_means_over_instances( copy.deepcopy(dataset), ['acc_phone_x', 'acc_phone_y', 'acc_phone_z'], k, 'default', 50, 50) DataViz.plot_clusters_3d(dataset_knn, ['acc_phone_x', 'acc_phone_y', 'acc_phone_z'], 'cluster', ['label']) DataViz.plot_silhouette(dataset_knn, 'cluster', 'silhouette') util.print_latex_statistics_clusters( dataset_knn, 'cluster', ['acc_phone_x', 'acc_phone_y', 'acc_phone_z'], 'label') del dataset_knn['silhouette'] k_values = range(2, 10) silhouette_values = [] # Do some initial runs to determine the right number for k print '===== k medoids clustering =====' for k in k_values: print 'k = ', k
xlabel='k', ylabel='silhouette score', ylim=[0, 1], line_styles=['b-']) # And run the knn with the highest silhouette score # k = 6 # todo: replaced with np.argmax call over silhouette scores k = k_values[np.argmax(silhouette_values)] print(f'Highest K-Means silhouette score: k = {k}') dataset_knn = clusteringNH.k_means_over_instances( copy.deepcopy(dataset), ['gyr_watch_x', 'gyr_watch_y', 'gyr_watch_z'], k, 'default', 50, 50) DataViz.plot_clusters_3d(dataset_knn, ['gyr_watch_x', 'gyr_watch_y', 'gyr_watch_z'], 'cluster', ['label']) DataViz.plot_silhouette(dataset_knn, 'cluster', 'silhouette') util.print_latex_statistics_clusters( dataset_knn, 'cluster', ['gyr_watch_x', 'gyr_watch_y', 'gyr_watch_z'], 'label') del dataset_knn['silhouette'] k_values = range(2, 10) silhouette_values = [] # Do some initial runs to determine the right number for k print('===== k medoids clustering =====') for k in k_values: print(f'k = {k}')
dataset_cluster = clusteringNH.k_means_over_instances(copy.deepcopy(dataset), ['acc_x', 'acc_y', 'acc_z'], k, 'default', 20, 10) silhouette_score = dataset_cluster['silhouette'].mean() print(f'silhouette = {silhouette_score}') silhouette_values.append(silhouette_score) DataViz.plot_xy(x=[k_values], y=[silhouette_values], xlabel='k', ylabel='silhouette score', ylim=[0,1], line_styles=['b-']) # And run the knn with the highest silhouette score # k = 6 # todo: replaced with np.argmax call over silhouette scores k = k_values[np.argmax(silhouette_values)] print(f'Highest K-Means silhouette score: k = {k}') dataset_knn = clusteringNH.k_means_over_instances(copy.deepcopy(dataset), ['acc_x', 'acc_y', 'acc_z'], k, 'default', 50, 50) DataViz.plot_clusters_3d(dataset_knn, ['acc_x', 'acc_y', 'acc_z'], 'cluster', ['label']) DataViz.plot_silhouette(dataset_knn, 'cluster', 'silhouette') util.print_latex_statistics_clusters(dataset_knn, 'cluster', ['acc_x', 'acc_y', 'acc_z'], 'label') del dataset_knn['silhouette'] #k_values = range(2, 10) #silhouette_values = [] # Do some initial runs to determine the right number for k #print('===== k medoids clustering =====') #for k in k_values: # print(f'k = {k}') # dataset_cluster = clusteringNH.k_medoids_over_instances(copy.deepcopy(dataset), ['acc_phone_x', 'acc_phone_y', 'acc_phone_z'], k, 'default', 20, n_inits=10) # silhouette_score = dataset_cluster['silhouette'].mean() # print(f'silhouette = {silhouette_score}')
copy.deepcopy(dataset), attributes_to_cluster, k, 'default', 20, 50) silhouette_score = dataset_cluster['silhouette'].mean() print(f'silhouette = {silhouette_score}') silhouette_values.append(silhouette_score) # And run the knn with the highest silhouette score # k = 6 # todo: replaced with np.argmax call over silhouette scores k = k_values[np.argmax(silhouette_values)] print(f'Highest K-Means silhouette score: k = {k}') dataset_knn = clusteringNH.k_means_over_instances(copy.deepcopy(dataset), attributes_to_cluster, k, 'default', 20, 50) DataViz.plot_silhouette(dataset_knn, 'cluster', 'silhouette') DataViz.plot_clusters_3d(dataset_knn, [ 'acc_x', 'acc_y', 'acc_z', ], 'cluster', ['label']) DataViz.plot_clusters_3d(dataset_knn, [ "gyr_x", "gyr_y", "gyr_z", ], 'cluster', ['label']) util.print_latex_statistics_clusters(dataset_knn, 'cluster', attributes_to_cluster, 'label') del dataset_knn['silhouette'] # And we select the outcome dataset of the knn clustering.... dataset_knn.to_csv(cluster_watch_data)
xlabel='k', ylabel='silhouette score', ylim=[0, 1], line_styles=['b-']) # And run the knn with the highest silhouette score # k = 6 # todo: replaced with np.argmax call over silhouette scores k = k_values[np.argmax(silhouette_values)] print(f'Highest K-Means silhouette score: k = {k}') dataset_knn = clusteringNH.k_means_over_instances( copy.deepcopy(dataset), ['gyr_phone_X', 'gyr_phone_Y', 'gyr_phone_Z'], k, 'default', 50, 50) DataViz.plot_clusters_3d(dataset_knn, ['gyr_phone_X', 'gyr_phone_Y', 'gyr_phone_Z'], 'cluster', ['label']) DataViz.plot_silhouette(dataset_knn, 'cluster', 'silhouette') util.print_latex_statistics_clusters( dataset_knn, 'cluster', ['gyr_phone_X', 'gyr_phone_Y', 'gyr_phone_Z'], 'label') del dataset_knn['silhouette'] k_values = range(2, 10) silhouette_values = [] # Do some initial runs to determine the right number for k print('===== k medoids clustering =====') for k in k_values: print(f'k = {k}')
y=[silhouette_values], xlabel='k', ylabel='silhouette score kmeans', ylim=[0, 1], line_styles=['b-']) # And run the knn with the highest silhouette score # k = 6 # todo: replaced with np.argmax call over silhouette scores k = k_values[np.argmax(silhouette_values)] print(f'Highest K-Means silhouette score: k = {k}') dataset_knn = clusteringNH.k_means_over_instances( copy.deepcopy(dataset), ['gravity.x', 'gravity.y', 'gravity.z'], k, 'default', 50, 50) DataViz.plot_clusters_3d(dataset_knn, ['gravity.x', 'gravity.y', 'gravity.z'], 'cluster', ['label']) DataViz.plot_silhouette(dataset_knn, 'cluster', 'silhouette') util.print_latex_statistics_clusters(dataset_knn, 'cluster', ['gravity.x', 'gravity.y', 'gravity.z'], 'label') del dataset_knn['silhouette'] k_values = range(2, 10) silhouette_values = [] # Do some initial runs to determine the right number for k print('===== k medoids clustering =====') for k in k_values: print(f'k = {k}') dataset_cluster = clusteringNH.k_medoids_over_instances(