DataViz = VisualizeDataset() # Read the result from the previous chapter, and make sure the index is of the type datetime. dataset_path = './intermediate_datafiles/' try: dataset = pd.read_csv(dataset_path + 'chapter4_our_result.csv', index_col=0) except IOError as e: print('File not found, try to run previous crowdsignals scripts first!') raise e dataset.index = dataset.index.to_datetime() # First let us use non hierarchical clustering. clusteringNH = NonHierarchicalClustering() # Let us look at k-means first. k_values = range(2, 10) silhouette_values = [] # ## Do some initial runs to determine the right number for k # print '===== kmeans clustering =====' for k in k_values: print 'k = ', k dataset_cluster = clusteringNH.k_means_over_instances(copy.deepcopy(dataset), ['Bx', 'By', 'Bz'], k, 'default', 20, 10) silhouette_score = dataset_cluster['silhouette'].mean() print 'silhouette = ', silhouette_score silhouette_values.append(silhouette_score)
DataViz = VisualizeDataset() # Read the result from the previous chapter, and make sure the index is of the type datetime. dataset_path = './intermediate_datafiles/' try: dataset = pd.read_csv(dataset_path + 'chapter4_result.csv', index_col=0) except IOError as e: print('File not found, try to run previous crowdsignals scripts first!') raise e dataset.index = dataset.index.to_datetime() # # First let us use non hierarchical clustering. # clusteringNH = NonHierarchicalClustering() # # # Let us look at k-means first. # # k_values = range(2, 10) # silhouette_values = [] # # # ## Do some initial runs to determine the right number for k # # # print '===== kmeans clustering =====' # for k in k_values: # print 'k = ', k # dataset_cluster = clusteringNH.k_means_over_instances(copy.deepcopy(dataset), ['gyr_phone_x', 'gyr_phone_y', 'gyr_phone_z'], k, 'default', 20, 10) # silhouette_score = dataset_cluster['silhouette'].mean() # print 'silhouette = ', silhouette_score # silhouette_values.append(silhouette_score)
def main(): # Read the result from the previous chapter convert the index to datetime try: dataset = pd.read_csv(DATA_PATH / DATASET_FNAME, index_col=0) dataset.index = pd.to_datetime(dataset.index) except IOError as e: print( 'File not found, try to run previous crowdsignals scripts first!') raise e # Create an instance of visualization class to plot the results DataViz = VisualizeDataset(__file__) # Create objects for clustering clusteringNH = NonHierarchicalClustering() clusteringH = HierarchicalClustering() if FLAGS.mode == 'kmeans': # Do some initial runs to determine the right number for k k_values = range(2, 10) silhouette_values = [] print('Running k-means clustering') for k in k_values: print(f'k = {k}') dataset_cluster = clusteringNH.k_means_over_instances( dataset=copy.deepcopy(dataset), cols=['acc_phone_x', 'acc_phone_y', 'acc_phone_z'], k=k, distance_metric='default', max_iters=20, n_inits=10) silhouette_score = dataset_cluster['silhouette'].mean() print(f'silhouette = {silhouette_score}') silhouette_values.append(silhouette_score) DataViz.plot_xy(x=[k_values], y=[silhouette_values], xlabel='k', ylabel='silhouette score', ylim=[0, 1], line_styles=['b-']) # Run the knn with the highest silhouette score k = k_values[np.argmax(silhouette_values)] print(f'Highest K-Means silhouette score: k = {k}') print('Use this value of k to run the --mode=final --k=?') if FLAGS.mode == 'kmediods': # Do some initial runs to determine the right number for k k_values = range(2, 10) silhouette_values = [] print('Running k-medoids clustering') for k in k_values: print(f'k = {k}') dataset_cluster = clusteringNH.k_medoids_over_instances( dataset=copy.deepcopy(dataset), cols=['acc_phone_x', 'acc_phone_y', 'acc_phone_z'], k=k, distance_metric='default', max_iters=20, n_inits=10) silhouette_score = dataset_cluster['silhouette'].mean() print(f'silhouette = {silhouette_score}') silhouette_values.append(silhouette_score) DataViz.plot_xy(x=[k_values], y=[silhouette_values], xlabel='k', ylabel='silhouette score', ylim=[0, 1], line_styles=['b-']) # Run k medoids with the highest silhouette score k = k_values[np.argmax(silhouette_values)] print(f'Highest K-Medoids silhouette score: k = {k}') dataset_kmed = clusteringNH.k_medoids_over_instances( dataset=copy.deepcopy(dataset), cols=['acc_phone_x', 'acc_phone_y', 'acc_phone_z'], k=k, distance_metric='default', max_iters=20, n_inits=50) DataViz.plot_clusters_3d( data_table=dataset_kmed, data_cols=['acc_phone_x', 'acc_phone_y', 'acc_phone_z'], cluster_col='cluster', label_cols=['label']) DataViz.plot_silhouette(data_table=dataset_kmed, cluster_col='cluster', silhouette_col='silhouette') util.print_latex_statistics_clusters( dataset=dataset_kmed, cluster_col='cluster', input_cols=['acc_phone_x', 'acc_phone_y', 'acc_phone_z'], label_col='label') # Run hierarchical clustering if FLAGS.mode == 'agglomerative': k_values = range(2, 10) silhouette_values = [] # Do some initial runs to determine the right number for the maximum number of clusters print('Running agglomerative clustering') for k in k_values: print(f'k = {k}') dataset_cluster, link = clusteringH.agglomerative_over_instances( dataset=dataset, cols=['acc_phone_x', 'acc_phone_y', 'acc_phone_z'], max_clusters=k, distance_metric='euclidean', use_prev_linkage=True, link_function='ward') silhouette_score = dataset_cluster['silhouette'].mean() print(f'silhouette = {silhouette_score}') silhouette_values.append(silhouette_score) if k == k_values[0]: DataViz.plot_dendrogram(dataset_cluster, link) # Plot the clustering results DataViz.plot_xy(x=[k_values], y=[silhouette_values], xlabel='k', ylabel='silhouette score', ylim=[0, 1], line_styles=['b-']) if FLAGS.mode == 'final': # Select the outcome dataset of the knn clustering clusteringNH = NonHierarchicalClustering() dataset = clusteringNH.k_means_over_instances( dataset=dataset, cols=['acc_phone_x', 'acc_phone_y', 'acc_phone_z'], k=FLAGS.k, distance_metric='default', max_iters=50, n_inits=50) # Plot the results DataViz.plot_clusters_3d(dataset, ['acc_phone_x', 'acc_phone_y', 'acc_phone_z'], 'cluster', ['label']) DataViz.plot_silhouette(dataset, 'cluster', 'silhouette') # Print table statistics util.print_latex_statistics_clusters( dataset, 'cluster', ['acc_phone_x', 'acc_phone_y', 'acc_phone_z'], 'label') del dataset['silhouette'] # Store the final dataset dataset.to_csv(DATA_PATH / RESULT_FNAME)
# Read the result from the previous chapter, and make sure the index is of the type datetime. dataset_path = './intermediate_datafiles/' try: dataset = pd.read_csv(dataset_path + 'processed_data.csv', index_col=0) except IOError as e: print('File not found, try to run previous crowdsignals scripts first!') raise e dataset.index = dataset.index.to_datetime() selected_columns = ['gFx', 'gFy', 'gFz', 'ax', 'ay', 'az', 'Gain'] # First let us use non hierarchical clustering. clusteringNH = NonHierarchicalClustering() # Let us look at k-means first. k_values = range(2, 10) silhouette_values = [] # ## Do some initial runs to determine the right number for k # print '===== kmeans clustering =====' for k in k_values: print 'k = ', k dataset_cluster = clusteringNH.k_means_over_instances( copy.deepcopy(dataset), selected_columns, k, 'default', 20, 10) silhouette_score = dataset_cluster['silhouette'].mean() print 'silhouette = ', silhouette_score
'gyr_Gyroscope z (rad/s)', 'gyr_Gyroscope' ], [ 'linacc_Linear Acceleration x (m/s^2)', 'linacc_Linear Acceleration y (m/s^2)', 'linacc_Linear Acceleration z (m/s^2)', 'linacc_Linear Acceleration' ], [ 'mag_Magnetic field x (muT)', 'mag_Magnetic field y (muT)', 'mag_Magnetic field z (muT)', 'mag_Magnetic field' ]] # First let us use non hierarchical clustering. clusteringNH = NonHierarchicalClustering() # Let us look at k-means first. # ## Do some initial runs to determine the right number for k # for printing in params: # print '===== kmeans clustering =====' k_values = range(2, 10) silhouette_values = [] for k in k_values: # print 'k = ', k dataset_cluster = clusteringNH.k_means_over_instances( copy.deepcopy(dataset), [printing[0], printing[1], printing[2]], k,
# As usual, we set our program constants, read the input file and initialize a visualization object. DATA_PATH = Path('./intermediate_datafiles/Assignment3') DATASET_FNAME = sys.argv[1] if len(sys.argv) > 1 else 'chapter4_result.csv' RESULT_FNAME = sys.argv[2] if len(sys.argv) > 2 else 'chapter5_result.csv' try: dataset = pd.read_csv(DATA_PATH / DATASET_FNAME, index_col=0) dataset.index = pd.to_datetime(dataset.index) except IOError as e: print('File not found, try to run previous crowdsignals scripts first!') raise e DataViz = VisualizeDataset(__file__) # We'll start by applying non-hierarchical clustering. clusteringNH = NonHierarchicalClustering() # Let us look at k-means first. k_values = range(2, 4) silhouette_values = [] ## Do some initial runs to determine the right number for k print('===== kmeans clustering =====') for k in k_values: print(f'k = {k}') dataset_cluster = clusteringNH.k_means_over_instances(copy.deepcopy(dataset), ['acc_x', 'acc_y', 'acc_z'], k, 'default', 20, 10) silhouette_score = dataset_cluster['silhouette'].mean() print(f'silhouette = {silhouette_score}') silhouette_values.append(silhouette_score)
import copy import numpy as np import pandas as pd import matplotlib.pyplot as plt from pathlib import Path import pickle from Load import * # As usual, we set our program constants, read the input file and initialize a visualization object. dataset = pd.read_csv(outlier_watch_data, index_col=[time_col]).dropna() dataset.index = pd.to_datetime(dataset.index) DataViz = VisualizeDataset(__file__, show=False) # We'll start by applying non-hierarchical clustering. clusteringNH = NonHierarchicalClustering() # Let us look at k-means first. k_values = range(2, 25) silhouette_values = [] ## Do some initial runs to determine the right number for k attributes_to_cluster = [ 'acc_x', 'acc_y', 'acc_z', "gyr_x", "gyr_y", "gyr_z", ] print('===== kmeans clustering =====')
DataViz = VisualizeDataset() # Read the result from the previous chapter, and make sure the index is of the type datetime. dataset_path = './intermediate_datafiles-own/' try: dataset = pd.read_csv(dataset_path + 'chapter4_result.csv', index_col=0) except IOError as e: print('File not found, try to run previous crowdsignals scripts first!') raise e dataset.index = dataset.index.to_datetime() # First let us use non hierarchical clustering. clusteringNH = NonHierarchicalClustering() # Let us look at k-means first. ''' k_values = range(2, 10) silhouette_values = [] # ## Do some initial runs to determine the right number for k # print '===== kmeans clustering =====' for k in k_values: print 'k = ', k dataset_cluster = clusteringNH.k_means_over_instances(copy.deepcopy(dataset), ['gyr_phone_x', 'gyr_phone_y', 'gyr_phone_z'], k, 'default', 20, 10) silhouette_score = dataset_cluster['silhouette'].mean() print 'silhouette = ', silhouette_score silhouette_values.append(silhouette_score)