コード例 #1
0
plot.ylabel('silhouette score')
plot.ylim([0,1])
plot.show()

# And run the knn with the highest silhouette score

k = 0
best_silhouette_score = 0

for i in zip(silhouette_values, k_values):
    pass

k = 6

dataset_knn = clusteringNH.k_means_over_instances(copy.deepcopy(dataset), ['Bx', 'By', 'Bz'], k, 'default', 50, 50)
DataViz.plot_clusters_3d(dataset_knn, ['Bx', 'By', 'Bz'], 'cluster', ['label'])
DataViz.plot_silhouette(dataset_knn, 'cluster', 'silhouette')
util.print_latex_statistics_clusters(dataset_knn, 'cluster', ['Bx', 'By', 'Bz'], 'label')
del dataset_knn['silhouette']


k_values = range(2, 10)
silhouette_values = []

# Do some initial runs to determine the right number for k

print '===== k medoids clustering ====='
for k in k_values:
    print 'k = ', k
    dataset_cluster = clusteringNH.k_medoids_over_instances(copy.deepcopy(dataset), ['Bx', 'By', 'Bz'], k, 'default', 20, n_inits=10)
    silhouette_score = dataset_cluster['silhouette'].mean()
コード例 #2
0
                y=[silhouette_values],
                xlabel='k',
                ylabel='silhouette score',
                ylim=[0, 1],
                line_styles=['b-'])

# And run the knn with the highest silhouette score

# k = 6 # todo: replaced with np.argmax call over silhouette scores
k = k_values[np.argmax(silhouette_values)]
print(f'Highest K-Means silhouette score: k = {k}')

dataset_knn = clusteringNH.k_means_over_instances(copy.deepcopy(dataset),
                                                  attributes_to_cluster, k,
                                                  'default', 50, 50)
DataViz.plot_clusters_3d(dataset_knn, attributes_to_cluster, 'cluster',
                         ['label'])
DataViz.plot_silhouette(dataset_knn, 'cluster', 'silhouette')
util.print_latex_statistics_clusters(dataset_knn, 'cluster',
                                     attributes_to_cluster, 'label')
del dataset_knn['silhouette']

k_values = range(2, 10)
silhouette_values = []

print('===== k medoids clustering =====')
for k in k_values:
    print(f'k = {k}')
    dataset_cluster = clusteringNH.k_medoids_over_instances(
        copy.deepcopy(dataset),
        attributes_to_cluster,
        k,
コード例 #3
0
ファイル: crowdsignals_ch5.py プロジェクト: NilsHMeier/ML4QS
def main():
    # Read the result from the previous chapter convert the index to datetime
    try:
        dataset = pd.read_csv(DATA_PATH / DATASET_FNAME, index_col=0)
        dataset.index = pd.to_datetime(dataset.index)
    except IOError as e:
        print(
            'File not found, try to run previous crowdsignals scripts first!')
        raise e

    # Create an instance of visualization class to plot the results
    DataViz = VisualizeDataset(__file__)

    # Create objects for clustering
    clusteringNH = NonHierarchicalClustering()
    clusteringH = HierarchicalClustering()

    if FLAGS.mode == 'kmeans':
        # Do some initial runs to determine the right number for k
        k_values = range(2, 10)
        silhouette_values = []

        print('Running k-means clustering')
        for k in k_values:
            print(f'k = {k}')
            dataset_cluster = clusteringNH.k_means_over_instances(
                dataset=copy.deepcopy(dataset),
                cols=['acc_phone_x', 'acc_phone_y', 'acc_phone_z'],
                k=k,
                distance_metric='default',
                max_iters=20,
                n_inits=10)
            silhouette_score = dataset_cluster['silhouette'].mean()
            print(f'silhouette = {silhouette_score}')
            silhouette_values.append(silhouette_score)

        DataViz.plot_xy(x=[k_values],
                        y=[silhouette_values],
                        xlabel='k',
                        ylabel='silhouette score',
                        ylim=[0, 1],
                        line_styles=['b-'])

        # Run the knn with the highest silhouette score
        k = k_values[np.argmax(silhouette_values)]
        print(f'Highest K-Means silhouette score: k = {k}')
        print('Use this value of k to run the --mode=final --k=?')

    if FLAGS.mode == 'kmediods':
        # Do some initial runs to determine the right number for k
        k_values = range(2, 10)
        silhouette_values = []
        print('Running k-medoids clustering')

        for k in k_values:
            print(f'k = {k}')
            dataset_cluster = clusteringNH.k_medoids_over_instances(
                dataset=copy.deepcopy(dataset),
                cols=['acc_phone_x', 'acc_phone_y', 'acc_phone_z'],
                k=k,
                distance_metric='default',
                max_iters=20,
                n_inits=10)
            silhouette_score = dataset_cluster['silhouette'].mean()
            print(f'silhouette = {silhouette_score}')
            silhouette_values.append(silhouette_score)

        DataViz.plot_xy(x=[k_values],
                        y=[silhouette_values],
                        xlabel='k',
                        ylabel='silhouette score',
                        ylim=[0, 1],
                        line_styles=['b-'])

        # Run k medoids with the highest silhouette score
        k = k_values[np.argmax(silhouette_values)]
        print(f'Highest K-Medoids silhouette score: k = {k}')

        dataset_kmed = clusteringNH.k_medoids_over_instances(
            dataset=copy.deepcopy(dataset),
            cols=['acc_phone_x', 'acc_phone_y', 'acc_phone_z'],
            k=k,
            distance_metric='default',
            max_iters=20,
            n_inits=50)
        DataViz.plot_clusters_3d(
            data_table=dataset_kmed,
            data_cols=['acc_phone_x', 'acc_phone_y', 'acc_phone_z'],
            cluster_col='cluster',
            label_cols=['label'])
        DataViz.plot_silhouette(data_table=dataset_kmed,
                                cluster_col='cluster',
                                silhouette_col='silhouette')
        util.print_latex_statistics_clusters(
            dataset=dataset_kmed,
            cluster_col='cluster',
            input_cols=['acc_phone_x', 'acc_phone_y', 'acc_phone_z'],
            label_col='label')

    # Run hierarchical clustering
    if FLAGS.mode == 'agglomerative':
        k_values = range(2, 10)
        silhouette_values = []

        # Do some initial runs to determine the right number for the maximum number of clusters
        print('Running agglomerative clustering')
        for k in k_values:
            print(f'k = {k}')
            dataset_cluster, link = clusteringH.agglomerative_over_instances(
                dataset=dataset,
                cols=['acc_phone_x', 'acc_phone_y', 'acc_phone_z'],
                max_clusters=k,
                distance_metric='euclidean',
                use_prev_linkage=True,
                link_function='ward')
            silhouette_score = dataset_cluster['silhouette'].mean()
            print(f'silhouette = {silhouette_score}')
            silhouette_values.append(silhouette_score)
            if k == k_values[0]:
                DataViz.plot_dendrogram(dataset_cluster, link)

        # Plot the clustering results
        DataViz.plot_xy(x=[k_values],
                        y=[silhouette_values],
                        xlabel='k',
                        ylabel='silhouette score',
                        ylim=[0, 1],
                        line_styles=['b-'])

    if FLAGS.mode == 'final':
        # Select the outcome dataset of the knn clustering
        clusteringNH = NonHierarchicalClustering()
        dataset = clusteringNH.k_means_over_instances(
            dataset=dataset,
            cols=['acc_phone_x', 'acc_phone_y', 'acc_phone_z'],
            k=FLAGS.k,
            distance_metric='default',
            max_iters=50,
            n_inits=50)
        # Plot the results
        DataViz.plot_clusters_3d(dataset,
                                 ['acc_phone_x', 'acc_phone_y', 'acc_phone_z'],
                                 'cluster', ['label'])
        DataViz.plot_silhouette(dataset, 'cluster', 'silhouette')
        # Print table statistics
        util.print_latex_statistics_clusters(
            dataset, 'cluster', ['acc_phone_x', 'acc_phone_y', 'acc_phone_z'],
            'label')
        del dataset['silhouette']

        # Store the final dataset
        dataset.to_csv(DATA_PATH / RESULT_FNAME)
コード例 #4
0
# And run the knn with the highest silhouette score

highest_score = 0
k = 0

for i, j in zip(k_values, silhouette_values):
    if j > highest_score:
        k = i
        highest_score = j

print "k: \t" + str(k)

dataset_knn = clusteringNH.k_means_over_instances(copy.deepcopy(dataset),
                                                  selected_columns, k,
                                                  'default', 50, 50)
DataViz.plot_clusters_3d(dataset_knn, selected_columns, 'cluster', ['label'])
DataViz.plot_silhouette(dataset_knn, 'cluster', 'silhouette')
util.print_latex_statistics_clusters(dataset_knn, 'cluster', selected_columns,
                                     'label')
del dataset_knn['silhouette']

k_values = range(2, 10)
silhouette_values = []

# Do some initial runs to determine the right number for k

print '===== k medoids clustering ====='
for k in k_values:
    print 'k = ', k
    dataset_cluster = clusteringNH.k_medoids_over_instances(
        copy.deepcopy(dataset), selected_columns, k, 'default', 20, n_inits=10)
コード例 #5
0
plot.plot(k_values, silhouette_values, 'b-')
plot.xlabel('k')
plot.ylabel('silhouette score')
plot.ylim([0, 1])
plot.show()

# And run the knn with the highest silhouette score

k = 6

dataset_knn = clusteringNH.k_means_over_instances(
    copy.deepcopy(dataset), ['acc_phone_x', 'acc_phone_y', 'acc_phone_z'], k,
    'default', 50, 50)
DataViz.plot_clusters_3d(dataset_knn,
                         ['acc_phone_x', 'acc_phone_y', 'acc_phone_z'],
                         'cluster', ['label'])
DataViz.plot_silhouette(dataset_knn, 'cluster', 'silhouette')
util.print_latex_statistics_clusters(
    dataset_knn, 'cluster', ['acc_phone_x', 'acc_phone_y', 'acc_phone_z'],
    'label')
del dataset_knn['silhouette']

k_values = range(2, 10)
silhouette_values = []

# Do some initial runs to determine the right number for k

print '===== k medoids clustering ====='
for k in k_values:
    print 'k = ', k
コード例 #6
0
                xlabel='k',
                ylabel='silhouette score',
                ylim=[0, 1],
                line_styles=['b-'])

# And run the knn with the highest silhouette score

# k = 6 # todo: replaced with np.argmax call over silhouette scores
k = k_values[np.argmax(silhouette_values)]
print(f'Highest K-Means silhouette score: k = {k}')

dataset_knn = clusteringNH.k_means_over_instances(
    copy.deepcopy(dataset), ['gyr_watch_x', 'gyr_watch_y', 'gyr_watch_z'], k,
    'default', 50, 50)
DataViz.plot_clusters_3d(dataset_knn,
                         ['gyr_watch_x', 'gyr_watch_y', 'gyr_watch_z'],
                         'cluster', ['label'])
DataViz.plot_silhouette(dataset_knn, 'cluster', 'silhouette')
util.print_latex_statistics_clusters(
    dataset_knn, 'cluster', ['gyr_watch_x', 'gyr_watch_y', 'gyr_watch_z'],
    'label')
del dataset_knn['silhouette']

k_values = range(2, 10)
silhouette_values = []

# Do some initial runs to determine the right number for k

print('===== k medoids clustering =====')
for k in k_values:
    print(f'k = {k}')
コード例 #7
0
ファイル: crowdsignals_ch5.py プロジェクト: Mick-IJzer/ML4QS
    dataset_cluster = clusteringNH.k_means_over_instances(copy.deepcopy(dataset), ['acc_x', 'acc_y', 'acc_z'], k, 'default', 20, 10)
    silhouette_score = dataset_cluster['silhouette'].mean()
    print(f'silhouette = {silhouette_score}')
    silhouette_values.append(silhouette_score)

DataViz.plot_xy(x=[k_values], y=[silhouette_values], xlabel='k', ylabel='silhouette score',
                ylim=[0,1], line_styles=['b-'])

# And run the knn with the highest silhouette score

# k = 6 # todo: replaced with np.argmax call over silhouette scores
k = k_values[np.argmax(silhouette_values)]
print(f'Highest K-Means silhouette score: k = {k}')

dataset_knn = clusteringNH.k_means_over_instances(copy.deepcopy(dataset), ['acc_x', 'acc_y', 'acc_z'], k, 'default', 50, 50)
DataViz.plot_clusters_3d(dataset_knn, ['acc_x', 'acc_y', 'acc_z'], 'cluster', ['label'])
DataViz.plot_silhouette(dataset_knn, 'cluster', 'silhouette')
util.print_latex_statistics_clusters(dataset_knn, 'cluster', ['acc_x', 'acc_y', 'acc_z'], 'label')
del dataset_knn['silhouette']

#k_values = range(2, 10)
#silhouette_values = []

# Do some initial runs to determine the right number for k

#print('===== k medoids clustering =====')
#for k in k_values:
#    print(f'k = {k}')
#    dataset_cluster = clusteringNH.k_medoids_over_instances(copy.deepcopy(dataset), ['acc_phone_x', 'acc_phone_y', 'acc_phone_z'], k, 'default', 20, n_inits=10)
#    silhouette_score = dataset_cluster['silhouette'].mean()
#    print(f'silhouette = {silhouette_score}')
コード例 #8
0
        copy.deepcopy(dataset), attributes_to_cluster, k, 'default', 20, 50)
    silhouette_score = dataset_cluster['silhouette'].mean()
    print(f'silhouette = {silhouette_score}')
    silhouette_values.append(silhouette_score)

# And run the knn with the highest silhouette score

# k = 6 # todo: replaced with np.argmax call over silhouette scores
k = k_values[np.argmax(silhouette_values)]
print(f'Highest K-Means silhouette score: k = {k}')

dataset_knn = clusteringNH.k_means_over_instances(copy.deepcopy(dataset),
                                                  attributes_to_cluster, k,
                                                  'default', 20, 50)
DataViz.plot_silhouette(dataset_knn, 'cluster', 'silhouette')
DataViz.plot_clusters_3d(dataset_knn, [
    'acc_x',
    'acc_y',
    'acc_z',
], 'cluster', ['label'])
DataViz.plot_clusters_3d(dataset_knn, [
    "gyr_x",
    "gyr_y",
    "gyr_z",
], 'cluster', ['label'])
util.print_latex_statistics_clusters(dataset_knn, 'cluster',
                                     attributes_to_cluster, 'label')
del dataset_knn['silhouette']

# And we select the outcome dataset of the knn clustering....
dataset_knn.to_csv(cluster_watch_data)
コード例 #9
0
ファイル: crowdsignals_ch5.py プロジェクト: ML4QS-2/ML4QS
                xlabel='k',
                ylabel='silhouette score',
                ylim=[0, 1],
                line_styles=['b-'])

# And run the knn with the highest silhouette score

# k = 6 # todo: replaced with np.argmax call over silhouette scores
k = k_values[np.argmax(silhouette_values)]
print(f'Highest K-Means silhouette score: k = {k}')

dataset_knn = clusteringNH.k_means_over_instances(
    copy.deepcopy(dataset), ['gyr_phone_X', 'gyr_phone_Y', 'gyr_phone_Z'], k,
    'default', 50, 50)
DataViz.plot_clusters_3d(dataset_knn,
                         ['gyr_phone_X', 'gyr_phone_Y', 'gyr_phone_Z'],
                         'cluster', ['label'])
DataViz.plot_silhouette(dataset_knn, 'cluster', 'silhouette')
util.print_latex_statistics_clusters(
    dataset_knn, 'cluster', ['gyr_phone_X', 'gyr_phone_Y', 'gyr_phone_Z'],
    'label')
del dataset_knn['silhouette']

k_values = range(2, 10)
silhouette_values = []

# Do some initial runs to determine the right number for k

print('===== k medoids clustering =====')
for k in k_values:
    print(f'k = {k}')
コード例 #10
0
                y=[silhouette_values],
                xlabel='k',
                ylabel='silhouette score kmeans',
                ylim=[0, 1],
                line_styles=['b-'])

# And run the knn with the highest silhouette score

# k = 6 # todo: replaced with np.argmax call over silhouette scores
k = k_values[np.argmax(silhouette_values)]
print(f'Highest K-Means silhouette score: k = {k}')

dataset_knn = clusteringNH.k_means_over_instances(
    copy.deepcopy(dataset), ['gravity.x', 'gravity.y', 'gravity.z'], k,
    'default', 50, 50)
DataViz.plot_clusters_3d(dataset_knn, ['gravity.x', 'gravity.y', 'gravity.z'],
                         'cluster', ['label'])
DataViz.plot_silhouette(dataset_knn, 'cluster', 'silhouette')
util.print_latex_statistics_clusters(dataset_knn, 'cluster',
                                     ['gravity.x', 'gravity.y', 'gravity.z'],
                                     'label')
del dataset_knn['silhouette']

k_values = range(2, 10)
silhouette_values = []

# Do some initial runs to determine the right number for k

print('===== k medoids clustering =====')
for k in k_values:
    print(f'k = {k}')
    dataset_cluster = clusteringNH.k_medoids_over_instances(