예제 #1
0
DataViz.plot_silhouette(dataset_kmed, 'cluster', 'silhouette')
util.print_latex_statistics_clusters(dataset_kmed, 'cluster', ['Bx', 'By', 'Bz'], 'label')

# And the hierarchical clustering is the last one we try

clusteringH = HierarchicalClustering()

k_values = range(2, 10)
silhouette_values = []

# Do some initial runs to determine the right number for the mBximum number of clusters.

print '===== agglomaritive clustering ====='
for k in k_values:
    print 'k = ', k
    dataset_cluster, l = clusteringH.agglomerative_over_instances(copy.deepcopy(dataset), ['Bx', 'By', 'Bz'], k, 'euclidean', use_prev_linkage=True, link_function='ward')
    silhouette_score = dataset_cluster['silhouette'].mean()
    print 'silhouette = ', silhouette_score
    silhouette_values.append(silhouette_score)
    if k == k_values[0]:
        DataViz.plot_dendrogram(dataset_cluster, l)

plot.plot(k_values, silhouette_values, 'b-')
plot.ylim([0,1])
plot.xlabel('mBx number of clusters')
plot.ylabel('silhouette score')
plot.show()

# And we select the outcome dataset of the knn clustering....

dataset_knn.to_csv(dataset_path + 'chapter5_our_result.csv')
예제 #2
0
def main():
    # Read the result from the previous chapter convert the index to datetime
    try:
        dataset = pd.read_csv(DATA_PATH / DATASET_FNAME, index_col=0)
        dataset.index = pd.to_datetime(dataset.index)
    except IOError as e:
        print(
            'File not found, try to run previous crowdsignals scripts first!')
        raise e

    # Create an instance of visualization class to plot the results
    DataViz = VisualizeDataset(__file__)

    # Create objects for clustering
    clusteringNH = NonHierarchicalClustering()
    clusteringH = HierarchicalClustering()

    if FLAGS.mode == 'kmeans':
        # Do some initial runs to determine the right number for k
        k_values = range(2, 10)
        silhouette_values = []

        print('Running k-means clustering')
        for k in k_values:
            print(f'k = {k}')
            dataset_cluster = clusteringNH.k_means_over_instances(
                dataset=copy.deepcopy(dataset),
                cols=['acc_phone_x', 'acc_phone_y', 'acc_phone_z'],
                k=k,
                distance_metric='default',
                max_iters=20,
                n_inits=10)
            silhouette_score = dataset_cluster['silhouette'].mean()
            print(f'silhouette = {silhouette_score}')
            silhouette_values.append(silhouette_score)

        DataViz.plot_xy(x=[k_values],
                        y=[silhouette_values],
                        xlabel='k',
                        ylabel='silhouette score',
                        ylim=[0, 1],
                        line_styles=['b-'])

        # Run the knn with the highest silhouette score
        k = k_values[np.argmax(silhouette_values)]
        print(f'Highest K-Means silhouette score: k = {k}')
        print('Use this value of k to run the --mode=final --k=?')

    if FLAGS.mode == 'kmediods':
        # Do some initial runs to determine the right number for k
        k_values = range(2, 10)
        silhouette_values = []
        print('Running k-medoids clustering')

        for k in k_values:
            print(f'k = {k}')
            dataset_cluster = clusteringNH.k_medoids_over_instances(
                dataset=copy.deepcopy(dataset),
                cols=['acc_phone_x', 'acc_phone_y', 'acc_phone_z'],
                k=k,
                distance_metric='default',
                max_iters=20,
                n_inits=10)
            silhouette_score = dataset_cluster['silhouette'].mean()
            print(f'silhouette = {silhouette_score}')
            silhouette_values.append(silhouette_score)

        DataViz.plot_xy(x=[k_values],
                        y=[silhouette_values],
                        xlabel='k',
                        ylabel='silhouette score',
                        ylim=[0, 1],
                        line_styles=['b-'])

        # Run k medoids with the highest silhouette score
        k = k_values[np.argmax(silhouette_values)]
        print(f'Highest K-Medoids silhouette score: k = {k}')

        dataset_kmed = clusteringNH.k_medoids_over_instances(
            dataset=copy.deepcopy(dataset),
            cols=['acc_phone_x', 'acc_phone_y', 'acc_phone_z'],
            k=k,
            distance_metric='default',
            max_iters=20,
            n_inits=50)
        DataViz.plot_clusters_3d(
            data_table=dataset_kmed,
            data_cols=['acc_phone_x', 'acc_phone_y', 'acc_phone_z'],
            cluster_col='cluster',
            label_cols=['label'])
        DataViz.plot_silhouette(data_table=dataset_kmed,
                                cluster_col='cluster',
                                silhouette_col='silhouette')
        util.print_latex_statistics_clusters(
            dataset=dataset_kmed,
            cluster_col='cluster',
            input_cols=['acc_phone_x', 'acc_phone_y', 'acc_phone_z'],
            label_col='label')

    # Run hierarchical clustering
    if FLAGS.mode == 'agglomerative':
        k_values = range(2, 10)
        silhouette_values = []

        # Do some initial runs to determine the right number for the maximum number of clusters
        print('Running agglomerative clustering')
        for k in k_values:
            print(f'k = {k}')
            dataset_cluster, link = clusteringH.agglomerative_over_instances(
                dataset=dataset,
                cols=['acc_phone_x', 'acc_phone_y', 'acc_phone_z'],
                max_clusters=k,
                distance_metric='euclidean',
                use_prev_linkage=True,
                link_function='ward')
            silhouette_score = dataset_cluster['silhouette'].mean()
            print(f'silhouette = {silhouette_score}')
            silhouette_values.append(silhouette_score)
            if k == k_values[0]:
                DataViz.plot_dendrogram(dataset_cluster, link)

        # Plot the clustering results
        DataViz.plot_xy(x=[k_values],
                        y=[silhouette_values],
                        xlabel='k',
                        ylabel='silhouette score',
                        ylim=[0, 1],
                        line_styles=['b-'])

    if FLAGS.mode == 'final':
        # Select the outcome dataset of the knn clustering
        clusteringNH = NonHierarchicalClustering()
        dataset = clusteringNH.k_means_over_instances(
            dataset=dataset,
            cols=['acc_phone_x', 'acc_phone_y', 'acc_phone_z'],
            k=FLAGS.k,
            distance_metric='default',
            max_iters=50,
            n_inits=50)
        # Plot the results
        DataViz.plot_clusters_3d(dataset,
                                 ['acc_phone_x', 'acc_phone_y', 'acc_phone_z'],
                                 'cluster', ['label'])
        DataViz.plot_silhouette(dataset, 'cluster', 'silhouette')
        # Print table statistics
        util.print_latex_statistics_clusters(
            dataset, 'cluster', ['acc_phone_x', 'acc_phone_y', 'acc_phone_z'],
            'label')
        del dataset['silhouette']

        # Store the final dataset
        dataset.to_csv(DATA_PATH / RESULT_FNAME)
예제 #3
0
                                     attributes_to_cluster, 'label')

# And the hierarchical clustering is the last one we try

clusteringH = HierarchicalClustering()

k_values = range(2, 10)
silhouette_values = []

print('===== agglomerative clustering =====')
for k in k_values:
    print(f'k = {k}')
    dataset_cluster, l = clusteringH.agglomerative_over_instances(
        copy.deepcopy(dataset),
        attributes_to_cluster,
        k,
        'euclidean',
        use_prev_linkage=True,
        link_function='ward')
    silhouette_score = dataset_cluster['silhouette'].mean()
    print(f'silhouette = {silhouette_score}')
    silhouette_values.append(silhouette_score)
    if k == k_values[0]:
        DataViz.plot_dendrogram(dataset_cluster, l)

DataViz.plot_xy(x=[k_values],
                y=[silhouette_values],
                xlabel='k',
                ylabel='silhouette score',
                ylim=[0, 1],
                line_styles=['b-'])
# ward

# dataset_cluster, l = clusteringH.agglomerative_over_instances(copy.deepcopy(dataset), ['gyr_phone_x', 'gyr_phone_y', 'gyr_phone_z'], 5, 'euclidean', use_prev_linkage=True, link_function='ward')
# dataset_cluster, l = clusteringH.agglomerative_over_instances(copy.deepcopy(dataset), ['acc_phone_x', 'acc_phone_y', 'acc_phone_z'], 5, 'minkowski', use_prev_linkage=True, link_function='single')

# silhouette_score = dataset_cluster['silhouette'].mean()
# print 'silhouette = ', silhouette_score
# silhouette_values.append(silhouette_score)
# if k == k_values[0]:
#     DataViz.plot_dendrogram(dataset_cluster, l)

#
# plot.plot(k_values, silhouette_values, 'b-')
# plot.ylim([0,1])
# plot.xlabel('max number of clusters')
# plot.ylabel('silhouette score')
# plot.show()

# And we select the outcome dataset of the knn clustering....

# dataset_knn.to_csv(dataset_path + 'chapter5_result.csv')
dataset_cluster, l = clusteringH.agglomerative_over_instances(
    copy.deepcopy(dataset), ['acc_phone_x', 'acc_phone_y', 'acc_phone_z'],
    5,
    'minkowski',
    use_prev_linkage=True,
    link_function='complete')

util.print_latex_statistics_clusters(
    dataset_cluster, 'cluster', ['gyr_phone_x', 'gyr_phone_y', 'gyr_phone_z'],
    'label')
예제 #5
0
def plot_dendrogram(dataset, linkage, k):
    sys.setrecursionlimit(40000)
    plot.title('Hierarchical Clustering Dendrogram')
    plot.xlabel('time points')
    plot.ylabel('distance')
    times = dataset.index.strftime('%H:%M:%S')
    dendrogram(linkage,truncate_mode='lastp',p=k, show_leaf_counts=True, leaf_rotation=45.,leaf_font_size=8.,show_contracted=True, labels=times)
    plot.show()

# Do some initial runs to determine the right number for the maximum number of clusters.

print '===== agglomaritive clustering ====='

for k in k_values:
    print 'k = ', k
    dataset_cluster, l = clusteringH.agglomerative_over_instances(copy.deepcopy(dataset), ['gyr_phone_x', 'gyr_phone_y', 'gyr_phone_z'], k, 'euclidean', use_prev_linkage=True, link_function='ward')
    silhouette_score = dataset_cluster['silhouette'].mean()
    print 'silhouette = ', silhouette_score
    silhouette_values.append(silhouette_score)
    if k == k_values[0]:
        plot_dendrogram(dataset_cluster, l, k)

plot.plot(k_values, silhouette_values, 'b-')
plot.ylim([0,1])
plot.xlabel('max number of clusters')
plot.ylabel('silhouette score')
plot.show()

silhouette_values = []
print '===== parameter testing ====='
print '2p = ', "inf"
예제 #6
0
# And the hierarchical clustering is the last one we try

clusteringH = HierarchicalClustering()

k_values = range(2, 10)
silhouette_values = []

# Do some initial runs to determine the right number for the maximum number of clusters.

print '===== agglomaritive clustering ====='
for k in k_values:
    print 'k = ', k
    dataset_cluster, l = clusteringH.agglomerative_over_instances(
        copy.deepcopy(dataset),
        selected_columns,
        k,
        'euclidean',
        use_prev_linkage=True,
        link_function='ward')
    silhouette_score = dataset_cluster['silhouette'].mean()
    print 'silhouette = ', silhouette_score
    silhouette_values.append(silhouette_score)
    if k == k_values[0]:
        DataViz.plot_dendrogram(dataset_cluster, l)

plot.plot(k_values, silhouette_values, 'b-')
plot.ylim([0, 1])
plot.xlabel('max number of clusters')
plot.ylabel('silhouette score')
plot.show()