def clustering(the_image_autoencoded, the_image_shape, number_of_clusters, extra_parameters=""):
    print()
    print("***   Mean-Shift clustering   ***")
    print("---------------------------------")
    # https://towardsdatascience.com/the-5-clustering-algorithms-data-scientists-need-to-know-a36d136ef68
    # https://scikit-learn.org/stable/modules/generated/sklearn.cluster.MeanShift.html
    print("Image shape: ", the_image_shape)
    # print("Creating dataframe")
    # df = DataFrame(data=the_image_autoencoded)

    print("Running fit function for mean-shift clustering")
    clust = MeanShift(bandwidth=2).fit(the_image_autoencoded)

    print("Creating list for clustered data")
    clustered_data = np.zeros((the_image_shape[0], the_image_shape[1]))
    print("Clustered data shape:  ", np.shape(clustered_data))

    x = 0
    y = 0
    for i in range(the_image_shape[0] * the_image_shape[1]):
        clustered_data[y, x] = clust.labels_[i]
        x = x + 1
        if x == the_image_shape[1]:
            x = 0
            y = y + 1

    # Parameters start
    print("Parameters for this estimation: ", clust.get_params())
    label_min = 1
    label_max = 0
    for i in range(np.shape(clustered_data)[0] * np.shape(clustered_data)[1]):
        if clust.labels_[i] > label_max:
            label_max = clust.labels_[i]
        if clust.labels_[i] < label_min:
            label_min = clust.labels_[i]
    print("Labels from", label_min, " ,to", label_max, ". Number of labels: ", label_max - label_min)
    # Parameters stop

    return clustered_data
Exemple #2
0
def _mean_shift(table,
                input_cols,
                prediction_col='prediction',
                bandwidth=None,
                bin_seeding=False,
                min_bin_freq=1,
                cluster_all=True):
    inputarr = table[input_cols]

    ms = MeanShift(bandwidth=bandwidth,
                   bin_seeding=bin_seeding,
                   min_bin_freq=min_bin_freq,
                   cluster_all=cluster_all,
                   n_jobs=1)

    ms.fit(inputarr)

    label_name = {
        'bandwidth': 'Bandwidth',
        'bin_seeding': 'Bin Seeding',
        'min_bin_freq': 'Minimum Bin Frequency',
        'cluster_all': 'Cluster All'
    }
    get_param = ms.get_params()
    param_table = pd.DataFrame.from_items(
        [['Parameter', list(label_name.values())],
         ['Value', [get_param[x] for x in list(label_name.keys())]]])

    cluster_centers = ms.cluster_centers_
    n_clusters = len(cluster_centers)
    colors = cm.nipy_spectral(np.arange(n_clusters).astype(float) / n_clusters)
    labels = ms.labels_

    if len(input_cols) > 1:
        pca2_model = PCA(n_components=2).fit(inputarr)
        pca2 = pca2_model.transform(inputarr)

    fig_centers = _mean_shift_centers_plot(input_cols, cluster_centers, colors)
    fig_samples = _mean_shift_samples_plot(
        table, input_cols, 100, cluster_centers,
        colors) if len(table.index) > 100 else _mean_shift_samples_plot(
            table, input_cols, None, cluster_centers, colors)

    if len(input_cols) > 1:
        fig_pca = _mean_shift_pca_plot(labels, cluster_centers, pca2_model,
                                       pca2, colors)
        rb = BrtcReprBuilder()
        rb.addMD(
            strip_margin("""
        | ## Mean Shift Result
        | - Coordinates of cluster centers
        | {fig_cluster_centers} 
        | - Samples
        | {fig_pca}
        | {fig_samples}
        | ### Parameters
        | {params}
        """.format(fig_cluster_centers=fig_centers,
                   fig_pca=fig_pca,
                   fig_samples=fig_samples,
                   params=pandasDF2MD(param_table))))
    else:
        rb = BrtcReprBuilder()
        rb.addMD(
            strip_margin("""
        | ## Mean Shift Result
        | - Coordinates of cluster centers
        | {fig_cluster_centers} 
        | - Samples
        | {fig_samples}
        | ### Parameters
        | {params}
        """.format(fig_cluster_centers=fig_centers,
                   fig_samples=fig_samples,
                   params=pandasDF2MD(param_table))))

    model = _model_dict('mean_shift')
    model['model'] = ms
    model['input_cols'] = input_cols
    model['_repr_brtc_'] = rb.get()

    out_table = table.copy()
    out_table[prediction_col] = labels
    return {'out_table': out_table, 'model': model}
    print("Clustered data shape:  ", np.shape(clustered_data))

    x = 0
    y = 0
    for i in range(np.shape(clustered_data)[0] * np.shape(clustered_data)[1]):
        clustered_data[x][y] = clust.labels_[i]
        x = x + 1
        if x == 100:
            x = 0
            y = y + 1

    plt.imshow(clustered_data)
    name = img_dir + 'img_mean_shift_clustering.png'
    plt.savefig(name, bbox_inches='tight')

    print("Parameters for this estimation: ", clust.get_params())
    label_min = 1
    label_max = 0
    for i in range(np.shape(clustered_data)[0] * np.shape(clustered_data)[1]):
        if clust.labels_[i] > label_max:
            label_max = clust.labels_[i]
        if clust.labels_[i] < label_min:
            label_min = clust.labels_[i]
    print("Labels from", label_min, " ,to", label_max, ". Number of labels: ",
          label_max - label_min)

    print()
    print(datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
    end_time = time.time()
    print("End time:  ", time.ctime(end_time))
    print("Duration:  ", int(end_time - start_time), " seconds")