Esempio n. 1
0
    def fit_predict(self, data):
        graph = induce_graph(data, distance=self.metric)
        fname = save_to_file(graph, force_integer=True)

        output_directory = '_labellings'
        if not exists(output_directory):
            mkdirs(output_directory)

        cwd = os.getcwd()
        os.chdir('YWWTools/target')

        result_blocks = []

        for num_blocks in range(2, 15):
            result_fname = '../../%s_blocks_%d' % (fname, num_blocks)
            call(' '.join([
                'java',
                '-cp YWWTools.jar:deps.jar yang.weiwei.Tools',
                '--tool wsbm',
                '--nodes %d' % graph.num_vertices(),
                '--blocks %d' % num_blocks,
                '--graph ../../%s' % fname,
                '--output %s' % result_fname,
                '--no-verbose',
            ]),
                 shell=True)

            blocks = np.genfromtxt(result_fname)
            score = silhouette_score(data, blocks)
            result_blocks.append((score, blocks))
            os.remove(result_fname)

        os.chdir(cwd)
        os.remove(fname)
        return np.array(max(result_blocks)[1])
Esempio n. 2
0
def plot_blocks_wsbm_evaluation_metrics(dataset,
                                        test_range=tuple(range(2, 15))):
    """This plots the difference between the silhouette score and NMI on a
    dataset when clustering with the WSBM."""
    dataset_ = Table(dataset)
    data, y = dataset_.X, dataset_.Y

    graph = induce_graph(data)
    fname = save_to_file(graph, force_integer=True)

    output_directory = '_labellings'
    if not exists(output_directory):
        mkdirs(output_directory)

    cwd = os.getcwd()
    os.chdir('YWWTools/target')

    nmi_scores, silhouettes_scores = [], []
    for num_blocks in test_range:
        result_fname = '../../%s_blocks_%d' % (fname, num_blocks)
        call(' '.join([
            'java',
            '-cp YWWTools.jar:deps.jar yang.weiwei.Tools',
            '--tool wsbm',
            '--nodes %d' % graph.num_vertices(),
            '--blocks %d' % num_blocks,
            '--graph ../../%s' % fname,
            '--output %s' % result_fname,
            '--no-verbose',
        ]),
             shell=True)

        blocks = np.genfromtxt(result_fname)
        os.remove(result_fname)
        silhouettes_scores.append(silhouette_score(data, blocks))
        nmi_scores.append(normalized_mutual_info_score(y, blocks))

    os.chdir(cwd)
    os.remove(fname)

    plt.plot(test_range, nmi_scores, label='NMI')
    plt.plot(test_range, silhouettes_scores, label='Silhouette')
    plt.title('Clustering with WSBM on %s' % dataset.title())
    plt.xlabel('Blocks')
    plt.ylabel('Scores')
    plt.legend()
    plt.savefig('%s/wsbm_blocks_%s.png' % (RESULTS_DIR, dataset))
Esempio n. 3
0
    def fit_predict(self, data):
        graph = induce_graph(data, distance=self.metric)

        result_blocks = []

        weights = graph.edge_properties['weights'].get_array()
        for threshold in np.linspace(0, weights.max(), self.cutoff_interval):
            working_graph = cutoff(graph, threshold, inplace=True)
            # Apply the sbm to the pruned graph
            blocks = minimize_blockmodel_dl(working_graph)
            blocks = blocks.get_blocks().get_array()

            # Silhouette doesn't work if there's only one cluster label
            if len(np.unique(blocks)) > 1:
                cutoff_score = silhouette_score(data, blocks)
                result_blocks.append((cutoff_score, blocks))

        return np.array(max(result_blocks)[1])
Esempio n. 4
0
def plot_threshold_sbm_distance_metrics(dataset, split_into=20):
    dataset_ = Table(dataset)
    data, y = dataset_.X, dataset_.Y

    for metric in DISTANCES:
        graph = induce_graph(data, distance='%s_invexp' % metric)
        weights = graph.edge_properties['weights'].get_array()

        silhouette_scores = []
        thresholds = np.linspace(0, weights.max(), split_into)
        for threshold in thresholds:
            _, silhouette = sbm_clustering_nmi_silhouette(data, y, threshold)
            silhouette_scores.append(silhouette)

        plt.plot(thresholds, silhouette_scores, label=metric)
        plt.title('Clustering with SBM after thresholding on %s' %
                  dataset.title())
        plt.xlabel('Distance threshold')
        plt.ylabel('Silhouette score')
    plt.legend()
    plt.savefig('%s/threshold_clustering_metrics_%s.png' %
                (RESULTS_DIR, dataset))
Esempio n. 5
0
def plot_threshold_sbm_components(dataset, split_into=10):
    """This plots the difference between the silhouette score and NMI on a
    dataset when clustering with the threshold SBM."""
    dataset_ = Table(dataset)
    data, y = dataset_.X, dataset_.Y

    graph = induce_graph(data)
    weights = graph.edge_properties['weights'].get_array()

    nmi_scores, silhouettes_scores = [], []
    thresholds = np.linspace(0, weights.max(), split_into)
    for threshold in thresholds:
        nmi, silhouette = sbm_clustering_nmi_silhouette(data, y, threshold)
        nmi_scores.append(nmi)
        silhouettes_scores.append(silhouette)

    plt.plot(thresholds, nmi_scores, label='NMI')
    plt.plot(thresholds, silhouettes_scores, label='Silhouette')
    plt.title('Clustering with SBM with thresholding on %s' % dataset.title())
    plt.xlabel('Distance threshold')
    plt.ylabel('Scores')
    plt.legend()
    plt.savefig('%s/threshold_clustering_%s.png' % (RESULTS_DIR, dataset))
Esempio n. 6
0
import matplotlib
from Orange.data import Table

matplotlib.use('Agg')

from graph_tool.draw import sfdp_layout, graph_draw

from induce_graph import induce_graph, cutoff

iris = Table('iris')
graph = induce_graph(iris)

graph_1 = cutoff(graph, 0.1)
graph_2 = cutoff(graph, 0.2)
graph_3 = cutoff(graph, 0.3)

vertex_layout = sfdp_layout(graph_2)

rgb_colors = ([[70, 190, 250]] * 50) + ([[237, 70, 47]] *
                                        50) + ([[170, 242, 43]] * 50)
rgb_colors = [[r / 255, g / 255, b / 255] for r, g, b in rgb_colors]

colors = graph_1.new_vertex_property('vector<double>')
for node, color in zip(graph_1.vertices(), rgb_colors):
    colors[node] = color

graph_draw(graph_1,
           vertex_layout,
           vertex_fill_color=colors,
           output='iris_threshold_01.png',
           output_size=(600, 400))