def fit_predict(self, data): graph = induce_graph(data, distance=self.metric) fname = save_to_file(graph, force_integer=True) output_directory = '_labellings' if not exists(output_directory): mkdirs(output_directory) cwd = os.getcwd() os.chdir('YWWTools/target') result_blocks = [] for num_blocks in range(2, 15): result_fname = '../../%s_blocks_%d' % (fname, num_blocks) call(' '.join([ 'java', '-cp YWWTools.jar:deps.jar yang.weiwei.Tools', '--tool wsbm', '--nodes %d' % graph.num_vertices(), '--blocks %d' % num_blocks, '--graph ../../%s' % fname, '--output %s' % result_fname, '--no-verbose', ]), shell=True) blocks = np.genfromtxt(result_fname) score = silhouette_score(data, blocks) result_blocks.append((score, blocks)) os.remove(result_fname) os.chdir(cwd) os.remove(fname) return np.array(max(result_blocks)[1])
def plot_blocks_wsbm_evaluation_metrics(dataset, test_range=tuple(range(2, 15))): """This plots the difference between the silhouette score and NMI on a dataset when clustering with the WSBM.""" dataset_ = Table(dataset) data, y = dataset_.X, dataset_.Y graph = induce_graph(data) fname = save_to_file(graph, force_integer=True) output_directory = '_labellings' if not exists(output_directory): mkdirs(output_directory) cwd = os.getcwd() os.chdir('YWWTools/target') nmi_scores, silhouettes_scores = [], [] for num_blocks in test_range: result_fname = '../../%s_blocks_%d' % (fname, num_blocks) call(' '.join([ 'java', '-cp YWWTools.jar:deps.jar yang.weiwei.Tools', '--tool wsbm', '--nodes %d' % graph.num_vertices(), '--blocks %d' % num_blocks, '--graph ../../%s' % fname, '--output %s' % result_fname, '--no-verbose', ]), shell=True) blocks = np.genfromtxt(result_fname) os.remove(result_fname) silhouettes_scores.append(silhouette_score(data, blocks)) nmi_scores.append(normalized_mutual_info_score(y, blocks)) os.chdir(cwd) os.remove(fname) plt.plot(test_range, nmi_scores, label='NMI') plt.plot(test_range, silhouettes_scores, label='Silhouette') plt.title('Clustering with WSBM on %s' % dataset.title()) plt.xlabel('Blocks') plt.ylabel('Scores') plt.legend() plt.savefig('%s/wsbm_blocks_%s.png' % (RESULTS_DIR, dataset))
def fit_predict(self, data): graph = induce_graph(data, distance=self.metric) result_blocks = [] weights = graph.edge_properties['weights'].get_array() for threshold in np.linspace(0, weights.max(), self.cutoff_interval): working_graph = cutoff(graph, threshold, inplace=True) # Apply the sbm to the pruned graph blocks = minimize_blockmodel_dl(working_graph) blocks = blocks.get_blocks().get_array() # Silhouette doesn't work if there's only one cluster label if len(np.unique(blocks)) > 1: cutoff_score = silhouette_score(data, blocks) result_blocks.append((cutoff_score, blocks)) return np.array(max(result_blocks)[1])
def plot_threshold_sbm_distance_metrics(dataset, split_into=20): dataset_ = Table(dataset) data, y = dataset_.X, dataset_.Y for metric in DISTANCES: graph = induce_graph(data, distance='%s_invexp' % metric) weights = graph.edge_properties['weights'].get_array() silhouette_scores = [] thresholds = np.linspace(0, weights.max(), split_into) for threshold in thresholds: _, silhouette = sbm_clustering_nmi_silhouette(data, y, threshold) silhouette_scores.append(silhouette) plt.plot(thresholds, silhouette_scores, label=metric) plt.title('Clustering with SBM after thresholding on %s' % dataset.title()) plt.xlabel('Distance threshold') plt.ylabel('Silhouette score') plt.legend() plt.savefig('%s/threshold_clustering_metrics_%s.png' % (RESULTS_DIR, dataset))
def plot_threshold_sbm_components(dataset, split_into=10): """This plots the difference between the silhouette score and NMI on a dataset when clustering with the threshold SBM.""" dataset_ = Table(dataset) data, y = dataset_.X, dataset_.Y graph = induce_graph(data) weights = graph.edge_properties['weights'].get_array() nmi_scores, silhouettes_scores = [], [] thresholds = np.linspace(0, weights.max(), split_into) for threshold in thresholds: nmi, silhouette = sbm_clustering_nmi_silhouette(data, y, threshold) nmi_scores.append(nmi) silhouettes_scores.append(silhouette) plt.plot(thresholds, nmi_scores, label='NMI') plt.plot(thresholds, silhouettes_scores, label='Silhouette') plt.title('Clustering with SBM with thresholding on %s' % dataset.title()) plt.xlabel('Distance threshold') plt.ylabel('Scores') plt.legend() plt.savefig('%s/threshold_clustering_%s.png' % (RESULTS_DIR, dataset))
import matplotlib from Orange.data import Table matplotlib.use('Agg') from graph_tool.draw import sfdp_layout, graph_draw from induce_graph import induce_graph, cutoff iris = Table('iris') graph = induce_graph(iris) graph_1 = cutoff(graph, 0.1) graph_2 = cutoff(graph, 0.2) graph_3 = cutoff(graph, 0.3) vertex_layout = sfdp_layout(graph_2) rgb_colors = ([[70, 190, 250]] * 50) + ([[237, 70, 47]] * 50) + ([[170, 242, 43]] * 50) rgb_colors = [[r / 255, g / 255, b / 255] for r, g, b in rgb_colors] colors = graph_1.new_vertex_property('vector<double>') for node, color in zip(graph_1.vertices(), rgb_colors): colors[node] = color graph_draw(graph_1, vertex_layout, vertex_fill_color=colors, output='iris_threshold_01.png', output_size=(600, 400))