def __init__(self, data, linkage='ward', num_clusters=1): self._num_clusters = num_clusters vector_ids = [[i] for i in range(len(data))] self._dendrogram = Dendrogram(vector_ids) numpy.fill_diagonal(data, numpy.inf) self._dist_matrix = data self.linkage = linkage_fn(linkage)
def __init__(self, data, dist_metric=euclidean_distance, linkage=ward_link, num_clusters=1): self._num_clusters = num_clusters vector_ids = [[i] for i in range(len(data))] self._dendrogram = Dendrogram(vector_ids) self._dist_matrix = data self.linkage = linkage
def __init__(self, data, dist_metric=euclidean_distance, linkage = ward_link, num_clusters=1): self._num_clusters = num_clusters vector_ids = [[i] for i in range(len(data))] self._dendrogram = Dendrogram(vector_ids) self._dist_matrix = data self.linkage = linkage
def cluster(self): initial_clusters=[] for point in self.points: initial_clusters.append(Cluster(point)) dnd = Dendrogram("Goodness") dnd.add_level(float('inf'),initial_clusters) goodness_measure = MergeGoodnessMeasure(self.th) all_clusters = RockClusters(initial_clusters,self.link_matrix,goodness_measure) n_clusters = all_clusters.size() while n_clusters>self.k: n_clusters_before_merge = n_clusters g = all_clusters.merge_best_candidates() n_clusters = all_clusters.size() if n_clusters==n_clusters_before_merge: break dnd.add_level(str(g),all_clusters.get_all_clusters()) return dnd # require 'jaccard_coefficient' # require 'link_matrix' # require 'cluster' # require 'dendrogram' # require 'merge_goodness_measure' # require 'rock_clusters' # class RockAlgorithm # attr_reader :similarity_measure, :points, :th, :link_matrix, :k # def initialize(points, k, th) # @points = points # @k = k # @th = th # similarity_measure = JaccardCoefficient.new # @link_matrix = LinkMatrix.new points, similarity_measure, th # end # def cluster # initial_clusters = [] # points.each do |point| # initial_clusters.push Cluster.new(point) # end # dnd = Dendrogram.new 'Goodness' # dnd.add_level('inf', initial_clusters) # goodness_measure = MergeGoodnessMeasure.new th # all_clusters = RockClusters.new initial_clusters, link_matrix, goodness_measure # n_clusters = all_clusters.size # while n_clusters > k # n_clusters_before_merge = n_clusters # g = all_clusters.merge_best_candidates # n_clusters = all_clusters.size # # No linked clusters to merge # break if (n_clusters == n_clusters_before_merge) # dnd.add_level(g.to_s, all_clusters.get_all_clusters) # puts "Number of clusters: #{all_clusters.get_all_clusters.size}" # end # #all_clusters.cluster_map.each do |k, c| # # puts c.get_elements.inspect # #end # dnd # end # end
class Clusterer(AbstractClusterer): """ The Hierarchical Agglomerative Clusterer starts with each of the N vectors as singleton clusters. It then iteratively merges pairs of clusters which have the smallest distance according to function LINKAGE. This continues until there is only one cluster. """ def __init__(self, data, dist_metric=euclidean_distance, linkage=ward_link, num_clusters=1): self._num_clusters = num_clusters vector_ids = [[i] for i in range(len(data))] self._dendrogram = Dendrogram(vector_ids) self._dist_matrix = data self.linkage = linkage def smallest_distance(self, clusters): """ Return the smallest distance in the distance matrix. The smallest distance depends on the possible connections in the distance matrix. @param clusters: an object of the class L{DistanceMatrix} holding the clusters at a specific state in the clustering procedure. @type clusters: L{DistanceMatrix} @return: a tuple containing the smallest distance and the indexes of the clusters yielding the smallest distance. """ i, j = numpy.unravel_index(numpy.nanargmin(clusters), clusters.shape) return clusters[i, j], i, j def cluster(self, verbose=0, sum_ess=False): """ Cluster all clusters hierarchically until the level of num_clusters is obtained. @param verbose: how much output is produced during the clustering (0-2) @type verbose: C{int} @return: None, desctructive method. """ ## if sum_ess and self.linkage.__name__ != "ward_link": ## raise ValueError( ## "Summing for method other than Ward makes no sense...") clusters = copy.copy(self._dist_matrix) summed_ess = 0.0 while len(clusters) > max(self._num_clusters, 1): if verbose >= 1: print 'k=%s' % len(clusters) if verbose == 2: print clusters best, i, j = self.smallest_distance(clusters) # In Ward (1963) ess is summed at each iteration # in R's hclust and Python's hcluster and some text books it is not. # Here it is optional... if sum_ess: summed_ess += best else: summed_ess = best clusters = self.update_distmatrix(i, j, clusters) self._dendrogram.merge(i, j) self._dendrogram._items[i].distance = summed_ess clusters = clusters.remove(j) def update_distmatrix(self, i, j, clusters): """ Update the distance matrix using the specified linkage method so that it represents the correct distances to the newly formed cluster. """ return self.linkage(clusters, i, j, self._dendrogram) def dendrogram(self): """Return the dendrogram object.""" return self._dendrogram def num_clusters(self): return self._num_clusters def __repr__(self): return """<Hierarchical Agglomerative Clusterer(linkage method: %r, n=%d clusters>""" % (self.linkage.__name__, self._num_clusters)
class Clusterer(AbstractClusterer): """ The Hierarchical Agglomerative Clusterer starts with each of the N vectors as singleton clusters. It then iteratively merges pairs of clusters which have the smallest distance according to function LINKAGE. This continues until there is only one cluster. """ def __init__(self, data, linkage='ward', num_clusters=1): self._num_clusters = num_clusters vector_ids = [[i] for i in range(len(data))] self._dendrogram = Dendrogram(vector_ids) numpy.fill_diagonal(data, numpy.inf) self._dist_matrix = data self.linkage = linkage_fn(linkage) def smallest_distance(self, clusters): """ Return the smallest distance in the distance matrix. The smallest distance depends on the possible connections in the distance matrix. @param clusters: an object of the class L{DistanceMatrix} holding the clusters at a specific state in the clustering procedure. @type clusters: L{DistanceMatrix} @return: a tuple containing the smallest distance and the indexes of the clusters yielding the smallest distance. """ i, j = numpy.unravel_index(numpy.argmin(clusters), clusters.shape) return clusters[i, j], i, j def cluster(self, verbose=0, sum_ess=False): """ Cluster all clusters hierarchically until the level of num_clusters is obtained. @param verbose: how much output is produced during the clustering (0-2) @type verbose: C{int} @return: None, desctructive method. """ ## if sum_ess and self.linkage.__name__ != "ward_link": ## raise ValueError( ## "Summing for method other than Ward makes no sense...") clusters = copy.copy(self._dist_matrix) #clusters = self._dist_matrix summed_ess = 0.0 while len(clusters) > max(self._num_clusters, 1): if verbose >= 1: print 'k=%s' % len(clusters) if verbose == 2: print clusters best, i, j = self.smallest_distance(clusters) # In Ward (1963) ess is summed at each iteration # in R's hclust and Python's hcluster and some text books it is not. # Here it is optional... if sum_ess: summed_ess += best else: summed_ess = best clusters = self.update_distmatrix(i, j, clusters) self._dendrogram.merge(i,j) self._dendrogram[i].distance = summed_ess indices = numpy.arange(clusters.shape[0]) indices = indices[indices!=j] clusters = clusters.take(indices, axis=0).take(indices, axis=1) def update_distmatrix(self, i, j, clusters): """ Update the distance matrix using the specified linkage method so that it represents the correct distances to the newly formed cluster. """ return self.linkage(clusters, i, j, self._dendrogram) def dendrogram(self): """Return the dendrogram object.""" return self._dendrogram def num_clusters(self): return self._num_clusters def __repr__(self): return """<Hierarchical Agglomerative Clusterer(linkage method: %r, n=%d clusters>""" % (self.linkage.__name__, self._num_clusters)
from modularity import find_best_splits if __name__ == "__main__": """ Finds the best clustering for each of the given ego network files. Stores output to file called submission.csv """ ego_nets = read_data(argv[1]) # Good sets (small) to test on are 25708, and 1310 # Change this variable to change the egonet that it starts reading from # start = 8338 # index = [k for k, v in tup_ls].index(start) index = 0 # use the line above instead if not running from start tup_ls = sorted(ego_nets.iteritems(), key=lambda t: t[1].size) out = open("submission.csv", "w") # out = open("out.txt", "a") # if not running from start, use append instead while index < len(tup_ls): uid, ego_net = tup_ls[index] print("Analyzing ego network {0}".format(uid)) dendrogram = Dendrogram(ego_net) size = ego_net.size best_split = find_best_splits(dendrogram.levels, size) circles = dendrogram.convert_to_circles()[best_split] circ_str = str(uid) + "," + str(len(circles)) + "," circ_str += ";".join([" ".join([str(fid) for fid in circle]) for circle in circles]) print(circ_str, file=out) print("Best split level for ego network {0} is {1}".format(uid, best_split)) index += 1 out.close()
def cluster(self): initial_clusters = [] for point in self.points: initial_clusters.append(Cluster(point)) dnd = Dendrogram("Goodness") dnd.add_level(float('inf'), initial_clusters) goodness_measure = MergeGoodnessMeasure(self.th) all_clusters = RockClusters(initial_clusters, self.link_matrix, goodness_measure) n_clusters = all_clusters.size() while n_clusters > self.k: n_clusters_before_merge = n_clusters g = all_clusters.merge_best_candidates() n_clusters = all_clusters.size() if n_clusters == n_clusters_before_merge: break dnd.add_level(str(g), all_clusters.get_all_clusters()) return dnd # require 'jaccard_coefficient' # require 'link_matrix' # require 'cluster' # require 'dendrogram' # require 'merge_goodness_measure' # require 'rock_clusters' # class RockAlgorithm # attr_reader :similarity_measure, :points, :th, :link_matrix, :k # def initialize(points, k, th) # @points = points # @k = k # @th = th # similarity_measure = JaccardCoefficient.new # @link_matrix = LinkMatrix.new points, similarity_measure, th # end # def cluster # initial_clusters = [] # points.each do |point| # initial_clusters.push Cluster.new(point) # end # dnd = Dendrogram.new 'Goodness' # dnd.add_level('inf', initial_clusters) # goodness_measure = MergeGoodnessMeasure.new th # all_clusters = RockClusters.new initial_clusters, link_matrix, goodness_measure # n_clusters = all_clusters.size # while n_clusters > k # n_clusters_before_merge = n_clusters # g = all_clusters.merge_best_candidates # n_clusters = all_clusters.size # # No linked clusters to merge # break if (n_clusters == n_clusters_before_merge) # dnd.add_level(g.to_s, all_clusters.get_all_clusters) # puts "Number of clusters: #{all_clusters.get_all_clusters.size}" # end # #all_clusters.cluster_map.each do |k, c| # # puts c.get_elements.inspect # #end # dnd # end # end