def cluster_vectorspace(self, vectors, trace=False): # create a cluster for each vector clusters = [[vector] for vector in vectors] # This copy module and function is from the python standard library vector_sum = copy.copy(vectors) norm_sum = [normalize(vsum) for vsum in vector_sum] cluster_matrix = numpy.zeros([len(vectors[0]), len(vectors)]) i = 0 for v in norm_sum: cluster_matrix[:, i] = v i = i + 1 dot_store_matrix = numpy.dot(cluster_matrix.transpose(), cluster_matrix) while len(clusters) > max(self._num_clusters, 1): # find the two best candidate clusters to merge max_sim = self.array_max(dot_store_matrix) i = max_sim[0] j = max_sim[1] similarity = max_sim[2] if i == j: print "got stuck when at " + str(len(clusters)) + " clusters" break vsum = clusters[i] + clusters[j] if trace: print 'merging %d and %d' % (i, j) clusters[i] = vsum del clusters[j] vector_sum[i] = vector_sum[i] + vector_sum[j] norm_sum[i] = normalize(vector_sum[i]) cluster_matrix[:, i] = norm_sum[i] del vector_sum[j] del norm_sum[j] cluster_matrix = numpy.delete(cluster_matrix, j, 1) dot_store_matrix = numpy.dot(cluster_matrix.transpose(), cluster_matrix) self._dendogram.merge(similarity, i, j) if self._names: self._name_dendogram.merge(similarity, i, j) if len(clusters) % 50 == 0: print len(clusters) if self.log_it is not None: self.log_it(str(len(clusters))) if self._iterative_reassign: for i in range(self._max_reassign): self.get_iteratively_reassigned_clusters(i, saveit=True) self.update_clusters(len(clusters))
def cluster_vectorspace(self, vectors, trace=False): # create a cluster for each vector clusters = [[vector] for vector in vectors] # This copy module and function is from the python standard library vector_sum = copy.copy(vectors) norm_sum = [normalize(vsum) for vsum in vector_sum] cluster_matrix = numpy.zeros([len(vectors[0]), len(vectors)]) i = 0 for v in norm_sum: cluster_matrix[:, i] = v i = i + 1 if self.msg_handle is not None: self.msg_handle.dm("initializing dot_store_matrix") dot_store_matrix = numpy.dot(cluster_matrix.transpose(), cluster_matrix) start = time.time() while len(clusters) > max(self._num_clusters, 1): # find the two best candidate clusters to merge max_sim = self.array_max(dot_store_matrix) i = max_sim[0] j = max_sim[1] if i == j: print "got stuck when at " + str(len(clusters)) + " clusters" break; vsum = clusters[i] + clusters[j] if trace: print 'merging %d and %d' % (i, j) clusters[i] = vsum del clusters[j] vector_sum[i] = vector_sum[i] + vector_sum[j] norm_sum[i] = normalize(vector_sum[i]) cluster_matrix[:, i] = norm_sum[i] del vector_sum[j] del norm_sum[j] cluster_matrix = numpy.delete(cluster_matrix, j, 1) dot_store_matrix = numpy.dot(cluster_matrix.transpose(), cluster_matrix) self._dendogram.merge(i, j) if self._names: self._name_dendogram.merge(i, j) end = time.time() if end - start > 5: if self.msg_handle is not None: self.msg_handle.dm(str(len(clusters))) start = end else: print len(clusters) self.update_clusters(len(clusters))
def get_iteratively_reassigned_clusters(self, num_clusters, max_iterations=100, saveit=True): if num_clusters in self._reassigned_clusters: return self._reassigned_clusters[num_clusters] self.update_clusters(num_clusters) new_centroids = self._centroids clusters = self._dendogram.groups(num_clusters) for iter in range(max_iterations): number_reassigned = 0 new_clusters = [[] for i in range(len(self._centroids))] for cluster_number, cluster in enumerate(clusters): for vec in cluster: dps = [numpy.dot(numpy.transpose(centroid), vec) for centroid in new_centroids] new_cluster_index = dps.index(max(dps)) new_clusters[new_cluster_index].append(vec) if new_cluster_index != cluster_number: number_reassigned += 1 new_centroids = [] for cluster in new_clusters: assert len(cluster) > 0 if self._should_normalise: centroid = self._normalise(cluster[0]) else: centroid = numpy.array(cluster[0]) for vector in cluster[1:]: if self._should_normalise: centroid += self._normalise(vector) else: centroid += vector # centroid /= float(len(cluster)) # was this supposed to be some sort of normalizing? norm_centroid = normalize(centroid) new_centroids.append(norm_centroid) print [len(cluster) for cluster in new_clusters] print "Number reassigned = %i" % number_reassigned
def cluster_vectorspace(self, vectors, trace=False): # create a cluster for each vector clusters = [[vector] for vector in vectors] # This copy module and function is from the python standard library vector_sum = copy.copy(vectors) norm_sum = [normalize(vsum) for vsum in vector_sum] cluster_matrix = numpy.zeros([len(vectors[0]), len(vectors)]) i = 0 for v in norm_sum: cluster_matrix[:, i] = v i = i + 1 dot_store_matrix = numpy.dot(cluster_matrix.transpose(), cluster_matrix) while len(clusters) > max(self._num_clusters, 1): # find the two best candidate clusters to merge max_sim = self.array_max(dot_store_matrix) i = max_sim[0] j = max_sim[1] if i == j: print "got stuck when at " + str(len(clusters)) + " clusters" break vsum = clusters[i] + clusters[j] if trace: print 'merging %d and %d' % (i, j) clusters[i] = vsum del clusters[j] vector_sum[i] = vector_sum[i] + vector_sum[j] norm_sum[i] = normalize(vector_sum[i]) cluster_matrix[:, i] = norm_sum[i] del vector_sum[j] del norm_sum[j] cluster_matrix = numpy.delete(cluster_matrix, j, 1) dot_store_matrix = numpy.dot(cluster_matrix.transpose(), cluster_matrix) self._dendogram.merge(i, j) if self._names: self._name_dendogram.merge(i, j) if len(clusters) % 50 == 0: print len(clusters) self.update_clusters(len(clusters))
def cluster_vectorspace(self, vectors, trace=False): # create a cluster for each vector clusters = [[vector] for vector in vectors] # This copy module and function is from the python standard library vector_sum = copy.copy(vectors) norm_sum = [normalize(vsum) for vsum in vector_sum] while len(clusters) > max(self._num_clusters, 1): # find the two best candidate clusters to merge best = None for i in range(len(clusters)): for j in range(i + 1, len(clusters)): sim = numpy.dot(norm_sum[i], norm_sum[j]) if not best or sim > best[0]: best = (sim, i, j) # merge them and replace in cluster list i, j = best[1:] csum = clusters[i] + clusters[j] if trace: print 'merging %d and %d' % (i, j) # print len(clusters) clusters[i] = csum del clusters[j] vector_sum[i] = vector_sum[i] + vector_sum[j] norm_sum[i] = normalize(vector_sum[i]) del vector_sum[j] del norm_sum[j] self._dendogram.merge(i, j) if self._names: self._name_dendogram.merge(i, j) if len(clusters) % 50 == 0: print len(clusters) self.update_clusters(self._num_clusters)
def compute_centroids(self, clusters): centroids = [] for cluster in clusters: assert len(cluster) > 0 if self._should_normalise: centroid = self._normalise(cluster[0]) else: centroid = numpy.array(cluster[0]) for vector in cluster[1:]: if self._should_normalise: centroid += self._normalise(vector) else: centroid += vector # centroid /= float(len(cluster)) # was this supposed to be some sort of normalizing? norm_centroid = normalize(centroid) centroids.append(norm_centroid) return centroids
def update_clusters(self, num_clusters): clusters = self._dendogram.groups(num_clusters) self._centroids = [] for cluster in clusters: assert len(cluster) > 0 if self._should_normalise: centroid = self._normalise(cluster[0]) else: centroid = numpy.array(cluster[0]) for vector in cluster[1:]: if self._should_normalise: centroid += self._normalise(vector) else: centroid += vector # centroid /= float(len(cluster)) # was this supposed to be some sort of normalizing? norm_centroid = normalize(centroid) self._centroids.append(norm_centroid) self._num_clusters = len(self._centroids)
def update_clusters(self, num_clusters): print "entering update clusters with num_clusters = " + str( num_clusters) clusters = self._dendogram.groups(num_clusters) self._centroids = [] for cluster in clusters: assert len(cluster) > 0 if self._should_normalise: centroid = self._normalise(cluster[0]) else: centroid = numpy.array(cluster[0]) for vector in cluster[1:]: if self._should_normalise: centroid += self._normalise(vector) else: centroid += vector # centroid /= float(len(cluster)) # was this supposed to be some sort of normalizing? norm_centroid = normalize(centroid) self._centroids.append(norm_centroid) self._num_clusters = len(self._centroids)