def update_clustroid(cluster): """Update the clustroid in a cluster using the minimum sum of squares""" sum_of_squares = [] for i in xrange(len(cluster)): summ = 0 for j in xrange(i + 1, len(cluster)): summ += pow(edit_dist(cluster[i], cluster[j]), 2) sum_of_squares.append(summ) c_id = argmin(sum_of_squares) cluster.append(cluster[c_id]) del cluster[c_id]
def set_clustroid_mate(listoclusters): """For each cluster, set the [-2] element as the closest to the clustroid """ min_dist = float('inf') for cluster in listoclusters: if len(cluster) == 1: continue for i in xrange(len(cluster) - 1): dist = edit_dist(cluster[i], cluster[-1]) if min_dist > dist: min_dist = dist mate_idx = i cluster[-2], cluster[mate_idx] = cluster[mate_idx], cluster[-2]
def clusterize(listoclusters): """Create clusters starting from a list of lists of single elements""" cluster_size = 1 while cluster_size < settings.CLUSTER_MAX_SIZE: min_dist = float('inf') for i in xrange(len(listoclusters)): for j in xrange(i + 1, len(listoclusters)): dist = edit_dist(listoclusters[i][-1], listoclusters[j][-1]) if dist < min_dist: min_dist = dist to_merge = (i, j) listoclusters[to_merge[0]].extend(listoclusters[to_merge[1]]) update_clustroid(listoclusters[to_merge[0]]) cluster_size = max(len(l) for l in listoclusters) set_clustroid_mate(listoclusters)