def hcluster(blogwords): biclusters = [ bicluster(vec=blogwords[i], id=i) for i in range(len(blogwords)) ] distances = {} flag = None currentclusted = -1 while (len(biclusters) > 1): min_val = 2 biclusters_len = len(biclusters) for i in range(biclusters_len - 1): for j in range(i + 1, biclusters_len): if distances.get((biclusters[i].id, biclusters[j].id)) == None: distances[(biclusters[i].id, biclusters[j].id)] = pearson_distance( biclusters[i].vec, biclusters[j].vec) d = distances[(biclusters[i].id, biclusters[j].id)] if d < min_val: min_val = d flag = (i, j) bic1, bic2 = flag newvec = [(biclusters[bic1].vec[i] + biclusters[bic2].vec[i]) / 2 for i in range(len(biclusters[bic1].vec))] newbic = bicluster(newvec, left=biclusters[bic1], right=biclusters[bic2], distance=min_val, id=currentclusted) currentclusted -= 1 del biclusters[bic2] del biclusters[bic1] biclusters.append(newbic) return biclusters[0]
def kmeans(blogwords, k): min_max_per_word = [[ min([row[i] for row in blogwords]), max([row[i] for row in blogwords]) ] for i in range(len(blogwords[0]))] # generate k clusters randomly clusters = [] for i in range(k): cluster = [] for min_, max_ in min_max_per_word: cluster.append(random.random() * (max_ - min_) + min_) clusters.append(cluster) lables = [] matchs = [[] for i in range(k)] lastmatchs = [[] for i in range(k)] rounds = 100 while rounds > 0: matchs = [[] for i in range(k)] print 'round \t', rounds for i in range(len(blogwords)): bestmatch_cluster = None min_distance = 2.1 for j in range(k): dis = pearson_distance(clusters[j], blogwords[i]) if dis < min_distance: min_distance = dis bestmatch_cluster = j matchs[bestmatch_cluster].append(i) print_matchs(matchs) print_matchs(lastmatchs) if matchs == lastmatchs: break lastmatchs = [[item for item in matchs[i]] for i in range(k)] #move the centroids to the average of their members for j in range(k): avg = [0.0 for i in range(len(blogwords[0]))] for m in matchs[j]: vec = blogwords[m] for i in range(len(blogwords[0])): avg[i] += vec[i] avg = [item / len(blogwords[0]) for item in avg] clusters[j] = avg rounds -= 1
def totalcost(blogwords, costf, medoids_idx) : size = len(blogwords) total_cost = 0.0 medoids = {} for idx in medoids_idx : medoids[idx] = [] for i in range(size) : choice = None min_cost = 2.1 for m in medoids : tmp = distances_cache.get((m,i),None) if tmp is None : tmp = pearson_distance(blogwords[m],blogwords[i]) distances_cache[(m,i)] = tmp if tmp < min_cost : choice = m min_cost = tmp medoids[choice].append(i) total_cost += min_cost return total_cost, medoids
def totalcost(blogwords, costf, medoids_idx): size = len(blogwords) total_cost = 0.0 medoids = {} for idx in medoids_idx: medoids[idx] = [] for i in range(size): choice = None min_cost = 2.1 for m in medoids: tmp = distances_cache.get((m, i), None) if tmp == None: tmp = pearson_distance(blogwords[m], blogwords[i]) distances_cache[(m, i)] = tmp if tmp < min_cost: choice = m min_cost = tmp medoids[choice].append(i) total_cost += min_cost return total_cost, medoids
def kmeans(blogwords, k) : min_max_per_word = [ [min([row[i] for row in blogwords]), max([row[i] for row in blogwords])] for i in range(len(blogwords[0]))] # generate k clusters randomly clusters = [] for i in range(k) : cluster = [] for min_, max_ in min_max_per_word : cluster.append(random.random() * (max_ - min_) + min_) clusters.append(cluster) lables = [] matchs = [ [] for i in range(k)] lastmatchs = [ [] for i in range(k)] rounds = 100 while rounds > 0 : matchs = [ [] for i in range(k)] print 'round \t',rounds for i in range(len(blogwords)) : bestmatch_cluster = None min_distance = 2.1 for j in range(k) : dis = pearson_distance(clusters[j], blogwords[i]) if dis < min_distance : min_distance = dis bestmatch_cluster = j matchs[bestmatch_cluster].append(i) print_matchs(matchs) print_matchs(lastmatchs) if matchs == lastmatchs : break lastmatchs = [[ item for item in matchs[i] ] for i in range(k)] #move the centroids to the average of their members for j in range(k) : avg = [0.0 for i in range(len(blogwords[0])) ] for m in matchs[j] : vec = blogwords[m] for i in range(len(blogwords[0])) : avg[i] += vec[i] avg = [ item / len(blogwords[0]) for item in avg] clusters[j] = avg rounds -= 1
def hcluster(blogwords,blognames) : biclusters = [ bicluster(vec = blogwords[i], id = i ) for i in range(len(blogwords)) ] distances = {} flag = None; currentclusted = -1 while(len(biclusters) > 1) : min_val = 2; biclusters_len = len(biclusters) for i in range(biclusters_len-1) : for j in range(i + 1, biclusters_len) : if distances.get((biclusters[i].id,biclusters[j].id)) == None: distances[(biclusters[i].id,biclusters[j].id)] = pearson_distance(biclusters[i].vec,biclusters[j].vec) d = distances[(biclusters[i].id,biclusters[j].id)] if d < min_val : min_val = d flag = (i,j) bic1,bic2 = flag newvec = [(biclusters[bic1].vec[i] + biclusters[bic2].vec[i])/2 for i in range(len(biclusters[bic1].vec))] newbic = bicluster(newvec, left=biclusters[bic1], right=biclusters[bic2], distance=min_val, id = currentclusted) currentclusted -= 1 del biclusters[bic2] del biclusters[bic1] biclusters.append(newbic) return biclusters[0]