def hcluster(blogwords):
    biclusters = [
        bicluster(vec=blogwords[i], id=i) for i in range(len(blogwords))
    ]
    distances = {}
    flag = None
    currentclusted = -1
    while (len(biclusters) > 1):
        min_val = 2
        biclusters_len = len(biclusters)
        for i in range(biclusters_len - 1):
            for j in range(i + 1, biclusters_len):
                if distances.get((biclusters[i].id, biclusters[j].id)) == None:
                    distances[(biclusters[i].id,
                               biclusters[j].id)] = pearson_distance(
                                   biclusters[i].vec, biclusters[j].vec)
                d = distances[(biclusters[i].id, biclusters[j].id)]
                if d < min_val:
                    min_val = d
                    flag = (i, j)
        bic1, bic2 = flag
        newvec = [(biclusters[bic1].vec[i] + biclusters[bic2].vec[i]) / 2
                  for i in range(len(biclusters[bic1].vec))]
        newbic = bicluster(newvec,
                           left=biclusters[bic1],
                           right=biclusters[bic2],
                           distance=min_val,
                           id=currentclusted)
        currentclusted -= 1
        del biclusters[bic2]
        del biclusters[bic1]
        biclusters.append(newbic)
    return biclusters[0]
Exemple #2
0
def kmeans(blogwords, k):
    min_max_per_word = [[
        min([row[i] for row in blogwords]),
        max([row[i] for row in blogwords])
    ] for i in range(len(blogwords[0]))]
    # generate k clusters randomly
    clusters = []
    for i in range(k):
        cluster = []
        for min_, max_ in min_max_per_word:
            cluster.append(random.random() * (max_ - min_) + min_)
        clusters.append(cluster)
    lables = []
    matchs = [[] for i in range(k)]
    lastmatchs = [[] for i in range(k)]

    rounds = 100
    while rounds > 0:
        matchs = [[] for i in range(k)]
        print 'round \t', rounds
        for i in range(len(blogwords)):
            bestmatch_cluster = None
            min_distance = 2.1
            for j in range(k):
                dis = pearson_distance(clusters[j], blogwords[i])
                if dis < min_distance:
                    min_distance = dis
                    bestmatch_cluster = j
            matchs[bestmatch_cluster].append(i)
        print_matchs(matchs)
        print_matchs(lastmatchs)
        if matchs == lastmatchs: break
        lastmatchs = [[item for item in matchs[i]] for i in range(k)]
        #move the centroids to the average of their members
        for j in range(k):
            avg = [0.0 for i in range(len(blogwords[0]))]
            for m in matchs[j]:
                vec = blogwords[m]
                for i in range(len(blogwords[0])):
                    avg[i] += vec[i]
            avg = [item / len(blogwords[0]) for item in avg]
            clusters[j] = avg
        rounds -= 1
Exemple #3
0
def totalcost(blogwords, costf, medoids_idx) :
    size = len(blogwords)
    total_cost = 0.0
    medoids = {}
    for idx in medoids_idx :
        medoids[idx] = []
    for i in range(size) :
        choice = None
        min_cost = 2.1
        for m in medoids :
            tmp = distances_cache.get((m,i),None)
            if tmp is None :
                tmp = pearson_distance(blogwords[m],blogwords[i])
                distances_cache[(m,i)] = tmp
            if tmp < min_cost :
                choice = m
                min_cost = tmp
        medoids[choice].append(i)
        total_cost += min_cost
    return total_cost, medoids
Exemple #4
0
def totalcost(blogwords, costf, medoids_idx):
    size = len(blogwords)
    total_cost = 0.0
    medoids = {}
    for idx in medoids_idx:
        medoids[idx] = []
    for i in range(size):
        choice = None
        min_cost = 2.1
        for m in medoids:
            tmp = distances_cache.get((m, i), None)
            if tmp == None:
                tmp = pearson_distance(blogwords[m], blogwords[i])
                distances_cache[(m, i)] = tmp
            if tmp < min_cost:
                choice = m
                min_cost = tmp
        medoids[choice].append(i)
        total_cost += min_cost
    return total_cost, medoids
Exemple #5
0
def kmeans(blogwords, k) :
    min_max_per_word = [ [min([row[i] for row in blogwords]), max([row[i] for row in blogwords])]  for i in range(len(blogwords[0]))]
    # generate k clusters randomly
    clusters = []
    for i in range(k) :
        cluster = []
        for min_, max_ in min_max_per_word :
            cluster.append(random.random() * (max_ - min_) + min_)
        clusters.append(cluster)
    lables = []
    matchs = [ [] for i in range(k)]
    lastmatchs = [ [] for i in range(k)]

    rounds = 100
    while rounds > 0 :
        matchs = [ [] for i in range(k)]
        print 'round \t',rounds
        for i in range(len(blogwords)) :
            bestmatch_cluster = None
            min_distance = 2.1
            for j in range(k) :
                dis = pearson_distance(clusters[j], blogwords[i])
                if dis < min_distance :
                    min_distance = dis
                    bestmatch_cluster = j
            matchs[bestmatch_cluster].append(i)
        print_matchs(matchs)
        print_matchs(lastmatchs)
        if matchs == lastmatchs : break
        lastmatchs = [[ item for item in matchs[i] ] for i in range(k)]
        #move the centroids to the average of their members
        for j in range(k) :
            avg = [0.0 for i in range(len(blogwords[0])) ]
            for m in matchs[j] :
                vec = blogwords[m]
                for i in range(len(blogwords[0])) :
                    avg[i] += vec[i]
            avg = [ item / len(blogwords[0]) for item in avg]
            clusters[j] = avg
        rounds -= 1
def hcluster(blogwords,blognames) :
    biclusters = [ bicluster(vec = blogwords[i], id = i ) for i in range(len(blogwords)) ]
    distances = {}
    flag = None;
    currentclusted = -1
    while(len(biclusters) > 1) :
        min_val = 2;
        biclusters_len = len(biclusters)
        for i in range(biclusters_len-1) :
            for j in range(i + 1, biclusters_len) :
                if distances.get((biclusters[i].id,biclusters[j].id)) == None:
                    distances[(biclusters[i].id,biclusters[j].id)] = pearson_distance(biclusters[i].vec,biclusters[j].vec)
                d = distances[(biclusters[i].id,biclusters[j].id)] 
                if d < min_val :
                    min_val = d
                    flag = (i,j)
        bic1,bic2 = flag
        newvec = [(biclusters[bic1].vec[i] + biclusters[bic2].vec[i])/2 for i in range(len(biclusters[bic1].vec))]
        newbic = bicluster(newvec, left=biclusters[bic1], right=biclusters[bic2], distance=min_val, id = currentclusted)
        currentclusted -= 1
        del biclusters[bic2]
        del biclusters[bic1]
        biclusters.append(newbic)
    return biclusters[0]