def main(): filename = "data/SwedishLeaf_TRAIN" data = open(filename, 'r').read() data = data.split('\n') print len(data) features = np.zeros(shape=(len(data), 128)) for i, x in enumerate(data): if (len(x) == 0): continue features[i] = (x.split(','))[1:] distancesED = calc(features) CFdistances = calcCF(features) CID(500, 501, distancesED, CFdistances) # print CFdistances print "Now clustering" from hcluster import hcluster hcluster(features, distancesED, CFdistances)
def __init__(self, **kwargs): if PARAM_OBS not in kwargs: raise Exception( "Cluster initialization requires initial observations") elif PARAM_THRESHOLD not in kwargs: raise Exception( "Hierarchical cluster initialization requires a distance threshold." ) obs_raw = kwargs[PARAM_OBS] obs = numpy.array([arr[0] for arr in obs_raw]) self.adaptive = False self.min_dist = float(kwargs[PARAM_THRESHOLD]) / 3 self.max_radius = float(kwargs[PARAM_THRESHOLD]) tree = hcluster(obs) clusters = tree.extract_clusters(self.max_radius) self.centroids = numpy.array([c.vec for c in clusters]) #trials, dist = self._cluster(obs_raw) self.clusterNodes = [ ClusterNode(reduceDim(arr), 0.001) for arr in self.centroids ] if PARAM_ADAPTIVE in kwargs: self.adaptive = bool(kwargs[PARAM_ADAPTIVE]) #for idx in range(len(self.clusterNodes)): # self.clusterNodes[trials[idx]].label = 'group {}'.format(idx) return
def get_folder_clusters(path, imlist): n = len(imlist) #extract feature vector for each image features = np.zeros((n, 3)) for i in range(n): print 'working on ', imlist[i] if 'average' in imlist[i]: continue pathf = path + imlist[i] im = np.array(Image.open(pathf)) if im.shape[-1] > 3: im = to_rgb3b(im) R = np.mean(im[:, :, 0].flatten()) G = np.mean(im[:, :, 1].flatten()) B = np.mean(im[:, :, 2].flatten()) features[i] = np.array([R, G, B]) tree = hcluster.hcluster(features) print 'finished generating image clusters' return tree
# -*- coding: utf-8 -*- import sys import hcluster import find_files import compute_hashes import time if __name__ == '__main__' and len(sys.argv) > 1: print "searching for images..." images = find_files.find_images(sys.argv[1]) images_length = len(images) print images_length, "images found" if images_length > 0: hash_start_time = time.time() print "computing hashes and vectors..." hashes = compute_hashes.get_hashes(images) hash_end_time = time.time() print "all hashes computed in", hash_end_time - hash_start_time print "clustering..." hcluster.hcluster(hashes)
#!/usr/bin/env python3 import numpy import pdb import hcluster list1 = [[292, 1203, 32, 17, 917], [830, 1079, 494], [46, 565, 262, 74, 21, 17, 539], [1050, 58, 52], [1079, 494], [16, 52], [16, 17], [29, 52], [405, 958, 292, 345, 97, 58, 17, 1061, 4, 494], [94, 446, 32, 1592, 194, 9392, 9, 687, 1540, 3029, 32, 52], [4687, 1741, 97, 32, 17, 238], [14, 15, 21, 52], [292, 1203, 32, 17, 917], [491, 48, 494], [806, 17], [1622, 16, 17], [16, 17], [46, 350, 97, 58, 494], [16, 17, 127, 2954], [379, 529, 15, 1516, 1531, 793, 17, 284], [29, 18708, 2687, 44, 1060, 17]] length = len(list1) jc_matrix = numpy.zeros((length,length) ) for i in range(length): for j in range(i+1): if i == j: jc_matrix[i][j] = -1 else: jc_matrix[i][j] = round(float(len(set(list1[i]) & set(list1[j])))/float(len(set(list1[i]) | set( list1[j]))),3) #round it 3 dec places print jc_matrix numpy.savetxt('jaccard.txt',jc_matrix, delimiter=" ", fmt="%s" ) tree = hcluster.hcluster(jc_matrix,list1)
# path = 'D:\\GitHub\\PCV-translation-to-Chinese\\data\\sunsets\\flickr-sunsets-small\\' path = '../data/flickr-sunsets-small/' imlist = [ os.path.join(path, f) for f in os.listdir(path) if f.endswith('.jpg') ] # extract feature vector (8 bins per color channel) features = zeros([len(imlist), 512]) for i, f in enumerate(imlist): im = array(Image.open(f)) # multi-dimensional histogram h, edges = histogramdd(im.reshape(-1, 3), 8, normed=True, range=[(0, 255), (0, 255), (0, 255)]) features[i] = h.flatten() tree = hcluster.hcluster(features) # visualize clusters with some (arbitrary) threshold clusters = tree.extract_clusters(0.23 * tree.distance) # plot images for clusters with more than 3 elements # figure() for c in clusters: elements = c.get_cluster_elements() nbr_elements = len(elements) if nbr_elements > 3: figure() subplot(1, 2, 2) for p in range(minimum(nbr_elements, 20)): subplot(4, 5, p + 1) im = array(Image.open(imlist[elements[p]])) imshow(im)
# sort the edgelist edgelist.sort() featurematrix = [] temp = edgelist[0][0] # print(edgelist[0]) # Generating the sparse matrix fmline = [] for index, item in enumerate(edgelist): newtemp = item[0] if newtemp != temp: temp = newtemp featurematrix.append(fmline) fmline = [] fmline.append(item[1]) else: fmline.append(item[1]) # print(featurematrix) # print(len(featurematrix)) """ Building feature matrix from the edge list extract feature vector for each node """ # tree = hclusttest.hcluster(featurematrix) tree = hcluster.hcluster(featurematrix)
def main(jobName, inputfile): f = open(inputfile) natoms, SDdata, DistAvg, labels = readInput(f) f.close() tree = hcluster.hcluster(natoms, labels, SDdata, DistAvg) writeOutput(natoms, tree)
# PCA on all images. imlist = imtools.get_imlist('/Users/thakis/Downloads/data/a_thumbs') imcount = len(imlist) immatrix = array([array(Image.open(im)).flatten() for im in imlist], 'f') V, S, immean = pca.pca(immatrix) # Visualize only selected images. imlist = imtools.get_imlist('/Users/thakis/Downloads/data/a_selected_thumbs') imcount = len(imlist) immatrix = array([array(Image.open(im)).flatten() for im in imlist], 'f') # Project on 40 first PCs. projected = array([dot(V[:40], immatrix[i] - immean) for i in range(imcount)]) # hierarchical clustering. tree = hcluster.hcluster(projected) hcluster.draw_dendrogram(tree, imlist, filename='out_font.png') # k means. K = 4 projected = whiten(projected) centroids, variance = kmeans(projected, K) code, distance = vq(projected, centroids) # Plot clusters. for k in range(K): ind = where(code == k)[0] figure() gray() for i in range(minimum(len(ind), 40)): subplot(4, 10, i + 1)
# -*- coding: utf-8 -*- """ Created on Fri Sep 02 16:39:06 2016 @author: user """ import hcluster from numpy import * from PIL import Image import os #create the dataset class1 = 1.5 * randn(100, 2) class2 = randn(100, 2) + array([5, 5]) features = vstack((class1, class2)) tree = hcluster.hcluster(features) clusters = tree.extract_clusters(5) print len(clusters) for c in clusters: print c.get_cluster_elements() #use the images features named sunset # create a list of images path = '../pcv_data/data/sunsets/flickr-sunsets-small/' imlist = [ os.path.join(path, f) for f in os.listdir(path) if f.endswith('.jpg') ] # extract feature vector (8 bins per color channel) features = zeros([len(imlist), 512]) for i, f in enumerate(imlist):
def draw_tree(observations, filename): node = hcluster([o[0] for o in observations]) draw_dendrogram(node, [o[1] for o in observations], filename) return filename
for i in range(len(dataset)): feaArr = dataset[i][:,3:9] timestamp = dataset[i][:,0] filtfea = filtSensordata(feaArr) timestamp = map(lambda x:float(x),timestamp) timestampArr = np.array([timestamp]).T sensordata = np.append(timestampArr,filtfea,1) activities = detectActivity(sensordata) print "the %dth file contains:%d activities"%(i,len(activities)) activitiesofAll += activities print "activity number:%d" % len(activitiesofAll) activitiesofMini = divideMiniActivity(activitiesofAll) activitiesfea = feaExtraction(activitiesofMini) n_clusters = 3 #x=array([[[1,1,1],[2,2,2],[3,3,3]],[[1,1,1],[2,2,2],[3,3,3]],[[4,4,4],[5,5,5],[6,6,6]],[[4,4,4],[5,5,5],[6,6,6]]]) k,l = hcluster(activitiesfea,4) print l #print len(timestampArr),len(feaArr) ''' rotationArr = getRotationArr(timestamp,filtfea) plt.figure(1) ax1 = plt.subplot(311) plt.ylabel(u'rotationx') plt.xlabel(u'sample') plt.plot(range(len(rotationArr)),rotationArr[:,0],color = 'red',linewidth = 1.0) plt.xlim(0,len(rotationArr)) plt.ylim(-200,200) ax2 = plt.subplot(312) plt.ylabel(u'rotationy')
#クラスタのラベルを使って画像を生成する codeim = code.reshape(steps,steps) codeim = imresize(codeim,im.shape[:2],interp='nearest') plt.figure() plt.imshow(codeim) #この後、projectedとimlistの情報を利用するので、一時的に違う名前で保存しておく projected2=projected imlist2=imlist #6.2 階層クラスタリング class1 = 1.5 * np.random.randn(100,2) class2 = np.random.randn(100,2) + np.array([5,5]) features = np.vstack((class1,class2)) tree = hcluster.hcluster(features) clusters = tree.extract_clusters(5) print 'number of clusters', len(clusters) for c in clusters: print c.get_cluster_elements() #6.2.1 画像のクラスタリング #画像のリストを作成する path = 'flickr-sunsets/' imlist = [os.path.join(path,f) for f in os.listdir(path) if f.endswith('.jpg')] #特徴量ベクトルを抽出する(色チャンネルに8つのビン) features = np.zeros([len(imlist), 512])
from PIL import Image from pylab import * import imtools import pickle from scipy.cluster.vq import * # 画像のリストを得る imlist = imtools.get_imlist('selected_fontimages/') imnbr = len(imlist) # モデルのファイルを読み込む with open('font_pca_modes.pkl','rb') as f: immean = pickle.load(f) V = pickle.load(f) # 平板化した画像を格納する行列を作る immatrix = array([array(Image.open(im)).flatten() for im in imlist],'f') # 第40主成分までを射影する immean = immean.flatten() projected = array([dot(V[:40],immatrix[i]-immean) for i in range(imnbr)]) import hcluster tree = hcluster.hcluster(projected) hcluster.draw_dendrogram(tree,imlist,filename='fonts.jpg')
from pylab import * class1 = 1.5 * randn(100,2) class2 = randn(100,2) + array([5,5]) features = vstack((class1,class2)) import hcluster tree = hcluster.hcluster(features) clusters = tree.extract_clusters(5) print 'number of clusters', len(clusters) for c in clusters: print c.get_cluster_elements()
def jcsimilarity(ne1,ne2,conveclist,docs): length = len(conveclist) temp_list = [] jc_matrix=[] for i in range(length): for j in range(i+1): if i == j: temp_list.extend([0]) else: temp_list.extend([round(float(len(set(conveclist[i]) & set(conveclist[j])))/float(len(set(conveclist[i]) | set( conveclist[j]))),3)]) jc_matrix.append(temp_list) temp_list=[] numpy.savetxt('jaccard2.txt',jc_matrix, delimiter=" ", fmt="%s" ) jcc_matrix = [[0],[10,0],[2,4,0],[3,6,5,0],[1,2,3,9,0],[6,7,8,20,2,0]] # print ('Enter hcluster') relations = hcluster.hcluster(jc_matrix, conveclist) # print ('Exit hcluster') relations.sort() if not relations: return relation = [] fp = open('wordlist.pkl') wordlist = pickle.load(fp) inv_wordlist = {v : k for k, v in wordlist.items()} fp.close() fp = open('NamedEntity.pkl') ne = pickle.load(fp) inv_ne = {v : k for k , v in ne.items()} fp.close() prev = [] # count = 0 if (len(relations)) > 1: while len(relations) > 0 : rel = relations.pop() if rel != prev: if prev != []: try: with open('Relation.csv', 'a') as myfile: if len(prev) > 0: writer = csv.writer(myfile, dialect='excel') row = [inv_ne[ne1],inv_ne[ne2],r,docs] # change inv_wordlist with relation writer.writerow(row) except IOError as ioe: print('Error: ' + str(ioe)) #relation = [] # count = 0 r = [] prev = rel[:] while len(rel) > 0: r.append(inv_wordlist[rel.pop()]) # relation.append(r) # count += 1 # else: # r = [] # prev = rel[:] # while len(rel) > 0: # r.append(inv_wordlist[rel.pop()]) # relation.append(r) # count += 1 try: with open('Relation.csv', 'a') as myfile: if len(prev) > 0 : writer = csv.writer(myfile, dialect='excel') row = [inv_ne[ne1],inv_ne[ne2], r,docs] writer.writerow(row) except IOError as ioe: print('Error: ' + str(ioe))