Example #1
0
File: train.py Project: psdh/SaTS
def main():

    filename = "data/SwedishLeaf_TRAIN"

    data = open(filename, 'r').read()

    data = data.split('\n')

    print len(data)
    features = np.zeros(shape=(len(data), 128))

    for i, x in enumerate(data):
        if (len(x) == 0):
            continue
        features[i] = (x.split(','))[1:]

    distancesED = calc(features)

    CFdistances = calcCF(features)

    CID(500, 501, distancesED, CFdistances)

    # print CFdistances

    print "Now clustering"

    from hcluster import hcluster
    hcluster(features, distancesED, CFdistances)
Example #2
0
def main():

    filename = "data/SwedishLeaf_TRAIN"

    data = open(filename, 'r').read()

    data = data.split('\n')

    print len(data)
    features = np.zeros(shape=(len(data), 128))

    for i, x in enumerate(data):
        if (len(x) == 0):
            continue
        features[i] = (x.split(','))[1:]

    distancesED = calc(features)

    CFdistances = calcCF(features)

    CID(500, 501, distancesED, CFdistances)

    # print CFdistances

    print "Now clustering"

    from hcluster import hcluster
    hcluster(features, distancesED, CFdistances)
Example #3
0
 def __init__(self, **kwargs):
     if PARAM_OBS not in kwargs:
         raise Exception(
             "Cluster initialization requires initial observations")
     elif PARAM_THRESHOLD not in kwargs:
         raise Exception(
             "Hierarchical cluster initialization requires a distance threshold."
         )
     obs_raw = kwargs[PARAM_OBS]
     obs = numpy.array([arr[0] for arr in obs_raw])
     self.adaptive = False
     self.min_dist = float(kwargs[PARAM_THRESHOLD]) / 3
     self.max_radius = float(kwargs[PARAM_THRESHOLD])
     tree = hcluster(obs)
     clusters = tree.extract_clusters(self.max_radius)
     self.centroids = numpy.array([c.vec for c in clusters])
     #trials, dist = self._cluster(obs_raw)
     self.clusterNodes = [
         ClusterNode(reduceDim(arr), 0.001) for arr in self.centroids
     ]
     if PARAM_ADAPTIVE in kwargs:
         self.adaptive = bool(kwargs[PARAM_ADAPTIVE])
     #for idx in range(len(self.clusterNodes)):
     #    self.clusterNodes[trials[idx]].label = 'group {}'.format(idx)
     return
Example #4
0
def get_folder_clusters(path, imlist):
    n = len(imlist)
    #extract feature vector for each image
    features = np.zeros((n, 3))
    for i in range(n):
        print 'working on ', imlist[i]
        if 'average' in imlist[i]: continue
        pathf = path + imlist[i]
        im = np.array(Image.open(pathf))
        if im.shape[-1] > 3:
            im = to_rgb3b(im)
        R = np.mean(im[:, :, 0].flatten())
        G = np.mean(im[:, :, 1].flatten())
        B = np.mean(im[:, :, 2].flatten())
        features[i] = np.array([R, G, B])
    tree = hcluster.hcluster(features)
    print 'finished generating image clusters'
    return tree
Example #5
0
# -*- coding: utf-8 -*-
import sys
import hcluster
import find_files
import compute_hashes
import time

if __name__ == '__main__' and len(sys.argv) > 1:
    print "searching for images..."
    images = find_files.find_images(sys.argv[1])
    images_length = len(images)
    print images_length, "images found"
    if images_length > 0:
        hash_start_time = time.time()
        print "computing hashes and vectors..."
        hashes = compute_hashes.get_hashes(images)
        hash_end_time = time.time()
        print "all hashes computed in", hash_end_time - hash_start_time
        print "clustering..."
        hcluster.hcluster(hashes)
Example #6
0
#!/usr/bin/env python3
import numpy
import pdb
import hcluster
list1 = [[292, 1203, 32, 17, 917], [830, 1079, 494], [46, 565, 262, 74, 21, 17, 539], [1050, 58, 52], [1079, 494], [16, 52], [16, 17], [29, 52], [405, 958, 292, 345, 97, 58, 17, 1061, 4, 494], [94, 446, 32, 1592, 194, 9392, 9, 687, 1540, 3029, 32, 52], [4687, 1741, 97, 32, 17, 238], [14, 15, 21, 52], [292, 1203, 32, 17, 917], [491, 48, 494], [806, 17], [1622, 16, 17], [16, 17], [46, 350, 97, 58, 494], [16, 17, 127, 2954], [379, 529, 15, 1516, 1531, 793, 17, 284], [29, 18708, 2687, 44, 1060, 17]]

length = len(list1)

jc_matrix = numpy.zeros((length,length) )



for i in range(length):
   for j in range(i+1):
         if i == j:
             jc_matrix[i][j] = -1
         else:
             jc_matrix[i][j] = round(float(len(set(list1[i]) & set(list1[j])))/float(len(set(list1[i]) | set( list1[j]))),3) #round it 3 dec places


print jc_matrix  
numpy.savetxt('jaccard.txt',jc_matrix, delimiter=" ", fmt="%s" )       

tree = hcluster.hcluster(jc_matrix,list1)
Example #7
0
# path = 'D:\\GitHub\\PCV-translation-to-Chinese\\data\\sunsets\\flickr-sunsets-small\\'
path = '../data/flickr-sunsets-small/'
imlist = [
    os.path.join(path, f) for f in os.listdir(path) if f.endswith('.jpg')
]
# extract feature vector (8 bins per color channel)
features = zeros([len(imlist), 512])
for i, f in enumerate(imlist):
    im = array(Image.open(f))
    # multi-dimensional histogram
    h, edges = histogramdd(im.reshape(-1, 3),
                           8,
                           normed=True,
                           range=[(0, 255), (0, 255), (0, 255)])
    features[i] = h.flatten()
tree = hcluster.hcluster(features)

# visualize clusters with some (arbitrary) threshold
clusters = tree.extract_clusters(0.23 * tree.distance)
# plot images for clusters with more than 3 elements
# figure()
for c in clusters:
    elements = c.get_cluster_elements()
    nbr_elements = len(elements)
    if nbr_elements > 3:
        figure()
        subplot(1, 2, 2)
        for p in range(minimum(nbr_elements, 20)):
            subplot(4, 5, p + 1)
            im = array(Image.open(imlist[elements[p]]))
            imshow(im)
Example #8
0
# sort the edgelist
edgelist.sort()

featurematrix = []
temp = edgelist[0][0]
# print(edgelist[0])

# Generating the sparse matrix
fmline = []
for index, item in enumerate(edgelist):
    newtemp = item[0]
    if newtemp != temp:
        temp = newtemp
        featurematrix.append(fmline)
        fmline = []
        fmline.append(item[1])
    else:
        fmline.append(item[1])

# print(featurematrix)
# print(len(featurematrix))


"""
Building feature matrix from the edge list
extract feature vector for each node 
"""
# tree = hclusttest.hcluster(featurematrix)
tree = hcluster.hcluster(featurematrix)
Example #9
0
def main(jobName, inputfile):
    f = open(inputfile)
    natoms, SDdata, DistAvg, labels = readInput(f)
    f.close()
    tree = hcluster.hcluster(natoms, labels, SDdata, DistAvg)
    writeOutput(natoms, tree)
Example #10
0
# PCA on all images.
imlist = imtools.get_imlist('/Users/thakis/Downloads/data/a_thumbs')
imcount = len(imlist)
immatrix = array([array(Image.open(im)).flatten() for im in imlist], 'f')
V, S, immean = pca.pca(immatrix)

# Visualize only selected images.
imlist = imtools.get_imlist('/Users/thakis/Downloads/data/a_selected_thumbs')
imcount = len(imlist)
immatrix = array([array(Image.open(im)).flatten() for im in imlist], 'f')

# Project on 40 first PCs.
projected = array([dot(V[:40], immatrix[i] - immean) for i in range(imcount)])

# hierarchical clustering.
tree = hcluster.hcluster(projected)
hcluster.draw_dendrogram(tree, imlist, filename='out_font.png')

# k means.
K = 4
projected = whiten(projected)
centroids, variance = kmeans(projected, K)
code, distance = vq(projected, centroids)

# Plot clusters.
for k in range(K):
  ind = where(code == k)[0]
  figure()
  gray()
  for i in range(minimum(len(ind), 40)):
    subplot(4, 10, i + 1)
Example #11
0
# -*- coding: utf-8 -*-
"""
Created on Fri Sep 02 16:39:06 2016

@author: user
"""
import hcluster
from numpy import *
from PIL import Image
import os
#create the dataset
class1 = 1.5 * randn(100, 2)
class2 = randn(100, 2) + array([5, 5])
features = vstack((class1, class2))

tree = hcluster.hcluster(features)

clusters = tree.extract_clusters(5)
print len(clusters)
for c in clusters:
    print c.get_cluster_elements()
#use the images features named sunset
# create a list of images
path = '../pcv_data/data/sunsets/flickr-sunsets-small/'
imlist = [
    os.path.join(path, f) for f in os.listdir(path) if f.endswith('.jpg')
]

# extract feature vector (8 bins per color channel)
features = zeros([len(imlist), 512])
for i, f in enumerate(imlist):
Example #12
0
def draw_tree(observations, filename):
    node = hcluster([o[0] for o in observations])
    draw_dendrogram(node, [o[1] for o in observations], filename)
    return filename
Example #13
0
for i in range(len(dataset)):
    feaArr = dataset[i][:,3:9]
    timestamp = dataset[i][:,0]
    filtfea = filtSensordata(feaArr)
    timestamp = map(lambda x:float(x),timestamp)
    timestampArr = np.array([timestamp]).T
    sensordata = np.append(timestampArr,filtfea,1)
    activities = detectActivity(sensordata)
    print "the %dth file contains:%d activities"%(i,len(activities))
    activitiesofAll += activities
print "activity number:%d" % len(activitiesofAll)
activitiesofMini = divideMiniActivity(activitiesofAll)
activitiesfea = feaExtraction(activitiesofMini)
n_clusters = 3
#x=array([[[1,1,1],[2,2,2],[3,3,3]],[[1,1,1],[2,2,2],[3,3,3]],[[4,4,4],[5,5,5],[6,6,6]],[[4,4,4],[5,5,5],[6,6,6]]])
k,l = hcluster(activitiesfea,4)
print l
#print len(timestampArr),len(feaArr)
'''
rotationArr = getRotationArr(timestamp,filtfea)

plt.figure(1)
ax1 = plt.subplot(311)
plt.ylabel(u'rotationx')
plt.xlabel(u'sample')
plt.plot(range(len(rotationArr)),rotationArr[:,0],color = 'red',linewidth = 1.0)
plt.xlim(0,len(rotationArr))
plt.ylim(-200,200)

ax2 = plt.subplot(312)
plt.ylabel(u'rotationy')
Example #14
0
File: 0605.py Project: ta-oyama/PCV
#クラスタのラベルを使って画像を生成する
codeim = code.reshape(steps,steps)
codeim = imresize(codeim,im.shape[:2],interp='nearest')
plt.figure()
plt.imshow(codeim)

#この後、projectedとimlistの情報を利用するので、一時的に違う名前で保存しておく
projected2=projected
imlist2=imlist

#6.2 階層クラスタリング
class1 = 1.5 * np.random.randn(100,2)
class2 = np.random.randn(100,2) + np.array([5,5])
features = np.vstack((class1,class2))

tree = hcluster.hcluster(features)

clusters = tree.extract_clusters(5)

print 'number of clusters', len(clusters)
for c in clusters:
    print c.get_cluster_elements()


#6.2.1 画像のクラスタリング
#画像のリストを作成する
path = 'flickr-sunsets/'
imlist = [os.path.join(path,f) for f in os.listdir(path) if f.endswith('.jpg')]

#特徴量ベクトルを抽出する(色チャンネルに8つのビン)
features = np.zeros([len(imlist), 512])
from PIL import Image
from pylab import *
import imtools
import pickle
from scipy.cluster.vq import *

# 画像のリストを得る
imlist = imtools.get_imlist('selected_fontimages/')
imnbr = len(imlist)

# モデルのファイルを読み込む
with open('font_pca_modes.pkl','rb') as f:
  immean = pickle.load(f)
  V = pickle.load(f)

# 平板化した画像を格納する行列を作る
immatrix = array([array(Image.open(im)).flatten()
                  for im in imlist],'f')

# 第40主成分までを射影する
immean = immean.flatten()
projected = array([dot(V[:40],immatrix[i]-immean)
                   for i in range(imnbr)])

import hcluster
tree = hcluster.hcluster(projected) 
hcluster.draw_dendrogram(tree,imlist,filename='fonts.jpg')



Example #16
0
from pylab import *

class1 = 1.5 * randn(100,2)
class2 = randn(100,2) + array([5,5])
features = vstack((class1,class2))

import hcluster

tree = hcluster.hcluster(features)

clusters = tree.extract_clusters(5)

print 'number of clusters', len(clusters)
for c in clusters:
  print c.get_cluster_elements()
Example #17
0
def jcsimilarity(ne1,ne2,conveclist,docs):

  length = len(conveclist)


  temp_list = []
  jc_matrix=[]

  for i in range(length):
       for j in range(i+1):
          if i == j:
              temp_list.extend([0])
          else:
              temp_list.extend([round(float(len(set(conveclist[i]) & set(conveclist[j])))/float(len(set(conveclist[i]) | set( conveclist[j]))),3)])

       jc_matrix.append(temp_list)
       temp_list=[]    

  numpy.savetxt('jaccard2.txt',jc_matrix, delimiter=" ", fmt="%s" )       
  jcc_matrix = [[0],[10,0],[2,4,0],[3,6,5,0],[1,2,3,9,0],[6,7,8,20,2,0]]
  
 # print ('Enter hcluster')
  relations =  hcluster.hcluster(jc_matrix, conveclist)
 # print ('Exit hcluster')
  relations.sort()  
  if not relations:
      return
  relation = []
  fp = open('wordlist.pkl')
  wordlist = pickle.load(fp)
   
  inv_wordlist = {v : k for k, v in wordlist.items()}
  fp.close()

  fp = open('NamedEntity.pkl')
  ne = pickle.load(fp)
  inv_ne = {v : k for k , v in ne.items()}

  fp.close()
  prev = []
 # count = 0
  if (len(relations)) > 1:
     while len(relations) > 0 : 
         rel = relations.pop()
         if rel != prev:
             
           if prev != []:
                    try:
                      with open('Relation.csv', 'a') as myfile:
                         if len(prev) > 0:
                           writer = csv.writer(myfile, dialect='excel')
                           row = [inv_ne[ne1],inv_ne[ne2],r,docs]    # change inv_wordlist with relation
                           writer.writerow(row)
                    except IOError as ioe:
                         print('Error: ' + str(ioe))
                   #relation = []  
                   # count = 0
                     

           r = []
           prev = rel[:]
           while  len(rel) > 0:
             r.append(inv_wordlist[rel.pop()])
          #   relation.append(r) 
          #   count += 1
        # else:
         #  r = []
         #  prev = rel[:]
         #  while len(rel) > 0:
         #    r.append(inv_wordlist[rel.pop()])
         #    relation.append(r)
         #    count += 1




     try:
        with open('Relation.csv', 'a') as myfile:
          if len(prev) > 0 : 
            writer = csv.writer(myfile, dialect='excel')
            row = [inv_ne[ne1],inv_ne[ne2], r,docs]
            writer.writerow(row)
     except IOError as ioe:
         print('Error: ' + str(ioe))