Beispiel #1
0
def getKmeans(a, k, threshold=1, iter=40, thresh=1e-05, minit="random", missing="warn"):
    """input : a, k threshold
        output : atk
        """
    if minit == "matrix":
        seeds, k = k, len(k)
    a.k = k  # initialise (could move it to __init__ but not bothered for the moment)
    height, width = a.matrix.shape
    pixels = a.matrix > threshold
    print "width, height:", width, height  # debug
    print "sum of relevant pixels:", sum(sum(pixels))  # debug
    dataPoints = [[(i, j) for i in range(width) if pixels[j, i]] for j in range(height)]
    dataPoints = sum(dataPoints, [])
    dataPoints = np.array(dataPoints)
    print dataPoints[:20]
    if minit == "matrix":
        a.centroids = kmeans2(data=dataPoints, k=seeds, iter=iter, thresh=thresh, minit=minit, missing=missing)
    else:
        a.centroids = kmeans2(data=dataPoints, k=k, iter=iter, thresh=thresh, minit=minit, missing=missing)
    a.data = dataPoints

    resultPattern = ma.zeros((height, width))
    resultPattern.mask = True
    resultPattern.fill_value = -999
    for i in range(len(dataPoints)):
        resultPattern[dataPoints[i][1], dataPoints[i][0]] = a.centroids[1][i]
    resultPattern = dbz(
        name="Clustering for %s with %d clusters" % (a.name, k + 1), matrix=resultPattern, vmin=0, vmax=k
    )

    atk = {"centroids": a.centroids, "data": a.data, "pattern": resultPattern}
    return atk
Beispiel #2
0
    def _discover_centroids(self, dataset_input):
        self.centroids, labels = kmeans2(dataset_input, self.n_centroids)
        while np.unique(labels).shape[0] != self.n_centroids:
            # print "Empty cluster found. Retrying kmeans.."
            self.centroids, labels = kmeans2(dataset_input, self.n_centroids)

        return (self.centroids, labels)
 def RunClustering(self,N,vector,K0):
     data = vector.reshape(N**2,3) 
     import scipy.cluster.vq as vq
     resmap,indexmap = vq.kmeans2(data,K0,iter=50,minit='random') 
     newresmap,indexmap = vq.kmeans2(data,resmap,iter=50,minit='matrix')
     self.indexmap = indexmap.reshape(N,N)
     self.CheckTopology(N)
    def _init_responsibilities( self, data ):
        '''
        Intialise responsibilities via k-means clustering.
        '''
        a_1 = np.asarray( data.a['normal'], dtype=np.float64 )
        b_1 = np.asarray( data.b['normal'], dtype=np.float64 )
        p_1 = a_1 / ( a_1 + b_1 )
              
        a_2 = np.asarray( data.a['tumour'], dtype=np.float64 )
        b_2 = np.asarray( data.b['tumour'], dtype=np.float64 )
        p_2 = a_2 / ( a_2 + b_2 )

        shape = ( data.nrows, 9 )
        
        responsibilities = np.zeros( shape )
        
        init_centers = np.array( ( 1., 0.5, 0. ) )
        
        cluster_centers_1, labels_1 = kmeans2( p_1, init_centers, minit='matrix' )
        cluster_centers_2, labels_2 = kmeans2( p_2, init_centers, minit='matrix' )

        labels = 3 * labels_1 + labels_2

        for id in range( 9 ):
            index = labels == id
            
            responsibilities[index, id] = 1.
        
        self.responsibilities = responsibilities
def kMeansCluster(x, k, trials):
    """kMeansCluster performs k means clustering on a dataset

    :param x: a data object (must contain field 'data')
    :type x: dict
    :param k: the number of centroids to cluster to
    :type k: int
    :param trials: the number of times to run kmeans2 (will be run with both 'random'
        and 'points'. The best of the two trials will be used.
    :type trials: int
    :returns:  a dictionary with keys idx and cents.
        idx is the group number for each protein (in the orde given in the x data object
        cents is a list of rowVectors with the centroids for each cluster

    """
    data = x['data']

    centsR, idxR = scv.kmeans2(data.copy(), k, iter=trials, minit='random')
    centsP, idxP = scv.kmeans2(data.copy(), k, iter=trials, minit='points')
    distR = calcDistortion(centsR, idxR, data)
    distP = calcDistortion(centsP, idxP, data)

    if distR > distP:
        centsR = centsP
        idxR = idxP
        distR = distP
    return {'idx': idxR, 'cents':centsR}
Beispiel #6
0
def cluster(dataArray):
	warnings.filterwarnings('error')
	bestKmeans=None

	#Gross code to handle warning from numpy for an empty cluster
	while bestKmeans is None:
		try:
			bestKmeans, bestMapping=kmeans2(dataArray, 5)
		except:
			pass
	minDB=DaviesBouldinIndex(bestKmeans, bestMapping, dataArray).getDBindex()
	for numClusters in range(5,11):
		kmeans=None
		while kmeans is None:
			try:
				kmeans, mapping=kmeans2(dataArray, numClusters)
			except:
				pass

		#print "Valid cluster created with numClusters:%i." % numClusters

		db=DaviesBouldinIndex(kmeans, mapping, dataArray).getDBindex()
		if db<minDB:
			minDB=db
			bestKmeans=kmeans
			bestMapping=mapping

	return bestKmeans, minDB, bestMapping
Beispiel #7
0
    def test_kmeans2_simple(self):
        initc = np.concatenate(([[X[0]], [X[1]], [X[2]]]))
        code = initc.copy()
        code1 = kmeans2(X, code, iter=1)[0]
        code2 = kmeans2(X, code, iter=2)[0]

        assert_array_almost_equal(code1, CODET1)
        assert_array_almost_equal(code2, CODET2)
Beispiel #8
0
    def test_kmeans2_simple(self):
        initc = np.concatenate(([[X[0]], [X[1]], [X[2]]]))
        for tp in np.array, np.matrix:
            code1 = kmeans2(tp(X), tp(initc), iter=1)[0]
            code2 = kmeans2(tp(X), tp(initc), iter=2)[0]

            assert_array_almost_equal(code1, CODET1)
            assert_array_almost_equal(code2, CODET2)
Beispiel #9
0
    def test_kmeans2_rank1(self):
        data = TESTDATA_2D
        data1 = data[:, 0]

        initc = data1[:3]
        code = initc.copy()
        kmeans2(data1, code, iter=1)[0]
        kmeans2(data1, code, iter=2)[0]
Beispiel #10
0
	def train(self,white=False):
		'''
			each train change everything
		'''
		if (white):
			self.centroids,self.labels=kmeans2(whiten(self.X),self.K,minit='random', missing='warn')
		else:
			self.centroids,self.labels=kmeans2(self.X,self.K,minit='random', missing='warn')
Beispiel #11
0
 def test_kmeans2_empty(self):
     """Ticket #505."""
     try:
         kmeans2([], 2)
         raise AssertionError("This should not succeed.")
     except ValueError, e:
         # OK, that's what we expect
         pass
Beispiel #12
0
    def test_kmeans2_simple(self):
        """Testing simple call to kmeans2 and its results."""
        initc = np.concatenate(([[X[0]], [X[1]], [X[2]]]))
        code = initc.copy()
        code1 = kmeans2(X, code, iter=1)[0]
        code2 = kmeans2(X, code, iter=2)[0]

        assert_array_almost_equal(code1, CODET1)
        assert_array_almost_equal(code2, CODET2)
Beispiel #13
0
    def test_kmeans2_rank1(self):
        data = np.fromfile(DATAFILE1, sep=", ")
        data = data.reshape((200, 2))
        data1 = data[:, 0]

        initc = data1[:3]
        code = initc.copy()
        kmeans2(data1, code, iter=1)[0]
        kmeans2(data1, code, iter=2)[0]
Beispiel #14
0
    def test_kmeans2_rank1(self):
        """Testing simple call to kmeans2 with rank 1 data."""
        data = np.fromfile(DATAFILE1, sep=", ")
        data = data.reshape((200, 2))
        data1 = data[:, 0]
        data2 = data[:, 1]

        initc = data1[:3]
        code = initc.copy()
        code1 = kmeans2(data1, code, iter=1)[0]
        code2 = kmeans2(data1, code, iter=2)[0]
    def test_kmeans_lost_cluster(self):
        # This will cause kmean to have a cluster with no points.
        data = np.fromfile(DATAFILE1, sep=", ")
        data = data.reshape((200, 2))
        initk = np.array([[-1.8127404, -0.67128041], [2.04621601, 0.07401111], [-2.31149087, -0.05160469]])

        kmeans(data, initk)
        with warnings.catch_warnings():
            warnings.simplefilter("ignore", UserWarning)
            kmeans2(data, initk, missing="warn")

        assert_raises(ClusterError, kmeans2, data, initk, missing="raise")
Beispiel #16
0
    def test_kmeans_lost_cluster(self):
        # This will cause kmean to have a cluster with no points.
        data = TESTDATA_2D
        initk = np.array([[-1.8127404, -0.67128041],
                         [2.04621601, 0.07401111],
                         [-2.31149087,-0.05160469]])

        kmeans(data, initk)
        with warnings.catch_warnings():
            warnings.simplefilter('ignore', UserWarning)
            kmeans2(data, initk, missing='warn')

        assert_raises(ClusterError, kmeans2, data, initk, missing='raise')
Beispiel #17
0
def gencode(image, k, oldcenters=None):
  t1 = time.time()
  npix = image.size / 3
  P = np.reshape(image, (npix, 3), order='F')
  Pw = vq.whiten(P)
  if oldcenters == None:  
    (centers, label) = vq.kmeans2(Pw, k, iter=30)
  else:
    (centers, label) = vq.kmeans2(Pw, oldcenters, iter=5)
  (code, distortion) = vq.vq(Pw, centers)
  code = np.reshape(code, image.shape[0:2], order='F')
  print time.time() - t1
  return code, centers
Beispiel #18
0
    def kmeans(self, id, k=5, is_row=True):
        """
        K-means clustering. http://en.wikipedia.org/wiki/K-means_clustering

        Clusterizes the (cols) values of a given row, or viceversa

        :param id: row (or col) id to cluster its values
        :param k: number of clusters
        :param is_row: is param *id* a row (or a col)?
        :type is_row: Boolean
        """
        # TODO: switch to Pycluster?
        # http://pypi.python.org/pypi/Pycluster
        if VERBOSE:
            sys.stdout.write('Computing k-means, k=%s, for id %s\n' % (k, id))
        point = None
        if is_row:
            point = self.get_matrix().get_row(id)
        else:
            point = self.get_matrix().get_col(id)
        points = []
        points_id = []
        for i in point.nonzero_entries():
            label = point.label(i)
            points_id.append(label)
            if not is_row:
                points.append(self.get_matrix().get_row(label))
            else:
                points.append(self.get_matrix().get_col(label))
        #return kmeans(array(points), k)
        if VERBOSE:
            sys.stdout.write('id %s has %s points\n' % (id, len(points)))
        M = array(points)

        MAX_POINTS = 150
        # Only apply Matrix initialization if num. points is not that big!
        if len(points) <= MAX_POINTS:
            centers = self._kinit(array(points), k)
            centroids, labels = kmeans2(M, centers, minit='matrix')
        else:
            centroids, labels = kmeans2(M, k, minit='random')
        i = 0
        clusters = dict()
        for cluster in labels:
            if not clusters.has_key(cluster): 
                clusters[cluster] = dict()
                clusters[cluster]['centroid'] = centroids[cluster]
                clusters[cluster]['points'] = []
            clusters[cluster]['points'].append(points_id[i])
            i += 1
        return clusters
Beispiel #19
0
    def test_kmeans_lost_cluster(self):
        # This will cause kmeans to have a cluster with no points.
        data = TESTDATA_2D
        initk = np.array([[-1.8127404, -0.67128041],
                         [2.04621601, 0.07401111],
                         [-2.31149087,-0.05160469]])

        with suppress_warnings() as sup:
            sup.filter(UserWarning,
                       "One of the clusters is empty. Re-run kmean with a different initialization")
            kmeans(data, initk)
            kmeans2(data, initk, missing='warn')

        assert_raises(ClusterError, kmeans2, data, initk, missing='raise')
Beispiel #20
0
def seq_cluster(traj,seq_len,stride,K):
    '''
    Put several frames together to be clustered as a sequence
    
    Args: 
        - traj: the trajectory object
        - seq_len: length of each sequence
        - stride: steps for moving the sequence window
        - K: number of clusters
    Return: labels
    '''
    
    # get flattened coordinates of each frame
    coords = traj.xyz
    coords = np.reshape(coords,(len(coords),-1))
#    print np.shape(coords) 
    # compute the covarance of each sequence as the features
    seqs = []
    for i in xrange(0,len(coords),stride):
        # covariance matrix of the coordinates
        covm = np.cov(np.transpose(coords[i:i+seq_len]))         
#        print np.shape(covm)
        seqs.append(np.diag(covm))
#    print np.shape(seq_data)

    centroids, labels = kmeans2(np.asarray(seqs),K,iter=100)

    # test clustering consistancy using classification
    clf = KNeighborsClassifier() # knn works best for alanine coords
    data_scores = cross_val_score(clf,seqs,labels,cv=5)
    print("Accuracy with 5 folds: %0.2f (+/- %0.2f)" % 
        (data_scores.mean(), data_scores.std()))

    return labels
Beispiel #21
0
    def _init_responsibilities( self, data ):
        '''
        Intialise responsibilities via k-means clustering.
        '''
        shape = ( data.nrows, self.ncomponents )
        
        responsibilities = np.zeros( shape )
        
        labels = {}
        for genome in constants.genomes:
            a = np.asarray( data.a[genome], dtype=np.float64 )
            b = np.asarray( data.b[genome], dtype=np.float64 )
            d = a + b
            p = a / d
              
            init_centers = np.linspace( 1, 0, self.nclass[genome] )                    
            
            clustering_result = kmeans2( p, init_centers, minit='matrix' )
            
            labels[genome] = clustering_result[1]
            
            print "Initial class ceneters : ", clustering_result[0]

        labels = self.nclass['normal'] * labels['normal'] + labels['tumour']

        for id in range( self.ncomponents ):
            indices = ( labels == id )
            
            responsibilities[indices, id] = 1.
        
        self.responsibilities = responsibilities
Beispiel #22
0
 def _init_kmeans(self, num_comp):
     """Initialize using k-means
     """
     (init_mean, labels) = kmeans2(self.data, num_comp)
     init_covar = self._get_covar(self.data, labels)
     init_mixweights = element_weights(labels)
     return (init_mean, labels, init_covar, init_mixweights)
def kmeans(dataset, n_cluster = 625):
    from scipy.cluster.vq import kmeans2, whiten
    feature_matrix = numpy.asarray(dataset)
    whitened = whiten(feature_matrix)
    cluster_num = 625
    _, cluster_labels = kmeans2(whitened, cluster_num, iter = 100)
    return cluster_labels
Beispiel #24
0
def find_freq_clusters(freqs):
    # first make a histogram
    minf, maxf = freqs.min(), freqs.max()
    maxbins = 8  # related to the max colors defined...
    df = 4.0 # MHz
    if ((maxf - minf) < df):  # Only a single freq to our resolution
        return [[0.0, 'inf']]
    numbins = int((maxf - minf) / df) + 2
    lobound = minf - 0.5 * df
    hibound = lobound + numbins * df
    hist, edges = _np.histogram(freqs, numbins, [lobound, hibound])
    # Now choose the maxbins biggest bins where there are TOAs
    hibins = hist.argsort()[::-1]
    hibins = hibins[hist[hibins] > 0]
    if len(hibins) > maxbins:
        hibins = hibins[:maxbins]
    ctrs = edges[hibins] + 0.5 * df
    ctrs.sort()
    # and use these as starting points for kmeans
    kmeans, indices = kmeans2(freqs, ctrs)
    if len(kmeans)==1:
        return [[0.0, 'inf']]
    elif len(kmeans)==2:
        return [[0.0, kmeans.mean()], [kmeans.mean(), 'inf']]
    else:
        freqbands = [[0.0, kmeans[0:2].mean()]]
        for ii in range(len(kmeans)-2):
            freqbands.append([kmeans[ii:ii+2].mean(), kmeans[ii+1:ii+3].mean()])
        freqbands.append([kmeans[-2:].mean(), 'inf'])
        return freqbands
def clustering2(img,clusters):
    "another clustering method - no major differences"
    #Reshaping image in list of pixels to allow kmean Algorithm
    #From 1792x1792x3 to 1792^2x3
    pixels = np.reshape(img,(img.shape[0]*img.shape[1],3))
    centroids,_ = kmeans2(pixels,3,iter=3,minit= 'random')
    #print ("Centroids : ",centroids.dtype,centroids.shape,type(centroids))
    #print centroids
    # quantization
    #Assigns a code from a code book to each observation
    #code : A length N array holding the code book index for each observation.
    #dist : The distortion (distance) between the observation and its nearest code.
    code,_ = vq(pixels,centroids)
    #print ("Code : ",code.dtype,code.shape,type(code))
    #print code

    # reshaping the result of the quantization
    reshaped = np.reshape(code,(img.shape[0],img.shape[1]))
    #print ("reshaped : ",reshaped.dtype,reshaped.shape,type(reshaped))

    clustered = centroids[reshaped]
    #print ("clustered : ",clustered.dtype,clustered.shape,type(clustered))
    
    #scatter3D(centroids)
    return clustered
Beispiel #26
0
def train_classifier(train_inds, dict_size=300, shuffle=False):
    # load OFH descriptors from training videos from all-but-two classes
    train_action_n, train_video_n, train_descs, train_labels = load_actions(actions[train_inds])

    # cluster and quantize to produce BoW descriptors
    print 'clustering...'
    print 'train_descs:', train_descs.shape
    if path.exists(path.join(savedir, 'clusters.npy')):
        clusters = np.load(path.join(savedir, 'clusters.npy'))
        cluster_inds = np.load(path.join(savedir, 'cluster_inds.npy'))
    else:
        clusters, cluster_inds = vq.kmeans2(train_descs, dict_size, iter=20, minit='points')
        np.save(path.join(savedir, 'clusters.npy'), clusters)
        np.save(path.join(savedir, 'cluster_inds.npy'), cluster_inds)

    if shuffle:
        random.shuffle(train_labels)

    # produce quantized histograms for each training video
    print 'quantizing...'
    f = path.join(savedir, 'train_hists.npy')
    if path.exists(f):
        train_hists = np.load(f)
    else:
        train_hists = get_desc_hists(clusters, train_descs, train_video_n)
        np.save(f, train_hists)

    # linearly regress for each attribute based on manually produced labels
    print 'training regressors...'
    cls = lin_reg.train(train_hists, train_labels)

    return clusters, cls
Beispiel #27
0
    def test_kmeans2_rank1_2(self):
        """Testing simple call to kmeans2 with rank 1 data."""
        data = np.fromfile(DATAFILE1, sep=", ")
        data = data.reshape((200, 2))
        data1 = data[:, 0]

        code1 = kmeans2(data1, 2, iter=1)
Beispiel #28
0
def find_color(image, rargs):
    MAX_SIZE = 250
    priority = (1, 1.7, 1.8)
    NUM_CLUSTERS = 4
    if "p1" in rargs:
        priority = (float(rargs["p1"]), float(rargs["p2"]), float(rargs["p3"]))
    if "clusters" in rargs:
        NUM_CLUSTERS = int(rargs["clusters"])

    if image.size[0] > MAX_SIZE:
        resize_factor = image.size[0] / MAX_SIZE
        image = image.resize((MAX_SIZE / 8, MAX_SIZE / 8), Image.BICUBIC)

    image_data = list(image.getdata())
    image_data = map(lambda x: rgb_to_hsv(*x), image_data)

    np_array = np.asarray(image_data) * priority
    clusters = vq.kmeans2(np_array, NUM_CLUSTERS, minit="points")[0]
    clusters /= priority
    out_colors = []
    for color in clusters:
        rgb = colorsys.hsv_to_rgb(*color)
        rgb = tuple([255 * x for x in rgb])
        out_colors.append("%02x%02x%02x" % rgb)
    return out_colors
Beispiel #29
0
def scipy_labels(data, clusters, nReps):
    # run scipy.cluster.vq.kmeans on data using an initial clusters
    # number of iterations is one less than used for mpi, since the
    # starting clusters are the result after one mpi iteration
    codebook, dist = kmeans2(data, clusters, nReps, 1e-6)
    labels, dist = vq(data, codebook)
    return labels, codebook
Beispiel #30
0
def computeClustering(data, k=k, textureFolder=textureFolder):
    t0 = time.time()
    outputFolder=textureFolder  #self reminding alias
    height, width, depth = data.shape
    data = data.reshape(height*width, depth)
    clust = kmeans2(data=data, k=k, iter=10, thresh=1e-05,\
                     minit='random', missing='warn')
    # output to textureFolder
    try:
        os.makedirs(textureFolder)
    except: 
        """don't crash.  fail gracefully or not at all"""
        print 'folder already exists!'
        os.rename(textureFolder, textureFolder[:-1]+ 'pre'+ str(timestamp))
        os.makedirs(textureFolder)
    texturelayer= []
    for i in range(k):
        print i
        texturelayer.append( (clust[1]==i).reshape(881,921) )
        #plt.imshow(cluster[i])
        #plt.show()
        if texturelayer[i].sum()==0:
            continue
        pic = dbz(  name='texture layer'+str(i),
                  matrix= np.flipud(texturelayer[i]), vmin=-2, vmax=1,
               imagePath= textureFolder+ '/texturelayer'+ str(i) + '.png')
        #pic.show()
        pic.saveImage()

    timespent= time.time()-t0;  print "time spent:",timespent

    pickle.dump({'content':texturelayer, 'notes':"%d texture layers from 'armor/filter/gaborFilterVectorField.pydump' " %k}, open(textureFolder+'/texturelayer.pydump','w'))
    return clust, texturelayer
Beispiel #31
0
def initkmeans(data, k):
    d = data.shape[1]

    # XXX: This initialization can be better
    (code, label) = kmeans2(data, data[:k], 5, minit='matrix')

    w = np.ones(k) / k
    mu = code.copy()
    va = np.zeros((k, d))
    for c in range(k):
        for i in range(d):
            va[c, i] = np.cov(data[np.where(label == c), i], rowvar=0)

    return w, mu, va
Beispiel #32
0
def classify_embeddings(embeddings, support_labels, support_embeddings, actual_labels=None):
    num_centroids = len(set(support_labels))
    centroid, labels = kmeans2(embeddings, num_centroids, minit='points')

    # labeling the centroids with knn on test embeddings
    knn = KNeighborsClassifier(n_neighbors=5)
    knn.fit(support_embeddings, support_labels)
    predicted_centroid_labels = knn.predict(centroid)

    if actual_labels is not None:
        translated_labels = translate_labels(actual_labels, labels, predicted_centroid_labels)
        draw_embeddings_cluster('comp_num_vs_text.png', embeddings, translated_labels, centroid)

    return [predicted_centroid_labels[label] for label in labels]
Beispiel #33
0
def kmeans_clust(vecs, words, K):
    if VERBOSE:
        print 'Running kmeans!'
    if np.mean(vecs[:,-1] == 1) == 1:
        # exclude the column of 1s
        vex = deepcopy(vecs[:, :-1])
    else:
        vex = deepcopy(vecs)
    # normalise (kmeans does euclidean distance, so this is required for cosine')
    vex /= np.linalg.norm(vex, axis=1).reshape(-1, 1)
    centroids, cluster_assignments = kmeans2(vex, K)
    assignments = pd.DataFrame({'word':words, 'cluster':cluster_assignments})
    csizes, indices = eval_assignments(assignments, K, None)
    return assignments, csizes, indices
Beispiel #34
0
def init_inducing_points(X, m):
    """
    initialize m inducing points by using k-means on X

    inputs:
    X   :   data points
    m   :   number of clusters
    """
    seed = int(np.abs(X.flatten()[0]))
    numpy_rand_state = np.random.get_state()
    np.random.seed(seed)
    Z_init = kmeans2(X, k=m)[0]
    np.random.set_state(numpy_rand_state)
    return Z_init
Beispiel #35
0
 def _init_posterior(self, obs):
     """
     Initialize posterior parameters
     """
     nmix = self._nstates
     nobs, ndim = obs.shape
     # initialize hidden states
     self.z = np.ones((nobs, nmix)) / float(nmix)
     # initialize mixing coefficients
     self.pi = np.ones(nmix) / float(nmix)
     # initialize mean vectors with K-Means clustering
     self.mu, temp = vq.kmeans2(obs, nmix)
     # initialize covariance matrices with sample covariance matrix
     self.cv = np.tile(np.atleast_2d(np.cov(obs.T)), (nmix, 1, 1))
Beispiel #36
0
def gen_codebook(k_means, codebook_file):

    out_vector = run_get_train_vector()
    logger.info('Finished get train vector')

    out_vector = vq.whiten(out_vector)
    codebook, distortion = vq.kmeans2(out_vector, k=k_means, minit='++')

    # with open('C:\\Users\\TienHai\\Desktop\\iDT\\run_LLC\\output\\test_codebook.txt','wb') as f:
    #   np.savetxt(f, codebook, fmt='%7f', delimiter='\t')
    logger.info('Finished gen codebook')

    with open(codebook_file, 'wb') as f:
        np.savetxt(f, codebook, fmt='%7f', delimiter='\t')
Beispiel #37
0
    def __init__(self, kern, outputs, n_inducing, fixed_mean, X):
        self.inputs, self.outputs, self.kernel = kern.input_dim, outputs, kern
        self.M, self.fixed_mean = n_inducing, fixed_mean

        self.Z = tf.Variable(kmeans2(X, self.M, minit='points')[0], dtype=tf.float64, name='Z')
        if self.inputs == outputs:
            self.mean = np.eye(self.inputs)
        elif self.inputs < self.outputs:
            self.mean = np.concatenate([np.eye(self.inputs), np.zeros((self.inputs, self.outputs - self.inputs))], axis=1)
        else:
            _, _, V = np.linalg.svd(X, full_matrices=False)
            self.mean = V[:self.outputs, :].T

        self.U = tf.Variable(np.zeros((self.M, self.outputs)), dtype=tf.float64, trainable=False, name='U')
    def __init__(self, kernel, d_out, n_inducing, X):
        self.d_in, self.d_out = kernel.d, d_out
        self.kernel = kernel
        self.n_inducing = n_inducing

        self.Z = tf.Variable(kmeans2(X, self.n_inducing)[0], dtype=tf.float64)

        self.mean = np.zeros((self.d_in, self.d_out))
        for i in range(min(self.d_in, self.d_out)):
            self.mean[i, i] = 1

        self.U = tf.Variable(np.zeros((self.n_inducing, self.d_out)),
                             dtype=tf.float64,
                             trainable=False)
Beispiel #39
0
    def classifyChunks(self, chType):
        if chType == Chunk.LFH:
            chunks = [i for i in self.fileHeaders]
        elif chType == Chunk.CD:
            chunks = [i for i in self.CDHeaders]

        number = len(self.zipFiles)

        # Eliminate chunks with invalid datetime.
        chunks = [c for c in chunks if not c.last_mod_datetime == None]

        centroids,classes = kmeans2([i.sig_vector() for i in chunks], number, minit='points')
        silos = [[j[1] for j in zip(classes,chunks) if j[0]==i] for i in range(number)]
        return zip(silos,centroids)
 def subcluster(cluster):
     data = getvalidrows(cluster.waves)
     try:
         c,l = vq.kmeans2(data,k,it)
     except:
         l = np.array([i % k for i in range(np.shape(data)[0])])
     result = []
     for i in range(k):
         mask = np.tile(l != i,(np.shape(cluster.waves)[1],1)).T
         fmask = np.tile(True,np.shape(cluster.waves))
         fmask[~cluster.waves.mask[:,0]] = mask
         waves = np.ma.masked_array(cluster.waves,fmask)
         result.append(wavecluster(waves,cluster,cluster.label))
     cluster.subclusters = result
Beispiel #41
0
def select_Z_mbs(nZ, mbs, XP_tr):
    """Select inducing point locations with kmeans from training data XP_tr, and  minibatch size.
    n_tr = number of training points.
    If nZ<1, there will be nZ * n_tr inducing points. Otherwise there will be nZ training points. 
    Same applies for the minibatch size mbs, except that if mbs=0 or mbs > n_tr, mbs is set to n_tr"""
    n_tr = XP_tr.shape[0]
    if nZ < 1:
        nZ = int(np.ceil(nZ * n_tr))
    Z = kmeans2(XP_tr, nZ, minit='points')[0]
    if mbs == 0 or mbs > n_tr:
        mbs = n_tr  # use all data
    elif mbs < 1:
        mbs = int(np.ceil(mbs * n_tr))
    return Z, mbs
Beispiel #42
0
 def _set_posterior(self,obs,use_emgmm=False):
   nobs = len(obs)
   nmix = self._nstates
   # hidden states
   self.z = dirichlet(np.tile(1.0/nmix,nmix),nobs)
   # mixing coefficients
   self.u = np.tile(self._u0,nmix)
   # posterior mean vector
   self.m, temp = vq.kmeans2(obs,nmix)
   self.beta = np.tile(self._beta0,nmix)
   # posterior degree of freedom
   self.nu = np.tile(float(nobs)/nmix,nmix)
   # posterior precision
   self.s = np.tile(self._s0,nmix)
Beispiel #43
0
    def kMeansCluster(self):
        """ Creates a simple 5-segment k-means cluster based on a 
		    small number of league parameters. The top-ranked teams 
		    are then given a k-means score of 1.0, the middle rank 0.5 
		    and the losers are assigned 0.0. Due to random fluctuation 
		    the same team may fall into a nearby group. 
		"""
        # Put the teams into a numpy array
        clusterList = []

        # List to record the order of the teams
        kMeansTeams = []

        # Check each team has more than min games
        for index, team in enumerate(self.leagueTable):

            points = team.getPoints()
            recentForm = team.getForm(5)
            probWin = team.getProbWin()
            goalsFor = team.getGF()

            kMeansTeams.append(team.getTeamName())
            row = [points, recentForm, probWin, goalsFor]
            clusterList.append(row)

        # Get the cluster array
        clusterArray = np.array(clusterList)

        # Normalize this array
        rows, cols = clusterArray.shape
        for col in xrange(cols):
            clusterArray[:, col] /= abs(clusterArray[:, col]).max()

        # Randomize over several kmeans
        res, groupIDs = kmeans2(clusterArray, 5)

        # Set the team's
        index = 0
        topGroup = groupIDs[0]
        bottomGroup = groupIDs[-1]

        for groupID in groupIDs:
            if groupID == topGroup:
                self.setKMeans(kMeansTeams[index], 1.0)
            elif groupID == bottomGroup:
                self.setKMeans(kMeansTeams[index], 0.0)
            else:
                self.setKMeans(kMeansTeams[index], 0.5)
            index += 1
def get_walkers_cluster(N, bounds_fcn, samples=None, data=None):
    walkers = []
    if (not (samples == None)):
        # Get a bunch of samples at random
        cand_samp_idx = np.random.randint(low=0,
                                          high=samples.shape[0],
                                          size=50000)
        cand_samp = samples[cand_samp_idx]

        # Cluster this subset of samples and evaluate likelihood
        k = N
        cents, lab = kmeans2(cand_samp, k)
        csps = []
        for cs in cents:
            csp = lnlike(cs, data)
            csps.append(csp)

        # Then sort by lnlike
        srt_idx = range(len(csps))
        srt_idx.sort(key=csps.__getitem__, reverse=True)
        srt_samp = map(cents.__getitem__, srt_idx)

        # Keep first N (implicitly, dropping N-k clusters)
        walkers = srt_samp[0:N / 2]
        i = 0
        M = samples.shape[1]
        for w in srt_samp[0:N / 2]:
            w2 = np.zeros(M)
            for j in range(M):
                w2[j] = w[j] + np.random.normal(scale=np.abs(w[j]) * .01)
            walkers.append(w2)

        for w in walkers:
            pp = lnlike(w, data)
            print "Walker ", i, " at ", w, " lnlike = ", pp
            i += 1
    else:
        lb, ub = bounds_fcn()
        print "D0", lb[0], ub[0]
        print "K0", lb[1], ub[1]
        print "D1", lb[2], ub[2]
        print "K1", lb[3], ub[3]
        for i in range(N):
            this_walker = []
            for l, u in zip(lb, ub):
                t = np.random.uniform()
                this_walker.append(l + t * (u - l))
            walkers.append(np.array(this_walker))
    return walkers
def process_template(template, k=3, use_kmeans=True):
    """Process timecourse template into time bins."""
    df = pd.read_csv(template)
    df_copy = df.copy(deep=True)
    df.drop('plate_well_neuron', 1, inplace=True)
    ordered_columns = np.argsort(np.asarray([int(x) for x in df.columns]))
    mat_df = df.as_matrix()
    mat_df = mat_df[:, ordered_columns]
    raveled_mat = mat_df.ravel()
    # raveled_mat = raveled_mat[np.isnan(raveled_mat) == 0]
    raveled_mat[raveled_mat == 0] = np.nan
    masked_data = raveled_mat[np.isnan(raveled_mat) == 0]
    if use_kmeans:
        bin_lengths, groups = kmeans2(masked_data, k, iter=10000)
        fixed_groups = np.zeros((raveled_mat.shape)) * np.nan
        fixed_groups[np.isnan(raveled_mat) == 0] = groups
        groups = fixed_groups
    else:
        sorted_inds = np.argsort(masked_data)
        group_ids = np.array_split(sorted_inds, k)
        groups = np.zeros((len(raveled_mat)))
        for idx, g in enumerate(group_ids):
            for gr in g:
                groups[gr] = idx
        bin_lengths = np.asarray(
            [raveled_mat[sorted_inds == x] for x in range(k)]).ravel()
    sort_idx = np.argsort(bin_lengths)
    sorted_groups = np.zeros((len(groups)), dtype=int) * np.nan
    for idx, g in enumerate(groups):
        if not np.isnan(g):
            sorted_groups[idx] = sort_idx[int(g)]
    print 'Timecourse group means: %s' % np.sort(bin_lengths)
    group_maps = {
        k: v
        for k, v in zip(raveled_mat, sorted_groups) if not np.isnan(v)
    }
    group_maps[0.0] = np.nan
    proc_mat = np.zeros((mat_df.shape))
    for r in range(proc_mat.shape[0]):
        for c in range(proc_mat.shape[1]):
            entry = mat_df[r, c]
            if np.isnan(entry):
                proc_mat[r, c] = entry
            else:
                proc_mat[r, c] = group_maps[entry]
    proc_columns = [str(x) for x in ordered_columns]
    proc_df = pd.DataFrame(proc_mat, columns=proc_columns)
    proc_df['plate_well_neuron'] = df_copy['plate_well_neuron']
    return proc_df
Beispiel #46
0
def svgp(args, dataloader, test_x, kernel=None):
    N = len(dataloader.dataset)

    inducing_points, _ = kmeans2(dataloader.dataset.train_x.numpy(),
                                 args.n_inducing,
                                 minit='points')
    inducing_points = torch.from_numpy(inducing_points).squeeze(-1)

    model = SVGP(inducing_points, kernel)
    # p(y|f)
    likelihood = GaussianLikelihood()

    model.train()
    likelihood.train()

    optimizer = optim.Adam([{
        'params': model.parameters()
    }, {
        'params': likelihood.parameters()
    }],
                           lr=args.learning_rate)

    mll = VariationalELBO(likelihood, model, N, combine_terms=False)

    for epoch in range(args.n_iters):
        for train_x, train_y in dataloader:
            train_x, train_y = train_x.squeeze(), train_y.squeeze()
            optimizer.zero_grad()
            output = model(train_x)

            log_ll, kl_div, log_prior = mll(output, train_y)
            loss = -(log_ll - kl_div + log_prior)
            loss.backward()
            optimizer.step()
        if epoch % 50 == 0:
            print("Iter {}, lower bound = {:.4f}, obs_var = {:.4f}".format(
                epoch, -loss.item(), likelihood.noise.item()))

    test_stats = TestStats(None, None)
    model.eval()
    likelihood.eval()
    with torch.no_grad():
        observed_pred = likelihood(model(test_x))
        test_y_mean = observed_pred.mean
        test_y_var = observed_pred.variance
    test_stats = test_stats._replace(test_y_mean=test_y_mean,
                                     test_y_var=test_y_var)

    return test_stats
Beispiel #47
0
def kmeans2(d, headers=None, K=None, whiten=True):
    if 'numpy' in str(type(d)):
        A = d
    else:
        A = d.get_data(headers)

    if whiten:
        W = vq.whiten(A)
    else:
        W = A

    codebook, bookerror = vq.kmeans2(W, K)
    codes, errors = vq.vq(W, codebook)

    return codebook, codes, errors
Beispiel #48
0
def surf_img(img):

    imgray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    surf = cv2.SURF()
    surf.extended = True
    surf.hessianThreshold = 1000
    kp, des = surf.detectAndCompute(imgray, None)
    features = np.asarray(des)
    centroid, label = kmeans2(features,
                              cluster_n,
                              iter=10,
                              thresh=1e-05,
                              minit='random',
                              missing='warn')
    return features, centroid, label, kp
Beispiel #49
0
def PFA(df):
    corrMat = df.corr()
    eigen_values, eigen_vectors = np.linalg.eig(corrMat)

    # Using Kmeans2 for getting the centroids of clusters and an array which
    centroids, labels = kmeans2(eigen_vectors[:, :7], 7)
    clusterVectors = [[] for i in range(7)]
    count = 0
    for i in labels:
        clusterVectors[i].append(count)
        count = count + 1

    # Getting vectors closest to each cluster centroid
    closest, _ = pairwise_distances_argmin_min(centroids, eigen_vectors[:, :7])
    return closest, clusterVectors
Beispiel #50
0
def kmeans(img):
    height, width, channels = img.shape
    lab_img = color.rgb2lab(img.astype(np.float32) / 255)
    ab_img = lab_img[:, :, 1:3].flatten()
    ab_img.shape = (ab_img.size / 2, 2)
    cluster_count = 2
    centroid, clusters = kmeans2(whiten(ab_img), cluster_count)
    clusters = 255 * clusters
    clusters.shape = (height, width)
    sumBorders = sum(clusters[0, :]) + sum(clusters[:, 0]) + sum(
        clusters[-1, :]) + sum(clusters[:, -1])
    if sumBorders / (2 * (height + width)) > 127:
        clusters = 255 - clusters
    mask = np.array(clusters, dtype=np.uint8)
    return mask
Beispiel #51
0
def form_groups(points, estimated_size=10, iter=1):
    if len(points) < 1:
        return []
    points = array(points)
    centroids, variance = kmeans2(points,
                                  estimated_size,
                                  iter=iter,
                                  minit='points')
    group_indicies, dist = vq(points, centroids)
    group = {}
    for i, index in enumerate(group_indicies):
        if index not in group:
            group[index] = []
        group[index].append(points[i])
    return group.values()
Beispiel #52
0
def surf_img(img1):
    print "-> calculating SURF"
    #Calculate surf desciptors, and apply Kmeans algo to create clusters
    surf = cv2.SURF()
    surf.extended = True
    #kp = surf.detect(img1)
    kp, descript = surf.detectAndCompute(img1, None)
    descriptors = np.asarray(descript)
    centroid, label = kmeans2(descriptors,
                              cluster_n,
                              iter=10,
                              thresh=1e-05,
                              minit='random',
                              missing='warn')
    return descript, centroid, label, kp
Beispiel #53
0
def init_layers(X, dims_in, dims_out, M, final_inducing_points,
                share_inducing_inputs):
    q_mus, q_sqrts, mean_functions, Zs = [], [], [], []
    X_running = X.copy()

    for dim_in, dim_out in zip(dims_in[:-1], dims_out[:-1]):
        if dim_in == dim_out:  # identity for same dims
            W = np.eye(dim_in)
        elif dim_in > dim_out:  # use PCA mf for stepping down
            _, _, V = np.linalg.svd(X_running, full_matrices=False)
            W = V[:dim_out, :].T
        elif dim_in < dim_out:  # identity + pad with zeros for stepping up
            I = np.eye(dim_in)
            zeros = np.zeros((dim_out - dim_in, dim_in))
            W = np.concatenate([I, zeros], 0).T

        mean_functions.append(Linear(A=W))
        Zs.append(kmeans2(X_running, M, minit='points')[0])
        if share_inducing_inputs:
            q_mus.append([np.zeros((M, dim_out))])
            q_sqrts.append([np.eye(M)[:, :, None] * np.ones((1, 1, dim_out))])
        else:
            q_mus.append([np.zeros((M, 1))] * dim_out)
            q_sqrts.append([np.eye(M)[:, :, None] * np.ones(
                (1, 1, 1))] * dim_out)

        X_running = X_running.dot(W)

    # final layer (as before but no mean function)
    mean_functions.append(Zero())
    Zs.append(kmeans2(X_running, final_inducing_points, minit='points')[0])
    q_mus.append([np.zeros((final_inducing_points, 1))])
    q_sqrts.append(
        [np.eye(final_inducing_points)[:, :, None] * np.ones((1, 1, 1))])

    return q_mus, q_sqrts, Zs, mean_functions
Beispiel #54
0
def spectral_clustering(G, k):
    GU = G.to_undirected()
    A = nx.adjacency_matrix(GU).toarray()
    # Create degree matrix
    D = np.diag(np.sum(A, axis=0))
    # Create Laplacian matrix
    L = D - A
    eigval, eigvec = np.linalg.eig(L)  # Calculate eigenvalues and eigenvectors
    eigval = eigval.real  # Keep the real part
    eigvec = eigvec.real  # Keep the real part
    idx = eigval.argsort()  # Get indices of sorted eigenvalues
    eigvec = eigvec[:, idx]  # Sort eigenvectors according to eigenvalues
    Y = eigvec[:, :k]  # Keep the first k vectors
    centroids, labels = kmeans2(Y, k)
    return labels
def spectral(X, n_clusters=3, verbose=False):
    m = len(X)
    labels = np.zeros((m, 1))
    simi_matrix = build_simi_matrix(X)
    d_matrix = np.sum(simi_matrix, axis=1)
    d2 = np.sqrt(1 / d_matrix)
    d2 = np.diag(d2)
    lap_matrix = np.dot((np.dot(d2, simi_matrix)), d2)

    U, s, V = np.linalg.svd(lap_matrix, full_matrices=True)
    kerN = U[:, m - n_clusters + 1:]
    for i in range(m):
        kerN[i, :] = kerN[i, :] / np.linalg.norm(kerN[i, :])
    _, labels = kmeans2(kerN, n_clusters, iter=100)
    return labels
    def run_and_publish_kmeans(self, data):
        kmeans_output = clusteralgos.kmeans2(data, self.num_agents)
        center_points = kmeans_output[0]
        '''
        The following two lines are for debugging.
        They make sure node0 gets all the features.
        '''
        # data_mean = np.average(data, 0)
        # center_points = np.array([data_mean, np.array([-999 for i in range(128)]), np.array([-999 for i in range(128)])])

        center_points_flattened = center_points.flatten()
        msg = Feature()
        msg.data = center_points_flattened
        msg.header.frame_id = str(self.node_id)
        self.pub_kmeans.publish(msg)
Beispiel #57
0
 def _init_posterior(self, obs):
     """
     Initialize posterior parameters
     """
     nmix = self._nstates
     nobs, ndim = obs.shape
     avr_N = float(nobs) / float(nmix)
     # parameters of posterior mixing coefficients
     self._u = np.ones(nmix) * (self._u0 + avr_N)
     # parameters of posterior precision matrices
     self._nu = np.ones(nmix) * (self._nu0 + avr_N)
     self._V = np.tile(np.array(self._V0), (nmix, 1, 1))
     # parameters of posterior mean vectors
     self._beta = np.ones(nmix) * (self._beta0 + avr_N)
     self._m, temp = vq.kmeans2(obs, nmix)  # initialize by K-Means
Beispiel #58
0
def _find_orientation(x, y, eye_color_index=4):
    """Find the orientation of the face."""
    old = np.seterr(all='raise')
    try:
        eyes, _ = vq.kmeans2(x[y == eye_color_index], 2)
    except:
        return 0
    finally:
        np.seterr(**old)
    eye_line = eyes[0] - eyes[1]
    rad = np.arctan2(*eye_line) + np.pi / 2
    eyes_rot = _rot_mat(rad).dot(eyes.T).T
    if eyes_rot[0, 1] < 0:
        rad += np.pi
    return rad
Beispiel #59
0
 def performKMeansClustering(vector_matrix):
     kmeans_results = dict()
     whitened = whiten(vector_matrix)
     std_devs = numpy.std(vector_matrix, axis=0)
     for k in range(2, 11):
         centroids, labels = kmeans2(whitened, k, minit='points')
         kmeans_results[k] = {
             'centroids': centroids.tolist(),
             'labels': labels.tolist()
         }
         for i, centroid in enumerate(kmeans_results[k]['centroids']):
             for j, val in enumerate(centroid):
                 kmeans_results[k]['centroids'][i][j] = \
                     val * std_devs[j]
     return kmeans_results
Beispiel #60
0
    def fit(self, X, Y):
        Z = kmeans2(X, self.ARGS.num_inducing, minit='points'
                    )[0] if X.shape[0] > self.ARGS.num_inducing else X.copy()

        if not self.model:
            # NB mb_size does not change once the model is created
            mb_size = self.ARGS.minibatch_size if X.shape[
                0] >= self.ARGS.minibatch_size else None

            if self.K == 2:
                lik = gpflow.likelihoods.Bernoulli()
                num_latent = 1
            else:
                lik = gpflow.likelihoods.MultiClass(self.K)
                num_latent = self.K

            kern = gpflow.kernels.RBF(X.shape[1],
                                      lengthscales=float(X.shape[1])**0.5)
            self.model = gpflow.models.SVGP(X,
                                            Y,
                                            kern,
                                            lik,
                                            feat=Z,
                                            whiten=False,
                                            num_latent=num_latent,
                                            minibatch_size=mb_size)

            self.opt = gpflow.train.AdamOptimizer(self.ARGS.adam_lr)

            self.sess = self.model.enquire_session()
            iters = self.ARGS.iterations

        else:
            iters = self.ARGS.small_iterations

        # we might have new data
        self.model.X.assign(X, session=self.sess)
        self.model.Y.assign(Y, session=self.sess)
        self.model.feature.Z.assign(Z, session=self.sess)

        num_outputs = self.model.q_sqrt.shape[0]
        self.model.q_mu.assign(np.zeros((self.ARGS.num_inducing, num_outputs)),
                               session=self.sess)
        self.model.q_sqrt.assign(np.tile(
            np.eye(self.ARGS.num_inducing)[None], [num_outputs, 1, 1]),
                                 session=self.sess)

        self.opt.minimize(self.model, maxiter=iters, session=self.sess)