Ejemplo n.º 1
0
def bic_kmeans(features, **kwargs):
    '''
    Run kmeans on features with **kwargs given to scipy.cluster.vq.kmeans for
    different numbers of clusters, k.  Choose, finally, the clustering that
    minimizes the Beysian Information Criterion or BIC.
    '''
    max_k = int(2*numpy.log(len(features)))

    base_distances = vq(features, 
            numpy.array([numpy.average(features, axis=0)]))[1]
    base_std = numpy.std(base_distances)

    centers_list = []
    bic_list = []
    distances_list = []
    for k in range(1, max_k+1):
        centers = kmeans(features, k, **kwargs)[0]
        clusters, distances = vq(features, centers)
        bic = calculate_bic(clusters, distances, base_std)
        centers_list.append(centers)
        distances_list.append(distances)
        bic_list.append(bic)

    best_index = numpy.argmin(bic_list)
    return centers_list[best_index], distances_list[best_index]
Ejemplo n.º 2
0
def _get_centroid_mask(overtones):
    flat = overtones.reshape((len(overtones) * 48, overtones.shape[2]))
    f0flat = flat[np.argmax(flat, 1) == 0]
    f0flat = f0flat[np.max(f0flat, 1) > 0]
    f0flat = (f0flat.T / np.max(f0flat, 1)).T

    centroids, distortion = kmeans(f0flat, 24)
    codes, dists = vq(f0flat, centroids)
    #centroids = centroids[np.bincount(codes) > np.median(np.bincount(codes))]    
    flat_norm = (flat.T / np.max(flat, 1)).T
    codes, dists = vq(flat_norm, centroids)

    flat_filtered = np.copy(flat)

    for i, (s, c) in enumerate(zip(flat, codes)):
        if c < 0 or c > len(centroids):
            continue

        centroid = centroids[c]
        centroid_denorm = centroid * np.max(s)
        flat_filtered[i, 1:] -= centroid_denorm[1:]
        flat_filtered[i, 1:] = np.maximum(flat_filtered[i, 1:], 0)

    overtones_filtered = flat_filtered.reshape(overtones.shape)

    return overtones_filtered
Ejemplo n.º 3
0
def getImageDescriptor(model, im, conf):
	im = standardizeImage(im)
	height, width = im.shape[:2]
	numWords = model.vocab.shape[1]
	frames, descrs = getPhowFeatures(im, conf.phowOpts)
	# quantize appearance
	if model.quantizer == 'vq':
		binsa, _ = vq(descrs.T, model.vocab.T)
	elif model.quantizer == 'kdtree':
		raise ValueError('quantizer kdtree not implemented')
	else:
		raise ValueError('quantizer {0} not known or understood'.format(model.quantizer))
	hist = []
	for n_spatial_bins_x, n_spatial_bins_y in zip(model.numSpatialX, model.numSpatialX):
		binsx, distsx = vq(frames[0, :], linspace(0, width, n_spatial_bins_x))
		binsy, distsy = vq(frames[1, :], linspace(0, height, n_spatial_bins_y))
		# binsx and binsy list to what spatial bin each feature point belongs to
		if (numpy.any(distsx < 0)) | (numpy.any(distsx > (width/n_spatial_bins_x+0.5))):
			print ("something went wrong")
			import pdb; pdb.set_trace()
		if (numpy.any(distsy < 0)) | (numpy.any(distsy > (height/n_spatial_bins_y+0.5))):
			print ("something went wrong")
			import pdb; pdb.set_trace()
		# combined quantization
		number_of_bins = n_spatial_bins_x * n_spatial_bins_y * numWords
		temp = arange(number_of_bins)
		# update using this: http://stackoverflow.com/questions/15230179/how-to-get-the-linear-index-for-a-numpy-array-sub2ind
		temp = temp.reshape([n_spatial_bins_x, n_spatial_bins_y, numWords])
		bin_comb = temp[binsx, binsy, binsa]
		hist_temp, _ = histogram(bin_comb, bins=range(number_of_bins+1), density=True)
		hist.append(hist_temp)
	
	hist = hstack(hist)
	hist = array(hist, 'float32') / sum(hist)
	return hist
Ejemplo n.º 4
0
def read_unclustered_data(filename, num_clusters, cl_type="kMeans"):
    """Return dictionary of cluster id to array of points.

    Given a filename in the format of lat, lng
    generate k clusters based on arguments. Outputs a dictionary with
    the cluster id as the key mapped to a list of lat, lng pts
    """
    request_points = []
    with open(filename, 'rb') as input_file:
        input_file.next()  # Skip the header row
        for line in input_file:
            lat, lng = line.split(',')
            request_points.append((float(lat), float(lng)))
    request_points = array(request_points)

    if cl_type == "kMeans":
        # computing K-Means with K = num_clusters
        centroids, _ = kmeans(request_points, int(num_clusters))
        # assign each sample to a cluster
        idx, _ = vq(request_points, centroids)

    else:
        # computeing kMedoids using distance matrix
        centroids = get_kmedoids(request_points, int(num_clusters))
        # assign each sample to a cluster
        idx, _ = vq(request_points, centroids)

    # map cluster lat, lng to cluster index
    cluster_points = defaultdict(list)
    for i in xrange(len(request_points)):
        lat, lng = request_points[i]
        cluster_points[idx[i]].append((lat, lng))
    return cluster_points
Ejemplo n.º 5
0
def _get_larger_chroms(ref_file):
    """Retrieve larger chromosomes, avoiding the smaller ones for plotting.
    """
    from scipy.cluster.vq import kmeans, vq
    all_sizes = []
    for c in ref.file_contigs(ref_file):
        all_sizes.append(float(c.size))
    all_sizes.sort()
    # separate out smaller chromosomes and haplotypes with kmeans
    centroids, _ = kmeans(np.array(all_sizes), 2)
    idx, _ = vq(np.array(all_sizes), centroids)
    little_sizes = tz.first(tz.partitionby(lambda xs: xs[0], zip(idx, all_sizes)))
    little_sizes = [x[1] for x in little_sizes]
    # create one more cluster with the smaller, removing the haplotypes
    centroids2, _ = kmeans(np.array(little_sizes), 2)
    idx2, _ = vq(np.array(little_sizes), centroids2)
    little_sizes2 = tz.first(tz.partitionby(lambda xs: xs[0], zip(idx2, little_sizes)))
    little_sizes2 = [x[1] for x in little_sizes2]
    # get any chromosomes not in haplotype/random bin
    thresh = max(little_sizes2)
    larger_chroms = []
    for c in ref.file_contigs(ref_file):
        if c.size > thresh:
            larger_chroms.append(c.name)
    return larger_chroms
Ejemplo n.º 6
0
 def run(self, features, number_of_clusters='3', restarts=10):
     if number_of_clusters != 'Use BIC':
         k = int(number_of_clusters)
         if k == 1:
             result = numpy.zeros(len(features), dtype=numpy.int32)
             return [result]
         return [vq(features, kmeans(features, k, iter=restarts)[0])[0]]
     else:
         return [vq(features, bic_kmeans(features, iter=restarts)[0])[0]]
Ejemplo n.º 7
0
def sphere_tissue_image(size=100, n_points=12):

    center = np.array([size/2,size/2,size/2],float)
    radius = size/4.

    points = {}
    for p in range(n_points):
        theta = np.random.rand()*2.*np.pi
        phi = np.random.rand()*np.pi - np.pi/2.
        
        points[p+3] = center + radius*np.array([np.cos(theta)*np.cos(phi),np.sin(theta)*np.cos(phi),np.sin(phi)])
    points = array_dict(points)

    point_target_area = 4.*np.pi*np.power(radius,2.)/float(n_points)
    point_target_distance = np.power(point_target_area/np.pi,0.5)

    sigma_deformation = (size/100.)*(20./n_points)
    omega_forces = dict(distance=0.1*size/100., repulsion=100.0*np.power(size/100.,2))

    for iterations in xrange(100):
        point_vectors = np.array([points[p]- points.values() for p in points.keys()])
        point_distances = np.array([vq(points.values(),np.array([points[p]]))[1] for p in points.keys()])
        point_vectors = point_vectors/(point_distances[...,np.newaxis]+1e-7)

        point_distance_forces = omega_forces['distance']*((point_distances-point_target_distance)[...,np.newaxis]*point_vectors/point_target_distance).sum(axis=1)
        
        point_repulsion_forces = omega_forces['repulsion']*np.power(point_target_distance,2)*(point_vectors/(np.power(point_distances,2)+1e-7)[...,np.newaxis]).sum(axis=1)
        
        point_forces = np.zeros((len(points),3))
        point_forces += point_distance_forces
        point_forces += point_repulsion_forces
        
        point_forces = np.minimum(1.0,sigma_deformation/np.linalg.norm(point_forces,axis=1))[:,np.newaxis] * point_forces
        
        new_points = points.values() + point_forces
        
        new_points = center+ radius*((new_points-center)/np.linalg.norm((new_points-center),axis=1)[:,np.newaxis])
        
        points = array_dict(new_points,points.keys())
    points[2] = center

    coords = np.transpose(np.mgrid[0:size,0:size,0:size],(1,2,3,0)).reshape((np.power(size,3),3)).astype(int)
    labels = points.keys()[vq(coords,points.values())[0]]

    ext_coords = coords[vq(coords,np.array([center]))[1]>size/3.]

    img = np.ones((size,size,size),np.uint8)
    img[tuple(np.transpose(coords))] = labels
    img[tuple(np.transpose(ext_coords))] = 1
    img = SpatialImage(img,resolution=(60./size,60./size,60./size))

    return img
Ejemplo n.º 8
0
def performance_measure(reference_set,experimental_set,measure='jaccard_index'):
    VP = (vq(experimental_set,reference_set)[1]==0).sum()
    FP = (vq(experimental_set,reference_set)[1]>0).sum()
    FN = (vq(reference_set,experimental_set)[1]>0).sum()

    if measure == 'true_positive':
        return VP
    elif measure == 'precision':
        return VP/float(VP+FP) 
    elif measure == 'recall':
        return VP/float(VP+FN) 
    elif measure == 'dice_index':
        return 2*VP / float(2*VP+FP+FN)
    elif measure == 'jaccard_index':
        return VP/float(VP+FP+FN)
Ejemplo n.º 9
0
def kmeans(iData, clustNumber, oPrefix, norm=False):
    '''Perform k-means cluster analysis and return MAP of zones'''
    print 'Run K-Means'
    
    height, width = iData.shape[1:3]
    #reshape 3D cube of data into 2D matrix and get indeces of valid pixels
    iData, notNanDataI = cube2flat(iData)
    if norm:
        #center and norm
        iDataMean = iData[:, notNanDataI].mean(axis=1)
        iDataStd  = iData[:, notNanDataI].std(axis=1)
        iData = np.subtract(iData.T, iDataMean).T
        iData = np.divide(iData.T, iDataStd).T

    #perform kmeans on valid data and return codebook
    codeBook = vq.kmeans(iData[:, notNanDataI].astype('f8').T, clustNumber)[0]
    #perform vector quantization of input data uzing the codebook
    #return vector of labels (for each valid pixel)
    labelVec = vq.vq(iData[:, notNanDataI].astype('f8').T, codeBook)[0]+1
    #create and fill MAP of zones
    zoneMap = np.zeros(width*height) + np.nan
    zoneMap[notNanDataI] = labelVec
    zoneMap = zoneMap.reshape(height, width)
    
    #visualize map of zones
    plt.imsave(oPrefix + 'zones.png', zoneMap)
    
    return zoneMap
def clustering_scipy_kmeans(features, n_clust = 8):
  """
  """
  whitened = whiten(features)
  print whitened.shape
  
  initial = [kmeans(whitened,i) for i in np.arange(1,12)]
  plt.plot([var for (cent,var) in initial])
  plt.show()
  
  #cent, var = initial[3]
  ##use vq() to get as assignment for each obs.
  #assignment,cdist = vq(whitened,cent)
  #plt.scatter(whitened[:,0], whitened[:,1], c=assignment)
  #plt.show()
  
  codebook, distortion = kmeans(whitened, n_clust)
  print codebook, distortion
  assigned_label, dist = vq(whitened, codebook)
  for ii in range(8):
    plt.subplot(4,2,ii+1)
    plt.plot(codebook[ii])
  plt.show()
  
  centroid, label = kmeans2(whitened, n_clust, minit = 'points')
  print centroid, label
  for ii in range(8):
    plt.subplot(4,2,ii)
    plt.plot(centroid[ii])
  plt.show()
Ejemplo n.º 11
0
def new_labelled_page(no_of_samples:int, window_size:int, page_scale:int or tuple, labelled_centroids:[tuple], page_paths:[str]):
    ### Duplication from above
    weighter = gaussian_weighter(window_size)
    windower = f.partial(win_centred_on, window=window_size)
    shifter = f.partial(point_shift, window=window_size)
    scaler = img_scaler(page_scale)
    make_observations = compose(prepare_features, real_fft)
    img, label = open_image_label(*page_paths)
    img, label = scaler(img, label)
    f_img = prepare_fft_image(img, window_size)
    access_img = img_accessor(img, identity)
    access_label = img_accessor(label, identity)
    access_f_img = img_accessor(f_img, compose(windower, shifter))
    ### End of duplication
    labels = [a[0] for a in labelled_centroids]
    centroids = np.asarray([a[1] for a in labelled_centroids])
    new_label = np.zeros_like(label)
    for s in img_slices(new_label.shape, 80):
        unlabelled_samples = sample_all_in_area(s, applier(identity, compose(make_observations, access_f_img)))   
        coords = [a[0] for a in unlabelled_samples]
        observations = np.asarray([a[1] for a in unlabelled_samples])
        codes, dist = vq.vq(observations, centroids)
        for i, code in zip(coords, codes):
            new_label[i] = labels[code]
    return new_label
Ejemplo n.º 12
0
def computeHistograms(codebook, descriptors):
	code, dist = vq.vq(descriptors, codebook)
	bins=range(codebook.shape[0] + 1)
	#print "bins:", bins
	histogram_of_words, bin_edges = np.histogram(code, bins, normed=True)
	#print histogram_of_words
	return histogram_of_words
Ejemplo n.º 13
0
def kmeans(features, projection, ite = 50, k = 4, threshold = 1e-5):    
    """ perform k_keamns clustering and return a the result as a subsapce clustering object """
    from scipy.cluster.vq import kmeans, vq
    import datetime

    from measures import spatial_coherence    
   
    centroids, distance = kmeans(features, k, iter=ite, thresh=threshold)
    code, _ = vq(features, centroids)
    
    run_ = datetime.datetime.now().strftime("%y_%m_%d_%H_%M")
    
    params = "projection_size=%d, k=%d" %(len(projection), k)
    clusters = clusters_from_code(code, k, projection)
  
    clustering_id = "(%s)_(%s)_(%s)_(%s)" %("exhaustive_kmeans", params, run_, projection)
    #print clustering_id
    km_clt = KMClustering(algorithm ="exhaustive_kmeans", parameters = params, run = run_,
                          clustering_id = clustering_id, clusters = clusters, ccontains_noise = False, cclustering_on_dimension = True)

   
    measures = {'spatial_coherence': spatial_coherence(km_clt, len(features))[0], 'distortion': distance}
    km_clt.update_measures(measures)
    
    return  km_clt 
Ejemplo n.º 14
0
def vectorQuantization (features, bits, debug=False):
	from scipy.cluster.vq import vq
	D = len(features[0])
	np_features = np.array(features)
	nom_features = np.empty(np_features.shape, dtype=str)
	for i in range(D):
		column = np_features[:,i]
		max_val = np.max(column)
		min_val = np.min(column)
		bits = bits
		denom = bits
		step = (max_val - min_val)/denom
		partition = [0]*(denom+1)
		codebook = [0]*(denom+1)
		for j in range(denom+1):
			partition[j] = (min_val+(step*j))
			codebook[j] = j
		column = np.array(column)
		partition = np.array(partition)
		if debug:
			print('****')
			print(column)
			print(partition)
		tmp = vq(column,partition)
		nom_col = [str(int(x)+1) for x in tmp[0]]
		if debug:
			print tmp[0]
			print nom_col
			print '****'
		nom_features[:,i] = nom_col
	return nom_features
Ejemplo n.º 15
0
def bow(images,codebook,clusters):
	out = images
	temp = []

	print "-"*60
	print "Creating the pseudo database."
	for im in images:
		c = Counter()
		bag,dist = vq(whiten(im[1]),codebook)
		
		for word in bag:
			c[word]+=1

		#Creating histograms
		for i in range(clusters):
			if i in c.iterkeys():
				c[i] = c[i]/sum(c.values())
			if i not in c.iterkeys():
				c[i] = 0
		
		temp.append(c)
		
	for i in range(len(temp)):
		out[i].append(temp[i])

	print "Done.\n"
	return out
def classify_kmeans(infile, clusternumber):
    '''
    apply kmeans
    '''
    
    #Load infile in data array    
    driver = gdal.GetDriverByName('GTiff')
    driver.Register()
    ds = gdal.Open(infile, gdal.GA_Update)
    databand = ds.GetRasterBand(1)
    
    #Read input raster into array
    data = ds.ReadAsArray() 
    #replace no data value with numpy.nan
    #data[data==-999.0]=numpy.nan 
    
    pixel = numpy.reshape(data,(data.shape[0]*data.shape[1]))
    centroids, variance = kmeans(pixel, clusternumber)
    code, distance = vq(pixel,centroids)
    centers_idx = numpy.reshape(code,(data.shape[0],data.shape[1]))
    clustered = centroids[centers_idx]
    
    # Write outraster to file
    databand.WriteArray(clustered)
    databand.FlushCache()        
    
    #Close file
    databand = None
    clustered = None
    ds = None  
Ejemplo n.º 17
0
def select(file, output, clusters=None):
    """
    Select clusters containing real motifs and discard the rest

    Parameters
    ----------
    file : An hdf5 file containing clustered motif matches as generated by birdwerdz.hdf.classify
    output : Name of output file which will contain only motifs from selected
             clusters.  If same as input file, will delete motifs from the file
    clusters : Clusters to select 

    """
    if file == output:
        mode = 'r+'
    else:
        mode = 'w-'
    with h5py.File(output, mode) as out:
        if file != output:
            with h5py.File(file, 'r+') as src:
                for entry in src.values():
                    out['/'].copy(entry,entry.name)
        for entry in out.values():
            if not isinstance(entry,h5py.Group) or 'motifs' not in entry.keys():
                continue

            amp_vecs= entry['motifs']['spectrogram'].sum(1) 

            cluster_path = 'cluster_mean_spectrograms'
            id,_ = vq(amp_vecs, out[cluster_path][:].sum(1))

            new_motifs=np.delete(entry['motifs'], np.where(
                [i not in clusters for i in id])[0])

            del entry['motifs']
            entry.create_dataset('motifs',data=new_motifs)
def performMCCAlgorithm(dataSet, specificDataPointIndex, numIterations = 200, numClusters = 4, subDataRatio = 0.5):
	periodsAhead = np.array([1, 2, 3, 4, 5, 6, 9, 12, 18, 24, 36, 60, 120])
	strippedDataSet = dataSet
	dataLength = strippedDataSet.shape[0]
	dataWidth = strippedDataSet.shape[1]
	specificDataPoint = strippedDataSet[specificDataPointIndex,:]

	numPeriods = len(periodsAhead)

	statisticWeightsbyIteration = np.empty(shape=(numIterations, 4),dtype=float)

	# Perform Bootstrapped Clustering
	for i in range(0,numIterations):
		# Perform Bootstrapped Clustering / Chooose Data Subset
		subDataSetIndexes = np.random.choice(range(0,dataLength),size=dataLength*subDataRatio,replace=True) 
		subDataSet = strippedDataSet[subDataSetIndexes,:]
		# Perform Bootstrapped Clustering / Find Data Clusters for Subset of Data
		kMeansClusters = spc.kmeans(subDataSet, numClusters)
		clusterCenters = kMeansClusters[0]
		# Perform Bootstrapped Clustering / Record Clustering Cost for Weighting Scheme
		clusteringCost = kMeansClusters[1]
		statisticWeightsbyIteration[i,0] = clusteringCost
		# Perform Bootstrapped Clustering / Apply Found Data Clusters to All Data
		allClusters = spc.vq(strippedDataSet, clusterCenters)
		clusterAssignments = allClusters[0]
		clusterDistortions = allClusters[1]
		display = 1 #TEST
		if display: #TEST
			plt.scatter(dataSet[0:60,0],dataSet[0:60,1],c=clusterAssignments[0:60]) #TEST	
			plt.show()	
		statisticWeightsbyIteration[i,1] = max(clusterDistortions)
		statisticWeightsbyIteration[i,2] = np.mean(clusterDistortions)
		statisticWeightsbyIteration[i,3] = np.std(clusterDistortions)
	return statisticWeightsbyIteration
Ejemplo n.º 19
0
def scipy_labels(data, clusters, nReps):
    # run scipy.cluster.vq.kmeans on data using an initial clusters
    # number of iterations is one less than used for mpi, since the
    # starting clusters are the result after one mpi iteration
    codebook, dist = kmeans2(data, clusters, nReps, 1e-6)
    labels, dist = vq(data, codebook)
    return labels, codebook
Ejemplo n.º 20
0
def connected_regions(image):
    """
    Converts image into grayscale, quantizes, counts connected regions
    """
    # render_image(image)

    colors = 2

    # Quantization into two colors
    image_rgb = np.dstack(image)
    pixels = np.reshape(
        image_rgb,
        (image_rgb.shape[0] * image_rgb.shape[1], image_rgb.shape[2])
    )
    centroids, _ = vq.kmeans(pixels, colors)
    quantized, _ = vq.vq(pixels, centroids)
    quantized_idx = quantized.reshape(
        (image_rgb.shape[0], image_rgb.shape[1])
    )

    if len(centroids) > 1:
        # for_render = (quantized_idx * 255).astype(np.uint8)
        # render_image(for_render)
        regions = len(region_sizes(quantized_idx))
        regions_inverted = len(region_sizes(1 - quantized_idx))
        # import pdb; pdb.set_trace()

        # if regions == 0:
        #     regions = image[0].shape[0] * image[0].shape[1]
        # print regions
        return max([regions, regions_inverted])
    else:
        return 0
Ejemplo n.º 21
0
    def project(self, descriptors):
        imhist = np.zeros((self.nbr_word))
        words, distance = cluster.vq(descriptors, self.voc)
        for i in words:
            imhist[i] += 1

        return imhist
Ejemplo n.º 22
0
def reduce_colors(image, k):
    '''Apply kmeans algorithm.
        Input:   image, number of clusters to use
        Returns: colors, 
                 counts per color, 
                 new image
    '''
    if k > 32:
        print "Setting colors to maximum allowed of 32"
        k = 32
    rows, cols, rgb = image.shape
    # reshape the image in a single row array of RGB pixels
    image_row = np.reshape(image,(rows * cols, 3))
    #HERE ADD CODE TO GET A GOOD GUESS OF COLORS AND PASS THAT AS
    #SECOND ARGUMENT TO kmeans
    #image_array_sample = shuffle(image_row, random_state=0)[:1000]
    #kguess = kmeans(image_array_sample, k)
    #colors,_ = kmeans(image_row, kguess)
    # perform the clustering
    colors,_ = kmeans(image_row, k)
    # vector quantization, assign to each pixel the index of the nearest centroid (i=1..k)
    qnt,_ = vq(image_row,colors)
    # reshape the qnt vector to the original image shape
    image_centers_id = np.reshape(qnt,(rows, cols))
    # assign the color value to each pixel
    newimage = colors[image_centers_id]
    #count number of pixels of each cluster color
    counts,bins = sp.histogram(qnt, len(colors))
    return colors, counts, newimage
Ejemplo n.º 23
0
def main():
    args = get_args()
    # This catches files sent in with stdin
    if isinstance(args.infile, TextIOWrapper):
        data = JSONFile(args.infile, True)
    else:
        data = args.infile

    points = np.array([
        [point.get('lon'), point.get('lat')]
        for point in data
    ])

    # In testing, found that a higher number of iterations led to less
    # errors due to missing centroids (Note: whitening led to worse results)
    centroids, distortion = kmeans(points, args.number_of_vans, 2000)
    index, distortion = vq(points, centroids)

    vans = [[] for _ in range(args.number_of_vans)]

    for i, point in enumerate(data):
        vans[index[i]].append(point)

    vans = distribute(vans, len(data), centroids)


    create_output(args.outfile, vans)
Ejemplo n.º 24
0
def clustering2(img,clusters):
    "another clustering method - no major differences"
    #Reshaping image in list of pixels to allow kmean Algorithm
    #From 1792x1792x3 to 1792^2x3
    pixels = np.reshape(img,(img.shape[0]*img.shape[1],3))
    centroids,_ = kmeans2(pixels,3,iter=3,minit= 'random')
    #print ("Centroids : ",centroids.dtype,centroids.shape,type(centroids))
    #print centroids
    # quantization
    #Assigns a code from a code book to each observation
    #code : A length N array holding the code book index for each observation.
    #dist : The distortion (distance) between the observation and its nearest code.
    code,_ = vq(pixels,centroids)
    #print ("Code : ",code.dtype,code.shape,type(code))
    #print code

    # reshaping the result of the quantization
    reshaped = np.reshape(code,(img.shape[0],img.shape[1]))
    #print ("reshaped : ",reshaped.dtype,reshaped.shape,type(reshaped))

    clustered = centroids[reshaped]
    #print ("clustered : ",clustered.dtype,clustered.shape,type(clustered))
    
    #scatter3D(centroids)
    return clustered
Ejemplo n.º 25
0
def main():    
    gdal.AllRegister()
    infile = auxil.select_infile() 
    if infile:                  
        inDataset = gdal.Open(infile,GA_ReadOnly)     
        cols = inDataset.RasterXSize
        rows = inDataset.RasterYSize    
        bands = inDataset.RasterCount
    else:
        return    
    pos =  auxil.select_pos(bands)
    bands = len(pos)    
    x0,y0,rows,cols=auxil.select_dims([0,0,rows,cols])   
    K = auxil.select_integer(6,msg='Number clusters')        
    G = zeros((rows*cols,len(pos))) 
    k = 0                                   
    for b in pos:
        band = inDataset.GetRasterBand(b)
        G[:,k] = band.ReadAsArray(x0,y0,cols,rows)\
                              .astype(float).ravel()
        k += 1        
    centers, _ = kmeans(G,K)
    labels, _ = vq(G,centers)      
    outfile,fmt = auxil.select_outfilefmt() 
    if outfile:
        driver = gdal.GetDriverByName(fmt)   
        outDataset = driver.Create(outfile,
                        cols,rows,1,GDT_Byte)         
        outBand = outDataset.GetRasterBand(1)
        outBand.WriteArray(reshape(labels,(rows,cols))\
                                              ,0,0) 
        outBand.FlushCache() 
        outDataset = None    
    inDataset = None        
Ejemplo n.º 26
0
def cluster_svm(x_data, y_data, kmean, xlab, ylab, show_graph):
	x =  vstack(x_data)
	y = vstack(y_data)
	#print data


	#dat = np.insert(x, 1, y, axis=1)
	dat = hstack((x,y))
	#print dat
	#data = vstack((t,c, d))
	# computing K-Means with K = 2 (2 clusters)
	centroids,_ = kmeans2(dat,kmean, iter=20)
	# assign each sample to a cluster
	idx,_ = vq(dat,centroids)

	# some plotting using numpy's logical indexing
	if show_graph:
		plt.figure()
		plt.plot(dat[idx==0,0],dat[idx==0,1],'ob',
		      dat[idx==1,0],dat[idx==1,1],'or',
		      dat[idx==2,0],dat[idx==2,1],'ok',)
		plt.plot(centroids[:,0],centroids[:,1],'sg',markersize=8)
		plt.xlabel(xlab)
		plt.ylabel(ylab)
		plt.show()
	return centroids
Ejemplo n.º 27
0
def clustering(img,clusters):
    "use the kmean algo to cluster img colors"
    #Reshaping image in list of pixels to allow kmean Algorithm
    #From 1792x1792x3 to (1792^2)x3
    pixels = np.reshape(img,(img.shape[0]*img.shape[1],3))
    #clustering is done on hue value of a pixel color
    #performing the clustering
    centroids,_ = kmeans(pixels,clusters,iter=3)
    # quantization
    #Assigns a code from a code book to each observation
    #code : A length N array holding the code book index for each observation.
    code,_ = vq(pixels,centroids)
    #print ("Code : ",code.dtype,code.shape,type(code))

    # reshaping the result of the quantization
    reshaped = np.reshape(code,(img.shape[0],img.shape[1]))
    #print ("reshaped : ",reshaped.dtype,reshaped.shape,type(reshaped))
    #print reshaped
    #print nbrDiff(reshaped)
  
    clustered = centroids[reshaped]
    #print ("Centroids : ",centroids.dtype,centroids.shape,type(centroids))
    #print ("Clustered : ",clustered.dtype,clustered.shape,type(clustered))
    #print ("nbrDiff de Clustered 0 = " , nbrDiff(clustered[:,:,0]))
    #print ("nbrDiff de Clustered 1 = " ,nbrDiff(clustered[:,:,1]))
    #print ("nbrDiff de Clustered 2 = " ,nbrDiff(clustered[:,:,2]))

    #print nbrDiff(reshaped)
    return clustered,reshaped,standardCode
Ejemplo n.º 28
0
 def project(self, descriptors):
   """Project descriptors on the vocabulary to create a histogram of words."""
   imhist = numpy.zeros((self.word_count))
   words, distance = vq.vq(descriptors, self.voc)
   for w in words:
     imhist[w] += 1
   return imhist
Ejemplo n.º 29
0
def Kmeans_map(obs, code_book):
    No = obs.shape[0]
    nc = code_book.shape[0]
    # nc is current number of clusters (may decrease if zero clusters last iteration)
    #
    # compute membership and distances between obs and code_book
    obs_code, distort = vq(obs, code_book)
    distortsum = np.sum(distort)
    distortmax = np.amax(distort)
    #
    # vq returns an indexing array obs_code mapping rows of obs (the points) to code_book (the centroids)
    # distort is an array of length No that has difference between observation and chosen centroid
    # vq stands for vector quantization and is provided in SciPy
    #
    VectorDimension = obs.shape[1]
    NewCode_Book = np.zeros([nc, VectorDimension])
    NumPointsinClusters = np.zeros([nc])
    for i in np.arange(nc):
        # 	Loop over clusters labelled with i
        cell_members = np.compress(np.equal(obs_code, i), obs, 0)
        NumPointsinClusters[i] = cell_members.shape[0]
        # 	Extract Points in this Cluster; extract points whose quantization label is i
        #
        NewCode_Book[i] = np.sum(cell_members, 0)
        # 	Calculate centroid of i'th cluster
    return NewCode_Book, NumPointsinClusters, distortsum, distortmax, No
Ejemplo n.º 30
0
def test_kmeans():
	obs = sp.random.uniform(0, 10, (1000, 2))
	# knum = 7
	obs = scvq.whiten(obs)

	# run kmeans with diffirent number of clusters
	for knum in range(2, 8):
		codebook, dist = scvq.kmeans(obs, knum)
		ind, dist = scvq.vq(obs, codebook)

		# visualize
		# plt.ion()
		plt.ioff()
		plt.figure(knum)
		colors = ["b*", "g+", "ro", "yp", "ms", "ch", "wx"]

		for icluster in range(knum):
			x = (ind == icluster).nonzero()[0]
			plt.plot(obs[x, 0], obs[x, 1], colors[icluster])

			for iline in range(sp.size(x)):
				plt.plot([obs[x[iline], 0], codebook[icluster, 0]],
					[obs[x[iline], 1], codebook[icluster, 1]], "k--")

		# the cluster centroid
		plt.plot(codebook[:, 0], codebook[:, 1], "ko")

		# the plot size
		plt.xlim((-0.3, 3.8))
		plt.ylim((-0.3, 3.8))
	plt.show()
		for v_num, v_id in enumerate(v_ids):
			print "Creating Self-Organized Map for " + vehicle_type + " with " + fuel_type + " consuption (ID " + str(v_num) + " of " + str(len(v_ids)) + ")\r",

			# Opens ID frame sampling analysis (NOT NORMALIZED)
			input_file_name = input_file_path_base + vehicle_type + '/' + fuel_type + '/' + v_id
			df = pd.read_csv(input_file_name)
			data = df.fillna(0).as_matrix().astype(float)

			# Starts K-Means analysis
			best_distortion = None
			best_code_book = None
			best_distance = None
			best_k = None
			for k_mean in k_means:
				centroids, distortion = kmeans(data, k_mean)									# Uses kmeans to clusterize data from actual map
				code_book, distance = vq(data, centroids)										# Gets codebook of ID's

				# Saves results if distortion is more than "elbow_rate" percent smaller than the best distortion so far
				if best_distortion == None or abs(distortion - best_distortion)/best_distortion > elbow_rate:
					best_distortion = distortion
					best_code_book = code_book
					best_distance = distance
					best_k = k_mean
				# If distortion is not "elbow_percent" percent smaller than the best distortion so far, quites the analysis
				else:
					break

			df['CODE_BOOK'] = best_code_book													# Saves codebook on result on dataframe
			df['DISTANCE'] = best_distance														# Saves distances from centroids on dataframe
			f.write(vehicle_type + ' ' + fuel_type + ' ' + v_id[:-4] + ' k=' + str(best_k) + '\n')
'''We are going to continue the investigation into the sightings of legendary Pokémon from the previous exercise. Just like the previous exercise, we will use the same example of Pokémon sightings. In this exercise, you will form clusters of the sightings using k-means clustering.

x and y are columns of X and Y coordinates of the locations of sightings, stored in a Pandas data frame, df. The following are available for use: matplotlib.pyplot as plt, seaborn as sns, and pandas as pd.'''

import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

x = [9, 6, 2, 3, 1, 7, 1, 6, 1, 7, 23, 26, 25, 23, 21, 23, 23, 20, 30, 23]
y = [8, 4, 10, 6, 0, 4, 10, 10, 6, 1, 29, 25, 30, 29, 29, 30, 25, 27, 26, 30]

df = pd.DataFrame({'x': x, 'y': y})

# Import kmeans and vq functions
from scipy.cluster.vq import kmeans, vq

# Compute cluster centers
centroids, _ = kmeans(df, 2)

# Assign cluster labels
df['cluster_labels'], _ = vq(df, centroids)

# Plot the points with seaborn
sns.scatterplot(x='x', y='y', hue='cluster_labels', data=df)
plt.show()
# K-means clustering: first exercise
# This exercise will familiarize you with the usage of k-means clustering on a dataset. Let us use the Comic Con dataset and check how k-means clustering works on it.

# Recall the two steps of k-means clustering:

# Define cluster centers through kmeans() function. It has two required arguments: observations and number of clusters.
# Assign cluster labels through the vq() function. It has two required arguments: observations and cluster centers.
# The data is stored in a Pandas data frame, comic_con. x_scaled and y_scaled are the column names of the standardized X and Y coordinates of people at a given point in time.

# Instructions
# 100 XP
# Import kmeans and vq functions in SciPy.
# Generate cluster centers using the kmeans() function with two clusters.
# Create cluster labels using these cluster centers.


# Import the kmeans and vq functions
from scipy.cluster.vq import kmeans, vq

# Generate cluster centers
cluster_centers, distortion = kmeans(comic_con[['x_scaled', 'y_scaled']], 2)

# Assign cluster labels
comic_con['cluster_labels'], distortion_list = vq(comic_con[['x_scaled', 'y_scaled']], cluster_centers)

# Plot clusters
sns.scatterplot(x='x_scaled', y='y_scaled', 
                hue='cluster_labels', data = comic_con)
plt.show()
Ejemplo n.º 34
0
#np数据从0开始计算,第0维维序号排除,第10维为标签排除,所以为1到9
points = dataset[:,1:9]
cancer_label = dataset[:,10]
print "points:\n",points
print "cancer_label:\n",cancer_label
# k-means聚类
#将原始数据做归一化处理
data=whiten(points)
#使用kmeans函数进行聚类,输入第一维为数据,第二维为聚类个数k.
#有些时候我们可能不知道最终究竟聚成多少类,一个办法是用层次聚类的结果进行初始化.当然也可以直接输入某个数值. 
#k-means最后输出的结果其实是两维的,第一维是聚类中心,第二维是损失distortion,我们在这里只取第一维,所以最后有个[0]
#centroid = kmeans(data,max(cluster))[0]  
centroid = kmeans(data,2)[0]
print centroid
#使用vq函数根据聚类中心对所有数据进行分类,vq的输出也是两维的,[0]表示的是所有数据的label
label=vq(data,centroid)[0]
num = [0,0]
for i in label:
    if(i == 0):
        num[0] = num[0] + 1
    else:
        num[1] = num[1] + 1
print 'num =',num       
#np.savetxt('file.csv',label)
print "Final clustering by k-means:\n",label
result = np.subtract(label,cancer_label)
print "result:\n",result

count = [0,0]
for i in result:
    if(i == 0):
Ejemplo n.º 35
0
from collections import defaultdict
from similar_words import load_vectors
import argparse

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--basename',
                        help='base name of word vector files',
                        type=str)
    parser.add_argument('--maxwords',
                        help='maximum number of words to cluster',
                        type=int)
    parser.add_argument('--k', help='number of clusters', type=int)
    args = parser.parse_args()

    vectors, words = load_vectors(args.basename, args.maxwords)

    centroids, _ = kmeans(vectors, args.k)
    idx, _ = vq(vectors, centroids)

    clusters = defaultdict(set)
    for i, c in enumerate(idx):
        clusters[c].add(words[i])

    for c in range(args.k):
        print 'CLUSTER', c + 1,
        for word in clusters[c]:
            print word,
        print
        print
      #print 'x ' + str(x) + ' y ' + str(y) + ' w ' + str(w) + ' h ' + str(h)
      #if horiz_aspect_ratio > 2 or vert_aspect_ratio > 2:
      #cv2.rectangle(img,(x,y),(x+w,y+h),(0,0,255),2)
      #draw in all contours to see how they fall
      #contour_sizes.append([float(x)*4,float(y)*4,max(float(w),float(h))])#,horiz_aspect_ratio,vert_aspect_ratio])
      contour_sizes.append([cx*8.0,cy*8.0,max(float(w),float(h))/8.0])#,horiz_aspect_ratio,vert_aspect_ratio])
      contour_lookup.append(c)
      #contour_sizes.append([float(x),float(w),float(h)])#,horiz_aspect_ratio])
    #cv2.drawContours(img,[c],0,(0,255,0),1)
 
  whitened_contour_sizes = clustering.whiten(contour_sizes)
  #print str(contour_sizes)

  # let scipy do its magic (k==3 groups)
  centers,dist = clustering.kmeans(whitened_contour_sizes,75,iter=100)
  code, distance = clustering.vq(whitened_contour_sizes,centers)
  #print str(centroid)
  #print str(code)

  #print 'contours is ' + str(len(contour_sizes)) + ' and code is ' + str(len(code))

  colors = [( int(random.uniform(0, 255)),int(random.uniform(0, 255)),int(random.uniform(0, 255))) for i in code ]
  #print str(colors)
  for i, label in enumerate(code):
    color = colors[label]
    x,y,w,h = cv2.boundingRect(contour_lookup[i])
    #box = contour_sizes[i]
    #x=int(box[0])
    #y=int(box[1])
    #w=int(box[2])
    #h=int(box[3])
Ejemplo n.º 37
0
    def compress(self, node, idx, bits=None, min_qsnr=None, sparse=False):
        val = self.get(node, idx)
        flattened_val = val.flatten()
        if bits is not None:
            bins = int(math.pow(2, bits))
            if bins > val.size:
                raise Exception(
                    'More bins than values with {} bits'.format(bits))
            kmeans = KMeans(n_clusters=bins)
            kmeans.fit(flattened_val.reshape((-1, 1)))
            codebook = kmeans.cluster_centers_
            codebook = codebook.astype(val.dtype)
            codes = vq(flattened_val.reshape((-1, 1)), codebook)
            compressed_val = np.array([codebook[code] for code in codes[0]
                                       ]).reshape(val.shape)
        elif min_qsnr:
            cur_qsnr = -math.inf
            bits = 1
            while cur_qsnr < min_qsnr:
                bits += 1
                if bits > 7:
                    raise Exception(
                        'Cannot find a solution with less than 8 bits \
                                     for {} with min_qsnr = {}'.format(
                            node.name, min_qsnr))
                bins = int(math.pow(2, bits))
                if bins > val.size:
                    break
                kmeans = KMeans(n_clusters=bins)
                kmeans.fit(flattened_val.reshape((-1, 1)))
                codebook = kmeans.cluster_centers_
                codebook = codebook.astype(val.dtype)
                codes = vq(flattened_val.reshape((-1, 1)), codebook)
                compressed_val = np.array(
                    [codebook[code] for code in codes[0]]).reshape(val.shape)
                cur_qsnr = qsnr(compressed_val.astype(np.float32),
                                val.astype(np.float32))
        else:
            # automatic search of optimal k with inertia method
            silhouette = []
            inertia = []
            for bits in range(1, 8):
                bins = int(math.pow(2, bits))
                if bins > val.size:
                    break
                kmeans = KMeans(n_clusters=bins)
                kmeans.fit(flattened_val.reshape((-1, 1)))
                codebook = kmeans.cluster_centers_
                codebook = codebook.astype(val.dtype)
                codes = vq(flattened_val.reshape((-1, 1)), codebook)
                compressed_val = np.array(
                    [codebook[code] for code in codes[0]]).reshape(val.shape)
                inertia.append(kmeans.inertia_)
                silhouette.append(
                    silhouette_score(flattened_val.reshape(-1, 1),
                                     compressed_val.flatten().reshape(-1, 1)))
            elb_idx = np.argmax(np.diff(np.diff(
                np.array(inertia))))  # 2nd grade derivative to find the elbow
            elb_idx = 1 if elb_idx == 0 else elb_idx
            bits = np.argmax(
                np.array(silhouette[elb_idx - 1:elb_idx + 1])
            ) + 1  # take the three around the elbow and look at the silhouette
            bins = int(math.pow(2, bits))
            kmeans = KMeans(n_clusters=bins)
            kmeans.fit(flattened_val.reshape((-1, 1)))
            codebook = kmeans.cluster_centers_
            codebook = codebook.astype(val.dtype)
            codes = vq(flattened_val.reshape((-1, 1)), codebook)
            compressed_val = np.array([codebook[code] for code in codes[0]
                                       ]).reshape(val.shape)

        if sparse:
            freqs = np.unique(codes, return_counts=True)
            max_index = np.where(freqs[1] == freqs[1].max())[0][0]
            sparse_val = freqs[0][max_index]
        else:
            sparse_val = None
        self.set(node, idx, val, compressed_val, sparse_val)
        x = 1
# -*- coding: utf-8 -*-
"""
Created on Thu Mar 14 02:02:38 2019

@author: js
"""


import numpy as np
from scipy.cluster.vq import vq, kmeans, whiten
list1 = [88.0, 74.0, 96.0, 85.0]
list2 = [92.0, 99.0, 95.0, 94.0]
list3 = [91.0, 87.0, 99.0, 95.0]
list4 = [78.0, 99.0, 97.0, 81.0]
list5 = [88.0, 78.0, 98.0, 84.0]
list6 = [100.0, 95.0, 100.0, 92.0]
data = np.array([list1,list2,list3,list4,list5,list6])
whiten = whiten(data)
centroids,_ = kmeans(whiten, 2)
result,_= vq(whiten, centroids)
print(result) 
Ejemplo n.º 39
0
# results of this approach)
# feat=whiten(feat)

# The use of the Kmeans2 is giving crappy results as it is only running once.
# centroids, labels=kmeans2(feat,K, minit='points', iter=10)

# To use the Kmeans algorithm from scipy we get centroids and the variance and
# the iter parameter is not the iterations but the number of times to
# run kmeans
centroids, variance = kmeans(feat, K, iter=100)

log("Centroids and Variance Calculated... proceeding to calculate Labels and Distance Matrix"
    )
# But for this is necessary to compute the labels from the centroids (distance)
# from the features and the centroids.
labels, distance = vq(feat, centroids)

log("Labels length: " + str(len(labels)))

outputCluster = open("cluster-" + Xfile, "w")
for k in range(len(labels)):
    outputCluster.write(str(k + 1) + " " + str(labels[k] + 1) + "\n")
outputCluster.close()

log("Clusters Exported to file clusters-...")

log("Starting Display")
# A partir dos centroids pode-se calcular a distância de cada documento a cada
# um dos centroids com uma cor respectiva.
# Se desenharmos horizontalmente os pontos com X a ser a data de
# publicação e Y o cluster?
Ejemplo n.º 40
0
def countpairs(src,
               lns,
               rnd,
               radii,
               rndtype='lens',
               srcweights=None,
               rndweights=None,
               numthreads=1):
    """
    Create annuli dictionary and run chunkcount
    for every inner/outer radius pair
    """
    srcpos = zip(src.data['RA'], src.data['DEC'])
    rndpos = zip(rnd.data['RA'], rnd.data['DEC'])

    if jackknife == True:
        jkresults = {}

        #separate d (src) and r (rnd) into SAME kmeans region, by position
        n_jk = njackknife
        centers, _ = kmeans(srcpos, n_jk)
        src_k_indices, _ = vq(srcpos, centers)
        rnd_k_indices, _ = vq(rndpos, centers)

        #count DD and DR for each radius for each sample
        for k in range(n_jk):
            #radii in increasing order
            annuli = {}

            #sources first
            src_k_mask = (src_k_indices != k)
            this_dpos = np.array(srcpos)[src_k_mask]
            this_srcweights = np.array(srcweights)[src_k_mask]
            annuli = loopradius(annuli, 'Psrcsum', 'srcpairs', radii, lns,
                                this_dpos, this_srcweights, numthreads)
            del this_dpos

            #again with randoms
            rnd_k_mask = (rnd_k_indices != k)
            this_rpos = np.array(rndpos)[rnd_k_mask]
            this_rndweights = np.array(rndweights)[rnd_k_mask]
            annuli = loopradius(annuli, 'Prndsum', 'rndpairs', radii, lns,
                                this_rpos, this_rndweights, numthreads)

            jkresults[k] = {}
            jkresults[k]['Psrcsum'] = [annuli[rad]['Psrcsum'] for rad in radii]
            jkresults[k]['Prndsum'] = [annuli[rad]['Prndsum'] for rad in radii]

            #get w for this sample
            tot_src = np.sum([annuli[m]['Psrcsum'] for m in annuli.keys()])
            tot_rnd = np.sum([annuli[m]['Prndsum'] for m in annuli.keys()])
            jkresults[k]['w'] = w([annuli[rad]['Psrcsum'] for rad in radii],
                                  [annuli[rad]['Psrcsum'] for rad in radii],
                                  tot_src, tot_rnd)

        #get jacknife estimate from average of jackknife regions
        jkresults['w'] = jk([jkresults[k]['Psrcsum'] for k in range(n_jk)],
                            [jkresults[k]['Prndsum'] for k in range(n_jk)], w)

        #get jackknife variance from the results of jackknife regions
        jkresults['var'] = varjk(
            [jkresults[k]['Psrcsum'] for k in range(n_jk)],
            [jkresults[k]['Prndsum'] for k in range(n_jk)], w)

    #answer with full sample
    annuli = {}

    #sources first
    srcpos = zip(src.data['RA'], src.data['DEC'])
    annuli = loopradius(annuli, 'Psrcsum', 'srcpairs', radii, lns, srcpos,
                        srcweights, numthreads)
    #save space
    del src, srcpos, srcweights

    #again with randoms
    rndpos = zip(rnd.data['RA'], rnd.data['DEC'])
    annuli = loopradius(annuli, 'Prndsum', 'rndpairs', radii, lns, rndpos,
                        rndweights, numthreads)
    del rnd, rndpos, rndweights

    if jackknife == True:
        return annuli, jkresults
    else:
        return annuli
Ejemplo n.º 41
0
            trainImages[i].append(filename)
            im = cv2.imread(os.path.join(trainImageDir, filename))
            imgray = cv2.cvtColor(im, cv2.COLOR_BGR2GRAY)
            kp, des = featureExtractor.get_keypoints_and_descriptors(imgray)
            desList.append(des)

descriptors = desList[0]
for des in desList:
    descriptors = np.vstack((descriptors, des))

if descriptors.dtype != "float32":
    descriptors = np.float32(descriptors)
voc, variance = kmeans(descriptors, numWords, 30)
sumNum = []
for i in range(len(gestureIDlist)):
    if i == 0:
        sumNum.append(0)
    else:
        val = sumNum[i - 1] + len(trainImages[i - 1])
        sumNum.append(val)

trainData = np.zeros((sumNum[-1] + len(trainImages[-1]), numWords), "float32")
trainLabels = np.zeros((sumNum[-1] + len(trainImages[-1])), "uint32")

for gestureID in range(len(gestureIDlist)):
    for numFrame in range(len(trainImages[gestureID])):
        words, distance = vq(desList[sumNum[gestureID] + numFrame], voc)
        for w in words:
            trainData[sumNum[gestureID] + numFrame][w] += 1
        trainLabels[sumNum[gestureID] + numFrame] = gestureID
Ejemplo n.º 42
0
def kmeansFits(parser):
    (options,args)= parser.parse_args()
    if len(args) == 0:
        parser.print_help()
        return
    if options.outfilename is None:
        print "-o filename options needs to be set ..."
        print "Returning ..."
        return None
    numpy.random.seed(seed=options.seed)
    #Restore fits
    savefilename= args[0]
    if os.path.exists(savefilename):
        savefile= open(savefilename,'rb')
        params= pickle.load(savefile)
        type= pickle.load(savefile)
        band= pickle.load(savefile)
        savefile.close()
    else:
        print "Input file does not exist ..."
        print "Returning ..."
        return
    #Prepare params for K-means
    print "Preparing data ..."
    if type == 'powerlawSF':
        if len(band) > 1:
            nparams= 4
        else:
            nparams= 2
    elif type == 'DRW':
        if len(band) == 1:
            nparams= 2
        else:
            print "DRW for multi-band fits not implemented yet ..."
            print "Returning ..."
            return
    elif type == 'KS11':
        nparams= 3
    elif type == 'scatter':
        nparams= 1
    ndata= len(params)
    kIn= numpy.zeros((ndata,nparams))
    if type == 'powerlawSF':
        #Stack as A,g,Ac,gc
        kIn[:,0]= numpy.array([p['logA'] for p in params.values()]).reshape(ndata)
        kIn[:,1]= numpy.array([p['gamma'] for p in params.values()]).reshape(ndata)
        if len(band) > 1:
            kIn[:,2]= numpy.array([p['logAgr'] for p in params.values()]).reshape(ndata)
            kIn[:,3]= numpy.array([p['gammagr'] for p in params.values()]).reshape(ndata)
    elif type == 'DRW':
        print "type == 'DRW' not implemented yet ..."
        print "Returning ..."
        return
    elif type == 'KS11':
        #Stack as A,g,s
        kIn[:,0]= numpy.array([p['logA'] for p in params.values()]).reshape(ndata)
        kIn[:,1]= numpy.array([p['gamma'] for p in params.values()]).reshape(ndata)
        kIn[:,2]= numpy.array([p['s'] for p in params.values()]).reshape(ndata)
    #Whiten, i.e., give unit variance
    print "Whitening data ..."
    whitenFactors= numpy.zeros(nparams)
    for ii in range(nparams):
        whitenFactors[ii]= numpy.std(kIn[:,ii])
        kIn[:,ii]/= whitenFactors[ii]
    #Ready to run K-means
    print "Running K-means ..."
    book, dist= kmeans(kIn,options.k)
    assign, dist= vq(kIn,book)
    #De-whiten the codebook
    for ii in range(nparams):
        book[:,ii]*= whitenFactors[ii]
    #Prepare for saving
    print "Preparing output for saving ..."
    outparams= []
    weights= []
    for kk in range(options.k):
        if type == 'powerlawSF':
            if len(band) > 1:
                outparams.append({'logA':book[kk,0],
                                  'gamma':book[kk,1],
                                  'logAgr':book[kk,2],
                                  'gammagr':book[kk,3]})
            else:
                outparams.append({'logA':book[kk,0],
                                  'gamma':book[kk,1]})
        elif type == 'DRW':
            print "DRW not implemented yet ..."
            print "Returning ..."
            return
        if type == 'KS11':
                outparams.append({'logA':book[kk,0],
                                  'gamma':book[kk,1],
                                  's':book[kk,2]})
        thisassign= assign[(assign == kk)]
        weights.append(len(thisassign))
    #Save
    print "Saving ..."
    if os.path.exists(options.outfilename):
        print options.outfilename+" exists ..."
        print "*Not* overwriting ..."
        print "Remove file before running ..."
        return
    if options.savefits:
        import pyfits
        cols= []
        if type == 'powerlawSF':
            colA= []
            colg= []
            for kk in range(options.k):
                colA.append(outparams[kk]['logA'])
                colg.append(outparams[kk]['gamma'])
            colA= numpy.array(colA)
            colg= numpy.array(colg)
            colw= numpy.log(numpy.array(weights))
            cols.append(pyfits.Column(name='logA',format='E',
                                      array=colA))
            cols.append(pyfits.Column(name='gamma',format='E',
                                      array=colg))
        elif type == 'KS11':
            colA= []
            colg= []
            cols= []
            for kk in range(options.k):
                colA.append(outparams[kk]['logA'])
                colg.append(outparams[kk]['gamma'])
                colg.append(outparams[kk]['s'])
            colA= numpy.array(colA)
            colg= numpy.array(colg)
            cols= numpy.array(colg)
            cols.append(pyfits.Column(name='logA',format='E',
                                      array=colA))
            cols.append(pyfits.Column(name='gamma',format='E',
                                      array=colg))
            cols.append(pyfits.Column(name='s',format='E',
                                      array=cols))           
        colw= numpy.log(numpy.array(weights))
        cols.append(pyfits.Column(name='logweight',format='E',
                                  array=colw))
        columns= pyfits.ColDefs(cols)
        tbhdu= pyfits.new_table(columns)
        tbhdu.writeto(options.outfilename)
    else:
        outfile= open(options.outfilename,'wb')
        pickle.dump(outparams,outfile)
        pickle.dump(weights,outfile)
        outfile.close()
    return
Ejemplo n.º 43
0
def recolour(image, pal):
    palette_array = np.array(pal, dtype=np.uint8)
    im_array = np.reshape(np.array(image), (image.size[0] * image.size[1], 3))
    quant, _ = vq(im_array, palette_array)
    idx = np.reshape(quant, (image.size[1], image.size[0]))
    return Image.fromarray(palette_array[idx])
Ejemplo n.º 44
0
    def fit(self, X, sample_weight=[]):
        """Fit a mixture of Gaussian model on X
        
        Parameters:
        -----------
        X : ndarray [n_samples, n_features]
            Each row is a sample.

        Returns:
        --------
        self : object.
            return the object itself.
        """
        #
        # Use uniform weights over samples if sample weights are not specified.
        #
        self.n_samples = X.shape[0]
        self.n_features = X.shape[1]
        if sample_weight == []:
            self.sample_weight = np.ones(
                (self.n_samples, 1), dtype=np.float64) / self.n_samples

        elif len(sample_weight.shape) == 1:
            self.sample_weight = np.reshape(sample_weight, (self.n_samples, 1))
            self.sample_weight.astype(np.float64)

        #
        # Run KMeans to find initial centroids and radiuses
        #
        #whitened_samples = whiten(X)
        whitened_samples = X
        centroids, distortion = kmeans(whitened_samples, self.n_components)
        indexes, distortion = vq(whitened_samples, centroids)

        if self.n_components != centroids.shape[0]:
            print 'KMean wasn\'t able to find as many clusters as you wanted'
            self.n_components = centroids.shape[0]

        coef = np.zeros((self.n_components, ), np.float64)

        #
        # Update mixing coefficients, mean and covariance
        #
        cim = np.zeros((self.n_samples, ), np.float64)
        for i in range(self.n_samples):
            cim[i] = np.argmin(np.sum(np.power(X[i, :] - centroids, 2),
                                      axis=1))

        mus = []
        sigmas = []
        for k in range(self.n_components):
            Xk = X[np.where(cim == k)[0], :]
            coef[k] = Xk.shape[0] / float(self.n_samples)
            Xkb = Xk - centroids[k, :]
            sigmak = np.dot(np.transpose(Xkb), Xkb) / float(self.n_samples - 1)
            if self.cov_type == 'diag':
                sigmak = np.diag(np.diag(sigmak))

            #
            # Check the component k is degenerate
            #
            # Quick check
            if np.mean(np.diag(sigmak)) >= self.dege_tole:
                mus.append(centroids[k, :])

                var = np.diag(sigmak)
                mvar = np.sum(var) / len(np.where(var > 0)[0])
                sigmak = sigmak * (1-self.sigma_corr_factor) + \
                         self.sigma_corr_factor*mvar*np.eye(sigmak.shape[0], dtype=np.float64)

                #sigmak += self.sigma_epsilon*np.eye(sigmak.shape[0], dtype=np.float64)
                sigmas.append(sigmak)

        self.components = self._pack_gaussian(mus, sigmas)
        self.n_components = len(mus)
        self.coef = coef
        self.staged_loss.append(self._loss(X))

        #
        # Main loop: Expectation Maximization
        #
        resp_mat = np.zeros((self.n_samples, self.n_components), np.float64)
        for iter in range(self.max_iters):

            #
            # E step. Compute responsibility.
            #
            for k in range(self.n_components):
                resp_mat[:, k] = self.components[k].pdf(X) * self.coef[k]
            resp_mat /= np.reshape(np.sum(resp_mat, axis=1),
                                   (self.n_samples, 1))

            #
            # M step. Re-estimate parameters
            #
            mus = []
            sigmas = []
            NK = np.sum(resp_mat, axis=0)
            self.coef = NK / float(self.n_samples)
            for k in range(self.n_components):
                r = np.reshape(
                    resp_mat[:, k],
                    (self.n_samples, 1)) / NK[k] * self.sample_weight
                r /= np.sum(r)
                muk = np.sum(r * X, axis=0)
                sigmak = 0.0
                for n in range(self.n_samples):
                    xkb = np.asmatrix(X[n, :] - muk)
                    sigmak += r[n, 0] * np.transpose(xkb) * xkb

                #
                # Check if the component k is degenerate
                #
                # Quick check
                if np.mean(np.diag(sigmak)) < self.dege_tole:
                    self._del_component(k)
                    k -= 1
                    continue

                #
                # Correct sigmak
                #
                var = np.diag(sigmak)
                mvar = np.sum(var) / len(np.where(var > 0)[0])
                sigmak = sigmak * (1-self.sigma_corr_factor) + \
                         self.sigma_corr_factor*mvar*np.eye(sigmak.shape[0], dtype=np.float64)

                if self.cov_type == 'diag':
                    sigmak = np.diag(np.diag(sigmak))

                mus.append(muk)
                sigmas.append(sigmak)

            self.components = self._pack_gaussian(mus, sigmas)
            self.staged_loss.append(self._loss(X))

            if self.verbose:
                print "Iteration: ", iter, "  Negative-Log-likelihood: ", self.staged_loss[
                    -1]

            #
            # Early stopping if the improvement is tiny.
            #
            if np.abs((self.staged_loss[-1] -
                       self.staged_loss[-2])) < self.tolerance:
                break

        self.components_ = self.components
        self.coef_ = self.coef
        self.n_components_ = self.n_components

        #
        # Remove very small components
        #
        #self._prune()

        return self
def main():
    try:
        dataset = sys.argv[1]
    except:
        print dataSets
        print "enter one of these datasets: "
        return 0
    if ((dataset in dataSets) == False):
        print 'dataset name is invalid:'
        return 0
    try:
        nCodewords = int(sys.argv[2])
    except:
        nCodewords = 1000
        print 'codewords: %d ' % (nCodewords)
    try:
        nSamples = int(sys.argv[3])
    except:
        nSamples = 500000
        print 'nSamples: %d ' % (nSamples)
    #
    dataPath = rootPath + dataset + dataDir
    catlist = os.listdir(dataPath)
    nCategories = len(catlist)
    nSamplesPerCat = int(np.round(nSamples / nCategories))
    count = 0
    for cat in catlist:
        catfilePath = dataPath + cat
        catname = cat.split('.')[0]
        catData = np.genfromtxt(catfilePath,
                                dtype='float',
                                usecols=np.arange(2, 15))
        if (catData.shape[0] <= nSamplesPerCat):
            catSample = catData
        else:
            rndsample = np.random.randint(0, catData.shape[0], nSamplesPerCat)
            catSample = catData[rndsample, :]
        if (count == 0):
            cumData = catSample
        else:
            cumData = np.concatenate((cumData, catSample), axis=0)
        count += 1
    # compute the codebook for the dataset
    [CodeBook, label] = kmeans2(cumData,
                                nCodewords,
                                iter=nIterKmeans,
                                minit='points',
                                missing='warn')  #@UnusedVariable
    # write codebook to file
    cbfilepath = rootPath + dataset + cbDir + dataset + str(
        nCodewords) + codebookext
    cbfile = open(cbfilepath, 'w')
    np.savetxt(
        cbfile,
        CodeBook,
        fmt='%f',
        delimiter=' ',
    )
    cbfile.close()
    # compute the bag-of-features histogram for each image
    for cat in catlist:
        catfilePath = dataPath + cat
        catname = cat.split('.')[0]
        catData = np.genfromtxt(catfilePath,
                                dtype='float',
                                usecols=np.arange(2, 15))
        [catLabel, catDist] = vq(catData, CodeBook)  #@UnusedVariable
        catImgId = np.genfromtxt(catfilePath,
                                 dtype=np.int,
                                 usecols=np.arange(15, 16))
        catId = np.genfromtxt(catfilePath,
                              dtype=np.int,
                              usecols=np.arange(16, 17))[0]
        ImgId = np.unique(catImgId)
        catboffilepath = rootPath + dataset + bofDir + catname + str(
            nCodewords) + bofext
        catboffile = open(catboffilepath, 'w')
        imgcount = 0
        for imgid in ImgId:
            imgLabel = catLabel[catImgId == imgid]
            [hist, edges] = np.histogram(imgLabel,
                                         nCodewords)  #@UnusedVariable
            if imgcount == 0:
                dataout = np.hstack((hist.T, imgid, catId))
            else:
                dataout = np.vstack((dataout, np.hstack(
                    (hist.T, imgid, catId))))
            imgcount += 1
            print('%s : %s' % (catname, imgid))
        np.savetxt(
            catboffile,
            dataout,
            fmt='%d',
            delimiter=' ',
        )
        catboffile.close()
    return 0
Ejemplo n.º 46
0
    def testCluster(self):
        print "< testCluster >"
        numVertices = 8
        graph = SparseGraph(GeneralVertexList(numVertices))

        graph.addEdge(0, 1)
        graph.addEdge(0, 2)
        graph.addEdge(1, 2)

        graph.addEdge(3, 4)
        graph.addEdge(3, 5)
        graph.addEdge(4, 5)

        graph.addEdge(0, 3)

        W = graph.getWeightMatrix()

        graphIterator = []
        graphIterator.append(W[0:6, 0:6].copy())
        W[1, 6] += 1
        W[6, 1] += 1
        graphIterator.append(W[0:7, 0:7].copy())
        W[4, 7] += 1
        W[7, 4] += 1
        graphIterator.append(W.copy())
        graphIterator = iter(graphIterator)

        k = 2
        clusterer = NingSpectralClustering(k)
        clustersList = clusterer.cluster(
            toSparseGraphListIterator(graphIterator))

        #Why are the bottom rows of Q still zero?

        #Try example in which only edges change
        numVertices = 7
        graph = SparseGraph(GeneralVertexList(numVertices))

        graph.addEdge(0, 1)
        graph.addEdge(0, 2)
        graph.addEdge(1, 2)

        graph.addEdge(3, 4)

        WList = []
        W = graph.getWeightMatrix()
        WList.append(W[0:5, 0:5].copy())

        graph.addEdge(3, 5)
        graph.addEdge(4, 5)
        W = graph.getWeightMatrix()
        WList.append(W[0:6, 0:6].copy())

        graph.addEdge(0, 6)
        graph.addEdge(1, 6)
        graph.addEdge(2, 6)
        W = graph.getWeightMatrix()
        WList.append(W[0:7, 0:7].copy())

        iterator = iter(WList)
        clustersList = clusterer.cluster(toSparseGraphListIterator(iterator))

        #Seems to work, amazingly
        #print(clustersList)

        #Try removing rows/cols
        W2 = W[0:5, 0:5]
        W3 = W[0:4, 0:4]
        WList = [W, W2, W3]
        iterator = iter(WList)
        clustersList = clusterer.cluster(toSparseGraphListIterator(iterator))

        #nptst.assert_array_equal(clustersList[0][0:5], clustersList[1])
        nptst.assert_array_equal(clustersList[1][0:4], clustersList[2])

        #Make sure 1st clustering (without updates) is correct
        L = GraphUtils.normalisedLaplacianRw(scipy.sparse.csr_matrix(W))
        numpy.random.seed(21)
        lmbda, Q = scipy.sparse.linalg.eigs(L,
                                            min(k, L.shape[0] - 1),
                                            which="SM",
                                            ncv=min(20 * k, L.shape[0]),
                                            v0=numpy.random.rand(L.shape[0]))

        V = VqUtils.whiten(Q)
        centroids, distortion = vq.kmeans(V, k, iter=20)
        clusters, distortion = vq.vq(V, centroids)

        #This should be equal but the eigenvector computation is unstable
        #even with repeated runs (and no way to set the seed)
        nptst.assert_array_equal(clusters, clustersList[0])
Ejemplo n.º 47
0
# -*- coding: utf-8 -*-
"""
K-means clustering

@author: David André Rodríguez Méndez (AndreRdz7)
"""
# Import libraries
import numpy as numpy
from scipy.cluster.vq import vq, kmeans
# Create datasets
data = np.random.random(90).reshape(30, 3)
c1 = np.random.choice(range(len(data)))
c2 = np.random.choice(range(len(data)))
# Getting k
clust_centers = np.vstack([data[c1], data[c2]])
print(clust_centers)
print(vq(data, clust_centers))
# K-means
kmeans(data, clust_centers)
Ejemplo n.º 48
0
from scipy.cluster.vq import kmeans, vq, whiten
from numpy import vstack, array
from numpy.random import rand

# data generation with three features
data = vstack((rand(100, 3) + array([.5, .5, .5]), rand(100, 3)))
print(data)

# whitening of data
data = whiten(data)
print(data)

# computing K-Means with K = 3 (2 clusters)
print("-------------computing K-Means with K = 3 (2 clusters)--------------")
centroids, _ = kmeans(data, 3)
print(centroids)

# assign each sample to a cluster
clx, _ = vq(data, centroids)

# check clusters of observation
print(clx)
# -*- coding: utf-8 -*-
"""
Created on Mon Jun 12 16:48:24 2017

@author: Xinyu Li
"""

from scipy.cluster.vq import kmeans,vq
from matplotlib.finance import quotes_historical_yahoo_ochl
from datetime import datetime
start = datetime(2014,7,1)
end = datetime(2014,9,30)
listDji = ['AXP','BA','CAT','CSCO','CVX','DD','DIS','GE','GS','HD','IBM',
'INTC','JNJ','JPM','KO','MCD','MMM','MRK','MSFT','NKE','PFE','PG','T','TRV',
'UNH','UTX','V','VZ','WMT','XOM']
quotes = [ [0 for col in range(90)] for row in range(30)]
listTemp = [ [0 for col in range(90)] for row in range(30)]
for i in range(30):
    quotes[i] = quotes_historical_yahoo_ochl(listDji[i], start, end)
days = len(quotes[0])
for i in range(30):
    for j in range(days-1):
        if (quotes[i][j][2] and quotes[i][j+1][2] and (quotes[i][j+1][2] >= quotes[i][j][2])):
            listTemp[i][j] = 1.0   
        else:
            listTemp[i][j] = -1.0
data = vstack(listTemp)
centroids,_ = kmeans(data,4)   #float or double is supported
result,_= vq(data,centroids)
print result
Ejemplo n.º 50
0
def sphere_tissue_image(size=100, n_points=12, n_layers=1):

    center = np.array([size / 2, size / 2, size / 2], float)

    coords = np.transpose(np.mgrid[0:size, 0:size, 0:size],
                          (1, 2, 3, 0)).reshape((np.power(size,
                                                          3), 3)).astype(int)
    coords_distances = np.linalg.norm(coords - center, axis=1)

    points = {}
    layer_img = {}

    for layer in xrange(n_layers):

        radius = (layer + 1) * size / float(2 * n_layers + 1)

        layer_n_points = n_points * np.power(layer + 1, 2)

        layer_points = {}

        for p in range(layer_n_points):
            theta = np.random.rand() * 2. * np.pi
            phi = np.random.rand() * np.pi - np.pi / 2.

            layer_points[p + np.power(layer, 2) * n_points +
                         3] = center + radius * np.array([
                             np.cos(theta) * np.cos(phi),
                             np.sin(theta) * np.cos(phi),
                             np.sin(phi)
                         ])
            layer_points = array_dict(layer_points)

            point_target_area = 4. * np.pi * np.power(
                radius, 2.) / float(layer_n_points)
            point_target_distance = np.power(point_target_area / np.pi, 0.5)

            sigma_deformation = (size / 100.) * (20. / layer_n_points)
            omega_forces = dict(distance=0.1 * size / 100.,
                                repulsion=100.0 * np.power(size / 100., 2))

            for iterations in xrange(100):
                point_vectors = np.array([
                    layer_points[p] - layer_points.values()
                    for p in layer_points.keys()
                ])
                point_distances = np.array([
                    vq(layer_points.values(), np.array([layer_points[p]]))[1]
                    for p in layer_points.keys()
                ])
                point_vectors = point_vectors / (
                    point_distances[..., np.newaxis] + 1e-7)

                point_distance_forces = omega_forces['distance'] * (
                    (point_distances - point_target_distance)[..., np.newaxis]
                    * point_vectors / point_target_distance).sum(axis=1)

                point_repulsion_forces = omega_forces['repulsion'] * np.power(
                    point_target_distance,
                    2) * (point_vectors / (np.power(point_distances, 2) +
                                           1e-7)[..., np.newaxis]).sum(axis=1)

                point_forces = np.zeros((len(layer_points), 3))
                point_forces += point_distance_forces
                point_forces += point_repulsion_forces

                point_forces = np.minimum(
                    1.0, sigma_deformation / np.linalg.norm(
                        point_forces, axis=1))[:, np.newaxis] * point_forces

                new_points = layer_points.values() + point_forces

                new_points = center + radius * (
                    (new_points - center) / np.linalg.norm(
                        (new_points - center), axis=1)[:, np.newaxis])

                layer_points = array_dict(new_points, layer_points.keys())

            for p in layer_points.keys():
                points[p] = layer_points[p]

            labels = layer_points.keys()[vq(coords, layer_points.values())[0]]

            layer_img[layer + 1] = np.ones((size, size, size), np.uint8)
            layer_img[layer + 1][tuple(np.transpose(coords))] = labels

    points[2] = center
    points = array_dict(points)

    # coords = np.transpose(np.mgrid[0:size,0:size,0:size],(1,2,3,0)).reshape((np.power(size,3),3)).astype(int)
    # labels = points.keys()[vq(coords,points.values())[0]]

    img = np.ones((size, size, size), np.uint8)

    for layer in xrange(n_layers):
        layer_coords = coords[
            (coords_distances > (2 * layer + 1) * size / float(4 *
                                                               (n_layers + 1)))
            & (coords_distances <=
               (2 * layer + 3) * size / float(4 * (n_layers + 1)))]
        img[tuple(np.transpose(layer_coords))] = layer_img[layer + 1][tuple(
            np.transpose(layer_coords))]

    center_coords = coords[coords_distances <= size / float(4 *
                                                            (n_layers + 1))]
    img[tuple(np.transpose(center_coords))] = 2

    ext_coords = coords[coords_distances > (n_layers + 1) * size /
                        float(2 * (n_layers + 2))]
    img[tuple(np.transpose(ext_coords))] = 1
    img = SpatialImage(img, voxelsize=(60. / size, 60. / size, 60. / size))

    return img
Ejemplo n.º 51
0
def pr07((xs, ys, ), var_number):
    template_vars = {}

    template_vars['var_number'] = var_number

    data = np.column_stack((
        xs,
        ys,
    )).astype('float')

    figure_filenames = []

    for n_clusters in [2, 3, 4]:
        centroids, __ = vq.kmeans(data, n_clusters)

        idx, __ = vq.vq(data, centroids)

        plt.figure()

        for i in xrange(n_clusters):
            plt.plot(
                data[idx == i, 0],
                data[idx == i, 1],
                'o',
                markersize=2,
                color=CLUSTER_COLORS[i],
            )

        plt.plot(centroids[:, 0],
                 centroids[:, 1],
                 '*',
    def KMEANS(self):
        # clusters
        K = 3

        data_arr = []
        meal_name_arr = []

        with open('./NewDataSet/Cluster_dataset/clusterisbnrate.csv',
                  'rb') as f:
            reader = csv.reader(f)
            for row in reader:
                if reader.line_num != 1:
                    '''for x in row[2:]:
		    		print x'''
                    data_arr.append([float(x) for x in row[1:]])
                    meal_name_arr.append([row[0]])

        data = vstack(data_arr)
        print "data  :"
        print data
        meal_name = vstack(meal_name_arr)

        # normalization
        data = whiten(data)

        # computing K-Means with K (clusters)
        centroids, distortion = kmeans(data, 3)
        print "distortion = " + str(distortion)

        # assign each sample to a cluster
        cntr = []
        print("Centroids:")
        print centroids
        cntr = centroids
        print("Cntr  :")
        print cntr
        print "---------------------------------------------------------"

        print("Centroids after sort:")
        #centroids=cntr.sort()
        #print centroids.sort()

        print "---------------------------------------------------------"
        idx, _ = vq(data, centroids)
        print "idx:"
        print idx
        print "-----------------------------------------------------------"
        '''# some plotting using numpy's logical indexing
	    plot(data[idx==0,0], data[idx==0,1],'ob',
		 data[idx==1,0], data[idx==1,1],'or',
		 data[idx==2,0], data[idx==2,1],'og')'''

        print meal_name
        print data

        for i in range(K):
            print centroids[i] * 3
            #print round(centroids[i])

        print "max value:"
        max1 = max(centroids)

        print "min value:"
        min1 = min(centroids)
        toprated = []
        lowrated = []
        medrated = []
        for i in range(K):
            result_names = meal_name[idx == i, 0]

            print "================================="
            vv = round(centroids[i])
            print vv
            name = ""
            print "Cluster " + str(i + 1)
            for name1 in result_names:
                name = name1
                print name1
                '''if(i== 0) :
		    	f1.write(name)
		    elif (i==1):
		    	f2.write(name)
		    elif (i==2):
		    	f3.write(name)'''

                if (centroids[i] == max1):
                    #for name1 in result_names:
                    toprated.append(name)
                    name = name1 + "\n"
                    f1.write(name)
                elif (centroids[i] == min1):
                    lowrated.append(name)
                    name = name1 + "\n"
                    f3.write(name)
                else:
                    medrated.append(name)
                    name = name1 + "\n"
                    f2.write(name)

        print "--------------------------------------------------------------------------"
        print "toprated:"
        print toprated
        print "--------------------------------------------------------------------------"
        print "medrated:"
        print medrated
        print "--------------------------------------------------------------------------"
        print "lowrated:"
        print lowrated
        print "--------------------------------------------------------------------------"
        '''plot(centroids[:,0],
		 centroids[:,1],
		 'sg',markersize=8)'''

        show()
Ejemplo n.º 53
0
 def quantize(pixels, palette):
     """quantize an image with a given color palette"""
     # pixels = np.reshape(img, (img.shape[0] * img.shape[1], 3))
     qnt, _ = vq(pixels, palette)
     centers_idx = np.reshape(qnt, (pixels.shape[0]))
     return centers_idx
Ejemplo n.º 54
0
 def test_vq(self):
     initc = np.concatenate(([[X[0]], [X[1]], [X[2]]]))
     for tp in np.array, np.matrix:
         label1, dist = _vq.vq(tp(X), tp(initc))
         assert_array_equal(label1, LABEL1)
         tlabel1, tdist = vq(tp(X), tp(initc))
Ejemplo n.º 55
0
def make_codeword(features, codebook):
    codeword, _ = vq(features, codebook)
    return codeword
Ejemplo n.º 56
0
        '''
        Draw Keypoints
        '''
        # img=cv2.drawKeypoints(gray,kp,img)
        # cv2.imwrite(target + '_keypoints1.jpg',img)

        if (count % batch_size == 0 or len(image_paths) - count  < 3):
            print "saving:", count, "/", len(image_paths)
            '''
            K-Means
            '''
            # kmeans.partial_fit(descriptors)
            i = 0
            for des in des_list:
                words, distance = vq(des, kmeans.cluster_centers_)
                for w in words:
                    im_features[i][w] += 1
                i += 1
            des_list = []
            # descriptors = np.array([], dtype=np.int32).reshape(0,128)
            print "K-MEANS Partial Completed", str(time.time() - t0)

            '''
            SVM
            '''
            lin_clf.fit(im_features, im_classes)
            with open("model_svm.pickle", "wb") as f1, open("clusters.pickle", "wb") as f2:
                pickle.dump(lin_clf, f1)
                pickle.dump(kmeans, f2)
            if count % test_split == 0:
 def computeHistograms(self, codebook, descriptors):
     code, dist = vq.vq(descriptors, codebook)
     histogram_of_words, bin_edges = histogram(
         code, bins=range(codebook.shape[0] + 1), normed=True)
     return histogram_of_words
Ejemplo n.º 58
0
def train_categorical_feature(feature_input, outcome, limit,
                              number_of_clusters):

    input = feature_input.values

    if len(pd.unique(input)) == 2:
        vocabulary = np.unique(input)
        p = np.array([0, 1])
        d = np.zeros(len(input), dtype=np.int)
        d[input == vocabulary[1]] = 1
        output = dict(zip(["d", "vocabulary", "p"], [d, vocabulary, p]))
        print output
        return output

    vocabulary_t = pd.unique(input)
    count_1 = np.zeros(len(vocabulary_t), dtype=int)
    count_0 = np.copy(count_1)

    outcome_1 = outcome.values == 1
    outcome_0 = outcome.values == 0
    for index, item in enumerate(vocabulary_t):
        if pd.notnull(item):
            count_1[index] = sum((input == item) * (outcome_1))
            count_0[index] = sum((input == item) * (outcome_0))
        else:
            count_1[index] = sum(pd.isnull(input) * (outcome_1))
            count_0[index] = sum(pd.isnull(input) * (outcome_0))

    condition = (count_0 + count_1) >= limit
    condition[pd.isnull(vocabulary_t)] = True

    #    n = sum(condition)
    #    vocabulary = np.zeros(n, dtype = str)
    #    p = np.zeros(n)

    def log_ratio(count_1, count_0):
        if count_1 == 0:
            return log(1 / (2 * float(count_0)))
        elif count_0 == 0:
            return log(2 * count_1)
        else:
            return log(count_1 / float(count_0))

    v_log_ratio = np.vectorize(log_ratio)

    vocabulary = vocabulary_t[condition]
    p = v_log_ratio(count_1[condition], count_0[condition])

    #    index = 0
    #    for i in range(len(vocabulary_t)):
    #        if (condition[i]):
    #            vocabulary[index] = str(vocabulary_t[index])
    #            p[index] = log_ratio(count_1[index], count_0[index])
    #            index = index + 1
    #            if (count_1[index] == 0):
    #                p[index] = log(1./(2*count_0[index]))
    #            elif (count_0[index] == 0):
    #                p[index] = log(2*count_1[index])
    #            else:
    #                p[index] = log(count_1[index]./count_0[index])
    # print "sum(condition == 0) is {0}".format(sum(condition == 0))
    if sum(condition == 0) <= 1:
        if sum(condition == 0) == 1:
            p = np.append(
                p,
                log_ratio(count_1[condition == 0][0],
                          count_0[condition == 0][0]))
            #           if (count_1[condition == 0][0] == 0):
            #                p[condition == 0] = log(1./(2*count_0[condition == 0][0]))
            #           elif (count_0[condition == 0] == 0):
            #                p[condition == 0] = log(2*count_1[condition == 0][0])
            #           else:
            #                p[condition == 0] = log(count_1[condition == 0][0]./count_0[condtion == 0][0])
            vocabulary = np.append(vocabulary, vocabulary_t[condition == 0])
    else:
        # print "number of clusters {0}".format(number_of_clusters)
        cl = min(number_of_clusters,
                 sum(condition == 0) - 1)  # why is it -1 here?
        # cl_vocabulary = pd.DataFrame()
        # print "cl {0}".format(cl)
        residual_1 = count_1[condition == 0]
        residual_0 = count_0[condition == 0]
        # print "length of the residual_1 {0}".format(len(residual_1))
        #        s = np.zeros(len(residual_1))
        s = v_log_ratio(residual_1, residual_0).reshape([len(residual_1), 1])
        whitened = whiten(s)
        codebook = kmeans(whitened, cl)[0]
        code = vq(whitened, codebook)[0]
        # print "length of code {0}".format(len(code))
        s1 = pd.Series(data=vocabulary_t[condition == 0])  # .astype(str)
        s2 = pd.Series(data=code)
        cl_vocabulary = pd.DataFrame.from_dict({
            "cat_feature_input": s1,
            "cluster_id": s2
        })

        #print cl_vocabulary.axes

        cl_p = np.zeros(cl, dtype=float)
        # print cl_p, len(cl_p)

        for i in range(cl):
            # print i
            c1 = residual_1[code == i]
            c0 = residual_0[code == i]
            cl_p[i] = log_ratio(sum(c1), sum(c0))
            # print "Hey"

    d = np.zeros(len(input))
    d[pd.isnull(input)] = p[pd.isnull(vocabulary)]

    for i in range(len(vocabulary)):
        d[input == vocabulary[i]] = p[i]
    vocabulary = vocabulary.astype(str)

    if 'cl_vocabulary' in locals():
        print "cl_vocabulary in locals()"
        for i in range(len(cl_vocabulary)):
            d[input == cl_vocabulary.loc[i, "cat_feature_input"]] = cl_p[
                cl_vocabulary.loc[i, "cluster_id"]]
        #print cl_vocabulary.axes
        cl_vocabulary.loc[:, "cat_feature_input"] = cl_vocabulary[
            "cat_feature_input"].astype(str)
        # print cl_vocabulary["cat_feature_input"].apply(type)

        output = dict(
            zip(["d", "vocabulary", "cl_vocabulary", "p", "cl_p"],
                [d, vocabulary, cl_vocabulary, p, cl_p]))
    else:
        output = dict(zip(["d", "vocabulary", "p"], [d, vocabulary, p]))

    #print output
    return output
Ejemplo n.º 59
0
    os.mkdir(base_path+os.sep+str(id))
    #f, axes = plt.subplots(nrows=2, ncols=1, figsize=(15, 10))
    sarraster = plt.imread(base_path+os.sep+'subj'+os.sep+str(id)+'.gif')
    # Removing speckles
    sarraster = ndi.median_filter(sarraster , size=2)
    # Flatten image to get line of values
    flatsarraster = sarraster.flatten().astype(float)

    print(flatsarraster.shape)

    # In remaining subplots add k-means classified images
    for i in range(2,6):
        #This scipy code classifies k-mean, code has same length as flattened
        #SAR raster and defines which class the SAR value corresponds to
        centroids, variance = kmeans(flatsarraster, i)
        code, distance = vq(flatsarraster, centroids)
        fig = plt.figure()
        fig.suptitle('K-Means Classification')

        # In first subplot add original SAR image
        ax = plt.subplot(241)
        plt.axis('off')
        ax.set_title('Original Image')
        plt.imshow(sarraster, cmap = 'gray')
        print(sarraster.shape)
        #Since code contains the classified values, reshape into SAR dimensions
        codeim = code.reshape(sarraster.shape[0], -1)
        print(codeim.shape)
        #codeim = ndi.median_filter(codeim , size=4)
        for j in range(i):
            #Plot the subplot with (i+2)th k-means
Ejemplo n.º 60
0
def displayResult():


    noOfCluster=0
    #Get Radio button input to check user choice
    chart = request.form['radio']
    #If user choice is cluster
    if chart == 'cluster':
        noOfCluster =long(request.form['cluster'])
        data_arr = []
        meal_name_arr = []
        #Url of data csv
        url='https://storage.googleapis.com/cloudbucket786/imptry4.csv'
        response=urllib2.urlopen(url)
        reader = csv.reader(response)
        
        for row in reader:
                if row[5] is None:
                    row[5]=0
                if row[5]=='':
                    row[5]=0
                if "," in row[6] :
                    rowVal=row[6].split(",")
                    row[6]=rowVal[0]+''+rowVal[1]
                    row[6]=float(row[6])
                if row[6]=='':
                    row[6]=0
                if row[6]=='N' :
                    row[6]=0
                if "," in row[7] :
                    rowVal=row[7].split(",")
                    row[7]=rowVal[0]+''+rowVal[1]
                    row[7]=float(row[6])
                if row[7]=='':
                    row[7]=0
                if row[7]=='N' :
                    row[7]=0
                data_arr.append([float(x) for x in row[5:]])#adding data to data_array
                meal_name_arr.append([row[0]])#adding ids to second array



    #print data_arr
        fig = plt.figure()
        ax = fig.add_subplot(111, projection='3d')#We are using 3D projection as we are plotting 3D data
        data = vstack( data_arr )

        meal_name = vstack(meal_name_arr)
    # normalization
        data = whiten(data)#Before running k-means, it is beneficial to rescale each feature dimension of the observation set with whitening.
    #Each feature is divided by its standard deviation across all observations to give it unit variance.

    # computing K-Means with K (clusters)
        centroids, distortion = kmeans(data,noOfCluster)


    # assign each sample to a cluster
        idx,_ = vq(data,centroids)

    # some plotting using numpy's logical indexing
        listOfColor=['ob','or','og','oc','om','ok','oy']
        for index in range(noOfCluster):
            plot(data[idx==index,0], data[idx==index,1],data[idx==index,2],listOfColor[index])# using 3 objects for 3D projection
        for index in range(noOfCluster):
            result_names = meal_name[idx==index, 0]
        print "================================="
        print "Cluster " + str(index+1)
        for name in result_names:
            print name

        plot(centroids[:,0],
             centroids[:,1],
             centroids[:,2],
             'oy',markersize=8)
        #saving file to temp image
        #Assigning labels to axis
        ax.set_xlabel('X Label')
        ax.set_ylabel('Y Label')
        ax.set_zlabel('Z Label')
        pylab.savefig('temp.jpg')
        pylab.clf()
        

        image="https://www.pythonanywhere.com/user/abhitej/files/home/abhitej/temp.jpg"
        #Overwrites the image on pythonanywhere.com

        return render_template('home.html',image=image,display='display:block;')

    else:
        list=[]
        words=request.form['words']
        list=words.split(",")

        list1=[]
        for s in list:
            list1.append(s.encode('ascii','ignore'))

        return render_template('home.html',list1=list1,display='display:none;')# Assigning display none for cluster if user chooce wordcloud