import random import numpy import numpy.random from ms import MeanShift # Create a simple data set... a = numpy.random.normal(3.0, 1.0, 100) b = numpy.random.normal(5.0, 0.5, 50) data = numpy.concatenate((a, b)) # Setup the mean shift object... ms = MeanShift() ms.set_data(data, 'd') ms.set_kernel(random.choice(filter(lambda s: s != 'fisher', ms.kernels()))) ms.set_spatial(random.choice(ms.spatials())) # Print out basic stats... print 'kernel = %s; spatial = %s' % (ms.get_kernel(), ms.get_spatial()) print 'exemplars = %i; features = %i' % (ms.exemplars(), ms.features()) print 'quality = %.3f; epsilon = %.3f; iter_cap = %i' % ( ms.quality, ms.epsilon, ms.iter_cap) print # Query the mode of various points... for x in numpy.arange(0.0, 7.0, 0.4): mode = ms.mode(numpy.array([x])) print '%.3f: mode = %.3f' % (x, mode)
data = numpy.concatenate((a,b,c,d), axis=0) # Use mean shift to cluster it... ms = MeanShift() ms.set_data(data, 'df') ms.set_kernel(random.choice(filter(lambda s: s!='fisher', ms.kernels()))) ms.set_spatial(random.choice(ms.spatials())) modes, indices = ms.cluster() # Print out basic stats... print 'kernel = %s; spatial = %s' % (ms.get_kernel(), ms.get_spatial()) print 'exemplars = %i; features = %i' % (ms.exemplars(), ms.features()) print 'quality = %.3f; epsilon = %.3f; iter_cap = %i' % (ms.quality, ms.epsilon, ms.iter_cap) print # Print out a grid of cluster assignments... for j in xrange(20): for i in xrange(20): fv = numpy.array([0.25*j, 0.25*i]) c = ms.assign_cluster(fv) print c, print
# Create a dataset - equally spaced samples weighted by a Gaussian, such that it should estimate a Gaussian... x = numpy.arange(-5.0, 5.0, 0.02) y = numpy.exp(-0.5 * x ** 2.0 / 2.0) data = numpy.concatenate((x.reshape((-1, 1)), y.reshape((-1, 1))), axis=1) # Setup the mean shift object... ms = MeanShift() ms.set_data(data, "df") ms.set_kernel(random.choice(filter(lambda s: s != "fisher", ms.kernels()))) ms.set_spatial(random.choice(ms.spatials())) ms.set_scale(numpy.ones(2), 1) # Iterate and calculate the probability at every point... sam = numpy.arange(-5.0, 5.0, 0.15) prob = numpy.array(map(lambda v: ms.prob(numpy.array([v, 1.0])), sam)) # Print out basic stats... print "kernel = %s; spatial = %s" % (ms.get_kernel(), ms.get_spatial()) print "exemplars = %i; features = %i" % (ms.exemplars(), ms.features()) print # Visualise the output... for threshold in numpy.arange(prob.max(), 0.0, -prob.max() / 15.0): print "".join(map(lambda p: "|" if p > threshold else " ", prob))
# Draw lots of samples from it... sample = ms.draws(samples_dir) # Get the probability of each... p1 = ms.probs(sample) # Throw away samples where p1 is 0 - they are a result of the range optimisation, and break the below... keep = p1 > 1e-6 sample = sample[keep, :] p1 = p1[keep] # Do a KDE of the samples, including bandwidth estimation... kde = MeanShift() kde.set_data(sample, 'df') kde.set_kernel( 'fisher(%f)' % (conc * 32)) # Hardly ideal - need something more independent/safer! kde.set_spatial('kd_tree') # Calculate a stochastic KL-divergance between the kde and the actual distribution... p2 = kde.probs(sample) kld = numpy.sum(numpy.log(p1 / p2)) / samples_dir # Print output to screen... print 'Kernel = %s; Dims = %i | KL-divergance = %.6f' % ( ms.get_kernel(), dim, kld) print
# Now for the directional kernels... for kernel in ['fisher', 'mirror_fisher']: for dim, area in zip(dir_dimensions, dir_area): for conc in dir_conc: # Create a mean shift object pointing in the [1, 0, ...] direction with the given concentration... data = numpy.array([1.0] + [0.0]*(dim-1), dtype=numpy.float32) ms = MeanShift() ms.set_data(data, 'f') ms.set_kernel('%s(%.1f)' % (kernel, conc)) ms.quality = 1.0 # Create uniform samples on the hyper-sphere with which we are dealing - abuse the MeanShift object by drawing with a Gaussian kernel and normalising... uniform = MeanShift() uniform.set_data(numpy.array([0.0]*dim, dtype=numpy.float32), 'f') uniform.set_kernel('gaussian') sample = uniform.draws(samples) div = numpy.sqrt(numpy.square(sample).sum(axis=1)) sample /= div[:, numpy.newaxis] # Evaluate the probabilities of the uniform directions... p = ms.probs(sample) # Print their average - should again be one... volume = p.mean() * area print 'Kernel = %s; Dims = %i | Monte-Carlo vol = %.3f (max = %.1f)' % (ms.get_kernel(), dim, volume, ms.prob(data)) print
kernel = 'composite(2:composite(1:gaussian, 1:gaussian), 2:fisher(%(ca)s), 2:mirror_fisher(%(cb)s))' # Don't ever do this: Just wanted to check a composite kernel within a composite kernel doesn't break things! ms = MeanShift() ms.set_data(data, 'df', None, 'rAA') ms.set_kernel(kernel % {'ca': 64.0, 'cb': 64.0}) # Use the MeanShiftCompositeScale object to optimise... optimise_scale = MeanShiftCompositeScale(kernel) optimise_scale.add_param_scale(0) optimise_scale.add_param_kernel('ca') optimise_scale.add_param_kernel('cb') steps = optimise_scale(ms) print 'Optimisation of "a" took %i steps' % steps print 'kernel = %s' % ms.get_kernel() print 'scale = %s' % ms.get_scale() print # Visualise - input and a draw from the input... def visualise(fn, data): img = numpy.zeros((size, size, 3), dtype=numpy.float32) for sample in data: bx = numpy.cos(sample[0]) * sample[1] by = numpy.sin(sample[0]) * sample[1] s_x = (size - 1) * 0.5 * (1.0 + bx / scale) s_y = (size - 1) * 0.5 * (1.0 + by / scale) e_x = s_x + angle_len * numpy.cos(sample[2])
ms.quality = 0.5 ms.set_data(numpy.array([1, 0, 0], dtype=numpy.float32), 'f') ms.set_kernel('fisher(%.1f%s)' % (2**power, code)) ms.set_spatial('kd_tree') return ms options = map(ms_by_conc, xrange(8)) + [ ms_by_conc(8, 'c'), ms_by_conc(8, 'a') ] + map(ms_by_conc, xrange(9, 16)) # Create it and do the bandwidth estimation... ms = MeanShift() ms.set_data(data, 'df') p = ProgBar() best = ms.scale_loo_nll_array(options, p.callback) del p print 'Selected kernel =', ms.get_kernel() print 'LOO score =', best # Visualise the best option... visualise('bandwidth_fisher.png', ms) # Also visualise correct vs approximate, for sanity checking... for option in [ms_by_conc(8, 'c'), ms_by_conc(8, 'a')]: #options: ms.copy_all(option) visualise('bandwidth_fisher_%s.png' % option.get_kernel(), ms)
ms = MeanShift() ms.set_data(data, 'df', None, 'rAA') ms.set_kernel(kernel % {'ca' : 64.0, 'cb' : 64.0}) # Use the MeanShiftCompositeScale object to optimise... optimise_scale = MeanShiftCompositeScale(kernel) optimise_scale.add_param_scale(0) optimise_scale.add_param_kernel('ca') optimise_scale.add_param_kernel('cb') steps = optimise_scale(ms) print 'Optimisation of "a" took %i steps' % steps print 'kernel = %s' % ms.get_kernel() print 'scale = %s' % ms.get_scale() print # Visualise - input and a draw from the input... def visualise(fn, data): img = numpy.zeros((size, size, 3), dtype=numpy.float32) for sample in data: bx = numpy.cos(sample[0]) * sample[1] by = numpy.sin(sample[0]) * sample[1] s_x = (size-1) * 0.5 * (1.0 + bx / scale) s_y = (size-1) * 0.5 * (1.0 + by / scale)
ms = MeanShift() ms.set_data(numpy.array([1.0] + [0.0]*(dim-1), dtype=numpy.float32), 'f') ms.set_kernel('%s(%.1f)' % (kernel, conc)) ms.quality = 1.0 # Draw lots of samples from it... sample = ms.draws(samples_dir) # Get the probability of each... p1 = ms.probs(sample) # Throw away samples where p1 is 0 - they are a result of the range optimisation, and break the below... keep = p1>1e-6 sample = sample[keep,:] p1 = p1[keep] # Do a KDE of the samples, including bandwidth estimation... kde = MeanShift() kde.set_data(sample, 'df') kde.set_kernel('fisher(%f)' % (conc*32)) # Hardly ideal - need something more independent/safer! kde.set_spatial('kd_tree') # Calculate a stochastic KL-divergance between the kde and the actual distribution... p2 = kde.probs(sample) kld = numpy.sum(numpy.log(p1/p2)) / samples_dir # Print output to screen... print 'Kernel = %s; Dims = %i | KL-divergance = %.6f' % (ms.get_kernel(), dim, kld) print
# Construct the mean shift object from it, including a composite kernel... ms = MeanShift() ms.set_data(data, 'df') ms.set_kernel('composite(2:gaussian,2:fisher(32.0))') ms.set_spatial('kd_tree') ms.set_scale(numpy.array([10.0,5.0,1.0,1.0])) ms.merge_range = 0.05 # Print out information in a convoluted way to test some convoluted features!.. ms2 = MeanShift() ms2.copy_kernel(ms) print 'Kernel:', ms2.get_kernel() del ms2 # For our first trick visualise the data set... img = numpy.zeros((size, size, 3), dtype=numpy.float32) for sample in data: s_x = (size-1) * sample[1] / scale s_y = (size-1) * sample[0] / scale e_x = (size-1) * (sample[1] + angle_len * sample[3]) / scale e_y = (size-1) * (sample[0] + angle_len * sample[2]) / scale for i in xrange(angle_step): t = float(i) / (angle_step-1)
axis=1) data.append(block) data = numpy.concatenate(data, axis=0) # Construct the mean shift object from it, including a composite kernel... ms = MeanShift() ms.set_data(data, 'df') ms.set_kernel('composite(2:gaussian,2:fisher(32.0))') ms.set_spatial('kd_tree') ms.set_scale(numpy.array([10.0, 5.0, 1.0, 1.0])) ms.merge_range = 0.05 # Print out information in a convoluted way to test some convoluted features!.. ms2 = MeanShift() ms2.copy_kernel(ms) print 'Kernel:', ms2.get_kernel() del ms2 # For our first trick visualise the data set... img = numpy.zeros((size, size, 3), dtype=numpy.float32) for sample in data: s_x = (size - 1) * sample[1] / scale s_y = (size - 1) * sample[0] / scale e_x = (size - 1) * (sample[1] + angle_len * sample[3]) / scale e_y = (size - 1) * (sample[0] + angle_len * sample[2]) / scale for i in xrange(angle_step): t = float(i) / (angle_step - 1) t_x = int(t * s_x + (1 - t) * e_x) t_y = int(t * s_y + (1 - t) * e_y)
# Now for the directional kernels... for kernel in ['fisher', 'mirror_fisher']: for dim, area in zip(dir_dimensions, dir_area): for conc in dir_conc: # Create a mean shift object pointing in the [1, 0, ...] direction with the given concentration... data = numpy.array([1.0] + [0.0] * (dim - 1), dtype=numpy.float32) ms = MeanShift() ms.set_data(data, 'f') ms.set_kernel('%s(%.1f)' % (kernel, conc)) ms.quality = 1.0 # Create uniform samples on the hyper-sphere with which we are dealing - abuse the MeanShift object by drawing with a Gaussian kernel and normalising... uniform = MeanShift() uniform.set_data(numpy.array([0.0] * dim, dtype=numpy.float32), 'f') uniform.set_kernel('gaussian') sample = uniform.draws(samples) div = numpy.sqrt(numpy.square(sample).sum(axis=1)) sample /= div[:, numpy.newaxis] # Evaluate the probabilities of the uniform directions... p = ms.probs(sample) # Print their average - should again be one... volume = p.mean() * area print 'Kernel = %s; Dims = %i | Monte-Carlo vol = %.3f (max = %.1f)' % ( ms.get_kernel(), dim, volume, ms.prob(data)) print
ms.set_spatial('kd_tree') return ms options = map(ms_by_conc, xrange(8)) + [ms_by_conc(8,'c'), ms_by_conc(8,'a')] + map(ms_by_conc, xrange(9,16)) # Create it and do the bandwidth estimation... ms = MeanShift() ms.set_data(data, 'df') p = ProgBar() best = ms.scale_loo_nll_array(options, p.callback) del p print 'Selected kernel =', ms.get_kernel() print 'LOO score =', best # Visualise the best option... visualise('bandwidth_fisher.png', ms) # Also visualise correct vs approximate, for sanity checking... for option in [ms_by_conc(8,'c'), ms_by_conc(8,'a')]: #options: ms.copy_all(option) visualise('bandwidth_fisher_%s.png' % option.get_kernel(), ms)