def test_call_and_closest_pair(self, PointSet): ps = PointSet fp = FastPair().build(ps) cp = fp.closest_pair() bf = fp.closest_pair_brute_force() assert fp() == cp assert abs(cp[0] - bf[0]) < 1e-8 assert cp[1] == bf[1]
def test_call_and_closest_pair(self): ps = PointSet() fp = FastPair().build(ps) cp = fp.closest_pair() bf = fp.closest_pair_brute_force() assert fp() == cp assert abs(cp[0] - bf[0]) < 1e-8 assert cp[1] == bf[1]
def test_merge_closest(self): # This needs to be 'fleshed' out more... lots of things to test here random.seed(1234) ps = PointSet(d=4) fp = FastPair().build(ps) # fp2 = FastPair().build(ps) n = len(ps) while n >= 2: dist, (a, b) = fp.closest_pair() new = interact(a, b) fp -= b # Drop b fp._update_point(a, new) n -= 1 assert len(fp) == 1 == n points = [(0.69903599809571437, 0.52457534006594131, 0.7614753848101149, 0.37011695654655385)] assert all_close(fp.points[0], points[0]) # Should have < 2 points now... with pytest.raises(ValueError): fp.closest_pair()
def test_all_closest_pairs(self): ps = PointSet() fp = FastPair().build(ps) cp = fp.closest_pair() bf = fp.closest_pair_brute_force() # Ordering should be the same # dc = fp.closest_pair_divide_conquer() # Maybe different ordering assert abs(cp[0] - bf[0]) < 1e-8 assert cp[1] == bf[1] # Tuple comparison test = min([(fp.dist(a, b), (a, b)) for a, b in combinations(ps, r=2)], key=itemgetter(0)) assert abs(cp[0] - test[0]) < 1e-8 assert sorted(cp[1]) == sorted(test[1]) # Tuple comparison
def test_call_and_closest_pair_min_points(self, image_array): ps = image_array fp = FastPair(dist=image_distance) for p in ps: fp += p assert fp.initialized is False assert len(fp) == 6 cp = fp.closest_pair() bf = fp.closest_pair_brute_force() assert fp() == cp assert abs(cp[0] - bf[0]) < 1e-8 assert cp[1] == bf[1]
def test_all_closest_pairs(self, PointSet): ps = PointSet fp = FastPair().build(ps) cp = fp.closest_pair() bf = fp.closest_pair_brute_force() # Ordering should be the same # dc = fp.closest_pair_divide_conquer() # Maybe different ordering assert abs(cp[0] - bf[0]) < 1e-8 assert cp[1] == bf[1] # Tuple comparison test = min( [(fp.dist(a, b), (a, b)) for a, b in combinations(ps, r=2)], key=itemgetter(0), ) assert abs(cp[0] - test[0]) < 1e-8 assert sorted(cp[1]) == sorted(test[1]) # Tuple comparison
def test_merge_closest(self): # This needs to be 'fleshed' out more... lots of things to test here random.seed(1234) ps = [rand_tuple(4) for _ in range(50)] fp = FastPair().build(ps) # fp2 = FastPair().build(ps) n = len(ps) while n >= 2: dist, (a, b) = fp.closest_pair() new = interact(a, b) fp -= b # Drop b fp._update_point(a, new) n -= 1 assert len(fp) == 1 == n points = [( 0.69903599809571437, 0.52457534006594131, 0.7614753848101149, 0.37011695654655385, )] assert all_close(fp.points[0], points[0]) # Should have < 2 points now... with pytest.raises(ValueError): fp.closest_pair()
def test_cluster(self): ps = PointSet() fp = FastPair().build(ps) for i in range(len(fp)-1): # Version one dist, (a, b) = fp.closest_pair() c = interact(a, b) fp -= b # Drop b fp -= a fp += c # Order gets reversed here... d, (e, f) = min([(fp.dist(i, j), (i, j)) for i, j in combinations(ps, r=2)], key=itemgetter(0)) g = interact(e, f) assert abs(d - dist) < 1e-8 assert (a == e or b == e) and (b == f or a == f) assert c == g ps.remove(e) ps.remove(f) ps.append(g) assert contains_same(fp.points, ps) assert len(fp.points) == len(ps) == 1
def test_cluster(self, PointSet): ps = PointSet fp = FastPair().build(ps) for i in range(len(fp) - 1): # Version one dist, (a, b) = fp.closest_pair() c = interact(a, b) fp -= b # Drop b fp -= a fp += c # Order gets reversed here... d, (e, f) = min( [(fp.dist(i, j), (i, j)) for i, j in combinations(ps, r=2)], key=itemgetter(0), ) g = interact(e, f) assert abs(d - dist) < 1e-8 assert (a == e or b == e) and (b == f or a == f) assert c == g ps.remove(e) ps.remove(f) ps.append(g) assert contains_same(fp.points, ps) assert len(fp.points) == len(ps) == 1
class AddC(object): """Implements the AddC clustering algorithm. For each data point arriving, the closest centroid to the incoming point is moved towards the point. If there are more than `kmax` centroids, then the two closest centroids are merged. This results in the creation of a redundant centroid; the redundant centroid is then set equal to the new data point. At any time, the data-structure can be queried for the current set of centroids/clusters, or updated with additional data points. """ def __init__(self, kmax=100, dist=kernel_dist(gaussian), centroid_factory=KernelCentroid): """Initialize an empty FastPair data-structure. Parameters ---------- kmax : int, default=100 The maximum number of cluster centroids to store (i.e., size of memory). This parameter controls the 'scale' of the desired solution, such that larger values of `kmax` will lead to a higher resolution cluster solution. """ self.kmax = kmax self.npoints = 0 self.centroid_factory = centroid_factory self.fastpair = FastPair(10, dist=dist) def __add__(self, p): """Add a point to the AddC sketch.""" c = self.centroid_factory(p) # Create an 'empty' centroid at point `p` self._step_one(c) self._step_two() self._step_three(c) self.npoints += 1 # Update count of points seen so far return self def __len__(self): """Number of points in the AddC sketch.""" return len(self.fastpair) def __call__(self): """Return the current set of cluster centroids.""" return self.centroids def __contains__(self, p): """Test if a given cluster centroid is in the AddC sketch.""" return p in self.fastpair def __iter__(self): return iter(self.fastpair) def _step_one(self, c): # Step 1: Move the closest centroid towards the point if len(self.fastpair) > 0: # Single pass through list of neighbor points... this could also # be sped up with a spatial index, though harder to do with # kernel-induced distances # Alternatively, if it was possible to insert the new data point # _before_ querying for the closest pair (`step_two`), then we # could do it that way... old = min(self.fastpair.sdist(c), key=itemgetter(0))[1] self.fastpair._update_point(old, old.add(c)) def _step_two(self): # Step 2: Merge the two closest centroids if len(self.fastpair) >= self.kmax and len(self.fastpair) > 1: dist, (a, b) = self.fastpair.closest_pair() self.fastpair -= b self.fastpair._update_point(a, a.merge(b)) # Update point `a` def _step_three(self, c): # Step 3: Set redundant centroid equal to new point self.fastpair += c def batch(self, points): # No checks, no nothing... just batch processing, pure and simple for point in points: self += point return self def trim(self, p=0.01): """Return only clusters over threshold.""" sub = [x.size for x in self if x.size > 0] t = (sum(sub) / len(sub)) * p return [x for x in self if x.size >= t] @property def centroids(self): """For plotting.""" return [c.center for c in self.fastpair]
class AddC(object): """Implements the AddC clustering algorithm. For each data point arriving, the closest centroid to the incoming point is moved towards the point. If there are more than `kmax` centroids, then the two closest centroids are merged. This results in the creation of a redundant centroid; the redundant centroid is then set equal to the new data point. At any time, the data-structure can be queried for the current set of centroids/clusters, or updated with additional data points. """ def __init__(self, kmax=100, dist=kernel_dist(gaussian), centroid_factory=KernelCentroid): """Initialize an empty FastPair data-structure. Parameters ---------- kmax : int, default=100 The maximum number of cluster centroids to store (i.e., size of memory). This parameter controls the 'scale' of the desired solution, such that larger values of `kmax` will lead to a higher resolution cluster solution. """ self.kmax = kmax self.npoints = 0 self.centroid_factory = centroid_factory self.fastpair = FastPair(10, dist=dist) def __add__(self, p): """Add a point to the AddC sketch.""" c = self.centroid_factory(p) # Create an 'empty' centroid at point `p` self._step_one(c) self._step_two() self._step_three(c) self.npoints += 1 # Update count of points seen so far return self def __len__(self): """Number of points in the AddC sketch.""" return len(self.fastpair) def __call__(self): """Return the current set of cluster centroids.""" return self.centroids def __contains__(self, p): """Test if a given cluster centroid is in the AddC sketch.""" return p in self.fastpair def __iter__(self): return iter(self.fastpair) def _step_one(self, c): # Step 1: Move the closest centroid towards the point if len(self.fastpair) > 0: # Single pass through list of neighbor points... this could also # be sped up with a spatial index, though harder to do with # kernel-induced distances # Alternatively, if it was possible to insert the new data point # _before_ querying for the closest pair (`step_two`), then we # could do it that way... old = min(self.fastpair.sdist(c), key=itemgetter(0))[1] self.fastpair._update_point(old, old.add(c)) def _step_two(self): # Step 2: Merge the two closest centroids if len(self.fastpair) >= self.kmax and len(self.fastpair) > 1: dist, (a, b) = self.fastpair.closest_pair() self.fastpair -= b self.fastpair._update_point(a, a.merge(b)) # Update point `a` def _step_three(self, c): # Step 3: Set redundant centroid equal to new point self.fastpair += c def batch(self, points): # No checks, no nothing... just batch processing, pure and simple for point in points: self += point return self def trim(self, p=0.01): """Return only clusters over threshold.""" sub = [x.size for x in self if x.size > 0] t = (sum(sub)/len(sub)) * p return [x for x in self if x.size >= t] @property def centroids(self): """For plotting.""" return [c.center for c in self.fastpair]