def test_update_point_less_points(self, PointSet): ps = PointSet fp = FastPair() for p in ps[:9]: fp += p assert fp.initialized is False old = ps[0] # Just grab the first point... new = rand_tuple(len(ps[0])) res = fp._update_point(old, new) assert len(fp) == 1
def test_merge_closest(self): # This needs to be 'fleshed' out more... lots of things to test here random.seed(1234) ps = PointSet(d=4) fp = FastPair().build(ps) # fp2 = FastPair().build(ps) n = len(ps) while n >= 2: dist, (a, b) = fp.closest_pair() new = interact(a, b) fp -= b # Drop b fp._update_point(a, new) n -= 1 assert len(fp) == 1 == n points = [(0.69903599809571437, 0.52457534006594131, 0.7614753848101149, 0.37011695654655385)] assert all_close(fp.points[0], points[0]) # Should have < 2 points now... with pytest.raises(ValueError): fp.closest_pair()
def test_update_point(self, PointSet): # Still failing sometimes... ps = PointSet fp = FastPair().build(ps) assert len(fp) == len(ps) old = ps[0] # Just grab the first point... new = rand_tuple(len(ps[0])) res = fp._update_point(old, new) assert old not in fp assert new in fp assert len(fp) == len(ps) # Size shouldn't change l = [(fp.dist(a, b), b) for a, b in zip(cycle([new]), ps)] res = min(l, key=itemgetter(0)) neigh = fp.neighbors[new]
def test_merge_closest(self): # This needs to be 'fleshed' out more... lots of things to test here random.seed(1234) ps = [rand_tuple(4) for _ in range(50)] fp = FastPair().build(ps) # fp2 = FastPair().build(ps) n = len(ps) while n >= 2: dist, (a, b) = fp.closest_pair() new = interact(a, b) fp -= b # Drop b fp._update_point(a, new) n -= 1 assert len(fp) == 1 == n points = [( 0.69903599809571437, 0.52457534006594131, 0.7614753848101149, 0.37011695654655385, )] assert all_close(fp.points[0], points[0]) # Should have < 2 points now... with pytest.raises(ValueError): fp.closest_pair()
def test_update_point(self): # Still failing sometimes... ps = PointSet() fp = FastPair().build(ps) assert len(fp) == len(ps) old = ps[0] # Just grab the first point... new = rand_tuple(len(ps[0])) res = fp._update_point(old, new) assert old not in fp assert new in fp assert len(fp) == len(ps) # Size shouldn't change l = [(fp.dist(a, b), b) for a, b in zip(cycle([new]), ps)] res = min(l, key=itemgetter(0)) neigh = fp.neighbors[new] assert abs(res[0] - neigh["dist"]) < 1e-8 assert res[1] == neigh["neigh"]
class AddC(object): """Implements the AddC clustering algorithm. For each data point arriving, the closest centroid to the incoming point is moved towards the point. If there are more than `kmax` centroids, then the two closest centroids are merged. This results in the creation of a redundant centroid; the redundant centroid is then set equal to the new data point. At any time, the data-structure can be queried for the current set of centroids/clusters, or updated with additional data points. """ def __init__(self, kmax=100, dist=kernel_dist(gaussian), centroid_factory=KernelCentroid): """Initialize an empty FastPair data-structure. Parameters ---------- kmax : int, default=100 The maximum number of cluster centroids to store (i.e., size of memory). This parameter controls the 'scale' of the desired solution, such that larger values of `kmax` will lead to a higher resolution cluster solution. """ self.kmax = kmax self.npoints = 0 self.centroid_factory = centroid_factory self.fastpair = FastPair(10, dist=dist) def __add__(self, p): """Add a point to the AddC sketch.""" c = self.centroid_factory(p) # Create an 'empty' centroid at point `p` self._step_one(c) self._step_two() self._step_three(c) self.npoints += 1 # Update count of points seen so far return self def __len__(self): """Number of points in the AddC sketch.""" return len(self.fastpair) def __call__(self): """Return the current set of cluster centroids.""" return self.centroids def __contains__(self, p): """Test if a given cluster centroid is in the AddC sketch.""" return p in self.fastpair def __iter__(self): return iter(self.fastpair) def _step_one(self, c): # Step 1: Move the closest centroid towards the point if len(self.fastpair) > 0: # Single pass through list of neighbor points... this could also # be sped up with a spatial index, though harder to do with # kernel-induced distances # Alternatively, if it was possible to insert the new data point # _before_ querying for the closest pair (`step_two`), then we # could do it that way... old = min(self.fastpair.sdist(c), key=itemgetter(0))[1] self.fastpair._update_point(old, old.add(c)) def _step_two(self): # Step 2: Merge the two closest centroids if len(self.fastpair) >= self.kmax and len(self.fastpair) > 1: dist, (a, b) = self.fastpair.closest_pair() self.fastpair -= b self.fastpair._update_point(a, a.merge(b)) # Update point `a` def _step_three(self, c): # Step 3: Set redundant centroid equal to new point self.fastpair += c def batch(self, points): # No checks, no nothing... just batch processing, pure and simple for point in points: self += point return self def trim(self, p=0.01): """Return only clusters over threshold.""" sub = [x.size for x in self if x.size > 0] t = (sum(sub) / len(sub)) * p return [x for x in self if x.size >= t] @property def centroids(self): """For plotting.""" return [c.center for c in self.fastpair]
class AddC(object): """Implements the AddC clustering algorithm. For each data point arriving, the closest centroid to the incoming point is moved towards the point. If there are more than `kmax` centroids, then the two closest centroids are merged. This results in the creation of a redundant centroid; the redundant centroid is then set equal to the new data point. At any time, the data-structure can be queried for the current set of centroids/clusters, or updated with additional data points. """ def __init__(self, kmax=100, dist=kernel_dist(gaussian), centroid_factory=KernelCentroid): """Initialize an empty FastPair data-structure. Parameters ---------- kmax : int, default=100 The maximum number of cluster centroids to store (i.e., size of memory). This parameter controls the 'scale' of the desired solution, such that larger values of `kmax` will lead to a higher resolution cluster solution. """ self.kmax = kmax self.npoints = 0 self.centroid_factory = centroid_factory self.fastpair = FastPair(10, dist=dist) def __add__(self, p): """Add a point to the AddC sketch.""" c = self.centroid_factory(p) # Create an 'empty' centroid at point `p` self._step_one(c) self._step_two() self._step_three(c) self.npoints += 1 # Update count of points seen so far return self def __len__(self): """Number of points in the AddC sketch.""" return len(self.fastpair) def __call__(self): """Return the current set of cluster centroids.""" return self.centroids def __contains__(self, p): """Test if a given cluster centroid is in the AddC sketch.""" return p in self.fastpair def __iter__(self): return iter(self.fastpair) def _step_one(self, c): # Step 1: Move the closest centroid towards the point if len(self.fastpair) > 0: # Single pass through list of neighbor points... this could also # be sped up with a spatial index, though harder to do with # kernel-induced distances # Alternatively, if it was possible to insert the new data point # _before_ querying for the closest pair (`step_two`), then we # could do it that way... old = min(self.fastpair.sdist(c), key=itemgetter(0))[1] self.fastpair._update_point(old, old.add(c)) def _step_two(self): # Step 2: Merge the two closest centroids if len(self.fastpair) >= self.kmax and len(self.fastpair) > 1: dist, (a, b) = self.fastpair.closest_pair() self.fastpair -= b self.fastpair._update_point(a, a.merge(b)) # Update point `a` def _step_three(self, c): # Step 3: Set redundant centroid equal to new point self.fastpair += c def batch(self, points): # No checks, no nothing... just batch processing, pure and simple for point in points: self += point return self def trim(self, p=0.01): """Return only clusters over threshold.""" sub = [x.size for x in self if x.size > 0] t = (sum(sub)/len(sub)) * p return [x for x in self if x.size >= t] @property def centroids(self): """For plotting.""" return [c.center for c in self.fastpair]