def test_find_neighbor_and_sdist(self, PointSet): ps = PointSet fp = FastPair().build(ps) rando = rand_tuple(len(ps[0])) neigh = fp._find_neighbor(rando) # Abusing find_neighbor! dist = fp.dist(rando, neigh["neigh"]) assert abs(dist - neigh["dist"]) < 1e-8 assert len(fp) == len(ps) # Make sure we didn't add a point... l = [(fp.dist(a, b), b) for a, b in zip(cycle([rando]), ps)] res = min(l, key=itemgetter(0)) assert abs(res[0] - neigh["dist"]) < 1e-8 assert res[1] == neigh["neigh"] res = min(fp.sdist(rando), key=itemgetter(0)) assert abs(neigh["dist"] - res[0]) < 1e-8 assert neigh["neigh"] == res[1]
def test_find_neighbor_and_sdist(self): ps = PointSet() fp = FastPair().build(ps) rando = rand_tuple(len(ps[0])) neigh = fp._find_neighbor(rando) # Abusing find_neighbor! dist = fp.dist(rando, neigh["neigh"]) assert abs(dist - neigh["dist"]) < 1e-8 assert len(fp) == len(ps) # Make sure we didn't add a point... l = [(fp.dist(a, b), b) for a, b in zip(cycle([rando]), ps)] res = min(l, key=itemgetter(0)) assert abs(res[0] - neigh["dist"]) < 1e-8 assert res[1] == neigh["neigh"] res = min(fp.sdist(rando), key=itemgetter(0)) assert abs(neigh["dist"] - res[0]) < 1e-8 assert neigh["neigh"] == res[1]
class AddC(object): """Implements the AddC clustering algorithm. For each data point arriving, the closest centroid to the incoming point is moved towards the point. If there are more than `kmax` centroids, then the two closest centroids are merged. This results in the creation of a redundant centroid; the redundant centroid is then set equal to the new data point. At any time, the data-structure can be queried for the current set of centroids/clusters, or updated with additional data points. """ def __init__(self, kmax=100, dist=kernel_dist(gaussian), centroid_factory=KernelCentroid): """Initialize an empty FastPair data-structure. Parameters ---------- kmax : int, default=100 The maximum number of cluster centroids to store (i.e., size of memory). This parameter controls the 'scale' of the desired solution, such that larger values of `kmax` will lead to a higher resolution cluster solution. """ self.kmax = kmax self.npoints = 0 self.centroid_factory = centroid_factory self.fastpair = FastPair(10, dist=dist) def __add__(self, p): """Add a point to the AddC sketch.""" c = self.centroid_factory(p) # Create an 'empty' centroid at point `p` self._step_one(c) self._step_two() self._step_three(c) self.npoints += 1 # Update count of points seen so far return self def __len__(self): """Number of points in the AddC sketch.""" return len(self.fastpair) def __call__(self): """Return the current set of cluster centroids.""" return self.centroids def __contains__(self, p): """Test if a given cluster centroid is in the AddC sketch.""" return p in self.fastpair def __iter__(self): return iter(self.fastpair) def _step_one(self, c): # Step 1: Move the closest centroid towards the point if len(self.fastpair) > 0: # Single pass through list of neighbor points... this could also # be sped up with a spatial index, though harder to do with # kernel-induced distances # Alternatively, if it was possible to insert the new data point # _before_ querying for the closest pair (`step_two`), then we # could do it that way... old = min(self.fastpair.sdist(c), key=itemgetter(0))[1] self.fastpair._update_point(old, old.add(c)) def _step_two(self): # Step 2: Merge the two closest centroids if len(self.fastpair) >= self.kmax and len(self.fastpair) > 1: dist, (a, b) = self.fastpair.closest_pair() self.fastpair -= b self.fastpair._update_point(a, a.merge(b)) # Update point `a` def _step_three(self, c): # Step 3: Set redundant centroid equal to new point self.fastpair += c def batch(self, points): # No checks, no nothing... just batch processing, pure and simple for point in points: self += point return self def trim(self, p=0.01): """Return only clusters over threshold.""" sub = [x.size for x in self if x.size > 0] t = (sum(sub) / len(sub)) * p return [x for x in self if x.size >= t] @property def centroids(self): """For plotting.""" return [c.center for c in self.fastpair]
class AddC(object): """Implements the AddC clustering algorithm. For each data point arriving, the closest centroid to the incoming point is moved towards the point. If there are more than `kmax` centroids, then the two closest centroids are merged. This results in the creation of a redundant centroid; the redundant centroid is then set equal to the new data point. At any time, the data-structure can be queried for the current set of centroids/clusters, or updated with additional data points. """ def __init__(self, kmax=100, dist=kernel_dist(gaussian), centroid_factory=KernelCentroid): """Initialize an empty FastPair data-structure. Parameters ---------- kmax : int, default=100 The maximum number of cluster centroids to store (i.e., size of memory). This parameter controls the 'scale' of the desired solution, such that larger values of `kmax` will lead to a higher resolution cluster solution. """ self.kmax = kmax self.npoints = 0 self.centroid_factory = centroid_factory self.fastpair = FastPair(10, dist=dist) def __add__(self, p): """Add a point to the AddC sketch.""" c = self.centroid_factory(p) # Create an 'empty' centroid at point `p` self._step_one(c) self._step_two() self._step_three(c) self.npoints += 1 # Update count of points seen so far return self def __len__(self): """Number of points in the AddC sketch.""" return len(self.fastpair) def __call__(self): """Return the current set of cluster centroids.""" return self.centroids def __contains__(self, p): """Test if a given cluster centroid is in the AddC sketch.""" return p in self.fastpair def __iter__(self): return iter(self.fastpair) def _step_one(self, c): # Step 1: Move the closest centroid towards the point if len(self.fastpair) > 0: # Single pass through list of neighbor points... this could also # be sped up with a spatial index, though harder to do with # kernel-induced distances # Alternatively, if it was possible to insert the new data point # _before_ querying for the closest pair (`step_two`), then we # could do it that way... old = min(self.fastpair.sdist(c), key=itemgetter(0))[1] self.fastpair._update_point(old, old.add(c)) def _step_two(self): # Step 2: Merge the two closest centroids if len(self.fastpair) >= self.kmax and len(self.fastpair) > 1: dist, (a, b) = self.fastpair.closest_pair() self.fastpair -= b self.fastpair._update_point(a, a.merge(b)) # Update point `a` def _step_three(self, c): # Step 3: Set redundant centroid equal to new point self.fastpair += c def batch(self, points): # No checks, no nothing... just batch processing, pure and simple for point in points: self += point return self def trim(self, p=0.01): """Return only clusters over threshold.""" sub = [x.size for x in self if x.size > 0] t = (sum(sub)/len(sub)) * p return [x for x in self if x.size >= t] @property def centroids(self): """For plotting.""" return [c.center for c in self.fastpair]