Esempio n. 1
0
 def test_find_neighbor_and_sdist(self, PointSet):
     ps = PointSet
     fp = FastPair().build(ps)
     rando = rand_tuple(len(ps[0]))
     neigh = fp._find_neighbor(rando)  # Abusing find_neighbor!
     dist = fp.dist(rando, neigh["neigh"])
     assert abs(dist - neigh["dist"]) < 1e-8
     assert len(fp) == len(ps)  # Make sure we didn't add a point...
     l = [(fp.dist(a, b), b) for a, b in zip(cycle([rando]), ps)]
     res = min(l, key=itemgetter(0))
     assert abs(res[0] - neigh["dist"]) < 1e-8
     assert res[1] == neigh["neigh"]
     res = min(fp.sdist(rando), key=itemgetter(0))
     assert abs(neigh["dist"] - res[0]) < 1e-8
     assert neigh["neigh"] == res[1]
Esempio n. 2
0
 def test_find_neighbor_and_sdist(self):
     ps = PointSet()
     fp = FastPair().build(ps)
     rando = rand_tuple(len(ps[0]))
     neigh = fp._find_neighbor(rando)  # Abusing find_neighbor!
     dist = fp.dist(rando, neigh["neigh"])
     assert  abs(dist - neigh["dist"]) < 1e-8
     assert len(fp) == len(ps)  # Make sure we didn't add a point...
     l = [(fp.dist(a, b), b) for a, b in zip(cycle([rando]), ps)]
     res = min(l, key=itemgetter(0))
     assert abs(res[0] - neigh["dist"]) < 1e-8
     assert res[1] == neigh["neigh"]
     res = min(fp.sdist(rando), key=itemgetter(0))
     assert abs(neigh["dist"] - res[0]) < 1e-8
     assert neigh["neigh"] == res[1]
Esempio n. 3
0
class AddC(object):
    """Implements the AddC clustering algorithm.

    For each data point arriving, the closest centroid to the incoming point
    is moved towards the point. If there are more than `kmax` centroids,
    then the two closest centroids are merged. This results in the creation of
    a redundant centroid; the redundant centroid is then set equal to the new
    data point. At any time, the data-structure can be queried for the current
    set of centroids/clusters, or updated with additional data points.
    """
    def __init__(self,
                 kmax=100,
                 dist=kernel_dist(gaussian),
                 centroid_factory=KernelCentroid):
        """Initialize an empty FastPair data-structure.

        Parameters
        ----------
        kmax : int, default=100
            The maximum number of cluster centroids to store (i.e., size of
            memory). This parameter controls the 'scale' of the desired
            solution, such that larger values of `kmax` will lead to a higher
            resolution cluster solution.
        """
        self.kmax = kmax
        self.npoints = 0
        self.centroid_factory = centroid_factory
        self.fastpair = FastPair(10, dist=dist)

    def __add__(self, p):
        """Add a point to the AddC sketch."""
        c = self.centroid_factory(p)  # Create an 'empty' centroid at point `p`
        self._step_one(c)
        self._step_two()
        self._step_three(c)
        self.npoints += 1  # Update count of points seen so far
        return self

    def __len__(self):
        """Number of points in the AddC sketch."""
        return len(self.fastpair)

    def __call__(self):
        """Return the current set of cluster centroids."""
        return self.centroids

    def __contains__(self, p):
        """Test if a given cluster centroid is in the AddC sketch."""
        return p in self.fastpair

    def __iter__(self):
        return iter(self.fastpair)

    def _step_one(self, c):
        # Step 1: Move the closest centroid towards the point
        if len(self.fastpair) > 0:
            # Single pass through list of neighbor points... this could also
            # be sped up with a spatial index, though harder to do with
            # kernel-induced distances
            # Alternatively, if it was possible to insert the new data point
            # _before_ querying for the closest pair (`step_two`), then we
            # could do it that way...
            old = min(self.fastpair.sdist(c), key=itemgetter(0))[1]
            self.fastpair._update_point(old, old.add(c))

    def _step_two(self):
        # Step 2: Merge the two closest centroids
        if len(self.fastpair) >= self.kmax and len(self.fastpair) > 1:
            dist, (a, b) = self.fastpair.closest_pair()
            self.fastpair -= b
            self.fastpair._update_point(a, a.merge(b))  # Update point `a`

    def _step_three(self, c):
        # Step 3: Set redundant centroid equal to new point
        self.fastpair += c

    def batch(self, points):
        # No checks, no nothing... just batch processing, pure and simple
        for point in points:
            self += point
        return self

    def trim(self, p=0.01):
        """Return only clusters over threshold."""
        sub = [x.size for x in self if x.size > 0]
        t = (sum(sub) / len(sub)) * p
        return [x for x in self if x.size >= t]

    @property
    def centroids(self):
        """For plotting."""
        return [c.center for c in self.fastpair]
Esempio n. 4
0
class AddC(object):
    """Implements the AddC clustering algorithm.

    For each data point arriving, the closest centroid to the incoming point
    is moved towards the point. If there are more than `kmax` centroids,
    then the two closest centroids are merged. This results in the creation of
    a redundant centroid; the redundant centroid is then set equal to the new
    data point. At any time, the data-structure can be queried for the current
    set of centroids/clusters, or updated with additional data points.
    """
    def __init__(self, kmax=100, dist=kernel_dist(gaussian),
                 centroid_factory=KernelCentroid):
        """Initialize an empty FastPair data-structure.

        Parameters
        ----------
        kmax : int, default=100
            The maximum number of cluster centroids to store (i.e., size of
            memory). This parameter controls the 'scale' of the desired
            solution, such that larger values of `kmax` will lead to a higher
            resolution cluster solution.
        """
        self.kmax = kmax
        self.npoints = 0
        self.centroid_factory = centroid_factory
        self.fastpair = FastPair(10, dist=dist)

    def __add__(self, p):
        """Add a point to the AddC sketch."""
        c = self.centroid_factory(p)  # Create an 'empty' centroid at point `p`
        self._step_one(c)
        self._step_two()
        self._step_three(c)
        self.npoints += 1  # Update count of points seen so far
        return self

    def __len__(self):
        """Number of points in the AddC sketch."""
        return len(self.fastpair)

    def __call__(self):
        """Return the current set of cluster centroids."""
        return self.centroids

    def __contains__(self, p):
        """Test if a given cluster centroid is in the AddC sketch."""
        return p in self.fastpair

    def __iter__(self):
        return iter(self.fastpair)

    def _step_one(self, c):
        # Step 1: Move the closest centroid towards the point
        if len(self.fastpair) > 0:
            # Single pass through list of neighbor points... this could also
            # be sped up with a spatial index, though harder to do with
            # kernel-induced distances
            # Alternatively, if it was possible to insert the new data point
            # _before_ querying for the closest pair (`step_two`), then we
            # could do it that way...
            old = min(self.fastpair.sdist(c), key=itemgetter(0))[1]
            self.fastpair._update_point(old, old.add(c))

    def _step_two(self):
        # Step 2: Merge the two closest centroids
        if len(self.fastpair) >= self.kmax and len(self.fastpair) > 1:
            dist, (a, b) = self.fastpair.closest_pair()
            self.fastpair -= b
            self.fastpair._update_point(a, a.merge(b))  # Update point `a`

    def _step_three(self, c):
        # Step 3: Set redundant centroid equal to new point
        self.fastpair += c

    def batch(self, points):
        # No checks, no nothing... just batch processing, pure and simple
        for point in points:
            self += point
        return self

    def trim(self, p=0.01):
        """Return only clusters over threshold."""
        sub = [x.size for x in self if x.size > 0]
        t = (sum(sub)/len(sub)) * p
        return [x for x in self if x.size >= t]

    @property
    def centroids(self):
        """For plotting."""
        return [c.center for c in self.fastpair]